This is the best I could come up with.
Extracting:
Code:
; Initial conditions: ecx = pixel count, esi = source, edx = channel 1, ebx = channel 2,
; edi = channel 3, ebp = channel 4
shr ecx, 2
sub ebx, edx
sub edi, edx
sub ebp, edx
sub edx, 4
mov eax, 0ffh
movd mm4, eax
punpckldq mm4, mm4
punpckldq xmm4, xmm4
extractloop:
add edx, 4
movdqa xmm0, [esi]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
psrld xmm1, 8
psrld xmm2, 16
psrld xmm3, 24
pand xmm0, xmm4
pand xmm1, xmm4
pand xmm2, xmm4
pand xmm3, xmm4
packssdw xmm0, xmm0
packssdw xmm1, xmm1
packssdw xmm2, xmm2
packssdw xmm3, xmm3
packuswb xmm0, xmm0
packuswb xmm1, xmm1
packuswb xmm2, xmm2
packuswb xmm3, xmm3
add esi, 16
dec ecx
movd [edx], mm0
movd [edx+ebx], mm1
movd [edx+edi], mm2
movd [edx+ebp], mm3
jnz extractloop
Merging:
Code:
; Initial conditions: ecx = pixel count, esi = destination, edx = channel 1, ebx = channel 2,
; edi = channel 3, ebp = channel 4
shr ecx, 2
sub esi, 16
sub ebx, edx
sub edi, edx
sub ebp, edx
mergeloop:
add esi, 16
movd mm0, [edx]
punpcklbw mm0, [edx+ebx]
movd mm1, [edx+edi]
punpcklbw mm1, [edx+ebp]
punpcklwd xmm0, xmm1
movdqa [esi], xmm0
add edx, 4
dec ecx
jnz mergeloop