123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458 |
- %include "jcolsamp.inc"
- %define out_width(b) (b) + 8
- %define input_buf(b) (b) + 12
- %define input_row(b) (b) + 16
- %define output_buf(b) (b) + 20
- %define num_rows(b) (b) + 24
- %define original_ebp ebp + 0
- %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
-
- %define WK_NUM 2
- %define gotptr wk(0) - SIZEOF_POINTER
- align 32
- GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
- EXTN(jsimd_ycc_rgb_convert_sse2):
- push ebp
- mov eax, esp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD)
- mov [esp], eax
- mov ebp, esp
- lea esp, [wk(0)]
- pushpic eax
- push ebx
- push esi
- push edi
- get_GOT ebx
- movpic POINTER [gotptr], ebx
- mov ecx, JDIMENSION [out_width(eax)]
- test ecx, ecx
- jz near .return
- push ecx
- mov edi, JSAMPIMAGE [input_buf(eax)]
- mov ecx, JDIMENSION [input_row(eax)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- lea esi, [esi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
- pop ecx
- mov edi, JSAMPARRAY [output_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax, eax
- jle near .return
- alignx 16, 7
- .rowloop:
- push eax
- push edi
- push edx
- push ebx
- push esi
- push ecx
- mov esi, JSAMPROW [esi]
- mov ebx, JSAMPROW [ebx]
- mov edx, JSAMPROW [edx]
- mov edi, JSAMPROW [edi]
- movpic eax, POINTER [gotptr]
- alignx 16, 7
- .columnloop:
- movdqa xmm5, XMMWORD [ebx]
- movdqa xmm1, XMMWORD [edx]
- pcmpeqw xmm4, xmm4
- pcmpeqw xmm7, xmm7
- psrlw xmm4, BYTE_BIT
- psllw xmm7, 7
- movdqa xmm0, xmm4
- pand xmm4, xmm5
- psrlw xmm5, BYTE_BIT
- pand xmm0, xmm1
- psrlw xmm1, BYTE_BIT
- paddw xmm4, xmm7
- paddw xmm5, xmm7
- paddw xmm0, xmm7
- paddw xmm1, xmm7
-
-
-
-
-
-
-
-
-
- movdqa xmm2, xmm4
- movdqa xmm3, xmm5
- paddw xmm4, xmm4
- paddw xmm5, xmm5
- movdqa xmm6, xmm0
- movdqa xmm7, xmm1
- paddw xmm0, xmm0
- paddw xmm1, xmm1
- pmulhw xmm4, [GOTOFF(eax,PW_MF0228)]
- pmulhw xmm5, [GOTOFF(eax,PW_MF0228)]
- pmulhw xmm0, [GOTOFF(eax,PW_F0402)]
- pmulhw xmm1, [GOTOFF(eax,PW_F0402)]
- paddw xmm4, [GOTOFF(eax,PW_ONE)]
- paddw xmm5, [GOTOFF(eax,PW_ONE)]
- psraw xmm4, 1
- psraw xmm5, 1
- paddw xmm0, [GOTOFF(eax,PW_ONE)]
- paddw xmm1, [GOTOFF(eax,PW_ONE)]
- psraw xmm0, 1
- psraw xmm1, 1
- paddw xmm4, xmm2
- paddw xmm5, xmm3
- paddw xmm4, xmm2
- paddw xmm5, xmm3
- paddw xmm0, xmm6
- paddw xmm1, xmm7
- movdqa XMMWORD [wk(0)], xmm4
- movdqa XMMWORD [wk(1)], xmm5
- movdqa xmm4, xmm2
- movdqa xmm5, xmm3
- punpcklwd xmm2, xmm6
- punpckhwd xmm4, xmm6
- pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
- punpcklwd xmm3, xmm7
- punpckhwd xmm5, xmm7
- pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
- paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
- paddd xmm4, [GOTOFF(eax,PD_ONEHALF)]
- psrad xmm2, SCALEBITS
- psrad xmm4, SCALEBITS
- paddd xmm3, [GOTOFF(eax,PD_ONEHALF)]
- paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
- psrad xmm3, SCALEBITS
- psrad xmm5, SCALEBITS
- packssdw xmm2, xmm4
- packssdw xmm3, xmm5
- psubw xmm2, xmm6
- psubw xmm3, xmm7
- movdqa xmm5, XMMWORD [esi]
- pcmpeqw xmm4, xmm4
- psrlw xmm4, BYTE_BIT
- pand xmm4, xmm5
- psrlw xmm5, BYTE_BIT
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- packuswb xmm0, xmm0
- packuswb xmm1, xmm1
- paddw xmm2, xmm4
- paddw xmm3, xmm5
- packuswb xmm2, xmm2
- packuswb xmm3, xmm3
- paddw xmm4, XMMWORD [wk(0)]
- paddw xmm5, XMMWORD [wk(1)]
- packuswb xmm4, xmm4
- packuswb xmm5, xmm5
- %if RGB_PIXELSIZE == 3
-
-
-
-
- punpcklbw xmmA, xmmC
- punpcklbw xmmE, xmmB
- punpcklbw xmmD, xmmF
- movdqa xmmG, xmmA
- movdqa xmmH, xmmA
- punpcklwd xmmA, xmmE
- punpckhwd xmmG, xmmE
- psrldq xmmH, 2
- psrldq xmmE, 2
- movdqa xmmC, xmmD
- movdqa xmmB, xmmD
- punpcklwd xmmD, xmmH
- punpckhwd xmmC, xmmH
- psrldq xmmB, 2
- movdqa xmmF, xmmE
- punpcklwd xmmE, xmmB
- punpckhwd xmmF, xmmB
- pshufd xmmH, xmmA, 0x4E
- movdqa xmmB, xmmE
- punpckldq xmmA, xmmD
- punpckldq xmmE, xmmH
- punpckhdq xmmD, xmmB
- pshufd xmmH, xmmG, 0x4E
- movdqa xmmB, xmmF
- punpckldq xmmG, xmmC
- punpckldq xmmF, xmmH
- punpckhdq xmmC, xmmB
- punpcklqdq xmmA, xmmE
- punpcklqdq xmmD, xmmG
- punpcklqdq xmmF, xmmC
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st32
- test edi, SIZEOF_XMMWORD-1
- jnz short .out1
-
- movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- jmp short .out0
- .out1:
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- .out0:
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD
- sub ecx, byte SIZEOF_XMMWORD
- jz near .nextrow
- add esi, byte SIZEOF_XMMWORD
- add ebx, byte SIZEOF_XMMWORD
- add edx, byte SIZEOF_XMMWORD
- jmp near .columnloop
- alignx 16, 7
- .column_st32:
- lea ecx, [ecx+ecx*2]
- cmp ecx, byte 2*SIZEOF_XMMWORD
- jb short .column_st16
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- add edi, byte 2*SIZEOF_XMMWORD
- movdqa xmmA, xmmF
- sub ecx, byte 2*SIZEOF_XMMWORD
- jmp short .column_st15
- .column_st16:
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st15
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- add edi, byte SIZEOF_XMMWORD
- movdqa xmmA, xmmD
- sub ecx, byte SIZEOF_XMMWORD
- .column_st15:
-
-
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st7
- movq XMM_MMWORD [edi], xmmA
- add edi, byte SIZEOF_MMWORD
- sub ecx, byte SIZEOF_MMWORD
- psrldq xmmA, SIZEOF_MMWORD
- .column_st7:
-
-
- cmp ecx, byte SIZEOF_DWORD
- jb short .column_st3
- movd XMM_DWORD [edi], xmmA
- add edi, byte SIZEOF_DWORD
- sub ecx, byte SIZEOF_DWORD
- psrldq xmmA, SIZEOF_DWORD
- .column_st3:
-
-
- movd eax, xmmA
- cmp ecx, byte SIZEOF_WORD
- jb short .column_st1
- mov word [edi], ax
- add edi, byte SIZEOF_WORD
- sub ecx, byte SIZEOF_WORD
- shr eax, 16
- .column_st1:
-
-
- test ecx, ecx
- jz short .nextrow
- mov byte [edi], al
- %else
- %ifdef RGBX_FILLER_0XFF
- pcmpeqb xmm6, xmm6
- pcmpeqb xmm7, xmm7
- %else
- pxor xmm6, xmm6
- pxor xmm7, xmm7
- %endif
-
-
-
-
- punpcklbw xmmA, xmmC
- punpcklbw xmmE, xmmG
- punpcklbw xmmB, xmmD
- punpcklbw xmmF, xmmH
- movdqa xmmC, xmmA
- punpcklwd xmmA, xmmE
- punpckhwd xmmC, xmmE
- movdqa xmmG, xmmB
- punpcklwd xmmB, xmmF
- punpckhwd xmmG, xmmF
- movdqa xmmD, xmmA
- punpckldq xmmA, xmmB
- punpckhdq xmmD, xmmB
- movdqa xmmH, xmmC
- punpckldq xmmC, xmmG
- punpckhdq xmmH, xmmG
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st32
- test edi, SIZEOF_XMMWORD-1
- jnz short .out1
-
- movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
- movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- jmp short .out0
- .out1:
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
- movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- .out0:
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD
- sub ecx, byte SIZEOF_XMMWORD
- jz near .nextrow
- add esi, byte SIZEOF_XMMWORD
- add ebx, byte SIZEOF_XMMWORD
- add edx, byte SIZEOF_XMMWORD
- jmp near .columnloop
- alignx 16, 7
- .column_st32:
- cmp ecx, byte SIZEOF_XMMWORD/2
- jb short .column_st16
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- add edi, byte 2*SIZEOF_XMMWORD
- movdqa xmmA, xmmC
- movdqa xmmD, xmmH
- sub ecx, byte SIZEOF_XMMWORD/2
- .column_st16:
- cmp ecx, byte SIZEOF_XMMWORD/4
- jb short .column_st15
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- add edi, byte SIZEOF_XMMWORD
- movdqa xmmA, xmmD
- sub ecx, byte SIZEOF_XMMWORD/4
- .column_st15:
-
-
- cmp ecx, byte SIZEOF_XMMWORD/8
- jb short .column_st7
- movq XMM_MMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD/8*4
- sub ecx, byte SIZEOF_XMMWORD/8
- psrldq xmmA, SIZEOF_XMMWORD/8*4
- .column_st7:
-
-
- test ecx, ecx
- jz short .nextrow
- movd XMM_DWORD [edi], xmmA
- %endif
- alignx 16, 7
- .nextrow:
- pop ecx
- pop esi
- pop ebx
- pop edx
- pop edi
- pop eax
- add esi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- add edi, byte SIZEOF_JSAMPROW
- dec eax
- jg near .rowloop
- sfence
- .return:
- pop edi
- pop esi
- pop ebx
- mov esp, ebp
- pop esp
- pop ebp
- ret
- align 32
|