123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404 |
- %include "jcolsamp.inc"
- %define out_width(b) (b) + 8
- %define input_buf(b) (b) + 12
- %define input_row(b) (b) + 16
- %define output_buf(b) (b) + 20
- %define num_rows(b) (b) + 24
- %define original_ebp ebp + 0
- %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
-
- %define WK_NUM 2
- %define gotptr wk(0) - SIZEOF_POINTER
- align 32
- GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx)
- EXTN(jsimd_ycc_rgb_convert_mmx):
- push ebp
- mov eax, esp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD)
- mov [esp], eax
- mov ebp, esp
- lea esp, [wk(0)]
- pushpic eax
- push ebx
- push esi
- push edi
- get_GOT ebx
- movpic POINTER [gotptr], ebx
- mov ecx, JDIMENSION [out_width(eax)]
- test ecx, ecx
- jz near .return
- push ecx
- mov edi, JSAMPIMAGE [input_buf(eax)]
- mov ecx, JDIMENSION [input_row(eax)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- lea esi, [esi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
- pop ecx
- mov edi, JSAMPARRAY [output_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax, eax
- jle near .return
- alignx 16, 7
- .rowloop:
- push eax
- push edi
- push edx
- push ebx
- push esi
- push ecx
- mov esi, JSAMPROW [esi]
- mov ebx, JSAMPROW [ebx]
- mov edx, JSAMPROW [edx]
- mov edi, JSAMPROW [edi]
- movpic eax, POINTER [gotptr]
- alignx 16, 7
- .columnloop:
- movq mm5, MMWORD [ebx]
- movq mm1, MMWORD [edx]
- pcmpeqw mm4, mm4
- pcmpeqw mm7, mm7
- psrlw mm4, BYTE_BIT
- psllw mm7, 7
- movq mm0, mm4
- pand mm4, mm5
- psrlw mm5, BYTE_BIT
- pand mm0, mm1
- psrlw mm1, BYTE_BIT
- paddw mm4, mm7
- paddw mm5, mm7
- paddw mm0, mm7
- paddw mm1, mm7
-
-
-
-
-
-
-
-
-
- movq mm2, mm4
- movq mm3, mm5
- paddw mm4, mm4
- paddw mm5, mm5
- movq mm6, mm0
- movq mm7, mm1
- paddw mm0, mm0
- paddw mm1, mm1
- pmulhw mm4, [GOTOFF(eax,PW_MF0228)]
- pmulhw mm5, [GOTOFF(eax,PW_MF0228)]
- pmulhw mm0, [GOTOFF(eax,PW_F0402)]
- pmulhw mm1, [GOTOFF(eax,PW_F0402)]
- paddw mm4, [GOTOFF(eax,PW_ONE)]
- paddw mm5, [GOTOFF(eax,PW_ONE)]
- psraw mm4, 1
- psraw mm5, 1
- paddw mm0, [GOTOFF(eax,PW_ONE)]
- paddw mm1, [GOTOFF(eax,PW_ONE)]
- psraw mm0, 1
- psraw mm1, 1
- paddw mm4, mm2
- paddw mm5, mm3
- paddw mm4, mm2
- paddw mm5, mm3
- paddw mm0, mm6
- paddw mm1, mm7
- movq MMWORD [wk(0)], mm4
- movq MMWORD [wk(1)], mm5
- movq mm4, mm2
- movq mm5, mm3
- punpcklwd mm2, mm6
- punpckhwd mm4, mm6
- pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd mm4, [GOTOFF(eax,PW_MF0344_F0285)]
- punpcklwd mm3, mm7
- punpckhwd mm5, mm7
- pmaddwd mm3, [GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)]
- paddd mm2, [GOTOFF(eax,PD_ONEHALF)]
- paddd mm4, [GOTOFF(eax,PD_ONEHALF)]
- psrad mm2, SCALEBITS
- psrad mm4, SCALEBITS
- paddd mm3, [GOTOFF(eax,PD_ONEHALF)]
- paddd mm5, [GOTOFF(eax,PD_ONEHALF)]
- psrad mm3, SCALEBITS
- psrad mm5, SCALEBITS
- packssdw mm2, mm4
- packssdw mm3, mm5
- psubw mm2, mm6
- psubw mm3, mm7
- movq mm5, MMWORD [esi]
- pcmpeqw mm4, mm4
- psrlw mm4, BYTE_BIT
- pand mm4, mm5
- psrlw mm5, BYTE_BIT
- paddw mm0, mm4
- paddw mm1, mm5
- packuswb mm0, mm0
- packuswb mm1, mm1
- paddw mm2, mm4
- paddw mm3, mm5
- packuswb mm2, mm2
- packuswb mm3, mm3
- paddw mm4, MMWORD [wk(0)]
- paddw mm5, MMWORD [wk(1)]
- packuswb mm4, mm4
- packuswb mm5, mm5
- %if RGB_PIXELSIZE == 3
-
-
-
-
- punpcklbw mmA, mmC
- punpcklbw mmE, mmB
- punpcklbw mmD, mmF
- movq mmG, mmA
- movq mmH, mmA
- punpcklwd mmA, mmE
- punpckhwd mmG, mmE
- psrlq mmH, 2*BYTE_BIT
- psrlq mmE, 2*BYTE_BIT
- movq mmC, mmD
- movq mmB, mmD
- punpcklwd mmD, mmH
- punpckhwd mmC, mmH
- psrlq mmB, 2*BYTE_BIT
- movq mmF, mmE
- punpcklwd mmE, mmB
- punpckhwd mmF, mmB
- punpckldq mmA, mmD
- punpckldq mmE, mmG
- punpckldq mmC, mmF
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st16
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
- movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
- sub ecx, byte SIZEOF_MMWORD
- jz short .nextrow
- add esi, byte SIZEOF_MMWORD
- add ebx, byte SIZEOF_MMWORD
- add edx, byte SIZEOF_MMWORD
- add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD
- jmp near .columnloop
- alignx 16, 7
- .column_st16:
- lea ecx, [ecx+ecx*2]
- cmp ecx, byte 2*SIZEOF_MMWORD
- jb short .column_st8
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
- movq mmA, mmC
- sub ecx, byte 2*SIZEOF_MMWORD
- add edi, byte 2*SIZEOF_MMWORD
- jmp short .column_st4
- .column_st8:
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st4
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq mmA, mmE
- sub ecx, byte SIZEOF_MMWORD
- add edi, byte SIZEOF_MMWORD
- .column_st4:
- movd eax, mmA
- cmp ecx, byte SIZEOF_DWORD
- jb short .column_st2
- mov dword [edi+0*SIZEOF_DWORD], eax
- psrlq mmA, DWORD_BIT
- movd eax, mmA
- sub ecx, byte SIZEOF_DWORD
- add edi, byte SIZEOF_DWORD
- .column_st2:
- cmp ecx, byte SIZEOF_WORD
- jb short .column_st1
- mov word [edi+0*SIZEOF_WORD], ax
- shr eax, WORD_BIT
- sub ecx, byte SIZEOF_WORD
- add edi, byte SIZEOF_WORD
- .column_st1:
- cmp ecx, byte SIZEOF_BYTE
- jb short .nextrow
- mov byte [edi+0*SIZEOF_BYTE], al
- %else
- %ifdef RGBX_FILLER_0XFF
- pcmpeqb mm6, mm6
- pcmpeqb mm7, mm7
- %else
- pxor mm6, mm6
- pxor mm7, mm7
- %endif
-
-
-
-
- punpcklbw mmA, mmC
- punpcklbw mmE, mmG
- punpcklbw mmB, mmD
- punpcklbw mmF, mmH
- movq mmC, mmA
- punpcklwd mmA, mmE
- punpckhwd mmC, mmE
- movq mmG, mmB
- punpcklwd mmB, mmF
- punpckhwd mmG, mmF
- movq mmD, mmA
- punpckldq mmA, mmB
- punpckhdq mmD, mmB
- movq mmH, mmC
- punpckldq mmC, mmG
- punpckhdq mmH, mmG
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st16
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
- movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
- movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
- sub ecx, byte SIZEOF_MMWORD
- jz short .nextrow
- add esi, byte SIZEOF_MMWORD
- add ebx, byte SIZEOF_MMWORD
- add edx, byte SIZEOF_MMWORD
- add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD
- jmp near .columnloop
- alignx 16, 7
- .column_st16:
- cmp ecx, byte SIZEOF_MMWORD/2
- jb short .column_st8
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
- movq mmA, mmC
- movq mmD, mmH
- sub ecx, byte SIZEOF_MMWORD/2
- add edi, byte 2*SIZEOF_MMWORD
- .column_st8:
- cmp ecx, byte SIZEOF_MMWORD/4
- jb short .column_st4
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq mmA, mmD
- sub ecx, byte SIZEOF_MMWORD/4
- add edi, byte 1*SIZEOF_MMWORD
- .column_st4:
- cmp ecx, byte SIZEOF_MMWORD/8
- jb short .nextrow
- movd dword [edi+0*SIZEOF_DWORD], mmA
- %endif
- alignx 16, 7
- .nextrow:
- pop ecx
- pop esi
- pop ebx
- pop edx
- pop edi
- pop eax
- add esi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- add edi, byte SIZEOF_JSAMPROW
- dec eax
- jg near .rowloop
- emms
- .return:
- pop edi
- pop esi
- pop ebx
- mov esp, ebp
- pop esp
- pop ebp
- ret
- align 32
|