|
- %include "jcolsamp.inc"
- %define img_width(b) (b) + 8
- %define input_buf(b) (b) + 12
- %define output_buf(b) (b) + 16
- %define output_row(b) (b) + 20
- %define num_rows(b) (b) + 24
- %define original_ebp ebp + 0
- %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
-
- %define WK_NUM 8
- %define gotptr wk(0) - SIZEOF_POINTER
- align 32
- GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx)
- EXTN(jsimd_rgb_ycc_convert_mmx):
- push ebp
- mov eax, esp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD)
- mov [esp], eax
- mov ebp, esp
- lea esp, [wk(0)]
- pushpic eax
- push ebx
- push esi
- push edi
- get_GOT ebx
- movpic POINTER [gotptr], ebx
- mov ecx, JDIMENSION [img_width(eax)]
- test ecx, ecx
- jz near .return
- push ecx
- mov esi, JSAMPIMAGE [output_buf(eax)]
- mov ecx, JDIMENSION [output_row(eax)]
- mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
- lea edi, [edi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
- pop ecx
- mov esi, JSAMPARRAY [input_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax, eax
- jle near .return
- alignx 16, 7
- .rowloop:
- pushpic eax
- push edx
- push ebx
- push edi
- push esi
- push ecx
- mov esi, JSAMPROW [esi]
- mov edi, JSAMPROW [edi]
- mov ebx, JSAMPROW [ebx]
- mov edx, JSAMPROW [edx]
- movpic eax, POINTER [gotptr]
- cmp ecx, byte SIZEOF_MMWORD
- jae short .columnloop
- alignx 16, 7
- %if RGB_PIXELSIZE == 3
- .column_ld1:
- push eax
- push edx
- lea ecx, [ecx+ecx*2]
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub ecx, byte SIZEOF_BYTE
- xor eax, eax
- mov al, byte [esi+ecx]
- .column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub ecx, byte SIZEOF_WORD
- xor edx, edx
- mov dx, word [esi+ecx]
- shl eax, WORD_BIT
- or eax, edx
- .column_ld4:
- movd mmA, eax
- pop edx
- pop eax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub ecx, byte SIZEOF_DWORD
- movd mmG, dword [esi+ecx]
- psllq mmA, DWORD_BIT
- por mmA, mmG
- .column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- movq mmG, mmA
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- mov ecx, SIZEOF_MMWORD
- jmp short .rgb_ycc_cnv
- .column_ld16:
- test cl, 2*SIZEOF_MMWORD
- mov ecx, SIZEOF_MMWORD
- jz short .rgb_ycc_cnv
- movq mmF, mmA
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
- jmp short .rgb_ycc_cnv
- alignx 16, 7
- .columnloop:
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
- movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
- .rgb_ycc_cnv:
-
-
-
- movq mmD, mmA
- psllq mmA, 4*BYTE_BIT
- psrlq mmD, 4*BYTE_BIT
- punpckhbw mmA, mmG
- psllq mmG, 4*BYTE_BIT
- punpcklbw mmD, mmF
- punpckhbw mmG, mmF
- movq mmE, mmA
- psllq mmA, 4*BYTE_BIT
- psrlq mmE, 4*BYTE_BIT
- punpckhbw mmA, mmD
- psllq mmD, 4*BYTE_BIT
- punpcklbw mmE, mmG
- punpckhbw mmD, mmG
- pxor mmH, mmH
- movq mmC, mmA
- punpcklbw mmA, mmH
- punpckhbw mmC, mmH
- movq mmB, mmE
- punpcklbw mmE, mmH
- punpckhbw mmB, mmH
- movq mmF, mmD
- punpcklbw mmD, mmH
- punpckhbw mmF, mmH
- %else
- .column_ld1:
- test cl, SIZEOF_MMWORD/8
- jz short .column_ld2
- sub ecx, byte SIZEOF_MMWORD/8
- movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
- .column_ld2:
- test cl, SIZEOF_MMWORD/4
- jz short .column_ld4
- sub ecx, byte SIZEOF_MMWORD/4
- movq mmF, mmA
- movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
- .column_ld4:
- test cl, SIZEOF_MMWORD/2
- mov ecx, SIZEOF_MMWORD
- jz short .rgb_ycc_cnv
- movq mmD, mmA
- movq mmC, mmF
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
- jmp short .rgb_ycc_cnv
- alignx 16, 7
- .columnloop:
- movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
- movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
- movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
- movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
- .rgb_ycc_cnv:
-
-
-
-
- movq mmB, mmA
- punpcklbw mmA, mmF
- punpckhbw mmB, mmF
- movq mmG, mmD
- punpcklbw mmD, mmC
- punpckhbw mmG, mmC
- movq mmE, mmA
- punpcklwd mmA, mmD
- punpckhwd mmE, mmD
- movq mmH, mmB
- punpcklwd mmB, mmG
- punpckhwd mmH, mmG
- pxor mmF, mmF
- movq mmC, mmA
- punpcklbw mmA, mmF
- punpckhbw mmC, mmF
- movq mmD, mmB
- punpcklbw mmB, mmF
- punpckhbw mmD, mmF
- movq mmG, mmE
- punpcklbw mmE, mmF
- punpckhbw mmG, mmF
- punpcklbw mmF, mmH
- punpckhbw mmH, mmH
- psrlw mmF, BYTE_BIT
- psrlw mmH, BYTE_BIT
- %endif
-
-
-
-
-
-
-
-
-
-
-
- movq MMWORD [wk(0)], mm0
- movq MMWORD [wk(1)], mm1
- movq MMWORD [wk(2)], mm4
- movq MMWORD [wk(3)], mm5
- movq mm6, mm1
- punpcklwd mm1, mm3
- punpckhwd mm6, mm3
- movq mm7, mm1
- movq mm4, mm6
- pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)]
- pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)]
- pmaddwd mm7, [GOTOFF(eax,PW_MF016_MF033)]
- pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)]
- movq MMWORD [wk(4)], mm1
- movq MMWORD [wk(5)], mm6
- pxor mm1, mm1
- pxor mm6, mm6
- punpcklwd mm1, mm5
- punpckhwd mm6, mm5
- psrld mm1, 1
- psrld mm6, 1
- movq mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]
- paddd mm7, mm1
- paddd mm4, mm6
- paddd mm7, mm5
- paddd mm4, mm5
- psrld mm7, SCALEBITS
- psrld mm4, SCALEBITS
- packssdw mm7, mm4
- movq mm1, MMWORD [wk(2)]
- movq mm6, mm0
- punpcklwd mm0, mm2
- punpckhwd mm6, mm2
- movq mm5, mm0
- movq mm4, mm6
- pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)]
- pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)]
- pmaddwd mm5, [GOTOFF(eax,PW_MF016_MF033)]
- pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)]
- movq MMWORD [wk(6)], mm0
- movq MMWORD [wk(7)], mm6
- pxor mm0, mm0
- pxor mm6, mm6
- punpcklwd mm0, mm1
- punpckhwd mm6, mm1
- psrld mm0, 1
- psrld mm6, 1
- movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]
- paddd mm5, mm0
- paddd mm4, mm6
- paddd mm5, mm1
- paddd mm4, mm1
- psrld mm5, SCALEBITS
- psrld mm4, SCALEBITS
- packssdw mm5, mm4
- psllw mm7, BYTE_BIT
- por mm5, mm7
- movq MMWORD [ebx], mm5
- movq mm0, MMWORD [wk(3)]
- movq mm6, MMWORD [wk(2)]
- movq mm1, MMWORD [wk(1)]
- movq mm4, mm0
- punpcklwd mm0, mm3
- punpckhwd mm4, mm3
- movq mm7, mm0
- movq mm5, mm4
- pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)]
- pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)]
- pmaddwd mm7, [GOTOFF(eax,PW_MF008_MF041)]
- pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)]
- movq mm3, [GOTOFF(eax,PD_ONEHALF)]
- paddd mm0, MMWORD [wk(4)]
- paddd mm4, MMWORD [wk(5)]
- paddd mm0, mm3
- paddd mm4, mm3
- psrld mm0, SCALEBITS
- psrld mm4, SCALEBITS
- packssdw mm0, mm4
- pxor mm3, mm3
- pxor mm4, mm4
- punpcklwd mm3, mm1
- punpckhwd mm4, mm1
- psrld mm3, 1
- psrld mm4, 1
- movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]
- paddd mm7, mm3
- paddd mm5, mm4
- paddd mm7, mm1
- paddd mm5, mm1
- psrld mm7, SCALEBITS
- psrld mm5, SCALEBITS
- packssdw mm7, mm5
- movq mm3, MMWORD [wk(0)]
- movq mm4, mm6
- punpcklwd mm6, mm2
- punpckhwd mm4, mm2
- movq mm1, mm6
- movq mm5, mm4
- pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)]
- pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)]
- pmaddwd mm1, [GOTOFF(eax,PW_MF008_MF041)]
- pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)]
- movq mm2, [GOTOFF(eax,PD_ONEHALF)]
- paddd mm6, MMWORD [wk(6)]
- paddd mm4, MMWORD [wk(7)]
- paddd mm6, mm2
- paddd mm4, mm2
- psrld mm6, SCALEBITS
- psrld mm4, SCALEBITS
- packssdw mm6, mm4
- psllw mm0, BYTE_BIT
- por mm6, mm0
- movq MMWORD [edi], mm6
- pxor mm2, mm2
- pxor mm4, mm4
- punpcklwd mm2, mm3
- punpckhwd mm4, mm3
- psrld mm2, 1
- psrld mm4, 1
- movq mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]
- paddd mm1, mm2
- paddd mm5, mm4
- paddd mm1, mm0
- paddd mm5, mm0
- psrld mm1, SCALEBITS
- psrld mm5, SCALEBITS
- packssdw mm1, mm5
- psllw mm7, BYTE_BIT
- por mm1, mm7
- movq MMWORD [edx], mm1
- sub ecx, byte SIZEOF_MMWORD
- add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD
- add edi, byte SIZEOF_MMWORD
- add ebx, byte SIZEOF_MMWORD
- add edx, byte SIZEOF_MMWORD
- cmp ecx, byte SIZEOF_MMWORD
- jae near .columnloop
- test ecx, ecx
- jnz near .column_ld1
- pop ecx
- pop esi
- pop edi
- pop ebx
- pop edx
- poppic eax
- add esi, byte SIZEOF_JSAMPROW
- add edi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- dec eax
- jg near .rowloop
- emms
- .return:
- pop edi
- pop esi
- pop ebx
- mov esp, ebp
- pop esp
- pop ebp
- ret
- align 32
|