123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276 |
- %include "jsimdext.inc"
- %include "jdct.inc"
- SECTION SEG_TEXT
- BITS 32
- %define sample_data ebp + 8
- %define start_col ebp + 12
- %define workspace ebp + 16
- align 32
- GLOBAL_FUNCTION(jsimd_convsamp_mmx)
- EXTN(jsimd_convsamp_mmx):
- push ebp
- mov ebp, esp
- push ebx
- push esi
- push edi
- pxor mm6, mm6
- pcmpeqw mm7, mm7
- psllw mm7, 7
- mov esi, JSAMPARRAY [sample_data]
- mov eax, JDIMENSION [start_col]
- mov edi, POINTER [workspace]
- mov ecx, DCTSIZE/4
- alignx 16, 7
- .convloop:
- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]
- mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]
- movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
- movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
- mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]
- mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]
- movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
- movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]
- movq mm4, mm0
- punpcklbw mm0, mm6
- punpckhbw mm4, mm6
- movq mm5, mm1
- punpcklbw mm1, mm6
- punpckhbw mm5, mm6
- paddw mm0, mm7
- paddw mm4, mm7
- paddw mm1, mm7
- paddw mm5, mm7
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
- movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
- movq mm0, mm2
- punpcklbw mm2, mm6
- punpckhbw mm0, mm6
- movq mm4, mm3
- punpcklbw mm3, mm6
- punpckhbw mm4, mm6
- paddw mm2, mm7
- paddw mm0, mm7
- paddw mm3, mm7
- paddw mm4, mm7
- movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
- movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
- movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
- movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
- add esi, byte 4*SIZEOF_JSAMPROW
- add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
- dec ecx
- jnz short .convloop
- emms
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
- %define RECIPROCAL(m, n, b) \
- MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
- %define CORRECTION(m, n, b) \
- MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
- %define SCALE(m, n, b) \
- MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
- %define SHIFT(m, n, b) \
- MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
- %define coef_block ebp + 8
- %define divisors ebp + 12
- %define workspace ebp + 16
- align 32
- GLOBAL_FUNCTION(jsimd_quantize_mmx)
- EXTN(jsimd_quantize_mmx):
- push ebp
- mov ebp, esp
- push esi
- push edi
- mov esi, POINTER [workspace]
- mov edx, POINTER [divisors]
- mov edi, JCOEFPTR [coef_block]
- mov ah, 2
- alignx 16, 7
- .quantloop1:
- mov al, DCTSIZE2/8/2
- alignx 16, 7
- .quantloop2:
- movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
- movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
- movq mm0, mm2
- movq mm1, mm3
- psraw mm2, (WORD_BIT-1)
- psraw mm3, (WORD_BIT-1)
- pxor mm0, mm2
- pxor mm1, mm3
- psubw mm0, mm2
- psubw mm1, mm3
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- paddw mm0, MMWORD [CORRECTION(0,0,edx)]
- paddw mm1, MMWORD [CORRECTION(0,1,edx)]
- movq mm4, mm0
- movq mm5, mm1
- pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)]
- pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]
- paddw mm0, mm4
- paddw mm1, mm5
-
-
-
-
- movq mm6, MMWORD [SCALE(0,0,edx)]
- movq mm7, MMWORD [SCALE(0,1,edx)]
- movq mm4, mm0
- movq mm5, mm1
- pmulhw mm0, mm6
- pmulhw mm1, mm7
- psraw mm6, (WORD_BIT-1)
- psraw mm7, (WORD_BIT-1)
- pand mm6, mm4
- pand mm7, mm5
- paddw mm0, mm6
- paddw mm1, mm7
- psraw mm4, (WORD_BIT-1)
- psraw mm5, (WORD_BIT-1)
- pand mm4, MMWORD [SCALE(0,0,edx)]
- pand mm5, MMWORD [SCALE(0,1,edx)]
- paddw mm0, mm4
- paddw mm1, mm5
- pxor mm0, mm2
- pxor mm1, mm3
- psubw mm0, mm2
- psubw mm1, mm3
- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
- add esi, byte 8*SIZEOF_DCTELEM
- add edx, byte 8*SIZEOF_DCTELEM
- add edi, byte 8*SIZEOF_JCOEF
- dec al
- jnz near .quantloop2
- dec ah
- jnz near .quantloop1
- emms
- pop edi
- pop esi
- pop ebp
- ret
- align 32
|