123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201 |
- %include "jsimdext.inc"
- %include "jdct.inc"
- SECTION SEG_TEXT
- BITS 32
- %define sample_data ebp + 8
- %define start_col ebp + 12
- %define workspace ebp + 16
- align 32
- GLOBAL_FUNCTION(jsimd_convsamp_sse2)
- EXTN(jsimd_convsamp_sse2):
- push ebp
- mov ebp, esp
- push ebx
- push esi
- push edi
- pxor xmm6, xmm6
- pcmpeqw xmm7, xmm7
- psllw xmm7, 7
- mov esi, JSAMPARRAY [sample_data]
- mov eax, JDIMENSION [start_col]
- mov edi, POINTER [workspace]
- mov ecx, DCTSIZE/4
- alignx 16, 7
- .convloop:
- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]
- mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]
- movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
- movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
- mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]
- mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]
- movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
- movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
- punpcklbw xmm0, xmm6
- punpcklbw xmm1, xmm6
- paddw xmm0, xmm7
- paddw xmm1, xmm7
- punpcklbw xmm2, xmm6
- punpcklbw xmm3, xmm6
- paddw xmm2, xmm7
- paddw xmm3, xmm7
- movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
- movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
- add esi, byte 4*SIZEOF_JSAMPROW
- add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
- dec ecx
- jnz short .convloop
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
- %define RECIPROCAL(m, n, b) \
- XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
- %define CORRECTION(m, n, b) \
- XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
- %define SCALE(m, n, b) \
- XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
- %define coef_block ebp + 8
- %define divisors ebp + 12
- %define workspace ebp + 16
- align 32
- GLOBAL_FUNCTION(jsimd_quantize_sse2)
- EXTN(jsimd_quantize_sse2):
- push ebp
- mov ebp, esp
- push esi
- push edi
- mov esi, POINTER [workspace]
- mov edx, POINTER [divisors]
- mov edi, JCOEFPTR [coef_block]
- mov eax, DCTSIZE2/32
- alignx 16, 7
- .quantloop:
- movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
- movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
- movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
- movdqa xmm0, xmm4
- movdqa xmm1, xmm5
- movdqa xmm2, xmm6
- movdqa xmm3, xmm7
- psraw xmm4, (WORD_BIT-1)
- psraw xmm5, (WORD_BIT-1)
- psraw xmm6, (WORD_BIT-1)
- psraw xmm7, (WORD_BIT-1)
- pxor xmm0, xmm4
- pxor xmm1, xmm5
- pxor xmm2, xmm6
- pxor xmm3, xmm7
- psubw xmm0, xmm4
- psubw xmm1, xmm5
- psubw xmm2, xmm6
- psubw xmm3, xmm7
- paddw xmm0, XMMWORD [CORRECTION(0,0,edx)]
- paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
- paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
- paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
- pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)]
- pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
- pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
- pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
- pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)]
- pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
- pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
- pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
- pxor xmm0, xmm4
- pxor xmm1, xmm5
- pxor xmm2, xmm6
- pxor xmm3, xmm7
- psubw xmm0, xmm4
- psubw xmm1, xmm5
- psubw xmm2, xmm6
- psubw xmm3, xmm7
- movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
- movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
- add esi, byte 32*SIZEOF_DCTELEM
- add edx, byte 32*SIZEOF_DCTELEM
- add edi, byte 32*SIZEOF_JCOEF
- dec eax
- jnz near .quantloop
- pop edi
- pop esi
- pop ebp
- ret
- align 32
|