123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351 |
- %include "jsimdext.inc"
- SECTION SEG_TEXT
- BITS 32
- %define img_width(b) (b) + 8
- %define max_v_samp(b) (b) + 12
- %define v_samp(b) (b) + 16
- %define width_blks(b) (b) + 20
- %define input_data(b) (b) + 24
- %define output_data(b) (b) + 28
- align 32
- GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
- EXTN(jsimd_h2v1_downsample_sse2):
- push ebp
- mov ebp, esp
- push esi
- push edi
- mov ecx, JDIMENSION [width_blks(ebp)]
- shl ecx, 3
- jz near .return
- mov edx, JDIMENSION [img_width(ebp)]
-
- push ecx
- shl ecx, 1
- sub ecx, edx
- jle short .expand_end
- mov eax, INT [max_v_samp(ebp)]
- test eax, eax
- jle short .expand_end
- cld
- mov esi, JSAMPARRAY [input_data(ebp)]
- alignx 16, 7
- .expandloop:
- push eax
- push ecx
- mov edi, JSAMPROW [esi]
- add edi, edx
- mov al, JSAMPLE [edi-1]
- rep stosb
- pop ecx
- pop eax
- add esi, byte SIZEOF_JSAMPROW
- dec eax
- jg short .expandloop
- .expand_end:
- pop ecx
-
- mov eax, JDIMENSION [v_samp(ebp)]
- test eax, eax
- jle near .return
- mov edx, 0x00010000
- movd xmm7, edx
- pcmpeqw xmm6, xmm6
- pshufd xmm7, xmm7, 0x00
- psrlw xmm6, BYTE_BIT
- mov esi, JSAMPARRAY [input_data(ebp)]
- mov edi, JSAMPARRAY [output_data(ebp)]
- alignx 16, 7
- .rowloop:
- push ecx
- push edi
- push esi
- mov esi, JSAMPROW [esi]
- mov edi, JSAMPROW [edi]
- cmp ecx, byte SIZEOF_XMMWORD
- jae short .columnloop
- alignx 16, 7
- .columnloop_r8:
- movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
- pxor xmm1, xmm1
- mov ecx, SIZEOF_XMMWORD
- jmp short .downsample
- alignx 16, 7
- .columnloop:
- movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
- .downsample:
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- pand xmm0, xmm6
- psrlw xmm2, BYTE_BIT
- pand xmm1, xmm6
- psrlw xmm3, BYTE_BIT
- paddw xmm0, xmm2
- paddw xmm1, xmm3
- paddw xmm0, xmm7
- paddw xmm1, xmm7
- psrlw xmm0, 1
- psrlw xmm1, 1
- packuswb xmm0, xmm1
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
- sub ecx, byte SIZEOF_XMMWORD
- add esi, byte 2*SIZEOF_XMMWORD
- add edi, byte 1*SIZEOF_XMMWORD
- cmp ecx, byte SIZEOF_XMMWORD
- jae short .columnloop
- test ecx, ecx
- jnz short .columnloop_r8
- pop esi
- pop edi
- pop ecx
- add esi, byte SIZEOF_JSAMPROW
- add edi, byte SIZEOF_JSAMPROW
- dec eax
- jg near .rowloop
- .return:
- pop edi
- pop esi
- pop ebp
- ret
- %define img_width(b) (b) + 8
- %define max_v_samp(b) (b) + 12
- %define v_samp(b) (b) + 16
- %define width_blks(b) (b) + 20
- %define input_data(b) (b) + 24
- %define output_data(b) (b) + 28
- align 32
- GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
- EXTN(jsimd_h2v2_downsample_sse2):
- push ebp
- mov ebp, esp
- push esi
- push edi
- mov ecx, JDIMENSION [width_blks(ebp)]
- shl ecx, 3
- jz near .return
- mov edx, JDIMENSION [img_width(ebp)]
-
- push ecx
- shl ecx, 1
- sub ecx, edx
- jle short .expand_end
- mov eax, INT [max_v_samp(ebp)]
- test eax, eax
- jle short .expand_end
- cld
- mov esi, JSAMPARRAY [input_data(ebp)]
- alignx 16, 7
- .expandloop:
- push eax
- push ecx
- mov edi, JSAMPROW [esi]
- add edi, edx
- mov al, JSAMPLE [edi-1]
- rep stosb
- pop ecx
- pop eax
- add esi, byte SIZEOF_JSAMPROW
- dec eax
- jg short .expandloop
- .expand_end:
- pop ecx
-
- mov eax, JDIMENSION [v_samp(ebp)]
- test eax, eax
- jle near .return
- mov edx, 0x00020001
- movd xmm7, edx
- pcmpeqw xmm6, xmm6
- pshufd xmm7, xmm7, 0x00
- psrlw xmm6, BYTE_BIT
- mov esi, JSAMPARRAY [input_data(ebp)]
- mov edi, JSAMPARRAY [output_data(ebp)]
- alignx 16, 7
- .rowloop:
- push ecx
- push edi
- push esi
- mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]
- mov edi, JSAMPROW [edi]
- cmp ecx, byte SIZEOF_XMMWORD
- jae short .columnloop
- alignx 16, 7
- .columnloop_r8:
- movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
- pxor xmm2, xmm2
- pxor xmm3, xmm3
- mov ecx, SIZEOF_XMMWORD
- jmp short .downsample
- alignx 16, 7
- .columnloop:
- movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
- movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
- .downsample:
- movdqa xmm4, xmm0
- movdqa xmm5, xmm1
- pand xmm0, xmm6
- psrlw xmm4, BYTE_BIT
- pand xmm1, xmm6
- psrlw xmm5, BYTE_BIT
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- movdqa xmm4, xmm2
- movdqa xmm5, xmm3
- pand xmm2, xmm6
- psrlw xmm4, BYTE_BIT
- pand xmm3, xmm6
- psrlw xmm5, BYTE_BIT
- paddw xmm2, xmm4
- paddw xmm3, xmm5
- paddw xmm0, xmm1
- paddw xmm2, xmm3
- paddw xmm0, xmm7
- paddw xmm2, xmm7
- psrlw xmm0, 2
- psrlw xmm2, 2
- packuswb xmm0, xmm2
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
- sub ecx, byte SIZEOF_XMMWORD
- add edx, byte 2*SIZEOF_XMMWORD
- add esi, byte 2*SIZEOF_XMMWORD
- add edi, byte 1*SIZEOF_XMMWORD
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test ecx, ecx
- jnz near .columnloop_r8
- pop esi
- pop edi
- pop ecx
- add esi, byte 2*SIZEOF_JSAMPROW
- add edi, byte 1*SIZEOF_JSAMPROW
- dec eax
- jg near .rowloop
- .return:
- pop edi
- pop esi
- pop ebp
- ret
- align 32
|