123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460 |
- %include "jcolsamp.inc"
- %define output_width(b) (b) + 8
- %define input_buf(b) (b) + 12
- %define in_row_group_ctr(b) (b) + 16
- %define output_buf(b) (b) + 20
- %define original_ebp ebp + 0
- %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
- %define WK_NUM 3
- %define gotptr wk(0) - SIZEOF_POINTER
- align 32
- GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx)
- EXTN(jsimd_h2v1_merged_upsample_mmx):
- push ebp
- mov eax, esp
- sub esp, byte 4
- and esp, byte (-SIZEOF_MMWORD)
- mov [esp], eax
- mov ebp, esp
- lea esp, [wk(0)]
- pushpic eax
- push ebx
- push esi
- push edi
- get_GOT ebx
- movpic POINTER [gotptr], ebx
- mov ecx, JDIMENSION [output_width(eax)]
- test ecx, ecx
- jz near .return
- push ecx
- mov edi, JSAMPIMAGE [input_buf(eax)]
- mov ecx, JDIMENSION [in_row_group_ctr(eax)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- mov edi, JSAMPARRAY [output_buf(eax)]
- mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]
- mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]
- mov edi, JSAMPROW [edi]
- pop ecx
- alignx 16, 7
- .columnloop:
- movpic eax, POINTER [gotptr]
- movq mm6, MMWORD [ebx]
- movq mm7, MMWORD [edx]
- pxor mm1, mm1
- pcmpeqw mm3, mm3
- psllw mm3, 7
- movq mm4, mm6
- punpckhbw mm6, mm1
- punpcklbw mm4, mm1
- movq mm0, mm7
- punpckhbw mm7, mm1
- punpcklbw mm0, mm1
- paddw mm6, mm3
- paddw mm4, mm3
- paddw mm7, mm3
- paddw mm0, mm3
-
-
-
-
-
-
-
-
-
- movq mm5, mm6
- movq mm2, mm4
- paddw mm6, mm6
- paddw mm4, mm4
- movq mm1, mm7
- movq mm3, mm0
- paddw mm7, mm7
- paddw mm0, mm0
- pmulhw mm6, [GOTOFF(eax,PW_MF0228)]
- pmulhw mm4, [GOTOFF(eax,PW_MF0228)]
- pmulhw mm7, [GOTOFF(eax,PW_F0402)]
- pmulhw mm0, [GOTOFF(eax,PW_F0402)]
- paddw mm6, [GOTOFF(eax,PW_ONE)]
- paddw mm4, [GOTOFF(eax,PW_ONE)]
- psraw mm6, 1
- psraw mm4, 1
- paddw mm7, [GOTOFF(eax,PW_ONE)]
- paddw mm0, [GOTOFF(eax,PW_ONE)]
- psraw mm7, 1
- psraw mm0, 1
- paddw mm6, mm5
- paddw mm4, mm2
- paddw mm6, mm5
- paddw mm4, mm2
- paddw mm7, mm1
- paddw mm0, mm3
- movq MMWORD [wk(0)], mm6
- movq MMWORD [wk(1)], mm7
- movq mm6, mm5
- movq mm7, mm2
- punpcklwd mm5, mm1
- punpckhwd mm6, mm1
- pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd mm6, [GOTOFF(eax,PW_MF0344_F0285)]
- punpcklwd mm2, mm3
- punpckhwd mm7, mm3
- pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd mm7, [GOTOFF(eax,PW_MF0344_F0285)]
- paddd mm5, [GOTOFF(eax,PD_ONEHALF)]
- paddd mm6, [GOTOFF(eax,PD_ONEHALF)]
- psrad mm5, SCALEBITS
- psrad mm6, SCALEBITS
- paddd mm2, [GOTOFF(eax,PD_ONEHALF)]
- paddd mm7, [GOTOFF(eax,PD_ONEHALF)]
- psrad mm2, SCALEBITS
- psrad mm7, SCALEBITS
- packssdw mm5, mm6
- packssdw mm2, mm7
- psubw mm5, mm1
- psubw mm2, mm3
- movq MMWORD [wk(2)], mm5
- mov al, 2
- jmp short .Yloop_1st
- alignx 16, 7
- .Yloop_2nd:
- movq mm0, MMWORD [wk(1)]
- movq mm2, MMWORD [wk(2)]
- movq mm4, MMWORD [wk(0)]
- alignx 16, 7
- .Yloop_1st:
- movq mm7, MMWORD [esi]
- pcmpeqw mm6, mm6
- psrlw mm6, BYTE_BIT
- pand mm6, mm7
- psrlw mm7, BYTE_BIT
- movq mm1, mm0
- movq mm3, mm2
- movq mm5, mm4
- paddw mm0, mm6
- paddw mm1, mm7
- packuswb mm0, mm0
- packuswb mm1, mm1
- paddw mm2, mm6
- paddw mm3, mm7
- packuswb mm2, mm2
- packuswb mm3, mm3
- paddw mm4, mm6
- paddw mm5, mm7
- packuswb mm4, mm4
- packuswb mm5, mm5
- %if RGB_PIXELSIZE == 3
-
-
-
-
- punpcklbw mmA, mmC
- punpcklbw mmE, mmB
- punpcklbw mmD, mmF
- movq mmG, mmA
- movq mmH, mmA
- punpcklwd mmA, mmE
- punpckhwd mmG, mmE
- psrlq mmH, 2*BYTE_BIT
- psrlq mmE, 2*BYTE_BIT
- movq mmC, mmD
- movq mmB, mmD
- punpcklwd mmD, mmH
- punpckhwd mmC, mmH
- psrlq mmB, 2*BYTE_BIT
- movq mmF, mmE
- punpcklwd mmE, mmB
- punpckhwd mmF, mmB
- punpckldq mmA, mmD
- punpckldq mmE, mmG
- punpckldq mmC, mmF
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st16
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
- movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
- sub ecx, byte SIZEOF_MMWORD
- jz near .endcolumn
- add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD
- add esi, byte SIZEOF_MMWORD
- dec al
- jnz near .Yloop_2nd
- add ebx, byte SIZEOF_MMWORD
- add edx, byte SIZEOF_MMWORD
- jmp near .columnloop
- alignx 16, 7
- .column_st16:
- lea ecx, [ecx+ecx*2]
- cmp ecx, byte 2*SIZEOF_MMWORD
- jb short .column_st8
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
- movq mmA, mmC
- sub ecx, byte 2*SIZEOF_MMWORD
- add edi, byte 2*SIZEOF_MMWORD
- jmp short .column_st4
- .column_st8:
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st4
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq mmA, mmE
- sub ecx, byte SIZEOF_MMWORD
- add edi, byte SIZEOF_MMWORD
- .column_st4:
- movd eax, mmA
- cmp ecx, byte SIZEOF_DWORD
- jb short .column_st2
- mov dword [edi+0*SIZEOF_DWORD], eax
- psrlq mmA, DWORD_BIT
- movd eax, mmA
- sub ecx, byte SIZEOF_DWORD
- add edi, byte SIZEOF_DWORD
- .column_st2:
- cmp ecx, byte SIZEOF_WORD
- jb short .column_st1
- mov word [edi+0*SIZEOF_WORD], ax
- shr eax, WORD_BIT
- sub ecx, byte SIZEOF_WORD
- add edi, byte SIZEOF_WORD
- .column_st1:
- cmp ecx, byte SIZEOF_BYTE
- jb short .endcolumn
- mov byte [edi+0*SIZEOF_BYTE], al
- %else
- %ifdef RGBX_FILLER_0XFF
- pcmpeqb mm6, mm6
- pcmpeqb mm7, mm7
- %else
- pxor mm6, mm6
- pxor mm7, mm7
- %endif
-
-
-
-
- punpcklbw mmA, mmC
- punpcklbw mmE, mmG
- punpcklbw mmB, mmD
- punpcklbw mmF, mmH
- movq mmC, mmA
- punpcklwd mmA, mmE
- punpckhwd mmC, mmE
- movq mmG, mmB
- punpcklwd mmB, mmF
- punpckhwd mmG, mmF
- movq mmD, mmA
- punpckldq mmA, mmB
- punpckhdq mmD, mmB
- movq mmH, mmC
- punpckldq mmC, mmG
- punpckhdq mmH, mmG
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st16
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
- movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
- movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
- sub ecx, byte SIZEOF_MMWORD
- jz short .endcolumn
- add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD
- add esi, byte SIZEOF_MMWORD
- dec al
- jnz near .Yloop_2nd
- add ebx, byte SIZEOF_MMWORD
- add edx, byte SIZEOF_MMWORD
- jmp near .columnloop
- alignx 16, 7
- .column_st16:
- cmp ecx, byte SIZEOF_MMWORD/2
- jb short .column_st8
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
- movq mmA, mmC
- movq mmD, mmH
- sub ecx, byte SIZEOF_MMWORD/2
- add edi, byte 2*SIZEOF_MMWORD
- .column_st8:
- cmp ecx, byte SIZEOF_MMWORD/4
- jb short .column_st4
- movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
- movq mmA, mmD
- sub ecx, byte SIZEOF_MMWORD/4
- add edi, byte 1*SIZEOF_MMWORD
- .column_st4:
- cmp ecx, byte SIZEOF_MMWORD/8
- jb short .endcolumn
- movd dword [edi+0*SIZEOF_DWORD], mmA
- %endif
- .endcolumn:
- emms
- .return:
- pop edi
- pop esi
- pop ebx
- mov esp, ebp
- pop esp
- pop ebp
- ret
- %define output_width(b) (b) + 8
- %define input_buf(b) (b) + 12
- %define in_row_group_ctr(b) (b) + 16
- %define output_buf(b) (b) + 20
- align 32
- GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx)
- EXTN(jsimd_h2v2_merged_upsample_mmx):
- push ebp
- mov ebp, esp
- push ebx
- push esi
- push edi
- mov eax, JDIMENSION [output_width(ebp)]
- mov edi, JSAMPIMAGE [input_buf(ebp)]
- mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- mov edi, JSAMPARRAY [output_buf(ebp)]
- lea esi, [esi+ecx*SIZEOF_JSAMPROW]
- push edx
- push ebx
- push esi
- mov ebx, esp
- push edi
- push ecx
- push ebx
- push eax
- call near EXTN(jsimd_h2v1_merged_upsample_mmx)
- add esi, byte SIZEOF_JSAMPROW
- add edi, byte SIZEOF_JSAMPROW
- mov POINTER [ebx+0*SIZEOF_POINTER], esi
- mov POINTER [ebx-1*SIZEOF_POINTER], edi
- call near EXTN(jsimd_h2v1_merged_upsample_mmx)
- add esp, byte 7*SIZEOF_DWORD
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
- align 32
|