123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575 |
- %include "jcolsamp.inc"
- %define output_width(b) (b) + 8
- %define input_buf(b) (b) + 12
- %define in_row_group_ctr(b) (b) + 16
- %define output_buf(b) (b) + 20
- %define original_ebp ebp + 0
- %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
-
- %define WK_NUM 3
- %define gotptr wk(0) - SIZEOF_POINTER
- align 32
- GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
- EXTN(jsimd_h2v1_merged_upsample_avx2):
- push ebp
- mov eax, esp
- sub esp, byte 4
- and esp, byte (-SIZEOF_YMMWORD)
- mov [esp], eax
- mov ebp, esp
- lea esp, [wk(0)]
- pushpic eax
- push ebx
- push esi
- push edi
- get_GOT ebx
- movpic POINTER [gotptr], ebx
- mov ecx, JDIMENSION [output_width(eax)]
- test ecx, ecx
- jz near .return
- push ecx
- mov edi, JSAMPIMAGE [input_buf(eax)]
- mov ecx, JDIMENSION [in_row_group_ctr(eax)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- mov edi, JSAMPARRAY [output_buf(eax)]
- mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]
- mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]
- mov edi, JSAMPROW [edi]
- pop ecx
- alignx 16, 7
- .columnloop:
- movpic eax, POINTER [gotptr]
- vmovdqu ymm6, YMMWORD [ebx]
- vmovdqu ymm7, YMMWORD [edx]
- vpxor ymm1, ymm1, ymm1
- vpcmpeqw ymm3, ymm3, ymm3
- vpsllw ymm3, ymm3, 7
- vpermq ymm6, ymm6, 0xd8
- vpermq ymm7, ymm7, 0xd8
- vpunpcklbw ymm4, ymm6, ymm1
- vpunpckhbw ymm6, ymm6, ymm1
- vpunpcklbw ymm0, ymm7, ymm1
- vpunpckhbw ymm7, ymm7, ymm1
- vpaddw ymm5, ymm6, ymm3
- vpaddw ymm2, ymm4, ymm3
- vpaddw ymm1, ymm7, ymm3
- vpaddw ymm3, ymm0, ymm3
-
-
-
-
-
-
-
-
-
- vpaddw ymm6, ymm5, ymm5
- vpaddw ymm4, ymm2, ymm2
- vpaddw ymm7, ymm1, ymm1
- vpaddw ymm0, ymm3, ymm3
- vpmulhw ymm6, ymm6, [GOTOFF(eax,PW_MF0228)]
- vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)]
- vpmulhw ymm7, ymm7, [GOTOFF(eax,PW_F0402)]
- vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)]
- vpaddw ymm6, ymm6, [GOTOFF(eax,PW_ONE)]
- vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
- vpsraw ymm6, ymm6, 1
- vpsraw ymm4, ymm4, 1
- vpaddw ymm7, ymm7, [GOTOFF(eax,PW_ONE)]
- vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
- vpsraw ymm7, ymm7, 1
- vpsraw ymm0, ymm0, 1
- vpaddw ymm6, ymm6, ymm5
- vpaddw ymm4, ymm4, ymm2
- vpaddw ymm6, ymm6, ymm5
- vpaddw ymm4, ymm4, ymm2
- vpaddw ymm7, ymm7, ymm1
- vpaddw ymm0, ymm0, ymm3
- vmovdqa YMMWORD [wk(0)], ymm6
- vmovdqa YMMWORD [wk(1)], ymm7
- vpunpckhwd ymm6, ymm5, ymm1
- vpunpcklwd ymm5, ymm5, ymm1
- vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
- vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)]
- vpunpckhwd ymm7, ymm2, ymm3
- vpunpcklwd ymm2, ymm2, ymm3
- vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
- vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)]
- vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
- vpaddd ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)]
- vpsrad ymm5, ymm5, SCALEBITS
- vpsrad ymm6, ymm6, SCALEBITS
- vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
- vpaddd ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)]
- vpsrad ymm2, ymm2, SCALEBITS
- vpsrad ymm7, ymm7, SCALEBITS
- vpackssdw ymm5, ymm5, ymm6
- vpackssdw ymm2, ymm2, ymm7
- vpsubw ymm5, ymm5, ymm1
- vpsubw ymm2, ymm2, ymm3
- vmovdqa YMMWORD [wk(2)], ymm5
- mov al, 2
- jmp short .Yloop_1st
- alignx 16, 7
- .Yloop_2nd:
- vmovdqa ymm0, YMMWORD [wk(1)]
- vmovdqa ymm2, YMMWORD [wk(2)]
- vmovdqa ymm4, YMMWORD [wk(0)]
- alignx 16, 7
- .Yloop_1st:
- vmovdqu ymm7, YMMWORD [esi]
- vpcmpeqw ymm6, ymm6, ymm6
- vpsrlw ymm6, ymm6, BYTE_BIT
- vpand ymm6, ymm6, ymm7
- vpsrlw ymm7, ymm7, BYTE_BIT
- vmovdqa ymm1, ymm0
- vmovdqa ymm3, ymm2
- vmovdqa ymm5, ymm4
- vpaddw ymm0, ymm0, ymm6
- vpaddw ymm1, ymm1, ymm7
- vpackuswb ymm0, ymm0, ymm0
- vpackuswb ymm1, ymm1, ymm1
- vpaddw ymm2, ymm2, ymm6
- vpaddw ymm3, ymm3, ymm7
- vpackuswb ymm2, ymm2, ymm2
- vpackuswb ymm3, ymm3, ymm3
- vpaddw ymm4, ymm4, ymm6
- vpaddw ymm5, ymm5, ymm7
- vpackuswb ymm4, ymm4, ymm4
- vpackuswb ymm5, ymm5, ymm5
- %if RGB_PIXELSIZE == 3
-
-
-
-
-
-
-
-
- vpunpcklbw ymmA, ymmA, ymmC
-
- vpunpcklbw ymmE, ymmE, ymmB
-
- vpunpcklbw ymmD, ymmD, ymmF
-
- vpsrldq ymmH, ymmA, 2
-
- vpunpckhwd ymmG, ymmA, ymmE
-
- vpunpcklwd ymmA, ymmA, ymmE
-
- vpsrldq ymmE, ymmE, 2
-
- vpsrldq ymmB, ymmD, 2
-
- vpunpckhwd ymmC, ymmD, ymmH
-
- vpunpcklwd ymmD, ymmD, ymmH
-
- vpunpckhwd ymmF, ymmE, ymmB
-
- vpunpcklwd ymmE, ymmE, ymmB
-
- vpshufd ymmH, ymmA, 0x4E
-
- vpunpckldq ymmA, ymmA, ymmD
-
- vpunpckhdq ymmD, ymmD, ymmE
-
- vpunpckldq ymmE, ymmE, ymmH
-
- vpshufd ymmH, ymmG, 0x4E
-
- vpunpckldq ymmG, ymmG, ymmC
-
- vpunpckhdq ymmC, ymmC, ymmF
-
- vpunpckldq ymmF, ymmF, ymmH
-
- vpunpcklqdq ymmH, ymmA, ymmE
-
- vpunpcklqdq ymmG, ymmD, ymmG
-
- vpunpcklqdq ymmC, ymmF, ymmC
-
- vperm2i128 ymmA, ymmH, ymmG, 0x20
-
- vperm2i128 ymmD, ymmC, ymmH, 0x30
-
- vperm2i128 ymmF, ymmG, ymmC, 0x31
-
- cmp ecx, byte SIZEOF_YMMWORD
- jb short .column_st64
- test edi, SIZEOF_YMMWORD-1
- jnz short .out1
-
- vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
- vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
- vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
- jmp short .out0
- .out1:
- vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
- vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
- vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
- .out0:
- add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD
- sub ecx, byte SIZEOF_YMMWORD
- jz near .endcolumn
- add esi, byte SIZEOF_YMMWORD
- dec al
- jnz near .Yloop_2nd
- add ebx, byte SIZEOF_YMMWORD
- add edx, byte SIZEOF_YMMWORD
- jmp near .columnloop
- alignx 16, 7
- .column_st64:
- lea ecx, [ecx+ecx*2]
- cmp ecx, byte 2*SIZEOF_YMMWORD
- jb short .column_st32
- vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
- vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
- add edi, byte 2*SIZEOF_YMMWORD
- vmovdqa ymmA, ymmF
- sub ecx, byte 2*SIZEOF_YMMWORD
- jmp short .column_st31
- .column_st32:
- cmp ecx, byte SIZEOF_YMMWORD
- jb short .column_st31
- vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
- add edi, byte SIZEOF_YMMWORD
- vmovdqa ymmA, ymmD
- sub ecx, byte SIZEOF_YMMWORD
- jmp short .column_st31
- .column_st31:
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st15
- vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- add edi, byte SIZEOF_XMMWORD
- vperm2i128 ymmA, ymmA, ymmA, 1
- sub ecx, byte SIZEOF_XMMWORD
- .column_st15:
-
-
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st7
- vmovq XMM_MMWORD [edi], xmmA
- add edi, byte SIZEOF_MMWORD
- sub ecx, byte SIZEOF_MMWORD
- vpsrldq xmmA, xmmA, SIZEOF_MMWORD
- .column_st7:
-
-
- cmp ecx, byte SIZEOF_DWORD
- jb short .column_st3
- vmovd XMM_DWORD [edi], xmmA
- add edi, byte SIZEOF_DWORD
- sub ecx, byte SIZEOF_DWORD
- vpsrldq xmmA, xmmA, SIZEOF_DWORD
- .column_st3:
-
-
- vmovd eax, xmmA
- cmp ecx, byte SIZEOF_WORD
- jb short .column_st1
- mov word [edi], ax
- add edi, byte SIZEOF_WORD
- sub ecx, byte SIZEOF_WORD
- shr eax, 16
- .column_st1:
-
-
- test ecx, ecx
- jz short .endcolumn
- mov byte [edi], al
- %else
- %ifdef RGBX_FILLER_0XFF
- vpcmpeqb ymm6, ymm6, ymm6
- vpcmpeqb ymm7, ymm7, ymm7
- %else
- vpxor ymm6, ymm6, ymm6
- vpxor ymm7, ymm7, ymm7
- %endif
-
-
-
-
-
-
-
-
- vpunpcklbw ymmA, ymmA, ymmC
-
- vpunpcklbw ymmE, ymmE, ymmG
-
- vpunpcklbw ymmB, ymmB, ymmD
-
- vpunpcklbw ymmF, ymmF, ymmH
-
- vpunpckhwd ymmC, ymmA, ymmE
-
- vpunpcklwd ymmA, ymmA, ymmE
-
- vpunpckhwd ymmG, ymmB, ymmF
-
- vpunpcklwd ymmB, ymmB, ymmF
-
- vpunpckhdq ymmE, ymmA, ymmB
-
- vpunpckldq ymmB, ymmA, ymmB
-
- vpunpckhdq ymmF, ymmC, ymmG
-
- vpunpckldq ymmG, ymmC, ymmG
-
- vperm2i128 ymmA, ymmB, ymmE, 0x20
-
- vperm2i128 ymmD, ymmG, ymmF, 0x20
-
- vperm2i128 ymmC, ymmB, ymmE, 0x31
-
- vperm2i128 ymmH, ymmG, ymmF, 0x31
-
- cmp ecx, byte SIZEOF_YMMWORD
- jb short .column_st64
- test edi, SIZEOF_YMMWORD-1
- jnz short .out1
-
- vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
- vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
- vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
- vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
- jmp short .out0
- .out1:
- vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
- vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
- vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
- vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
- .out0:
- add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD
- sub ecx, byte SIZEOF_YMMWORD
- jz near .endcolumn
- add esi, byte SIZEOF_YMMWORD
- dec al
- jnz near .Yloop_2nd
- add ebx, byte SIZEOF_YMMWORD
- add edx, byte SIZEOF_YMMWORD
- jmp near .columnloop
- alignx 16, 7
- .column_st64:
- cmp ecx, byte SIZEOF_YMMWORD/2
- jb short .column_st32
- vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
- vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
- add edi, byte 2*SIZEOF_YMMWORD
- vmovdqa ymmA, ymmC
- vmovdqa ymmD, ymmH
- sub ecx, byte SIZEOF_YMMWORD/2
- .column_st32:
- cmp ecx, byte SIZEOF_YMMWORD/4
- jb short .column_st16
- vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
- add edi, byte SIZEOF_YMMWORD
- vmovdqa ymmA, ymmD
- sub ecx, byte SIZEOF_YMMWORD/4
- .column_st16:
- cmp ecx, byte SIZEOF_YMMWORD/8
- jb short .column_st15
- vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- add edi, byte SIZEOF_XMMWORD
- vperm2i128 ymmA, ymmA, ymmA, 1
- sub ecx, byte SIZEOF_YMMWORD/8
- .column_st15:
-
-
- cmp ecx, byte SIZEOF_YMMWORD/16
- jb short .column_st7
- vmovq MMWORD [edi], xmmA
- add edi, byte SIZEOF_YMMWORD/16*4
- sub ecx, byte SIZEOF_YMMWORD/16
- vpsrldq xmmA, SIZEOF_YMMWORD/16*4
- .column_st7:
-
-
- test ecx, ecx
- jz short .endcolumn
- vmovd XMM_DWORD [edi], xmmA
- %endif
- .endcolumn:
- sfence
- .return:
- vzeroupper
- pop edi
- pop esi
- pop ebx
- mov esp, ebp
- pop esp
- pop ebp
- ret
- %define output_width(b) (b) + 8
- %define input_buf(b) (b) + 12
- %define in_row_group_ctr(b) (b) + 16
- %define output_buf(b) (b) + 20
- align 32
- GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
- EXTN(jsimd_h2v2_merged_upsample_avx2):
- push ebp
- mov ebp, esp
- push ebx
- push esi
- push edi
- mov eax, POINTER [output_width(ebp)]
- mov edi, JSAMPIMAGE [input_buf(ebp)]
- mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- mov edi, JSAMPARRAY [output_buf(ebp)]
- lea esi, [esi+ecx*SIZEOF_JSAMPROW]
- push edx
- push ebx
- push esi
- mov ebx, esp
- push edi
- push ecx
- push ebx
- push eax
- call near EXTN(jsimd_h2v1_merged_upsample_avx2)
- add esi, byte SIZEOF_JSAMPROW
- add edi, byte SIZEOF_JSAMPROW
- mov POINTER [ebx+0*SIZEOF_POINTER], esi
- mov POINTER [ebx-1*SIZEOF_POINTER], edi
- call near EXTN(jsimd_h2v1_merged_upsample_avx2)
- add esp, byte 7*SIZEOF_DWORD
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
- align 32
|