123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660 |
- %include "jsimdext.inc"
- SECTION SEG_TEXT
- BITS 32
- %macro LOAD16 0
- pxor N0, N0
- pxor N1, N1
- mov T0, INT [LUT + 0*SIZEOF_INT]
- mov T1, INT [LUT + 8*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 0
- pinsrw X1, word [BLOCK + T1 * 2], 0
- mov T0, INT [LUT + 1*SIZEOF_INT]
- mov T1, INT [LUT + 9*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 1
- pinsrw X1, word [BLOCK + T1 * 2], 1
- mov T0, INT [LUT + 2*SIZEOF_INT]
- mov T1, INT [LUT + 10*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 2
- pinsrw X1, word [BLOCK + T1 * 2], 2
- mov T0, INT [LUT + 3*SIZEOF_INT]
- mov T1, INT [LUT + 11*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 3
- pinsrw X1, word [BLOCK + T1 * 2], 3
- mov T0, INT [LUT + 4*SIZEOF_INT]
- mov T1, INT [LUT + 12*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 4
- pinsrw X1, word [BLOCK + T1 * 2], 4
- mov T0, INT [LUT + 5*SIZEOF_INT]
- mov T1, INT [LUT + 13*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 5
- pinsrw X1, word [BLOCK + T1 * 2], 5
- mov T0, INT [LUT + 6*SIZEOF_INT]
- mov T1, INT [LUT + 14*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 6
- pinsrw X1, word [BLOCK + T1 * 2], 6
- mov T0, INT [LUT + 7*SIZEOF_INT]
- mov T1, INT [LUT + 15*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 7
- pinsrw X1, word [BLOCK + T1 * 2], 7
- %endmacro
- %macro LOAD15 0
- pxor N0, N0
- pxor N1, N1
- pxor X1, X1
- mov T0, INT [LUT + 0*SIZEOF_INT]
- mov T1, INT [LUT + 8*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 0
- pinsrw X1, word [BLOCK + T1 * 2], 0
- mov T0, INT [LUT + 1*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 1
- mov T0, INT [LUT + 2*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 2
- mov T0, INT [LUT + 3*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 3
- mov T0, INT [LUT + 4*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 4
- mov T0, INT [LUT + 5*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 5
- mov T0, INT [LUT + 6*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 6
- mov T0, INT [LUT + 7*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 7
- cmp LENEND, 2
- jl %%.ELOAD15
- mov T1, INT [LUT + 9*SIZEOF_INT]
- pinsrw X1, word [BLOCK + T1 * 2], 1
- cmp LENEND, 3
- jl %%.ELOAD15
- mov T1, INT [LUT + 10*SIZEOF_INT]
- pinsrw X1, word [BLOCK + T1 * 2], 2
- cmp LENEND, 4
- jl %%.ELOAD15
- mov T1, INT [LUT + 11*SIZEOF_INT]
- pinsrw X1, word [BLOCK + T1 * 2], 3
- cmp LENEND, 5
- jl %%.ELOAD15
- mov T1, INT [LUT + 12*SIZEOF_INT]
- pinsrw X1, word [BLOCK + T1 * 2], 4
- cmp LENEND, 6
- jl %%.ELOAD15
- mov T1, INT [LUT + 13*SIZEOF_INT]
- pinsrw X1, word [BLOCK + T1 * 2], 5
- cmp LENEND, 7
- jl %%.ELOAD15
- mov T1, INT [LUT + 14*SIZEOF_INT]
- pinsrw X1, word [BLOCK + T1 * 2], 6
- %%.ELOAD15:
- %endmacro
- %macro LOAD8 0
- pxor N0, N0
- mov T0, INT [LUT + 0*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 0
- mov T0, INT [LUT + 1*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 1
- mov T0, INT [LUT + 2*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 2
- mov T0, INT [LUT + 3*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 3
- mov T0, INT [LUT + 4*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 4
- mov T0, INT [LUT + 5*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 5
- mov T0, INT [LUT + 6*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 6
- mov T0, INT [LUT + 7*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 7
- %endmacro
- %macro LOAD7 0
- pxor N0, N0
- pxor X0, X0
- mov T1, INT [LUT + 0*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 0
- cmp LENEND, 2
- jl %%.ELOAD7
- mov T1, INT [LUT + 1*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 1
- cmp LENEND, 3
- jl %%.ELOAD7
- mov T1, INT [LUT + 2*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 2
- cmp LENEND, 4
- jl %%.ELOAD7
- mov T1, INT [LUT + 3*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 3
- cmp LENEND, 5
- jl %%.ELOAD7
- mov T1, INT [LUT + 4*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 4
- cmp LENEND, 6
- jl %%.ELOAD7
- mov T1, INT [LUT + 5*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 5
- cmp LENEND, 7
- jl %%.ELOAD7
- mov T1, INT [LUT + 6*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 6
- %%.ELOAD7:
- %endmacro
- %macro REDUCE0 0
- movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
- movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
- movdqa xmm2, XMMWORD [VALUES + (16*2)]
- movdqa xmm3, XMMWORD [VALUES + (24*2)]
- movdqa xmm4, XMMWORD [VALUES + (32*2)]
- movdqa xmm5, XMMWORD [VALUES + (40*2)]
- movdqa xmm6, XMMWORD [VALUES + (48*2)]
- pcmpeqw xmm0, ZERO
- pcmpeqw xmm1, ZERO
- pcmpeqw xmm2, ZERO
- pcmpeqw xmm3, ZERO
- pcmpeqw xmm4, ZERO
- pcmpeqw xmm5, ZERO
- pcmpeqw xmm6, ZERO
- pcmpeqw xmm7, XMMWORD [VALUES + (56*2)]
- packsswb xmm0, xmm1
- packsswb xmm2, xmm3
- packsswb xmm4, xmm5
- packsswb xmm6, xmm7
- pmovmskb eax, xmm0
- pmovmskb ecx, xmm2
- pmovmskb edx, xmm4
- pmovmskb esi, xmm6
- shl ecx, 16
- shl esi, 16
- or eax, ecx
- or edx, esi
- not eax
- not edx
- mov edi, ZEROBITS
- mov INT [edi], eax
- mov INT [edi+SIZEOF_INT], edx
- %endmacro
- %define ZERO xmm7
- %define X0 xmm0
- %define X1 xmm1
- %define N0 xmm2
- %define N1 xmm3
- %define AL xmm4
- %define K eax
- %define LENEND eax
- %define LUT ebx
- %define T0 ecx
- %define T1 edx
- %define BLOCK esi
- %define VALUES edi
- %define LEN ebp
- %define ZEROBITS INT [esp + 5 * 4]
- align 32
- GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
- EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
- push ebp
- mov eax, esp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD)
- mov [esp], eax
- mov ebp, esp
- sub esp, 4
- push ebx
- push ecx
- push esi
- push edi
- push ebp
- mov BLOCK, INT [eax + 8]
- mov LUT, INT [eax + 12]
- mov VALUES, INT [eax + 24]
- movd AL, INT [eax + 20]
- mov T0, INT [eax + 28]
- mov ZEROBITS, T0
- mov LEN, INT [eax + 16]
- pxor ZERO, ZERO
- mov K, LEN
- and K, -16
- shr K, 4
- jz .ELOOP16
- .BLOOP16:
- LOAD16
- pcmpgtw N0, X0
- pcmpgtw N1, X1
- paddw X0, N0
- paddw X1, N1
- pxor X0, N0
- pxor X1, N1
- psrlw X0, AL
- psrlw X1, AL
- pxor N0, X0
- pxor N1, X1
- movdqa XMMWORD [VALUES + (0) * 2], X0
- movdqa XMMWORD [VALUES + (8) * 2], X1
- movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
- movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
- add VALUES, 16*2
- add LUT, 16*SIZEOF_INT
- dec K
- jnz .BLOOP16
- test LEN, 15
- je .PADDING
- .ELOOP16:
- mov LENEND, LEN
- and LENEND, 7
- test LEN, 8
- jz .TRY7
- test LEN, 7
- jz .TRY8
- LOAD15
- pcmpgtw N0, X0
- pcmpgtw N1, X1
- paddw X0, N0
- paddw X1, N1
- pxor X0, N0
- pxor X1, N1
- psrlw X0, AL
- psrlw X1, AL
- pxor N0, X0
- pxor N1, X1
- movdqa XMMWORD [VALUES + (0) * 2], X0
- movdqa XMMWORD [VALUES + (8) * 2], X1
- movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
- movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
- add VALUES, 16*2
- jmp .PADDING
- .TRY8:
- LOAD8
- pcmpgtw N0, X0
- paddw X0, N0
- pxor X0, N0
- psrlw X0, AL
- pxor N0, X0
- movdqa XMMWORD [VALUES + (0) * 2], X0
- movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
- add VALUES, 8*2
- jmp .PADDING
- .TRY7:
- LOAD7
- pcmpgtw N0, X0
- paddw X0, N0
- pxor X0, N0
- psrlw X0, AL
- pxor N0, X0
- movdqa XMMWORD [VALUES + (0) * 2], X0
- movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
- add VALUES, 8*2
- .PADDING:
- mov K, LEN
- add K, 7
- and K, -8
- shr K, 3
- sub K, DCTSIZE2/8
- jz .EPADDING
- align 16
- .ZEROLOOP:
- movdqa XMMWORD [VALUES + 0], ZERO
- add VALUES, 8*2
- inc K
- jnz .ZEROLOOP
- .EPADDING:
- sub VALUES, DCTSIZE2*2
- REDUCE0
- pop ebp
- pop edi
- pop esi
- pop ecx
- pop ebx
- mov esp, ebp
- pop esp
- pop ebp
- ret
- %undef ZERO
- %undef X0
- %undef X1
- %undef N0
- %undef N1
- %undef AL
- %undef K
- %undef LUT
- %undef T0
- %undef T1
- %undef BLOCK
- %undef VALUES
- %undef LEN
- %define ZERO xmm7
- %define ONE xmm5
- %define X0 xmm0
- %define X1 xmm1
- %define N0 xmm2
- %define N1 xmm3
- %define AL xmm4
- %define K eax
- %define LENEND eax
- %define LUT ebx
- %define T0 ecx
- %define T0w cx
- %define T1 edx
- %define BLOCK esi
- %define VALUES edi
- %define KK ebp
- %define ZEROBITS INT [esp + 5 * 4]
- %define EOB INT [esp + 5 * 4 + 4]
- %define LEN INT [esp + 5 * 4 + 8]
- align 32
- GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
- EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
- push ebp
- mov eax, esp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD)
- mov [esp], eax
- mov ebp, esp
- sub esp, 16
- push ebx
- push ecx
- push esi
- push edi
- push ebp
- pcmpeqw ONE, ONE
- psrlw ONE, 15
- mov BLOCK, INT [eax + 8]
- mov LUT, INT [eax + 12]
- mov VALUES, INT [eax + 24]
- movd AL, INT [eax + 20]
- mov T0, INT [eax + 28]
- mov K, INT [eax + 16]
- mov INT [T0 + 2 * SIZEOF_INT], -1
- mov INT [T0 + 3 * SIZEOF_INT], -1
- mov ZEROBITS, T0
- mov LEN, K
- pxor ZERO, ZERO
- and K, -16
- mov EOB, 0
- xor KK, KK
- shr K, 4
- jz .ELOOPR16
- .BLOOPR16:
- LOAD16
- pcmpgtw N0, X0
- pcmpgtw N1, X1
- paddw X0, N0
- paddw X1, N1
- pxor X0, N0
- pxor X1, N1
- psrlw X0, AL
- psrlw X1, AL
- movdqa XMMWORD [VALUES + (0) * 2], X0
- movdqa XMMWORD [VALUES + (8) * 2], X1
- pcmpeqw X0, ONE
- pcmpeqw X1, ONE
- packsswb N0, N1
- packsswb X0, X1
- pmovmskb T0, N0
- mov T1, ZEROBITS
- not T0
- mov word [T1 + 2 * SIZEOF_INT + KK], T0w
- pmovmskb T1, X0
- bsr T1, T1
- jz .CONTINUER16
- lea T1, [T1+KK*8]
- mov EOB, T1
- .CONTINUER16:
- add VALUES, 16*2
- add LUT, 16*SIZEOF_INT
- add KK, 2
- dec K
- jnz .BLOOPR16
- .ELOOPR16:
- mov LENEND, LEN
- test LENEND, 8
- jz .TRYR7
- test LENEND, 7
- jz .TRYR8
- and LENEND, 7
- LOAD15
- pcmpgtw N0, X0
- pcmpgtw N1, X1
- paddw X0, N0
- paddw X1, N1
- pxor X0, N0
- pxor X1, N1
- psrlw X0, AL
- psrlw X1, AL
- movdqa XMMWORD [VALUES + (0) * 2], X0
- movdqa XMMWORD [VALUES + (8) * 2], X1
- pcmpeqw X0, ONE
- pcmpeqw X1, ONE
- packsswb N0, N1
- packsswb X0, X1
- pmovmskb T0, N0
- mov T1, ZEROBITS
- not T0
- mov word [T1 + 2 * SIZEOF_INT + KK], T0w
- pmovmskb T1, X0
- bsr T1, T1
- jz .CONTINUER15
- lea T1, [T1+KK*8]
- mov EOB, T1
- .CONTINUER15:
- add VALUES, 16*2
- jmp .PADDINGR
- .TRYR8:
- LOAD8
- pcmpgtw N0, X0
- paddw X0, N0
- pxor X0, N0
- psrlw X0, AL
- movdqa XMMWORD [VALUES + (0) * 2], X0
- pcmpeqw X0, ONE
- packsswb N0, ZERO
- packsswb X0, ZERO
- pmovmskb T0, N0
- mov T1, ZEROBITS
- not T0
- mov word [T1 + 2 * SIZEOF_INT + KK], T0w
- pmovmskb T1, X0
- bsr T1, T1
- jz .CONTINUER8
- lea T1, [T1+KK*8]
- mov EOB, T1
- .CONTINUER8:
- add VALUES, 8*2
- jmp .PADDINGR
- .TRYR7:
- and LENEND, 7
- LOAD7
- pcmpgtw N0, X0
- paddw X0, N0
- pxor X0, N0
- psrlw X0, AL
- movdqa XMMWORD [VALUES + (0) * 2], X0
- pcmpeqw X0, ONE
- packsswb N0, ZERO
- packsswb X0, ZERO
- pmovmskb T0, N0
- mov T1, ZEROBITS
- not T0
- mov word [T1 + 2 * SIZEOF_INT + KK], T0w
- pmovmskb T1, X0
- bsr T1, T1
- jz .CONTINUER7
- lea T1, [T1+KK*8]
- mov EOB, T1
- .CONTINUER7:
- add VALUES, 8*2
- .PADDINGR:
- mov K, LEN
- add K, 7
- and K, -8
- shr K, 3
- sub K, DCTSIZE2/8
- jz .EPADDINGR
- align 16
- .ZEROLOOPR:
- movdqa XMMWORD [VALUES + 0], ZERO
- add VALUES, 8*2
- inc K
- jnz .ZEROLOOPR
- .EPADDINGR:
- sub VALUES, DCTSIZE2*2
- REDUCE0
- mov eax, EOB
- pop ebp
- pop edi
- pop esi
- pop ecx
- pop ebx
- mov esp, ebp
- pop esp
- pop ebp
- ret
- %undef ZERO
- %undef ONE
- %undef X0
- %undef X1
- %undef N0
- %undef N1
- %undef AL
- %undef K
- %undef KK
- %undef EOB
- %undef SIGN
- %undef LUT
- %undef T0
- %undef T1
- %undef BLOCK
- %undef VALUES
- %undef LEN
- %undef LENEND
- align 32
|