123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637 |
- ;
- ; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
- ; (64-bit SSE2)
- ;
- ; Copyright (C) 2016, 2018, Matthieu Darbois
- ;
- ; Based on the x86 SIMD extension for IJG JPEG library
- ; Copyright (C) 1999-2006, MIYASAKA Masaru.
- ; For conditions of distribution and use, see copyright notice in jsimdext.inc
- ;
- ; This file should be assembled with NASM (Netwide Assembler),
- ; can *not* be assembled with Microsoft's MASM or any compatible
- ; assembler (including Borland's Turbo Assembler).
- ; NASM is available from http://nasm.sourceforge.net/ or
- ; http://sourceforge.net/project/showfiles.php?group_id=6208
- ;
- ; This file contains an SSE2 implementation of data preparation for progressive
- ; Huffman encoding. See jcphuff.c for more details.
- %include "jsimdext.inc"
- ; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
- ; --------------------------------------------------------------------------
- ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
- ; jsimd_encode_mcu_AC_refine_prepare_sse2()
- %macro LOAD16 0
- pxor N0, N0
- pxor N1, N1
- mov T0d, INT [LUT + 0*SIZEOF_INT]
- mov T1d, INT [LUT + 8*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 0
- pinsrw X1, word [BLOCK + T1 * 2], 0
- mov T0d, INT [LUT + 1*SIZEOF_INT]
- mov T1d, INT [LUT + 9*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 1
- pinsrw X1, word [BLOCK + T1 * 2], 1
- mov T0d, INT [LUT + 2*SIZEOF_INT]
- mov T1d, INT [LUT + 10*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 2
- pinsrw X1, word [BLOCK + T1 * 2], 2
- mov T0d, INT [LUT + 3*SIZEOF_INT]
- mov T1d, INT [LUT + 11*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 3
- pinsrw X1, word [BLOCK + T1 * 2], 3
- mov T0d, INT [LUT + 4*SIZEOF_INT]
- mov T1d, INT [LUT + 12*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 4
- pinsrw X1, word [BLOCK + T1 * 2], 4
- mov T0d, INT [LUT + 5*SIZEOF_INT]
- mov T1d, INT [LUT + 13*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 5
- pinsrw X1, word [BLOCK + T1 * 2], 5
- mov T0d, INT [LUT + 6*SIZEOF_INT]
- mov T1d, INT [LUT + 14*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 6
- pinsrw X1, word [BLOCK + T1 * 2], 6
- mov T0d, INT [LUT + 7*SIZEOF_INT]
- mov T1d, INT [LUT + 15*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 7
- pinsrw X1, word [BLOCK + T1 * 2], 7
- %endmacro
- %macro LOAD15 0
- pxor N0, N0
- pxor N1, N1
- pxor X1, X1
- mov T0d, INT [LUT + 0*SIZEOF_INT]
- mov T1d, INT [LUT + 8*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 0
- pinsrw X1, word [BLOCK + T1 * 2], 0
- mov T0d, INT [LUT + 1*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 1
- mov T0d, INT [LUT + 2*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 2
- mov T0d, INT [LUT + 3*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 3
- mov T0d, INT [LUT + 4*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 4
- mov T0d, INT [LUT + 5*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 5
- mov T0d, INT [LUT + 6*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 6
- mov T0d, INT [LUT + 7*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 7
- cmp LENEND, 2
- jl %%.ELOAD15
- mov T1d, INT [LUT + 9*SIZEOF_INT]
- pinsrw X1, word [BLOCK + T1 * 2], 1
- cmp LENEND, 3
- jl %%.ELOAD15
- mov T1d, INT [LUT + 10*SIZEOF_INT]
- pinsrw X1, word [BLOCK + T1 * 2], 2
- cmp LENEND, 4
- jl %%.ELOAD15
- mov T1d, INT [LUT + 11*SIZEOF_INT]
- pinsrw X1, word [BLOCK + T1 * 2], 3
- cmp LENEND, 5
- jl %%.ELOAD15
- mov T1d, INT [LUT + 12*SIZEOF_INT]
- pinsrw X1, word [BLOCK + T1 * 2], 4
- cmp LENEND, 6
- jl %%.ELOAD15
- mov T1d, INT [LUT + 13*SIZEOF_INT]
- pinsrw X1, word [BLOCK + T1 * 2], 5
- cmp LENEND, 7
- jl %%.ELOAD15
- mov T1d, INT [LUT + 14*SIZEOF_INT]
- pinsrw X1, word [BLOCK + T1 * 2], 6
- %%.ELOAD15:
- %endmacro
- %macro LOAD8 0
- pxor N0, N0
- mov T0d, INT [LUT + 0*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 0
- mov T0d, INT [LUT + 1*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 1
- mov T0d, INT [LUT + 2*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 2
- mov T0d, INT [LUT + 3*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 3
- mov T0d, INT [LUT + 4*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 4
- mov T0d, INT [LUT + 5*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 5
- mov T0d, INT [LUT + 6*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 6
- mov T0d, INT [LUT + 7*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T0 * 2], 7
- %endmacro
- %macro LOAD7 0
- pxor N0, N0
- pxor X0, X0
- mov T1d, INT [LUT + 0*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 0
- cmp LENEND, 2
- jl %%.ELOAD7
- mov T1d, INT [LUT + 1*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 1
- cmp LENEND, 3
- jl %%.ELOAD7
- mov T1d, INT [LUT + 2*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 2
- cmp LENEND, 4
- jl %%.ELOAD7
- mov T1d, INT [LUT + 3*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 3
- cmp LENEND, 5
- jl %%.ELOAD7
- mov T1d, INT [LUT + 4*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 4
- cmp LENEND, 6
- jl %%.ELOAD7
- mov T1d, INT [LUT + 5*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 5
- cmp LENEND, 7
- jl %%.ELOAD7
- mov T1d, INT [LUT + 6*SIZEOF_INT]
- pinsrw X0, word [BLOCK + T1 * 2], 6
- %%.ELOAD7:
- %endmacro
- %macro REDUCE0 0
- movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
- movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
- movdqa xmm2, XMMWORD [VALUES + (16*2)]
- movdqa xmm3, XMMWORD [VALUES + (24*2)]
- movdqa xmm4, XMMWORD [VALUES + (32*2)]
- movdqa xmm5, XMMWORD [VALUES + (40*2)]
- movdqa xmm6, XMMWORD [VALUES + (48*2)]
- movdqa xmm7, XMMWORD [VALUES + (56*2)]
- pcmpeqw xmm0, ZERO
- pcmpeqw xmm1, ZERO
- pcmpeqw xmm2, ZERO
- pcmpeqw xmm3, ZERO
- pcmpeqw xmm4, ZERO
- pcmpeqw xmm5, ZERO
- pcmpeqw xmm6, ZERO
- pcmpeqw xmm7, ZERO
- packsswb xmm0, xmm1
- packsswb xmm2, xmm3
- packsswb xmm4, xmm5
- packsswb xmm6, xmm7
- pmovmskb eax, xmm0
- pmovmskb ecx, xmm2
- pmovmskb edx, xmm4
- pmovmskb esi, xmm6
- shl rcx, 16
- shl rdx, 32
- shl rsi, 48
- or rax, rcx
- or rdx, rsi
- or rax, rdx
- not rax
- mov MMWORD [r15], rax
- %endmacro
- ;
- ; Prepare data for jsimd_encode_mcu_AC_first().
- ;
- ; GLOBAL(void)
- ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
- ; const int *jpeg_natural_order_start,
- ; int Sl, int Al, JCOEF *values,
- ; size_t *zerobits)
- ;
- ; r10 = const JCOEF *block
- ; r11 = const int *jpeg_natural_order_start
- ; r12 = int Sl
- ; r13 = int Al
- ; r14 = JCOEF *values
- ; r15 = size_t *zerobits
- %define ZERO xmm9
- %define X0 xmm0
- %define X1 xmm1
- %define N0 xmm2
- %define N1 xmm3
- %define AL xmm4
- %define K eax
- %define LUT r11
- %define T0 rcx
- %define T0d ecx
- %define T1 rdx
- %define T1d edx
- %define BLOCK r10
- %define VALUES r14
- %define LEN r12d
- %define LENEND r13d
- align 32
- GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
- EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
- push rbp
- mov rax, rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp], rax
- mov rbp, rsp ; rbp = aligned rbp
- lea rsp, [rbp - 16]
- collect_args 6
- movdqa XMMWORD [rbp - 16], ZERO
- movd AL, r13d
- pxor ZERO, ZERO
- mov K, LEN
- mov LENEND, LEN
- and K, -16
- and LENEND, 7
- shr K, 4
- jz .ELOOP16
- .BLOOP16:
- LOAD16
- pcmpgtw N0, X0
- pcmpgtw N1, X1
- paddw X0, N0
- paddw X1, N1
- pxor X0, N0
- pxor X1, N1
- psrlw X0, AL
- psrlw X1, AL
- pxor N0, X0
- pxor N1, X1
- movdqa XMMWORD [VALUES + (0) * 2], X0
- movdqa XMMWORD [VALUES + (8) * 2], X1
- movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
- movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
- add VALUES, 16*2
- add LUT, 16*SIZEOF_INT
- dec K
- jnz .BLOOP16
- test LEN, 15
- je .PADDING
- .ELOOP16:
- test LEN, 8
- jz .TRY7
- test LEN, 7
- jz .TRY8
- LOAD15
- pcmpgtw N0, X0
- pcmpgtw N1, X1
- paddw X0, N0
- paddw X1, N1
- pxor X0, N0
- pxor X1, N1
- psrlw X0, AL
- psrlw X1, AL
- pxor N0, X0
- pxor N1, X1
- movdqa XMMWORD [VALUES + (0) * 2], X0
- movdqa XMMWORD [VALUES + (8) * 2], X1
- movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
- movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
- add VALUES, 16*2
- jmp .PADDING
- .TRY8:
- LOAD8
- pcmpgtw N0, X0
- paddw X0, N0
- pxor X0, N0
- psrlw X0, AL
- pxor N0, X0
- movdqa XMMWORD [VALUES + (0) * 2], X0
- movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
- add VALUES, 8*2
- jmp .PADDING
- .TRY7:
- LOAD7
- pcmpgtw N0, X0
- paddw X0, N0
- pxor X0, N0
- psrlw X0, AL
- pxor N0, X0
- movdqa XMMWORD [VALUES + (0) * 2], X0
- movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
- add VALUES, 8*2
- .PADDING:
- mov K, LEN
- add K, 7
- and K, -8
- shr K, 3
- sub K, DCTSIZE2/8
- jz .EPADDING
- align 16
- .ZEROLOOP:
- movdqa XMMWORD [VALUES + 0], ZERO
- add VALUES, 8*2
- inc K
- jnz .ZEROLOOP
- .EPADDING:
- sub VALUES, DCTSIZE2*2
- REDUCE0
- movdqa ZERO, XMMWORD [rbp - 16]
- uncollect_args 6
- mov rsp, rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
- %undef ZERO
- %undef X0
- %undef X1
- %undef N0
- %undef N1
- %undef AL
- %undef K
- %undef LUT
- %undef T0
- %undef T0d
- %undef T1
- %undef T1d
- %undef BLOCK
- %undef VALUES
- %undef LEN
- %undef LENEND
- ;
- ; Prepare data for jsimd_encode_mcu_AC_refine().
- ;
- ; GLOBAL(int)
- ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
- ; const int *jpeg_natural_order_start,
- ; int Sl, int Al, JCOEF *absvalues,
- ; size_t *bits)
- ;
- ; r10 = const JCOEF *block
- ; r11 = const int *jpeg_natural_order_start
- ; r12 = int Sl
- ; r13 = int Al
- ; r14 = JCOEF *values
- ; r15 = size_t *bits
- %define ZERO xmm9
- %define ONE xmm5
- %define X0 xmm0
- %define X1 xmm1
- %define N0 xmm2
- %define N1 xmm3
- %define AL xmm4
- %define K eax
- %define KK r9d
- %define EOB r8d
- %define SIGN rdi
- %define LUT r11
- %define T0 rcx
- %define T0d ecx
- %define T1 rdx
- %define T1d edx
- %define BLOCK r10
- %define VALUES r14
- %define LEN r12d
- %define LENEND r13d
- align 32
- GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
- EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
- push rbp
- mov rax, rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp], rax
- mov rbp, rsp ; rbp = aligned rbp
- lea rsp, [rbp - 16]
- collect_args 6
- movdqa XMMWORD [rbp - 16], ZERO
- xor SIGN, SIGN
- xor EOB, EOB
- xor KK, KK
- movd AL, r13d
- pxor ZERO, ZERO
- pcmpeqw ONE, ONE
- psrlw ONE, 15
- mov K, LEN
- mov LENEND, LEN
- and K, -16
- and LENEND, 7
- shr K, 4
- jz .ELOOPR16
- .BLOOPR16:
- LOAD16
- pcmpgtw N0, X0
- pcmpgtw N1, X1
- paddw X0, N0
- paddw X1, N1
- pxor X0, N0
- pxor X1, N1
- psrlw X0, AL
- psrlw X1, AL
- movdqa XMMWORD [VALUES + (0) * 2], X0
- movdqa XMMWORD [VALUES + (8) * 2], X1
- pcmpeqw X0, ONE
- pcmpeqw X1, ONE
- packsswb N0, N1
- packsswb X0, X1
- pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
- pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
- shr SIGN, 16 ; make room for sizebits
- shl T0, 48
- or SIGN, T0
- bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
- jz .CONTINUER16 ; if (idx) {
- mov EOB, KK
- add EOB, T1d ; EOB = k + idx;
- .CONTINUER16:
- add VALUES, 16*2
- add LUT, 16*SIZEOF_INT
- add KK, 16
- dec K
- jnz .BLOOPR16
- .ELOOPR16:
- test LEN, 8
- jz .TRYR7
- test LEN, 7
- jz .TRYR8
- LOAD15
- pcmpgtw N0, X0
- pcmpgtw N1, X1
- paddw X0, N0
- paddw X1, N1
- pxor X0, N0
- pxor X1, N1
- psrlw X0, AL
- psrlw X1, AL
- movdqa XMMWORD [VALUES + (0) * 2], X0
- movdqa XMMWORD [VALUES + (8) * 2], X1
- pcmpeqw X0, ONE
- pcmpeqw X1, ONE
- packsswb N0, N1
- packsswb X0, X1
- pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
- pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
- shr SIGN, 16 ; make room for sizebits
- shl T0, 48
- or SIGN, T0
- bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
- jz .CONTINUER15 ; if (idx) {
- mov EOB, KK
- add EOB, T1d ; EOB = k + idx;
- .CONTINUER15:
- add VALUES, 16*2
- jmp .PADDINGR
- .TRYR8:
- LOAD8
- pcmpgtw N0, X0
- paddw X0, N0
- pxor X0, N0
- psrlw X0, AL
- movdqa XMMWORD [VALUES + (0) * 2], X0
- pcmpeqw X0, ONE
- packsswb N0, ZERO
- packsswb X0, ZERO
- pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
- pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
- shr SIGN, 8 ; make room for sizebits
- shl T0, 56
- or SIGN, T0
- bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
- jz .CONTINUER8 ; if (idx) {
- mov EOB, KK
- add EOB, T1d ; EOB = k + idx;
- .CONTINUER8:
- add VALUES, 8*2
- jmp .PADDINGR
- .TRYR7:
- LOAD7
- pcmpgtw N0, X0
- paddw X0, N0
- pxor X0, N0
- psrlw X0, AL
- movdqa XMMWORD [VALUES + (0) * 2], X0
- pcmpeqw X0, ONE
- packsswb N0, ZERO
- packsswb X0, ZERO
- pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
- pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
- shr SIGN, 8 ; make room for sizebits
- shl T0, 56
- or SIGN, T0
- bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
- jz .CONTINUER7 ; if (idx) {
- mov EOB, KK
- add EOB, T1d ; EOB = k + idx;
- .CONTINUER7:
- add VALUES, 8*2
- .PADDINGR:
- mov K, LEN
- add K, 7
- and K, -8
- shr K, 3
- sub K, DCTSIZE2/8
- jz .EPADDINGR
- align 16
- .ZEROLOOPR:
- movdqa XMMWORD [VALUES + 0], ZERO
- shr SIGN, 8
- add VALUES, 8*2
- inc K
- jnz .ZEROLOOPR
- .EPADDINGR:
- not SIGN
- sub VALUES, DCTSIZE2*2
- mov MMWORD [r15+SIZEOF_MMWORD], SIGN
- REDUCE0
- mov eax, EOB
- movdqa ZERO, XMMWORD [rbp - 16]
- uncollect_args 6
- mov rsp, rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
- %undef ZERO
- %undef ONE
- %undef X0
- %undef X1
- %undef N0
- %undef N1
- %undef AL
- %undef K
- %undef KK
- %undef EOB
- %undef SIGN
- %undef LUT
- %undef T0
- %undef T0d
- %undef T1
- %undef T1d
- %undef BLOCK
- %undef VALUES
- %undef LEN
- %undef LENEND
- ; For some reason, the OS X linker does not honor the request to align the
- ; segment unless we do this.
- align 32
|