123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424 |
- %include "jsimdext.inc"
- SECTION SEG_CONST
- alignz 32
- GLOBAL_DATA(jconst_huff_encode_one_block)
- EXTN(jconst_huff_encode_one_block):
- %include "jpeg_nbits_table.inc"
- alignz 32
- SECTION SEG_TEXT
- BITS 32
- %macro EMIT_BYTE 0
- sub put_bits, 8
- mov edx, put_buffer
- mov ecx, put_bits
- shr edx, cl
- mov byte [eax], dl
- add eax, 1
- cmp dl, 0xFF
- jne %%.EMIT_BYTE_END
- mov byte [eax], 0
- add eax, 1
- %%.EMIT_BYTE_END:
- %endmacro
- %macro PUT_BITS 1
- add put_bits, ecx
- shl put_buffer, cl
- or put_buffer, %1
- %endmacro
- %macro CHECKBUF15 0
- cmp put_bits, 16
- jl %%.CHECKBUF15_END
- mov eax, POINTER [esp+buffer]
- EMIT_BYTE
- EMIT_BYTE
- mov POINTER [esp+buffer], eax
- %%.CHECKBUF15_END:
- %endmacro
- %macro EMIT_BITS 1
- PUT_BITS %1
- CHECKBUF15
- %endmacro
- %macro kloop_prepare 37
- pxor xmm4, xmm4
- pxor xmm5, xmm5
- pxor xmm6, xmm6
- pxor xmm7, xmm7
- pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0
- pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0
- pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0
- pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0
- pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1
- pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1
- pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1
- pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1
- pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2
- pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2
- pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2
- pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2
- pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3
- pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3
- pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3
- pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3
- pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4
- pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4
- pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4
- pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4
- pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5
- pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5
- pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5
- pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5
- pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6
- pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6
- pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6
- pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6
- pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7
- pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7
- pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7
- %if %1 != 32
- pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7
- %else
- pinsrw %37, ecx, 7
- %endif
- pcmpgtw xmm4, %34
- pcmpgtw xmm5, %35
- pcmpgtw xmm6, %36
- pcmpgtw xmm7, %37
- paddw %34, xmm4
- paddw %35, xmm5
- paddw %36, xmm6
- paddw %37, xmm7
- pxor %34, xmm4
- pxor %35, xmm5
- pxor %36, xmm6
- pxor %37, xmm7
- pxor xmm4, %34
- pxor xmm5, %35
- pxor xmm6, %36
- pxor xmm7, %37
- movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34
- movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35
- movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36
- movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37
- movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4
- movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5
- movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6
- movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7
- %endmacro
- %define pad 6 * SIZEOF_DWORD
- %define t1 pad
- %define t2 t1 + (DCTSIZE2 * SIZEOF_WORD)
- %define block t2 + (DCTSIZE2 * SIZEOF_WORD)
- %define actbl block + SIZEOF_DWORD
- %define buffer actbl + SIZEOF_DWORD
- %define temp buffer + SIZEOF_DWORD
- %define temp2 temp + SIZEOF_DWORD
- %define temp3 temp2 + SIZEOF_DWORD
- %define temp4 temp3 + SIZEOF_DWORD
- %define temp5 temp4 + SIZEOF_DWORD
- %define gotptr temp5 + SIZEOF_DWORD
- %define put_buffer ebx
- %define put_bits edi
- align 32
- GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
- EXTN(jsimd_huff_encode_one_block_sse2):
- push ebp
- mov eax, esp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD)
- mov [esp], eax
- mov ebp, esp
- sub esp, temp5+9*SIZEOF_DWORD-pad
- push ebx
- push ecx
- push esi
- push edi
- push ebp
- mov esi, POINTER [eax+8]
- mov put_buffer, dword [esi+8]
- mov put_bits, dword [esi+12]
- push esi
- get_GOT edx
- movpic POINTER [esp+gotptr], edx
- mov ecx, POINTER [eax+28]
- mov edx, POINTER [eax+16]
- mov esi, POINTER [eax+12]
- mov POINTER [esp+actbl], ecx
- mov POINTER [esp+block], edx
- mov POINTER [esp+buffer], esi
-
- mov esi, POINTER [esp+block]
- movsx ecx, word [esi]
- sub ecx, dword [eax+20]
- mov esi, ecx
-
-
-
-
- mov edx, ecx
- sar edx, 31
- xor ecx, edx
- sub ecx, edx
-
-
- add esi, edx
- mov dword [esp+temp], esi
-
- movpic ebp, POINTER [esp+gotptr]
- movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]
- mov dword [esp+temp2], edx
-
- mov ebp, POINTER [eax+24]
- mov eax, INT [ebp + edx * 4]
- movzx ecx, byte [ebp + edx + 1024]
- EMIT_BITS eax
- mov ecx, dword [esp+temp2]
-
- mov eax, 1
- shl eax, cl
- dec eax
- and eax, dword [esp+temp]
-
-
- EMIT_BITS eax
-
- xor ecx, ecx
- mov esi, POINTER [esp+block]
- kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
- 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
- 27, 20, 13, 6, 7, 14, 21, 28, 35, \
- xmm0, xmm1, xmm2, xmm3
- kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
- 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
- 53, 60, 61, 54, 47, 55, 62, 63, 63, \
- xmm0, xmm1, xmm2, xmm3
- pxor xmm7, xmm7
- movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD]
- movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD]
- movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD]
- movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD]
- pcmpeqw xmm0, xmm7
- pcmpeqw xmm1, xmm7
- pcmpeqw xmm2, xmm7
- pcmpeqw xmm3, xmm7
- packsswb xmm0, xmm1
- packsswb xmm2, xmm3
- pmovmskb edx, xmm0
- pmovmskb ecx, xmm2
- shl ecx, 16
- or edx, ecx
- not edx
- lea esi, [esp+t1]
- mov ebp, POINTER [esp+actbl]
- .BLOOP:
- bsf ecx, edx
- jz near .ELOOP
- lea esi, [esi+ecx*2]
- shr edx, cl
- mov dword [esp+temp3], edx
- .BRLOOP:
- cmp ecx, 16
- jl near .ERLOOP
- sub ecx, 16
- mov dword [esp+temp], ecx
- mov eax, INT [ebp + 240 * 4]
- movzx ecx, byte [ebp + 1024 + 240]
- EMIT_BITS eax
- mov ecx, dword [esp+temp]
- jmp .BRLOOP
- .ERLOOP:
- movsx eax, word [esi]
- movpic edx, POINTER [esp+gotptr]
- movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]
- mov dword [esp+temp2], eax
-
- shl ecx, 4
- add ecx, eax
- mov eax, INT [ebp + ecx * 4]
- movzx ecx, byte [ebp + ecx + 1024]
- EMIT_BITS eax
- movsx edx, word [esi+DCTSIZE2*2]
-
- mov ecx, dword [esp+temp2]
- mov eax, 1
- shl eax, cl
- dec eax
- and eax, edx
- EMIT_BITS eax
- mov edx, dword [esp+temp3]
- add esi, 2
- shr edx, 1
- jmp .BLOOP
- .ELOOP:
- movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD]
- movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD]
- movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD]
- movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD]
- pcmpeqw xmm0, xmm7
- pcmpeqw xmm1, xmm7
- pcmpeqw xmm2, xmm7
- pcmpeqw xmm3, xmm7
- packsswb xmm0, xmm1
- packsswb xmm2, xmm3
- pmovmskb edx, xmm0
- pmovmskb ecx, xmm2
- shl ecx, 16
- or edx, ecx
- not edx
- lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
- sub eax, esi
- shr eax, 1
- bsf ecx, edx
- jz near .ELOOP2
- shr edx, cl
- add ecx, eax
- lea esi, [esi+ecx*2]
- mov dword [esp+temp3], edx
- jmp .BRLOOP2
- .BLOOP2:
- bsf ecx, edx
- jz near .ELOOP2
- lea esi, [esi+ecx*2]
- shr edx, cl
- mov dword [esp+temp3], edx
- .BRLOOP2:
- cmp ecx, 16
- jl near .ERLOOP2
- sub ecx, 16
- mov dword [esp+temp], ecx
- mov eax, INT [ebp + 240 * 4]
- movzx ecx, byte [ebp + 1024 + 240]
- EMIT_BITS eax
- mov ecx, dword [esp+temp]
- jmp .BRLOOP2
- .ERLOOP2:
- movsx eax, word [esi]
- bsr eax, eax
- inc eax
- mov dword [esp+temp2], eax
-
- shl ecx, 4
- add ecx, eax
- mov eax, INT [ebp + ecx * 4]
- movzx ecx, byte [ebp + ecx + 1024]
- EMIT_BITS eax
- movsx edx, word [esi+DCTSIZE2*2]
-
- mov ecx, dword [esp+temp2]
- mov eax, 1
- shl eax, cl
- dec eax
- and eax, edx
- EMIT_BITS eax
- mov edx, dword [esp+temp3]
- add esi, 2
- shr edx, 1
- jmp .BLOOP2
- .ELOOP2:
-
- lea edx, [esp + t1 + (DCTSIZE2-1) * 2]
- cmp edx, esi
- je .EFN
- mov eax, INT [ebp]
- movzx ecx, byte [ebp + 1024]
- EMIT_BITS eax
- .EFN:
- mov eax, [esp+buffer]
- pop esi
-
- mov dword [esi+8], put_buffer
- mov dword [esi+12], put_bits
- pop ebp
- pop edi
- pop esi
- pop ecx
- pop ebx
- mov esp, ebp
- pop esp
- pop ebp
- ret
- align 32
|