12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391 |
- /*
- * Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
- #include "libyuv/row.h"
- #include "libyuv/scale_row.h"
- #ifdef __cplusplus
- namespace libyuv {
- extern "C" {
- #endif
- // This module is for 32 bit Visual C x86 and clangcl
- #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
- // Offsets for source bytes 0 to 9
- static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
- 128, 128, 128, 128, 128, 128, 128, 128};
- // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
- static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
- 128, 128, 128, 128, 128, 128, 128, 128};
- // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
- static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
- 128, 128, 128, 128, 128, 128, 128, 128};
- // Offsets for source bytes 0 to 10
- static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
- // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
- static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
- 8, 9, 9, 10, 10, 11, 12, 13};
- // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
- static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
- 10, 11, 12, 13, 13, 14, 14, 15};
- // Coefficients for source bytes 0 to 10
- static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
- // Coefficients for source bytes 10 to 21
- static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
- // Coefficients for source bytes 21 to 31
- static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
- // Coefficients for source bytes 21 to 31
- static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
- static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128};
- static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
- 6, 8, 11, 14, 128, 128, 128, 128};
- // Arrange words 0,3,6 into 0,1,2
- static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128};
- // Arrange words 0,3,6 into 3,4,5
- static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
- 6, 7, 12, 13, 128, 128, 128, 128};
- // Scaling values for boxes of 3x3 and 2x3
- static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
- 65536 / 9, 65536 / 6, 0, 0};
- // Arrange first value for pixels 0,1,2,3,4,5
- static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
- 11, 128, 14, 128, 128, 128, 128, 128};
- // Arrange second value for pixels 0,1,2,3,4,5
- static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
- 12, 128, 15, 128, 128, 128, 128, 128};
- // Arrange third value for pixels 0,1,2,3,4,5
- static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
- 13, 128, 128, 128, 128, 128, 128, 128};
- // Scaling values for boxes of 3x2 and 2x2
- static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
- 65536 / 3, 65536 / 2, 0, 0};
- // Reads 32 pixels, throws half away and writes 16 pixels.
- __declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // isolate odd pixels.
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg wloop
- ret
- }
- }
- // Blends 32x1 rectangle to 16x1.
- __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm4, xmm4 // constant 0x0101
- psrlw xmm4, 15
- packuswb xmm4, xmm4
- pxor xmm5, xmm5 // constant 0
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pmaddubsw xmm0, xmm4 // horizontal add
- pmaddubsw xmm1, xmm4
- pavgw xmm0, xmm5 // (x + 1) / 2
- pavgw xmm1, xmm5
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg wloop
- ret
- }
- }
- // Blends 32x2 rectangle to 16x1.
- __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- pcmpeqb xmm4, xmm4 // constant 0x0101
- psrlw xmm4, 15
- packuswb xmm4, xmm4
- pxor xmm5, xmm5 // constant 0
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pmaddubsw xmm0, xmm4 // horizontal add
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- paddw xmm0, xmm2 // vertical add
- paddw xmm1, xmm3
- psrlw xmm0, 1
- psrlw xmm1, 1
- pavgw xmm0, xmm5 // (x + 1) / 2
- pavgw xmm1, xmm5
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg wloop
- pop esi
- ret
- }
- }
- #ifdef HAS_SCALEROWDOWN2_AVX2
- // Reads 64 pixels, throws half away and writes 32 pixels.
- __declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- wloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
- vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg wloop
- vzeroupper
- ret
- }
- }
- // Blends 64x1 rectangle to 32x1.
- __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
- vpsrlw ymm4, ymm4, 15
- vpackuswb ymm4, ymm4, ymm4
- vpxor ymm5, ymm5, ymm5 // constant 0
- wloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
- vpmaddubsw ymm1, ymm1, ymm4
- vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
- vpavgw ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg wloop
- vzeroupper
- ret
- }
- }
- // For rounding, average = (sum + 2) / 4
- // becomes average((sum >> 1), 0)
- // Blends 64x2 rectangle to 32x1.
- __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
- vpsrlw ymm4, ymm4, 15
- vpackuswb ymm4, ymm4, ymm4
- vpxor ymm5, ymm5, ymm5 // constant 0
- wloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + esi]
- vmovdqu ymm3, [eax + esi + 32]
- lea eax, [eax + 64]
- vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
- vpmaddubsw ymm1, ymm1, ymm4
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- vpaddw ymm0, ymm0, ymm2 // vertical add
- vpaddw ymm1, ymm1, ymm3
- vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
- vpsrlw ymm1, ymm1, 1
- vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
- vpavgw ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
- vmovdqu [edx], ymm0
- lea edx, [edx + 32]
- sub ecx, 32
- jg wloop
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_SCALEROWDOWN2_AVX2
- // Point samples 32 pixels to 8 pixels.
- __declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
- psrld xmm5, 24
- pslld xmm5, 16
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- psrlw xmm0, 8
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- sub ecx, 8
- jg wloop
- ret
- }
- }
- // Blends 32x4 rectangle to 8x1.
- __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_ptr
- mov esi, [esp + 8 + 8] // src_stride
- mov edx, [esp + 8 + 12] // dst_ptr
- mov ecx, [esp + 8 + 16] // dst_width
- lea edi, [esi + esi * 2] // src_stride * 3
- pcmpeqb xmm4, xmm4 // constant 0x0101
- psrlw xmm4, 15
- movdqa xmm5, xmm4
- packuswb xmm4, xmm4
- psllw xmm5, 3 // constant 0x0008
- wloop:
- movdqu xmm0, [eax] // average rows
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- pmaddubsw xmm0, xmm4 // horizontal add
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- paddw xmm0, xmm2 // vertical add rows 0, 1
- paddw xmm1, xmm3
- movdqu xmm2, [eax + esi * 2]
- movdqu xmm3, [eax + esi * 2 + 16]
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- paddw xmm0, xmm2 // add row 2
- paddw xmm1, xmm3
- movdqu xmm2, [eax + edi]
- movdqu xmm3, [eax + edi + 16]
- lea eax, [eax + 32]
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- paddw xmm0, xmm2 // add row 3
- paddw xmm1, xmm3
- phaddw xmm0, xmm1
- paddw xmm0, xmm5 // + 8 for round
- psrlw xmm0, 4 // /16 for average of 4 * 4
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- sub ecx, 8
- jg wloop
- pop edi
- pop esi
- ret
- }
- }
- #ifdef HAS_SCALEROWDOWN4_AVX2
- // Point samples 64 pixels to 16 pixels.
- __declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
- vpsrld ymm5, ymm5, 24
- vpslld ymm5, ymm5, 16
- wloop:
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5
- vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
- vpsrlw ymm0, ymm0, 8
- vpackuswb ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
- vmovdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg wloop
- vzeroupper
- ret
- }
- }
- // Blends 64x4 rectangle to 16x1.
- __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_ptr
- mov esi, [esp + 8 + 8] // src_stride
- mov edx, [esp + 8 + 12] // dst_ptr
- mov ecx, [esp + 8 + 16] // dst_width
- lea edi, [esi + esi * 2] // src_stride * 3
- vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
- vpsrlw ymm4, ymm4, 15
- vpsllw ymm5, ymm4, 3 // constant 0x0008
- vpackuswb ymm4, ymm4, ymm4
- wloop:
- vmovdqu ymm0, [eax] // average rows
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + esi]
- vmovdqu ymm3, [eax + esi + 32]
- vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
- vpmaddubsw ymm1, ymm1, ymm4
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
- vpaddw ymm1, ymm1, ymm3
- vmovdqu ymm2, [eax + esi * 2]
- vmovdqu ymm3, [eax + esi * 2 + 32]
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- vpaddw ymm0, ymm0, ymm2 // add row 2
- vpaddw ymm1, ymm1, ymm3
- vmovdqu ymm2, [eax + edi]
- vmovdqu ymm3, [eax + edi + 32]
- lea eax, [eax + 64]
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- vpaddw ymm0, ymm0, ymm2 // add row 3
- vpaddw ymm1, ymm1, ymm3
- vphaddw ymm0, ymm0, ymm1 // mutates
- vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
- vpaddw ymm0, ymm0, ymm5 // + 8 for round
- vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
- vpackuswb ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
- vmovdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- jg wloop
- pop edi
- pop esi
- vzeroupper
- ret
- }
- }
- #endif // HAS_SCALEROWDOWN4_AVX2
- // Point samples 32 pixels to 24 pixels.
- // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
- // Then shuffled to do the scaling.
- __declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- movdqa xmm3, xmmword ptr kShuf0
- movdqa xmm4, xmmword ptr kShuf1
- movdqa xmm5, xmmword ptr kShuf2
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm1
- palignr xmm1, xmm0, 8
- pshufb xmm0, xmm3
- pshufb xmm1, xmm4
- pshufb xmm2, xmm5
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + 8], xmm1
- movq qword ptr [edx + 16], xmm2
- lea edx, [edx + 24]
- sub ecx, 24
- jg wloop
- ret
- }
- }
- // Blends 32x2 rectangle to 24x1
- // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
- // Then shuffled to do the scaling.
- // Register usage:
- // xmm0 src_row 0
- // xmm1 src_row 1
- // xmm2 shuf 0
- // xmm3 shuf 1
- // xmm4 shuf 2
- // xmm5 madd 0
- // xmm6 madd 1
- // xmm7 kRound34
- // Note that movdqa+palign may be better than movdqu.
- __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, xmmword ptr kShuf01
- movdqa xmm3, xmmword ptr kShuf11
- movdqa xmm4, xmmword ptr kShuf21
- movdqa xmm5, xmmword ptr kMadd01
- movdqa xmm6, xmmword ptr kMadd11
- movdqa xmm7, xmmword ptr kRound34
- wloop:
- movdqu xmm0, [eax] // pixels 0..7
- movdqu xmm1, [eax + esi]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm2
- pmaddubsw xmm0, xmm5
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- movdqu xmm0, [eax + 8] // pixels 8..15
- movdqu xmm1, [eax + esi + 8]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm3
- pmaddubsw xmm0, xmm6
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx + 8], xmm0
- movdqu xmm0, [eax + 16] // pixels 16..23
- movdqu xmm1, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm4
- movdqa xmm1, xmmword ptr kMadd21
- pmaddubsw xmm0, xmm1
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx + 16], xmm0
- lea edx, [edx + 24]
- sub ecx, 24
- jg wloop
- pop esi
- ret
- }
- }
- // Note that movdqa+palign may be better than movdqu.
- __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, xmmword ptr kShuf01
- movdqa xmm3, xmmword ptr kShuf11
- movdqa xmm4, xmmword ptr kShuf21
- movdqa xmm5, xmmword ptr kMadd01
- movdqa xmm6, xmmword ptr kMadd11
- movdqa xmm7, xmmword ptr kRound34
- wloop:
- movdqu xmm0, [eax] // pixels 0..7
- movdqu xmm1, [eax + esi]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm2
- pmaddubsw xmm0, xmm5
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- movdqu xmm0, [eax + 8] // pixels 8..15
- movdqu xmm1, [eax + esi + 8]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm3
- pmaddubsw xmm0, xmm6
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx + 8], xmm0
- movdqu xmm0, [eax + 16] // pixels 16..23
- movdqu xmm1, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm4
- movdqa xmm1, xmmword ptr kMadd21
- pmaddubsw xmm0, xmm1
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx + 16], xmm0
- lea edx, [edx+24]
- sub ecx, 24
- jg wloop
- pop esi
- ret
- }
- }
- // 3/8 point sampler
- // Scale 32 pixels to 12
- __declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- movdqa xmm4, xmmword ptr kShuf38a
- movdqa xmm5, xmmword ptr kShuf38b
- xloop:
- movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
- movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
- lea eax, [eax + 32]
- pshufb xmm0, xmm4
- pshufb xmm1, xmm5
- paddusb xmm0, xmm1
- movq qword ptr [edx], xmm0 // write 12 pixels
- movhlps xmm1, xmm0
- movd [edx + 8], xmm1
- lea edx, [edx + 12]
- sub ecx, 12
- jg xloop
- ret
- }
- }
- // Scale 16x3 pixels to 6x1 with interpolation
- __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, xmmword ptr kShufAc
- movdqa xmm3, xmmword ptr kShufAc3
- movdqa xmm4, xmmword ptr kScaleAc33
- pxor xmm5, xmm5
- xloop:
- movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
- movdqu xmm6, [eax + esi]
- movhlps xmm1, xmm0
- movhlps xmm7, xmm6
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- punpcklbw xmm6, xmm5
- punpcklbw xmm7, xmm5
- paddusw xmm0, xmm6
- paddusw xmm1, xmm7
- movdqu xmm6, [eax + esi * 2]
- lea eax, [eax + 16]
- movhlps xmm7, xmm6
- punpcklbw xmm6, xmm5
- punpcklbw xmm7, xmm5
- paddusw xmm0, xmm6
- paddusw xmm1, xmm7
- movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
- psrldq xmm0, 2
- paddusw xmm6, xmm0
- psrldq xmm0, 2
- paddusw xmm6, xmm0
- pshufb xmm6, xmm2
- movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
- psrldq xmm1, 2
- paddusw xmm7, xmm1
- psrldq xmm1, 2
- paddusw xmm7, xmm1
- pshufb xmm7, xmm3
- paddusw xmm6, xmm7
- pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
- packuswb xmm6, xmm6
- movd [edx], xmm6 // write 6 pixels
- psrlq xmm6, 16
- movd [edx + 2], xmm6
- lea edx, [edx + 6]
- sub ecx, 6
- jg xloop
- pop esi
- ret
- }
- }
- // Scale 16x2 pixels to 6x1 with interpolation
- __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
- ptrdiff_t src_stride,
- uint8_t* dst_ptr,
- int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, xmmword ptr kShufAb0
- movdqa xmm3, xmmword ptr kShufAb1
- movdqa xmm4, xmmword ptr kShufAb2
- movdqa xmm5, xmmword ptr kScaleAb2
- xloop:
- movdqu xmm0, [eax] // average 2 rows into xmm0
- movdqu xmm1, [eax + esi]
- lea eax, [eax + 16]
- pavgb xmm0, xmm1
- movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
- pshufb xmm1, xmm2
- movdqa xmm6, xmm0
- pshufb xmm6, xmm3
- paddusw xmm1, xmm6
- pshufb xmm0, xmm4
- paddusw xmm1, xmm0
- pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
- packuswb xmm1, xmm1
- movd [edx], xmm1 // write 6 pixels
- psrlq xmm1, 16
- movd [edx + 2], xmm1
- lea edx, [edx + 6]
- sub ecx, 6
- jg xloop
- pop esi
- ret
- }
- }
- // Reads 16 bytes and accumulates to 16 shorts at a time.
- __declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
- uint16_t* dst_ptr,
- int src_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- mov edx, [esp + 8] // dst_ptr
- mov ecx, [esp + 12] // src_width
- pxor xmm5, xmm5
- // sum rows
- xloop:
- movdqu xmm3, [eax] // read 16 bytes
- lea eax, [eax + 16]
- movdqu xmm0, [edx] // read 16 words from destination
- movdqu xmm1, [edx + 16]
- movdqa xmm2, xmm3
- punpcklbw xmm2, xmm5
- punpckhbw xmm3, xmm5
- paddusw xmm0, xmm2 // sum 16 words
- paddusw xmm1, xmm3
- movdqu [edx], xmm0 // write 16 words to destination
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 16
- jg xloop
- ret
- }
- }
- #ifdef HAS_SCALEADDROW_AVX2
- // Reads 32 bytes and accumulates to 32 shorts at a time.
- __declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
- uint16_t* dst_ptr,
- int src_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- mov edx, [esp + 8] // dst_ptr
- mov ecx, [esp + 12] // src_width
- vpxor ymm5, ymm5, ymm5
- // sum rows
- xloop:
- vmovdqu ymm3, [eax] // read 32 bytes
- lea eax, [eax + 32]
- vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
- vpunpcklbw ymm2, ymm3, ymm5
- vpunpckhbw ymm3, ymm3, ymm5
- vpaddusw ymm0, ymm2, [edx] // sum 16 words
- vpaddusw ymm1, ymm3, [edx + 32]
- vmovdqu [edx], ymm0 // write 32 words to destination
- vmovdqu [edx + 32], ymm1
- lea edx, [edx + 64]
- sub ecx, 32
- jg xloop
- vzeroupper
- ret
- }
- }
- #endif // HAS_SCALEADDROW_AVX2
- // Constant for making pixels signed to avoid pmaddubsw
- // saturation.
- static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
- // Constant for making pixels unsigned and adding .5 for rounding.
- static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
- 0x4040, 0x4040, 0x4040, 0x4040};
- // Bilinear column filtering. SSSE3 version.
- __declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- int dst_width,
- int x,
- int dx) {
- __asm {
- push ebx
- push esi
- push edi
- mov edi, [esp + 12 + 4] // dst_ptr
- mov esi, [esp + 12 + 8] // src_ptr
- mov ecx, [esp + 12 + 12] // dst_width
- movd xmm2, [esp + 12 + 16] // x
- movd xmm3, [esp + 12 + 20] // dx
- mov eax, 0x04040000 // shuffle to line up fractions with pixel.
- movd xmm5, eax
- pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
- psrlw xmm6, 9
- pcmpeqb xmm7, xmm7 // generate 0x0001
- psrlw xmm7, 15
- pextrw eax, xmm2, 1 // get x0 integer. preroll
- sub ecx, 2
- jl xloop29
- movdqa xmm0, xmm2 // x1 = x0 + dx
- paddd xmm0, xmm3
- punpckldq xmm2, xmm0 // x0 x1
- punpckldq xmm3, xmm3 // dx dx
- paddd xmm3, xmm3 // dx * 2, dx * 2
- pextrw edx, xmm2, 3 // get x1 integer. preroll
- // 2 Pixel loop.
- xloop2:
- movdqa xmm1, xmm2 // x0, x1 fractions.
- paddd xmm2, xmm3 // x += dx
- movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
- movd xmm0, ebx
- psrlw xmm1, 9 // 7 bit fractions.
- movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
- movd xmm4, ebx
- pshufb xmm1, xmm5 // 0011
- punpcklwd xmm0, xmm4
- psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
- pxor xmm1, xmm6 // 0..7f and 7f..0
- paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
- pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
- pextrw eax, xmm2, 1 // get x0 integer. next iteration.
- pextrw edx, xmm2, 3 // get x1 integer. next iteration.
- paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
- psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
- packuswb xmm1, xmm1 // 8 bits, 2 pixels.
- movd ebx, xmm1
- mov [edi], bx
- lea edi, [edi + 2]
- sub ecx, 2 // 2 pixels
- jge xloop2
- xloop29:
- add ecx, 2 - 1
- jl xloop99
- // 1 pixel remainder
- movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
- movd xmm0, ebx
- psrlw xmm2, 9 // 7 bit fractions.
- pshufb xmm2, xmm5 // 0011
- psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
- pxor xmm2, xmm6 // 0..7f and 7f..0
- paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
- pmaddubsw xmm2, xmm0 // 16 bit
- paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
- psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
- packuswb xmm2, xmm2 // 8 bits
- movd ebx, xmm2
- mov [edi], bl
- xloop99:
- pop edi
- pop esi
- pop ebx
- ret
- }
- }
- // Reads 16 pixels, duplicates them and writes 32 pixels.
- __declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
- const uint8_t* src_ptr,
- int dst_width,
- int x,
- int dx) {
- __asm {
- mov edx, [esp + 4] // dst_ptr
- mov eax, [esp + 8] // src_ptr
- mov ecx, [esp + 12] // dst_width
- wloop:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm0
- punpckhbw xmm1, xmm1
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 32
- jg wloop
- ret
- }
- }
- // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
- __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- uint8_t* dst_argb,
- int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- // src_stride ignored
- mov edx, [esp + 12] // dst_argb
- mov ecx, [esp + 16] // dst_width
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- shufps xmm0, xmm1, 0xdd
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg wloop
- ret
- }
- }
- // Blends 8x1 rectangle to 4x1.
- __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- uint8_t* dst_argb,
- int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- // src_stride ignored
- mov edx, [esp + 12] // dst_argb
- mov ecx, [esp + 16] // dst_width
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm0
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
- pavgb xmm0, xmm2
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg wloop
- ret
- }
- }
- // Blends 8x2 rectangle to 4x1.
- __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- uint8_t* dst_argb,
- int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // dst_width
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
- movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
- pavgb xmm0, xmm2
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg wloop
- pop esi
- ret
- }
- }
- // Reads 4 pixels at a time.
- __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8_t* dst_argb,
- int dst_width) {
- __asm {
- push ebx
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- // src_stride ignored
- mov ebx, [esp + 8 + 12] // src_stepx
- mov edx, [esp + 8 + 16] // dst_argb
- mov ecx, [esp + 8 + 20] // dst_width
- lea ebx, [ebx * 4]
- lea edi, [ebx + ebx * 2]
- wloop:
- movd xmm0, [eax]
- movd xmm1, [eax + ebx]
- punpckldq xmm0, xmm1
- movd xmm2, [eax + ebx * 2]
- movd xmm3, [eax + edi]
- lea eax, [eax + ebx * 4]
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg wloop
- pop edi
- pop ebx
- ret
- }
- }
- // Blends four 2x2 to 4x1.
- __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8_t* dst_argb,
- int dst_width) {
- __asm {
- push ebx
- push esi
- push edi
- mov eax, [esp + 12 + 4] // src_argb
- mov esi, [esp + 12 + 8] // src_stride
- mov ebx, [esp + 12 + 12] // src_stepx
- mov edx, [esp + 12 + 16] // dst_argb
- mov ecx, [esp + 12 + 20] // dst_width
- lea esi, [eax + esi] // row1 pointer
- lea ebx, [ebx * 4]
- lea edi, [ebx + ebx * 2]
- wloop:
- movq xmm0, qword ptr [eax] // row0 4 pairs
- movhps xmm0, qword ptr [eax + ebx]
- movq xmm1, qword ptr [eax + ebx * 2]
- movhps xmm1, qword ptr [eax + edi]
- lea eax, [eax + ebx * 4]
- movq xmm2, qword ptr [esi] // row1 4 pairs
- movhps xmm2, qword ptr [esi + ebx]
- movq xmm3, qword ptr [esi + ebx * 2]
- movhps xmm3, qword ptr [esi + edi]
- lea esi, [esi + ebx * 4]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
- movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
- pavgb xmm0, xmm2
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg wloop
- pop edi
- pop esi
- pop ebx
- ret
- }
- }
- // Column scaling unfiltered. SSE2 version.
- __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- __asm {
- push edi
- push esi
- mov edi, [esp + 8 + 4] // dst_argb
- mov esi, [esp + 8 + 8] // src_argb
- mov ecx, [esp + 8 + 12] // dst_width
- movd xmm2, [esp + 8 + 16] // x
- movd xmm3, [esp + 8 + 20] // dx
- pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
- pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
- paddd xmm2, xmm0
- paddd xmm3, xmm3 // 0, 0, 0, dx * 2
- pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
- paddd xmm2, xmm0 // x3 x2 x1 x0
- paddd xmm3, xmm3 // 0, 0, 0, dx * 4
- pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
- pextrw eax, xmm2, 1 // get x0 integer.
- pextrw edx, xmm2, 3 // get x1 integer.
- cmp ecx, 0
- jle xloop99
- sub ecx, 4
- jl xloop49
- // 4 Pixel loop.
- xloop4:
- movd xmm0, [esi + eax * 4] // 1 source x0 pixels
- movd xmm1, [esi + edx * 4] // 1 source x1 pixels
- pextrw eax, xmm2, 5 // get x2 integer.
- pextrw edx, xmm2, 7 // get x3 integer.
- paddd xmm2, xmm3 // x += dx
- punpckldq xmm0, xmm1 // x0 x1
- movd xmm1, [esi + eax * 4] // 1 source x2 pixels
- movd xmm4, [esi + edx * 4] // 1 source x3 pixels
- pextrw eax, xmm2, 1 // get x0 integer. next iteration.
- pextrw edx, xmm2, 3 // get x1 integer. next iteration.
- punpckldq xmm1, xmm4 // x2 x3
- punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
- movdqu [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 4 // 4 pixels
- jge xloop4
- xloop49:
- test ecx, 2
- je xloop29
- // 2 Pixels.
- movd xmm0, [esi + eax * 4] // 1 source x0 pixels
- movd xmm1, [esi + edx * 4] // 1 source x1 pixels
- pextrw eax, xmm2, 5 // get x2 integer.
- punpckldq xmm0, xmm1 // x0 x1
- movq qword ptr [edi], xmm0
- lea edi, [edi + 8]
- xloop29:
- test ecx, 1
- je xloop99
- // 1 Pixels.
- movd xmm0, [esi + eax * 4] // 1 source x2 pixels
- movd dword ptr [edi], xmm0
- xloop99:
- pop esi
- pop edi
- ret
- }
- }
- // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
- // TODO(fbarchard): Port to Neon
- // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
- static const uvec8 kShuffleColARGB = {
- 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
- 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
- };
- // Shuffle table for duplicating 2 fractions into 8 bytes each
- static const uvec8 kShuffleFractions = {
- 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
- };
- __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_argb
- mov esi, [esp + 8 + 8] // src_argb
- mov ecx, [esp + 8 + 12] // dst_width
- movd xmm2, [esp + 8 + 16] // x
- movd xmm3, [esp + 8 + 20] // dx
- movdqa xmm4, xmmword ptr kShuffleColARGB
- movdqa xmm5, xmmword ptr kShuffleFractions
- pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
- psrlw xmm6, 9
- pextrw eax, xmm2, 1 // get x0 integer. preroll
- sub ecx, 2
- jl xloop29
- movdqa xmm0, xmm2 // x1 = x0 + dx
- paddd xmm0, xmm3
- punpckldq xmm2, xmm0 // x0 x1
- punpckldq xmm3, xmm3 // dx dx
- paddd xmm3, xmm3 // dx * 2, dx * 2
- pextrw edx, xmm2, 3 // get x1 integer. preroll
- // 2 Pixel loop.
- xloop2:
- movdqa xmm1, xmm2 // x0, x1 fractions.
- paddd xmm2, xmm3 // x += dx
- movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
- psrlw xmm1, 9 // 7 bit fractions.
- movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
- pshufb xmm1, xmm5 // 0000000011111111
- pshufb xmm0, xmm4 // arrange pixels into pairs
- pxor xmm1, xmm6 // 0..7f and 7f..0
- pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
- pextrw eax, xmm2, 1 // get x0 integer. next iteration.
- pextrw edx, xmm2, 3 // get x1 integer. next iteration.
- psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
- packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
- movq qword ptr [edi], xmm0
- lea edi, [edi + 8]
- sub ecx, 2 // 2 pixels
- jge xloop2
- xloop29:
- add ecx, 2 - 1
- jl xloop99
- // 1 pixel remainder
- psrlw xmm2, 9 // 7 bit fractions.
- movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
- pshufb xmm2, xmm5 // 00000000
- pshufb xmm0, xmm4 // arrange pixels into pairs
- pxor xmm2, xmm6 // 0..7f and 7f..0
- pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
- psrlw xmm0, 7
- packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
- movd [edi], xmm0
- xloop99:
- pop edi
- pop esi
- ret
- }
- }
- // Reads 4 pixels, duplicates them and writes 8 pixels.
- __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
- const uint8_t* src_argb,
- int dst_width,
- int x,
- int dx) {
- __asm {
- mov edx, [esp + 4] // dst_argb
- mov eax, [esp + 8] // src_argb
- mov ecx, [esp + 12] // dst_width
- wloop:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpckldq xmm0, xmm0
- punpckhdq xmm1, xmm1
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg wloop
- ret
- }
- }
- // Divide num by div and return as 16.16 fixed point result.
- __declspec(naked) int FixedDiv_X86(int num, int div) {
- __asm {
- mov eax, [esp + 4] // num
- cdq // extend num to 64 bits
- shld edx, eax, 16 // 32.16
- shl eax, 16
- idiv dword ptr [esp + 8]
- ret
- }
- }
- // Divide num by div and return as 16.16 fixed point result.
- __declspec(naked) int FixedDiv1_X86(int num, int div) {
- __asm {
- mov eax, [esp + 4] // num
- mov ecx, [esp + 8] // denom
- cdq // extend num to 64 bits
- shld edx, eax, 16 // 32.16
- shl eax, 16
- sub eax, 0x00010001
- sbb edx, 0
- sub ecx, 1
- idiv ecx
- ret
- }
- }
- #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
- #ifdef __cplusplus
- } // extern "C"
- } // namespace libyuv
- #endif
|