scale_win.cc 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391
  1. /*
  2. * Copyright 2013 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #include "libyuv/scale_row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for 32 bit Visual C x86 and clangcl
  17. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
  18. // Offsets for source bytes 0 to 9
  19. static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
  20. 128, 128, 128, 128, 128, 128, 128, 128};
  21. // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
  22. static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
  23. 128, 128, 128, 128, 128, 128, 128, 128};
  24. // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  25. static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
  26. 128, 128, 128, 128, 128, 128, 128, 128};
  27. // Offsets for source bytes 0 to 10
  28. static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
  29. // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
  30. static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
  31. 8, 9, 9, 10, 10, 11, 12, 13};
  32. // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  33. static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
  34. 10, 11, 12, 13, 13, 14, 14, 15};
  35. // Coefficients for source bytes 0 to 10
  36. static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
  37. // Coefficients for source bytes 10 to 21
  38. static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
  39. // Coefficients for source bytes 21 to 31
  40. static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
  41. // Coefficients for source bytes 21 to 31
  42. static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
  43. static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
  44. 128, 128, 128, 128, 128, 128, 128, 128};
  45. static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
  46. 6, 8, 11, 14, 128, 128, 128, 128};
  47. // Arrange words 0,3,6 into 0,1,2
  48. static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
  49. 128, 128, 128, 128, 128, 128, 128, 128};
  50. // Arrange words 0,3,6 into 3,4,5
  51. static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
  52. 6, 7, 12, 13, 128, 128, 128, 128};
  53. // Scaling values for boxes of 3x3 and 2x3
  54. static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
  55. 65536 / 9, 65536 / 6, 0, 0};
  56. // Arrange first value for pixels 0,1,2,3,4,5
  57. static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
  58. 11, 128, 14, 128, 128, 128, 128, 128};
  59. // Arrange second value for pixels 0,1,2,3,4,5
  60. static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
  61. 12, 128, 15, 128, 128, 128, 128, 128};
  62. // Arrange third value for pixels 0,1,2,3,4,5
  63. static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
  64. 13, 128, 128, 128, 128, 128, 128, 128};
  65. // Scaling values for boxes of 3x2 and 2x2
  66. static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
  67. 65536 / 3, 65536 / 2, 0, 0};
  68. // Reads 32 pixels, throws half away and writes 16 pixels.
  69. __declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
  70. ptrdiff_t src_stride,
  71. uint8_t* dst_ptr,
  72. int dst_width) {
  73. __asm {
  74. mov eax, [esp + 4] // src_ptr
  75. // src_stride ignored
  76. mov edx, [esp + 12] // dst_ptr
  77. mov ecx, [esp + 16] // dst_width
  78. wloop:
  79. movdqu xmm0, [eax]
  80. movdqu xmm1, [eax + 16]
  81. lea eax, [eax + 32]
  82. psrlw xmm0, 8 // isolate odd pixels.
  83. psrlw xmm1, 8
  84. packuswb xmm0, xmm1
  85. movdqu [edx], xmm0
  86. lea edx, [edx + 16]
  87. sub ecx, 16
  88. jg wloop
  89. ret
  90. }
  91. }
  92. // Blends 32x1 rectangle to 16x1.
  93. __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
  94. ptrdiff_t src_stride,
  95. uint8_t* dst_ptr,
  96. int dst_width) {
  97. __asm {
  98. mov eax, [esp + 4] // src_ptr
  99. // src_stride
  100. mov edx, [esp + 12] // dst_ptr
  101. mov ecx, [esp + 16] // dst_width
  102. pcmpeqb xmm4, xmm4 // constant 0x0101
  103. psrlw xmm4, 15
  104. packuswb xmm4, xmm4
  105. pxor xmm5, xmm5 // constant 0
  106. wloop:
  107. movdqu xmm0, [eax]
  108. movdqu xmm1, [eax + 16]
  109. lea eax, [eax + 32]
  110. pmaddubsw xmm0, xmm4 // horizontal add
  111. pmaddubsw xmm1, xmm4
  112. pavgw xmm0, xmm5 // (x + 1) / 2
  113. pavgw xmm1, xmm5
  114. packuswb xmm0, xmm1
  115. movdqu [edx], xmm0
  116. lea edx, [edx + 16]
  117. sub ecx, 16
  118. jg wloop
  119. ret
  120. }
  121. }
  122. // Blends 32x2 rectangle to 16x1.
  123. __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
  124. ptrdiff_t src_stride,
  125. uint8_t* dst_ptr,
  126. int dst_width) {
  127. __asm {
  128. push esi
  129. mov eax, [esp + 4 + 4] // src_ptr
  130. mov esi, [esp + 4 + 8] // src_stride
  131. mov edx, [esp + 4 + 12] // dst_ptr
  132. mov ecx, [esp + 4 + 16] // dst_width
  133. pcmpeqb xmm4, xmm4 // constant 0x0101
  134. psrlw xmm4, 15
  135. packuswb xmm4, xmm4
  136. pxor xmm5, xmm5 // constant 0
  137. wloop:
  138. movdqu xmm0, [eax]
  139. movdqu xmm1, [eax + 16]
  140. movdqu xmm2, [eax + esi]
  141. movdqu xmm3, [eax + esi + 16]
  142. lea eax, [eax + 32]
  143. pmaddubsw xmm0, xmm4 // horizontal add
  144. pmaddubsw xmm1, xmm4
  145. pmaddubsw xmm2, xmm4
  146. pmaddubsw xmm3, xmm4
  147. paddw xmm0, xmm2 // vertical add
  148. paddw xmm1, xmm3
  149. psrlw xmm0, 1
  150. psrlw xmm1, 1
  151. pavgw xmm0, xmm5 // (x + 1) / 2
  152. pavgw xmm1, xmm5
  153. packuswb xmm0, xmm1
  154. movdqu [edx], xmm0
  155. lea edx, [edx + 16]
  156. sub ecx, 16
  157. jg wloop
  158. pop esi
  159. ret
  160. }
  161. }
  162. #ifdef HAS_SCALEROWDOWN2_AVX2
  163. // Reads 64 pixels, throws half away and writes 32 pixels.
  164. __declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
  165. ptrdiff_t src_stride,
  166. uint8_t* dst_ptr,
  167. int dst_width) {
  168. __asm {
  169. mov eax, [esp + 4] // src_ptr
  170. // src_stride ignored
  171. mov edx, [esp + 12] // dst_ptr
  172. mov ecx, [esp + 16] // dst_width
  173. wloop:
  174. vmovdqu ymm0, [eax]
  175. vmovdqu ymm1, [eax + 32]
  176. lea eax, [eax + 64]
  177. vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
  178. vpsrlw ymm1, ymm1, 8
  179. vpackuswb ymm0, ymm0, ymm1
  180. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  181. vmovdqu [edx], ymm0
  182. lea edx, [edx + 32]
  183. sub ecx, 32
  184. jg wloop
  185. vzeroupper
  186. ret
  187. }
  188. }
  189. // Blends 64x1 rectangle to 32x1.
  190. __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
  191. ptrdiff_t src_stride,
  192. uint8_t* dst_ptr,
  193. int dst_width) {
  194. __asm {
  195. mov eax, [esp + 4] // src_ptr
  196. // src_stride
  197. mov edx, [esp + 12] // dst_ptr
  198. mov ecx, [esp + 16] // dst_width
  199. vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
  200. vpsrlw ymm4, ymm4, 15
  201. vpackuswb ymm4, ymm4, ymm4
  202. vpxor ymm5, ymm5, ymm5 // constant 0
  203. wloop:
  204. vmovdqu ymm0, [eax]
  205. vmovdqu ymm1, [eax + 32]
  206. lea eax, [eax + 64]
  207. vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
  208. vpmaddubsw ymm1, ymm1, ymm4
  209. vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
  210. vpavgw ymm1, ymm1, ymm5
  211. vpackuswb ymm0, ymm0, ymm1
  212. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  213. vmovdqu [edx], ymm0
  214. lea edx, [edx + 32]
  215. sub ecx, 32
  216. jg wloop
  217. vzeroupper
  218. ret
  219. }
  220. }
  221. // For rounding, average = (sum + 2) / 4
  222. // becomes average((sum >> 1), 0)
  223. // Blends 64x2 rectangle to 32x1.
  224. __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
  225. ptrdiff_t src_stride,
  226. uint8_t* dst_ptr,
  227. int dst_width) {
  228. __asm {
  229. push esi
  230. mov eax, [esp + 4 + 4] // src_ptr
  231. mov esi, [esp + 4 + 8] // src_stride
  232. mov edx, [esp + 4 + 12] // dst_ptr
  233. mov ecx, [esp + 4 + 16] // dst_width
  234. vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
  235. vpsrlw ymm4, ymm4, 15
  236. vpackuswb ymm4, ymm4, ymm4
  237. vpxor ymm5, ymm5, ymm5 // constant 0
  238. wloop:
  239. vmovdqu ymm0, [eax]
  240. vmovdqu ymm1, [eax + 32]
  241. vmovdqu ymm2, [eax + esi]
  242. vmovdqu ymm3, [eax + esi + 32]
  243. lea eax, [eax + 64]
  244. vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
  245. vpmaddubsw ymm1, ymm1, ymm4
  246. vpmaddubsw ymm2, ymm2, ymm4
  247. vpmaddubsw ymm3, ymm3, ymm4
  248. vpaddw ymm0, ymm0, ymm2 // vertical add
  249. vpaddw ymm1, ymm1, ymm3
  250. vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
  251. vpsrlw ymm1, ymm1, 1
  252. vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
  253. vpavgw ymm1, ymm1, ymm5
  254. vpackuswb ymm0, ymm0, ymm1
  255. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  256. vmovdqu [edx], ymm0
  257. lea edx, [edx + 32]
  258. sub ecx, 32
  259. jg wloop
  260. pop esi
  261. vzeroupper
  262. ret
  263. }
  264. }
  265. #endif // HAS_SCALEROWDOWN2_AVX2
  266. // Point samples 32 pixels to 8 pixels.
  267. __declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
  268. ptrdiff_t src_stride,
  269. uint8_t* dst_ptr,
  270. int dst_width) {
  271. __asm {
  272. mov eax, [esp + 4] // src_ptr
  273. // src_stride ignored
  274. mov edx, [esp + 12] // dst_ptr
  275. mov ecx, [esp + 16] // dst_width
  276. pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
  277. psrld xmm5, 24
  278. pslld xmm5, 16
  279. wloop:
  280. movdqu xmm0, [eax]
  281. movdqu xmm1, [eax + 16]
  282. lea eax, [eax + 32]
  283. pand xmm0, xmm5
  284. pand xmm1, xmm5
  285. packuswb xmm0, xmm1
  286. psrlw xmm0, 8
  287. packuswb xmm0, xmm0
  288. movq qword ptr [edx], xmm0
  289. lea edx, [edx + 8]
  290. sub ecx, 8
  291. jg wloop
  292. ret
  293. }
  294. }
  295. // Blends 32x4 rectangle to 8x1.
  296. __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
  297. ptrdiff_t src_stride,
  298. uint8_t* dst_ptr,
  299. int dst_width) {
  300. __asm {
  301. push esi
  302. push edi
  303. mov eax, [esp + 8 + 4] // src_ptr
  304. mov esi, [esp + 8 + 8] // src_stride
  305. mov edx, [esp + 8 + 12] // dst_ptr
  306. mov ecx, [esp + 8 + 16] // dst_width
  307. lea edi, [esi + esi * 2] // src_stride * 3
  308. pcmpeqb xmm4, xmm4 // constant 0x0101
  309. psrlw xmm4, 15
  310. movdqa xmm5, xmm4
  311. packuswb xmm4, xmm4
  312. psllw xmm5, 3 // constant 0x0008
  313. wloop:
  314. movdqu xmm0, [eax] // average rows
  315. movdqu xmm1, [eax + 16]
  316. movdqu xmm2, [eax + esi]
  317. movdqu xmm3, [eax + esi + 16]
  318. pmaddubsw xmm0, xmm4 // horizontal add
  319. pmaddubsw xmm1, xmm4
  320. pmaddubsw xmm2, xmm4
  321. pmaddubsw xmm3, xmm4
  322. paddw xmm0, xmm2 // vertical add rows 0, 1
  323. paddw xmm1, xmm3
  324. movdqu xmm2, [eax + esi * 2]
  325. movdqu xmm3, [eax + esi * 2 + 16]
  326. pmaddubsw xmm2, xmm4
  327. pmaddubsw xmm3, xmm4
  328. paddw xmm0, xmm2 // add row 2
  329. paddw xmm1, xmm3
  330. movdqu xmm2, [eax + edi]
  331. movdqu xmm3, [eax + edi + 16]
  332. lea eax, [eax + 32]
  333. pmaddubsw xmm2, xmm4
  334. pmaddubsw xmm3, xmm4
  335. paddw xmm0, xmm2 // add row 3
  336. paddw xmm1, xmm3
  337. phaddw xmm0, xmm1
  338. paddw xmm0, xmm5 // + 8 for round
  339. psrlw xmm0, 4 // /16 for average of 4 * 4
  340. packuswb xmm0, xmm0
  341. movq qword ptr [edx], xmm0
  342. lea edx, [edx + 8]
  343. sub ecx, 8
  344. jg wloop
  345. pop edi
  346. pop esi
  347. ret
  348. }
  349. }
  350. #ifdef HAS_SCALEROWDOWN4_AVX2
  351. // Point samples 64 pixels to 16 pixels.
  352. __declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
  353. ptrdiff_t src_stride,
  354. uint8_t* dst_ptr,
  355. int dst_width) {
  356. __asm {
  357. mov eax, [esp + 4] // src_ptr
  358. // src_stride ignored
  359. mov edx, [esp + 12] // dst_ptr
  360. mov ecx, [esp + 16] // dst_width
  361. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
  362. vpsrld ymm5, ymm5, 24
  363. vpslld ymm5, ymm5, 16
  364. wloop:
  365. vmovdqu ymm0, [eax]
  366. vmovdqu ymm1, [eax + 32]
  367. lea eax, [eax + 64]
  368. vpand ymm0, ymm0, ymm5
  369. vpand ymm1, ymm1, ymm5
  370. vpackuswb ymm0, ymm0, ymm1
  371. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  372. vpsrlw ymm0, ymm0, 8
  373. vpackuswb ymm0, ymm0, ymm0
  374. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  375. vmovdqu [edx], xmm0
  376. lea edx, [edx + 16]
  377. sub ecx, 16
  378. jg wloop
  379. vzeroupper
  380. ret
  381. }
  382. }
  383. // Blends 64x4 rectangle to 16x1.
  384. __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
  385. ptrdiff_t src_stride,
  386. uint8_t* dst_ptr,
  387. int dst_width) {
  388. __asm {
  389. push esi
  390. push edi
  391. mov eax, [esp + 8 + 4] // src_ptr
  392. mov esi, [esp + 8 + 8] // src_stride
  393. mov edx, [esp + 8 + 12] // dst_ptr
  394. mov ecx, [esp + 8 + 16] // dst_width
  395. lea edi, [esi + esi * 2] // src_stride * 3
  396. vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
  397. vpsrlw ymm4, ymm4, 15
  398. vpsllw ymm5, ymm4, 3 // constant 0x0008
  399. vpackuswb ymm4, ymm4, ymm4
  400. wloop:
  401. vmovdqu ymm0, [eax] // average rows
  402. vmovdqu ymm1, [eax + 32]
  403. vmovdqu ymm2, [eax + esi]
  404. vmovdqu ymm3, [eax + esi + 32]
  405. vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
  406. vpmaddubsw ymm1, ymm1, ymm4
  407. vpmaddubsw ymm2, ymm2, ymm4
  408. vpmaddubsw ymm3, ymm3, ymm4
  409. vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
  410. vpaddw ymm1, ymm1, ymm3
  411. vmovdqu ymm2, [eax + esi * 2]
  412. vmovdqu ymm3, [eax + esi * 2 + 32]
  413. vpmaddubsw ymm2, ymm2, ymm4
  414. vpmaddubsw ymm3, ymm3, ymm4
  415. vpaddw ymm0, ymm0, ymm2 // add row 2
  416. vpaddw ymm1, ymm1, ymm3
  417. vmovdqu ymm2, [eax + edi]
  418. vmovdqu ymm3, [eax + edi + 32]
  419. lea eax, [eax + 64]
  420. vpmaddubsw ymm2, ymm2, ymm4
  421. vpmaddubsw ymm3, ymm3, ymm4
  422. vpaddw ymm0, ymm0, ymm2 // add row 3
  423. vpaddw ymm1, ymm1, ymm3
  424. vphaddw ymm0, ymm0, ymm1 // mutates
  425. vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
  426. vpaddw ymm0, ymm0, ymm5 // + 8 for round
  427. vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
  428. vpackuswb ymm0, ymm0, ymm0
  429. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  430. vmovdqu [edx], xmm0
  431. lea edx, [edx + 16]
  432. sub ecx, 16
  433. jg wloop
  434. pop edi
  435. pop esi
  436. vzeroupper
  437. ret
  438. }
  439. }
  440. #endif // HAS_SCALEROWDOWN4_AVX2
  441. // Point samples 32 pixels to 24 pixels.
  442. // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
  443. // Then shuffled to do the scaling.
  444. __declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
  445. ptrdiff_t src_stride,
  446. uint8_t* dst_ptr,
  447. int dst_width) {
  448. __asm {
  449. mov eax, [esp + 4] // src_ptr
  450. // src_stride ignored
  451. mov edx, [esp + 12] // dst_ptr
  452. mov ecx, [esp + 16] // dst_width
  453. movdqa xmm3, xmmword ptr kShuf0
  454. movdqa xmm4, xmmword ptr kShuf1
  455. movdqa xmm5, xmmword ptr kShuf2
  456. wloop:
  457. movdqu xmm0, [eax]
  458. movdqu xmm1, [eax + 16]
  459. lea eax, [eax + 32]
  460. movdqa xmm2, xmm1
  461. palignr xmm1, xmm0, 8
  462. pshufb xmm0, xmm3
  463. pshufb xmm1, xmm4
  464. pshufb xmm2, xmm5
  465. movq qword ptr [edx], xmm0
  466. movq qword ptr [edx + 8], xmm1
  467. movq qword ptr [edx + 16], xmm2
  468. lea edx, [edx + 24]
  469. sub ecx, 24
  470. jg wloop
  471. ret
  472. }
  473. }
  474. // Blends 32x2 rectangle to 24x1
  475. // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
  476. // Then shuffled to do the scaling.
  477. // Register usage:
  478. // xmm0 src_row 0
  479. // xmm1 src_row 1
  480. // xmm2 shuf 0
  481. // xmm3 shuf 1
  482. // xmm4 shuf 2
  483. // xmm5 madd 0
  484. // xmm6 madd 1
  485. // xmm7 kRound34
  486. // Note that movdqa+palign may be better than movdqu.
  487. __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
  488. ptrdiff_t src_stride,
  489. uint8_t* dst_ptr,
  490. int dst_width) {
  491. __asm {
  492. push esi
  493. mov eax, [esp + 4 + 4] // src_ptr
  494. mov esi, [esp + 4 + 8] // src_stride
  495. mov edx, [esp + 4 + 12] // dst_ptr
  496. mov ecx, [esp + 4 + 16] // dst_width
  497. movdqa xmm2, xmmword ptr kShuf01
  498. movdqa xmm3, xmmword ptr kShuf11
  499. movdqa xmm4, xmmword ptr kShuf21
  500. movdqa xmm5, xmmword ptr kMadd01
  501. movdqa xmm6, xmmword ptr kMadd11
  502. movdqa xmm7, xmmword ptr kRound34
  503. wloop:
  504. movdqu xmm0, [eax] // pixels 0..7
  505. movdqu xmm1, [eax + esi]
  506. pavgb xmm0, xmm1
  507. pshufb xmm0, xmm2
  508. pmaddubsw xmm0, xmm5
  509. paddsw xmm0, xmm7
  510. psrlw xmm0, 2
  511. packuswb xmm0, xmm0
  512. movq qword ptr [edx], xmm0
  513. movdqu xmm0, [eax + 8] // pixels 8..15
  514. movdqu xmm1, [eax + esi + 8]
  515. pavgb xmm0, xmm1
  516. pshufb xmm0, xmm3
  517. pmaddubsw xmm0, xmm6
  518. paddsw xmm0, xmm7
  519. psrlw xmm0, 2
  520. packuswb xmm0, xmm0
  521. movq qword ptr [edx + 8], xmm0
  522. movdqu xmm0, [eax + 16] // pixels 16..23
  523. movdqu xmm1, [eax + esi + 16]
  524. lea eax, [eax + 32]
  525. pavgb xmm0, xmm1
  526. pshufb xmm0, xmm4
  527. movdqa xmm1, xmmword ptr kMadd21
  528. pmaddubsw xmm0, xmm1
  529. paddsw xmm0, xmm7
  530. psrlw xmm0, 2
  531. packuswb xmm0, xmm0
  532. movq qword ptr [edx + 16], xmm0
  533. lea edx, [edx + 24]
  534. sub ecx, 24
  535. jg wloop
  536. pop esi
  537. ret
  538. }
  539. }
  540. // Note that movdqa+palign may be better than movdqu.
  541. __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
  542. ptrdiff_t src_stride,
  543. uint8_t* dst_ptr,
  544. int dst_width) {
  545. __asm {
  546. push esi
  547. mov eax, [esp + 4 + 4] // src_ptr
  548. mov esi, [esp + 4 + 8] // src_stride
  549. mov edx, [esp + 4 + 12] // dst_ptr
  550. mov ecx, [esp + 4 + 16] // dst_width
  551. movdqa xmm2, xmmword ptr kShuf01
  552. movdqa xmm3, xmmword ptr kShuf11
  553. movdqa xmm4, xmmword ptr kShuf21
  554. movdqa xmm5, xmmword ptr kMadd01
  555. movdqa xmm6, xmmword ptr kMadd11
  556. movdqa xmm7, xmmword ptr kRound34
  557. wloop:
  558. movdqu xmm0, [eax] // pixels 0..7
  559. movdqu xmm1, [eax + esi]
  560. pavgb xmm1, xmm0
  561. pavgb xmm0, xmm1
  562. pshufb xmm0, xmm2
  563. pmaddubsw xmm0, xmm5
  564. paddsw xmm0, xmm7
  565. psrlw xmm0, 2
  566. packuswb xmm0, xmm0
  567. movq qword ptr [edx], xmm0
  568. movdqu xmm0, [eax + 8] // pixels 8..15
  569. movdqu xmm1, [eax + esi + 8]
  570. pavgb xmm1, xmm0
  571. pavgb xmm0, xmm1
  572. pshufb xmm0, xmm3
  573. pmaddubsw xmm0, xmm6
  574. paddsw xmm0, xmm7
  575. psrlw xmm0, 2
  576. packuswb xmm0, xmm0
  577. movq qword ptr [edx + 8], xmm0
  578. movdqu xmm0, [eax + 16] // pixels 16..23
  579. movdqu xmm1, [eax + esi + 16]
  580. lea eax, [eax + 32]
  581. pavgb xmm1, xmm0
  582. pavgb xmm0, xmm1
  583. pshufb xmm0, xmm4
  584. movdqa xmm1, xmmword ptr kMadd21
  585. pmaddubsw xmm0, xmm1
  586. paddsw xmm0, xmm7
  587. psrlw xmm0, 2
  588. packuswb xmm0, xmm0
  589. movq qword ptr [edx + 16], xmm0
  590. lea edx, [edx+24]
  591. sub ecx, 24
  592. jg wloop
  593. pop esi
  594. ret
  595. }
  596. }
  597. // 3/8 point sampler
  598. // Scale 32 pixels to 12
  599. __declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
  600. ptrdiff_t src_stride,
  601. uint8_t* dst_ptr,
  602. int dst_width) {
  603. __asm {
  604. mov eax, [esp + 4] // src_ptr
  605. // src_stride ignored
  606. mov edx, [esp + 12] // dst_ptr
  607. mov ecx, [esp + 16] // dst_width
  608. movdqa xmm4, xmmword ptr kShuf38a
  609. movdqa xmm5, xmmword ptr kShuf38b
  610. xloop:
  611. movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
  612. movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
  613. lea eax, [eax + 32]
  614. pshufb xmm0, xmm4
  615. pshufb xmm1, xmm5
  616. paddusb xmm0, xmm1
  617. movq qword ptr [edx], xmm0 // write 12 pixels
  618. movhlps xmm1, xmm0
  619. movd [edx + 8], xmm1
  620. lea edx, [edx + 12]
  621. sub ecx, 12
  622. jg xloop
  623. ret
  624. }
  625. }
  626. // Scale 16x3 pixels to 6x1 with interpolation
  627. __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
  628. ptrdiff_t src_stride,
  629. uint8_t* dst_ptr,
  630. int dst_width) {
  631. __asm {
  632. push esi
  633. mov eax, [esp + 4 + 4] // src_ptr
  634. mov esi, [esp + 4 + 8] // src_stride
  635. mov edx, [esp + 4 + 12] // dst_ptr
  636. mov ecx, [esp + 4 + 16] // dst_width
  637. movdqa xmm2, xmmword ptr kShufAc
  638. movdqa xmm3, xmmword ptr kShufAc3
  639. movdqa xmm4, xmmword ptr kScaleAc33
  640. pxor xmm5, xmm5
  641. xloop:
  642. movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
  643. movdqu xmm6, [eax + esi]
  644. movhlps xmm1, xmm0
  645. movhlps xmm7, xmm6
  646. punpcklbw xmm0, xmm5
  647. punpcklbw xmm1, xmm5
  648. punpcklbw xmm6, xmm5
  649. punpcklbw xmm7, xmm5
  650. paddusw xmm0, xmm6
  651. paddusw xmm1, xmm7
  652. movdqu xmm6, [eax + esi * 2]
  653. lea eax, [eax + 16]
  654. movhlps xmm7, xmm6
  655. punpcklbw xmm6, xmm5
  656. punpcklbw xmm7, xmm5
  657. paddusw xmm0, xmm6
  658. paddusw xmm1, xmm7
  659. movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
  660. psrldq xmm0, 2
  661. paddusw xmm6, xmm0
  662. psrldq xmm0, 2
  663. paddusw xmm6, xmm0
  664. pshufb xmm6, xmm2
  665. movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
  666. psrldq xmm1, 2
  667. paddusw xmm7, xmm1
  668. psrldq xmm1, 2
  669. paddusw xmm7, xmm1
  670. pshufb xmm7, xmm3
  671. paddusw xmm6, xmm7
  672. pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
  673. packuswb xmm6, xmm6
  674. movd [edx], xmm6 // write 6 pixels
  675. psrlq xmm6, 16
  676. movd [edx + 2], xmm6
  677. lea edx, [edx + 6]
  678. sub ecx, 6
  679. jg xloop
  680. pop esi
  681. ret
  682. }
  683. }
  684. // Scale 16x2 pixels to 6x1 with interpolation
  685. __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
  686. ptrdiff_t src_stride,
  687. uint8_t* dst_ptr,
  688. int dst_width) {
  689. __asm {
  690. push esi
  691. mov eax, [esp + 4 + 4] // src_ptr
  692. mov esi, [esp + 4 + 8] // src_stride
  693. mov edx, [esp + 4 + 12] // dst_ptr
  694. mov ecx, [esp + 4 + 16] // dst_width
  695. movdqa xmm2, xmmword ptr kShufAb0
  696. movdqa xmm3, xmmword ptr kShufAb1
  697. movdqa xmm4, xmmword ptr kShufAb2
  698. movdqa xmm5, xmmword ptr kScaleAb2
  699. xloop:
  700. movdqu xmm0, [eax] // average 2 rows into xmm0
  701. movdqu xmm1, [eax + esi]
  702. lea eax, [eax + 16]
  703. pavgb xmm0, xmm1
  704. movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
  705. pshufb xmm1, xmm2
  706. movdqa xmm6, xmm0
  707. pshufb xmm6, xmm3
  708. paddusw xmm1, xmm6
  709. pshufb xmm0, xmm4
  710. paddusw xmm1, xmm0
  711. pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
  712. packuswb xmm1, xmm1
  713. movd [edx], xmm1 // write 6 pixels
  714. psrlq xmm1, 16
  715. movd [edx + 2], xmm1
  716. lea edx, [edx + 6]
  717. sub ecx, 6
  718. jg xloop
  719. pop esi
  720. ret
  721. }
  722. }
  723. // Reads 16 bytes and accumulates to 16 shorts at a time.
  724. __declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
  725. uint16_t* dst_ptr,
  726. int src_width) {
  727. __asm {
  728. mov eax, [esp + 4] // src_ptr
  729. mov edx, [esp + 8] // dst_ptr
  730. mov ecx, [esp + 12] // src_width
  731. pxor xmm5, xmm5
  732. // sum rows
  733. xloop:
  734. movdqu xmm3, [eax] // read 16 bytes
  735. lea eax, [eax + 16]
  736. movdqu xmm0, [edx] // read 16 words from destination
  737. movdqu xmm1, [edx + 16]
  738. movdqa xmm2, xmm3
  739. punpcklbw xmm2, xmm5
  740. punpckhbw xmm3, xmm5
  741. paddusw xmm0, xmm2 // sum 16 words
  742. paddusw xmm1, xmm3
  743. movdqu [edx], xmm0 // write 16 words to destination
  744. movdqu [edx + 16], xmm1
  745. lea edx, [edx + 32]
  746. sub ecx, 16
  747. jg xloop
  748. ret
  749. }
  750. }
  751. #ifdef HAS_SCALEADDROW_AVX2
  752. // Reads 32 bytes and accumulates to 32 shorts at a time.
  753. __declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
  754. uint16_t* dst_ptr,
  755. int src_width) {
  756. __asm {
  757. mov eax, [esp + 4] // src_ptr
  758. mov edx, [esp + 8] // dst_ptr
  759. mov ecx, [esp + 12] // src_width
  760. vpxor ymm5, ymm5, ymm5
  761. // sum rows
  762. xloop:
  763. vmovdqu ymm3, [eax] // read 32 bytes
  764. lea eax, [eax + 32]
  765. vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
  766. vpunpcklbw ymm2, ymm3, ymm5
  767. vpunpckhbw ymm3, ymm3, ymm5
  768. vpaddusw ymm0, ymm2, [edx] // sum 16 words
  769. vpaddusw ymm1, ymm3, [edx + 32]
  770. vmovdqu [edx], ymm0 // write 32 words to destination
  771. vmovdqu [edx + 32], ymm1
  772. lea edx, [edx + 64]
  773. sub ecx, 32
  774. jg xloop
  775. vzeroupper
  776. ret
  777. }
  778. }
  779. #endif // HAS_SCALEADDROW_AVX2
  780. // Constant for making pixels signed to avoid pmaddubsw
  781. // saturation.
  782. static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  783. 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
  784. // Constant for making pixels unsigned and adding .5 for rounding.
  785. static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
  786. 0x4040, 0x4040, 0x4040, 0x4040};
  787. // Bilinear column filtering. SSSE3 version.
  788. __declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
  789. const uint8_t* src_ptr,
  790. int dst_width,
  791. int x,
  792. int dx) {
  793. __asm {
  794. push ebx
  795. push esi
  796. push edi
  797. mov edi, [esp + 12 + 4] // dst_ptr
  798. mov esi, [esp + 12 + 8] // src_ptr
  799. mov ecx, [esp + 12 + 12] // dst_width
  800. movd xmm2, [esp + 12 + 16] // x
  801. movd xmm3, [esp + 12 + 20] // dx
  802. mov eax, 0x04040000 // shuffle to line up fractions with pixel.
  803. movd xmm5, eax
  804. pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
  805. psrlw xmm6, 9
  806. pcmpeqb xmm7, xmm7 // generate 0x0001
  807. psrlw xmm7, 15
  808. pextrw eax, xmm2, 1 // get x0 integer. preroll
  809. sub ecx, 2
  810. jl xloop29
  811. movdqa xmm0, xmm2 // x1 = x0 + dx
  812. paddd xmm0, xmm3
  813. punpckldq xmm2, xmm0 // x0 x1
  814. punpckldq xmm3, xmm3 // dx dx
  815. paddd xmm3, xmm3 // dx * 2, dx * 2
  816. pextrw edx, xmm2, 3 // get x1 integer. preroll
  817. // 2 Pixel loop.
  818. xloop2:
  819. movdqa xmm1, xmm2 // x0, x1 fractions.
  820. paddd xmm2, xmm3 // x += dx
  821. movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
  822. movd xmm0, ebx
  823. psrlw xmm1, 9 // 7 bit fractions.
  824. movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
  825. movd xmm4, ebx
  826. pshufb xmm1, xmm5 // 0011
  827. punpcklwd xmm0, xmm4
  828. psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
  829. pxor xmm1, xmm6 // 0..7f and 7f..0
  830. paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
  831. pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
  832. pextrw eax, xmm2, 1 // get x0 integer. next iteration.
  833. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
  834. paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
  835. psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
  836. packuswb xmm1, xmm1 // 8 bits, 2 pixels.
  837. movd ebx, xmm1
  838. mov [edi], bx
  839. lea edi, [edi + 2]
  840. sub ecx, 2 // 2 pixels
  841. jge xloop2
  842. xloop29:
  843. add ecx, 2 - 1
  844. jl xloop99
  845. // 1 pixel remainder
  846. movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
  847. movd xmm0, ebx
  848. psrlw xmm2, 9 // 7 bit fractions.
  849. pshufb xmm2, xmm5 // 0011
  850. psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
  851. pxor xmm2, xmm6 // 0..7f and 7f..0
  852. paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
  853. pmaddubsw xmm2, xmm0 // 16 bit
  854. paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
  855. psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
  856. packuswb xmm2, xmm2 // 8 bits
  857. movd ebx, xmm2
  858. mov [edi], bl
  859. xloop99:
  860. pop edi
  861. pop esi
  862. pop ebx
  863. ret
  864. }
  865. }
  866. // Reads 16 pixels, duplicates them and writes 32 pixels.
  867. __declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
  868. const uint8_t* src_ptr,
  869. int dst_width,
  870. int x,
  871. int dx) {
  872. __asm {
  873. mov edx, [esp + 4] // dst_ptr
  874. mov eax, [esp + 8] // src_ptr
  875. mov ecx, [esp + 12] // dst_width
  876. wloop:
  877. movdqu xmm0, [eax]
  878. lea eax, [eax + 16]
  879. movdqa xmm1, xmm0
  880. punpcklbw xmm0, xmm0
  881. punpckhbw xmm1, xmm1
  882. movdqu [edx], xmm0
  883. movdqu [edx + 16], xmm1
  884. lea edx, [edx + 32]
  885. sub ecx, 32
  886. jg wloop
  887. ret
  888. }
  889. }
  890. // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
  891. __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
  892. ptrdiff_t src_stride,
  893. uint8_t* dst_argb,
  894. int dst_width) {
  895. __asm {
  896. mov eax, [esp + 4] // src_argb
  897. // src_stride ignored
  898. mov edx, [esp + 12] // dst_argb
  899. mov ecx, [esp + 16] // dst_width
  900. wloop:
  901. movdqu xmm0, [eax]
  902. movdqu xmm1, [eax + 16]
  903. lea eax, [eax + 32]
  904. shufps xmm0, xmm1, 0xdd
  905. movdqu [edx], xmm0
  906. lea edx, [edx + 16]
  907. sub ecx, 4
  908. jg wloop
  909. ret
  910. }
  911. }
  912. // Blends 8x1 rectangle to 4x1.
  913. __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
  914. ptrdiff_t src_stride,
  915. uint8_t* dst_argb,
  916. int dst_width) {
  917. __asm {
  918. mov eax, [esp + 4] // src_argb
  919. // src_stride ignored
  920. mov edx, [esp + 12] // dst_argb
  921. mov ecx, [esp + 16] // dst_width
  922. wloop:
  923. movdqu xmm0, [eax]
  924. movdqu xmm1, [eax + 16]
  925. lea eax, [eax + 32]
  926. movdqa xmm2, xmm0
  927. shufps xmm0, xmm1, 0x88 // even pixels
  928. shufps xmm2, xmm1, 0xdd // odd pixels
  929. pavgb xmm0, xmm2
  930. movdqu [edx], xmm0
  931. lea edx, [edx + 16]
  932. sub ecx, 4
  933. jg wloop
  934. ret
  935. }
  936. }
  937. // Blends 8x2 rectangle to 4x1.
  938. __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
  939. ptrdiff_t src_stride,
  940. uint8_t* dst_argb,
  941. int dst_width) {
  942. __asm {
  943. push esi
  944. mov eax, [esp + 4 + 4] // src_argb
  945. mov esi, [esp + 4 + 8] // src_stride
  946. mov edx, [esp + 4 + 12] // dst_argb
  947. mov ecx, [esp + 4 + 16] // dst_width
  948. wloop:
  949. movdqu xmm0, [eax]
  950. movdqu xmm1, [eax + 16]
  951. movdqu xmm2, [eax + esi]
  952. movdqu xmm3, [eax + esi + 16]
  953. lea eax, [eax + 32]
  954. pavgb xmm0, xmm2 // average rows
  955. pavgb xmm1, xmm3
  956. movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
  957. shufps xmm0, xmm1, 0x88 // even pixels
  958. shufps xmm2, xmm1, 0xdd // odd pixels
  959. pavgb xmm0, xmm2
  960. movdqu [edx], xmm0
  961. lea edx, [edx + 16]
  962. sub ecx, 4
  963. jg wloop
  964. pop esi
  965. ret
  966. }
  967. }
  968. // Reads 4 pixels at a time.
  969. __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
  970. ptrdiff_t src_stride,
  971. int src_stepx,
  972. uint8_t* dst_argb,
  973. int dst_width) {
  974. __asm {
  975. push ebx
  976. push edi
  977. mov eax, [esp + 8 + 4] // src_argb
  978. // src_stride ignored
  979. mov ebx, [esp + 8 + 12] // src_stepx
  980. mov edx, [esp + 8 + 16] // dst_argb
  981. mov ecx, [esp + 8 + 20] // dst_width
  982. lea ebx, [ebx * 4]
  983. lea edi, [ebx + ebx * 2]
  984. wloop:
  985. movd xmm0, [eax]
  986. movd xmm1, [eax + ebx]
  987. punpckldq xmm0, xmm1
  988. movd xmm2, [eax + ebx * 2]
  989. movd xmm3, [eax + edi]
  990. lea eax, [eax + ebx * 4]
  991. punpckldq xmm2, xmm3
  992. punpcklqdq xmm0, xmm2
  993. movdqu [edx], xmm0
  994. lea edx, [edx + 16]
  995. sub ecx, 4
  996. jg wloop
  997. pop edi
  998. pop ebx
  999. ret
  1000. }
  1001. }
  1002. // Blends four 2x2 to 4x1.
  1003. __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
  1004. ptrdiff_t src_stride,
  1005. int src_stepx,
  1006. uint8_t* dst_argb,
  1007. int dst_width) {
  1008. __asm {
  1009. push ebx
  1010. push esi
  1011. push edi
  1012. mov eax, [esp + 12 + 4] // src_argb
  1013. mov esi, [esp + 12 + 8] // src_stride
  1014. mov ebx, [esp + 12 + 12] // src_stepx
  1015. mov edx, [esp + 12 + 16] // dst_argb
  1016. mov ecx, [esp + 12 + 20] // dst_width
  1017. lea esi, [eax + esi] // row1 pointer
  1018. lea ebx, [ebx * 4]
  1019. lea edi, [ebx + ebx * 2]
  1020. wloop:
  1021. movq xmm0, qword ptr [eax] // row0 4 pairs
  1022. movhps xmm0, qword ptr [eax + ebx]
  1023. movq xmm1, qword ptr [eax + ebx * 2]
  1024. movhps xmm1, qword ptr [eax + edi]
  1025. lea eax, [eax + ebx * 4]
  1026. movq xmm2, qword ptr [esi] // row1 4 pairs
  1027. movhps xmm2, qword ptr [esi + ebx]
  1028. movq xmm3, qword ptr [esi + ebx * 2]
  1029. movhps xmm3, qword ptr [esi + edi]
  1030. lea esi, [esi + ebx * 4]
  1031. pavgb xmm0, xmm2 // average rows
  1032. pavgb xmm1, xmm3
  1033. movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
  1034. shufps xmm0, xmm1, 0x88 // even pixels
  1035. shufps xmm2, xmm1, 0xdd // odd pixels
  1036. pavgb xmm0, xmm2
  1037. movdqu [edx], xmm0
  1038. lea edx, [edx + 16]
  1039. sub ecx, 4
  1040. jg wloop
  1041. pop edi
  1042. pop esi
  1043. pop ebx
  1044. ret
  1045. }
  1046. }
  1047. // Column scaling unfiltered. SSE2 version.
  1048. __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
  1049. const uint8_t* src_argb,
  1050. int dst_width,
  1051. int x,
  1052. int dx) {
  1053. __asm {
  1054. push edi
  1055. push esi
  1056. mov edi, [esp + 8 + 4] // dst_argb
  1057. mov esi, [esp + 8 + 8] // src_argb
  1058. mov ecx, [esp + 8 + 12] // dst_width
  1059. movd xmm2, [esp + 8 + 16] // x
  1060. movd xmm3, [esp + 8 + 20] // dx
  1061. pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
  1062. pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
  1063. paddd xmm2, xmm0
  1064. paddd xmm3, xmm3 // 0, 0, 0, dx * 2
  1065. pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
  1066. paddd xmm2, xmm0 // x3 x2 x1 x0
  1067. paddd xmm3, xmm3 // 0, 0, 0, dx * 4
  1068. pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
  1069. pextrw eax, xmm2, 1 // get x0 integer.
  1070. pextrw edx, xmm2, 3 // get x1 integer.
  1071. cmp ecx, 0
  1072. jle xloop99
  1073. sub ecx, 4
  1074. jl xloop49
  1075. // 4 Pixel loop.
  1076. xloop4:
  1077. movd xmm0, [esi + eax * 4] // 1 source x0 pixels
  1078. movd xmm1, [esi + edx * 4] // 1 source x1 pixels
  1079. pextrw eax, xmm2, 5 // get x2 integer.
  1080. pextrw edx, xmm2, 7 // get x3 integer.
  1081. paddd xmm2, xmm3 // x += dx
  1082. punpckldq xmm0, xmm1 // x0 x1
  1083. movd xmm1, [esi + eax * 4] // 1 source x2 pixels
  1084. movd xmm4, [esi + edx * 4] // 1 source x3 pixels
  1085. pextrw eax, xmm2, 1 // get x0 integer. next iteration.
  1086. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
  1087. punpckldq xmm1, xmm4 // x2 x3
  1088. punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
  1089. movdqu [edi], xmm0
  1090. lea edi, [edi + 16]
  1091. sub ecx, 4 // 4 pixels
  1092. jge xloop4
  1093. xloop49:
  1094. test ecx, 2
  1095. je xloop29
  1096. // 2 Pixels.
  1097. movd xmm0, [esi + eax * 4] // 1 source x0 pixels
  1098. movd xmm1, [esi + edx * 4] // 1 source x1 pixels
  1099. pextrw eax, xmm2, 5 // get x2 integer.
  1100. punpckldq xmm0, xmm1 // x0 x1
  1101. movq qword ptr [edi], xmm0
  1102. lea edi, [edi + 8]
  1103. xloop29:
  1104. test ecx, 1
  1105. je xloop99
  1106. // 1 Pixels.
  1107. movd xmm0, [esi + eax * 4] // 1 source x2 pixels
  1108. movd dword ptr [edi], xmm0
  1109. xloop99:
  1110. pop esi
  1111. pop edi
  1112. ret
  1113. }
  1114. }
  1115. // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
  1116. // TODO(fbarchard): Port to Neon
  1117. // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
  1118. static const uvec8 kShuffleColARGB = {
  1119. 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
  1120. 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
  1121. };
  1122. // Shuffle table for duplicating 2 fractions into 8 bytes each
  1123. static const uvec8 kShuffleFractions = {
  1124. 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
  1125. };
  1126. __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
  1127. const uint8_t* src_argb,
  1128. int dst_width,
  1129. int x,
  1130. int dx) {
  1131. __asm {
  1132. push esi
  1133. push edi
  1134. mov edi, [esp + 8 + 4] // dst_argb
  1135. mov esi, [esp + 8 + 8] // src_argb
  1136. mov ecx, [esp + 8 + 12] // dst_width
  1137. movd xmm2, [esp + 8 + 16] // x
  1138. movd xmm3, [esp + 8 + 20] // dx
  1139. movdqa xmm4, xmmword ptr kShuffleColARGB
  1140. movdqa xmm5, xmmword ptr kShuffleFractions
  1141. pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
  1142. psrlw xmm6, 9
  1143. pextrw eax, xmm2, 1 // get x0 integer. preroll
  1144. sub ecx, 2
  1145. jl xloop29
  1146. movdqa xmm0, xmm2 // x1 = x0 + dx
  1147. paddd xmm0, xmm3
  1148. punpckldq xmm2, xmm0 // x0 x1
  1149. punpckldq xmm3, xmm3 // dx dx
  1150. paddd xmm3, xmm3 // dx * 2, dx * 2
  1151. pextrw edx, xmm2, 3 // get x1 integer. preroll
  1152. // 2 Pixel loop.
  1153. xloop2:
  1154. movdqa xmm1, xmm2 // x0, x1 fractions.
  1155. paddd xmm2, xmm3 // x += dx
  1156. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
  1157. psrlw xmm1, 9 // 7 bit fractions.
  1158. movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
  1159. pshufb xmm1, xmm5 // 0000000011111111
  1160. pshufb xmm0, xmm4 // arrange pixels into pairs
  1161. pxor xmm1, xmm6 // 0..7f and 7f..0
  1162. pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
  1163. pextrw eax, xmm2, 1 // get x0 integer. next iteration.
  1164. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
  1165. psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
  1166. packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
  1167. movq qword ptr [edi], xmm0
  1168. lea edi, [edi + 8]
  1169. sub ecx, 2 // 2 pixels
  1170. jge xloop2
  1171. xloop29:
  1172. add ecx, 2 - 1
  1173. jl xloop99
  1174. // 1 pixel remainder
  1175. psrlw xmm2, 9 // 7 bit fractions.
  1176. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
  1177. pshufb xmm2, xmm5 // 00000000
  1178. pshufb xmm0, xmm4 // arrange pixels into pairs
  1179. pxor xmm2, xmm6 // 0..7f and 7f..0
  1180. pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
  1181. psrlw xmm0, 7
  1182. packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
  1183. movd [edi], xmm0
  1184. xloop99:
  1185. pop edi
  1186. pop esi
  1187. ret
  1188. }
  1189. }
  1190. // Reads 4 pixels, duplicates them and writes 8 pixels.
  1191. __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
  1192. const uint8_t* src_argb,
  1193. int dst_width,
  1194. int x,
  1195. int dx) {
  1196. __asm {
  1197. mov edx, [esp + 4] // dst_argb
  1198. mov eax, [esp + 8] // src_argb
  1199. mov ecx, [esp + 12] // dst_width
  1200. wloop:
  1201. movdqu xmm0, [eax]
  1202. lea eax, [eax + 16]
  1203. movdqa xmm1, xmm0
  1204. punpckldq xmm0, xmm0
  1205. punpckhdq xmm1, xmm1
  1206. movdqu [edx], xmm0
  1207. movdqu [edx + 16], xmm1
  1208. lea edx, [edx + 32]
  1209. sub ecx, 8
  1210. jg wloop
  1211. ret
  1212. }
  1213. }
  1214. // Divide num by div and return as 16.16 fixed point result.
  1215. __declspec(naked) int FixedDiv_X86(int num, int div) {
  1216. __asm {
  1217. mov eax, [esp + 4] // num
  1218. cdq // extend num to 64 bits
  1219. shld edx, eax, 16 // 32.16
  1220. shl eax, 16
  1221. idiv dword ptr [esp + 8]
  1222. ret
  1223. }
  1224. }
  1225. // Divide num by div and return as 16.16 fixed point result.
  1226. __declspec(naked) int FixedDiv1_X86(int num, int div) {
  1227. __asm {
  1228. mov eax, [esp + 4] // num
  1229. mov ecx, [esp + 8] // denom
  1230. cdq // extend num to 64 bits
  1231. shld edx, eax, 16 // 32.16
  1232. shl eax, 16
  1233. sub eax, 0x00010001
  1234. sbb edx, 0
  1235. sub ecx, 1
  1236. idiv ecx
  1237. ret
  1238. }
  1239. }
  1240. #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
  1241. #ifdef __cplusplus
  1242. } // extern "C"
  1243. } // namespace libyuv
  1244. #endif