scale_mmi.cc 56 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168
  1. /*
  2. * Copyright 2013 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/scale.h"
  11. #include <assert.h>
  12. #include <string.h>
  13. #include "libyuv/cpu_id.h"
  14. #include "libyuv/planar_functions.h" // For CopyARGB
  15. #include "libyuv/row.h"
  16. #include "libyuv/scale_row.h"
  17. #ifdef __cplusplus
  18. namespace libyuv {
  19. extern "C" {
  20. #endif
  21. // This module is for Mips MMI.
  22. #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
  23. // clang-format off
  24. // CPU agnostic row functions
  25. void ScaleRowDown2_MMI(const uint8_t* src_ptr,
  26. ptrdiff_t src_stride,
  27. uint8_t* dst,
  28. int dst_width) {
  29. (void)src_stride;
  30. uint64_t src0, src1, dest;
  31. const uint64_t shift = 0x8ULL;
  32. __asm__ volatile(
  33. "1: \n\t"
  34. "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
  35. "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
  36. "psrlh %[src0], %[src0], %[shift] \n\t"
  37. "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
  38. "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
  39. "psrlh %[src1], %[src1], %[shift] \n\t"
  40. "packushb %[dest], %[src0], %[src1] \n\t"
  41. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  42. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  43. "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
  44. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  45. "daddi %[width], %[width], -0x08 \n\t"
  46. "bnez %[width], 1b \n\t"
  47. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
  48. : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
  49. [shift] "f"(shift)
  50. : "memory");
  51. }
  52. void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
  53. ptrdiff_t src_stride,
  54. uint8_t* dst,
  55. int dst_width) {
  56. (void)src_stride;
  57. uint64_t src0, src1;
  58. uint64_t dest, dest0, dest1;
  59. const uint64_t mask = 0x00ff00ff00ff00ffULL;
  60. const uint64_t shift = 0x8ULL;
  61. __asm__ volatile(
  62. "1: \n\t"
  63. "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
  64. "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
  65. "and %[dest0], %[src0], %[mask] \n\t"
  66. "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
  67. "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
  68. "and %[dest1], %[src1], %[mask] \n\t"
  69. "packushb %[dest0], %[dest0], %[dest1] \n\t"
  70. "psrlh %[src0], %[src0], %[shift] \n\t"
  71. "psrlh %[src1], %[src1], %[shift] \n\t"
  72. "packushb %[dest1], %[src0], %[src1] \n\t"
  73. "pavgb %[dest], %[dest0], %[dest1] \n\t"
  74. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  75. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  76. "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
  77. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  78. "daddi %[width], %[width], -0x08 \n\t"
  79. "bnez %[width], 1b \n\t"
  80. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0),
  81. [dest1] "=&f"(dest1), [dest] "=&f"(dest)
  82. : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask),
  83. [shift] "f"(shift), [width] "r"(dst_width)
  84. : "memory");
  85. }
  86. void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
  87. ptrdiff_t src_stride,
  88. uint8_t* dst,
  89. int dst_width) {
  90. const uint8_t* s = src_ptr;
  91. const uint8_t* t = src_ptr + src_stride;
  92. uint64_t s0, s1, t0, t1;
  93. uint64_t dest, dest0, dest1;
  94. const uint64_t ph = 0x0002000200020002ULL;
  95. const uint64_t mask = 0x00ff00ff00ff00ffULL;
  96. const uint64_t shift0 = 0x2ULL;
  97. const uint64_t shift1 = 0x8ULL;
  98. __asm__ volatile(
  99. "1: \n\t"
  100. "gsldrc1 %[s0], 0x00(%[s]) \n\t"
  101. "gsldlc1 %[s0], 0x07(%[s]) \n\t"
  102. "psrlh %[s1], %[s0], %[shift1] \n\t"
  103. "and %[s0], %[s0], %[mask] \n\t"
  104. "gsldrc1 %[t0], 0x00(%[t]) \n\t"
  105. "gsldlc1 %[t0], 0x07(%[t]) \n\t"
  106. "psrlh %[t1], %[t0], %[shift1] \n\t"
  107. "and %[t0], %[t0], %[mask] \n\t"
  108. "paddh %[dest0], %[s0], %[s1] \n\t"
  109. "paddh %[dest0], %[dest0], %[t0] \n\t"
  110. "paddh %[dest0], %[dest0], %[t1] \n\t"
  111. "paddh %[dest0], %[dest0], %[ph] \n\t"
  112. "psrlh %[dest0], %[dest0], %[shift0] \n\t"
  113. "gsldrc1 %[s0], 0x08(%[s]) \n\t"
  114. "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
  115. "psrlh %[s1], %[s0], %[shift1] \n\t"
  116. "and %[s0], %[s0], %[mask] \n\t"
  117. "gsldrc1 %[t0], 0x08(%[t]) \n\t"
  118. "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
  119. "psrlh %[t1], %[t0], %[shift1] \n\t"
  120. "and %[t0], %[t0], %[mask] \n\t"
  121. "paddh %[dest1], %[s0], %[s1] \n\t"
  122. "paddh %[dest1], %[dest1], %[t0] \n\t"
  123. "paddh %[dest1], %[dest1], %[t1] \n\t"
  124. "paddh %[dest1], %[dest1], %[ph] \n\t"
  125. "psrlh %[dest1], %[dest1], %[shift0] \n\t"
  126. "packushb %[dest], %[dest0], %[dest1] \n\t"
  127. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  128. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  129. "daddiu %[s], %[s], 0x10 \n\t"
  130. "daddiu %[t], %[t], 0x10 \n\t"
  131. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  132. "daddi %[width], %[width], -0x08 \n\t"
  133. "bnez %[width], 1b \n\t"
  134. : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
  135. [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest)
  136. : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
  137. [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
  138. [mask] "f"(mask)
  139. : "memory");
  140. }
  141. void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
  142. ptrdiff_t src_stride,
  143. uint8_t* dst_argb,
  144. int dst_width) {
  145. (void)src_stride;
  146. const uint32_t* src = (const uint32_t*)(src_argb);
  147. uint32_t* dst = (uint32_t*)(dst_argb);
  148. uint64_t src0, src1, dest;
  149. __asm__ volatile(
  150. "1: \n\t"
  151. "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
  152. "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
  153. "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
  154. "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
  155. "punpckhwd %[dest], %[src0], %[src1] \n\t"
  156. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  157. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  158. "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
  159. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  160. "daddi %[width], %[width], -0x02 \n\t"
  161. "bnez %[width], 1b \n\t"
  162. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
  163. : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width)
  164. : "memory");
  165. }
  166. void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
  167. ptrdiff_t src_stride,
  168. uint8_t* dst_argb,
  169. int dst_width) {
  170. (void)src_stride;
  171. uint64_t src0, src1;
  172. uint64_t dest, dest_hi, dest_lo;
  173. __asm__ volatile(
  174. "1: \n\t"
  175. "lwc1 %[src0], 0x00(%[src_ptr]) \n\t"
  176. "lwc1 %[src1], 0x08(%[src_ptr]) \n\t"
  177. "punpcklwd %[dest_lo], %[src0], %[src1] \n\t"
  178. "lwc1 %[src0], 0x04(%[src_ptr]) \n\t"
  179. "lwc1 %[src1], 0x0c(%[src_ptr]) \n\t"
  180. "punpcklwd %[dest_hi], %[src0], %[src1] \n\t"
  181. "pavgb %[dest], %[dest_lo], %[dest_hi] \n\t"
  182. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  183. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  184. "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
  185. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  186. "daddi %[width], %[width], -0x02 \n\t"
  187. "bnez %[width], 1b \n\t"
  188. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
  189. [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
  190. : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
  191. : "memory");
  192. }
  193. void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
  194. ptrdiff_t src_stride,
  195. uint8_t* dst_argb,
  196. int dst_width) {
  197. const uint8_t* s = src_argb;
  198. const uint8_t* t = src_argb + src_stride;
  199. uint64_t s0, s_hi, s_lo;
  200. uint64_t t0, t_hi, t_lo;
  201. uint64_t dest, dest_hi, dest_lo;
  202. const uint64_t mask = 0x0ULL;
  203. const uint64_t ph = 0x0002000200020002ULL;
  204. const uint64_t shfit = 0x2ULL;
  205. __asm__ volatile(
  206. "1: \n\t"
  207. "gsldrc1 %[s0], 0x00(%[s]) \n\t"
  208. "gsldlc1 %[s0], 0x07(%[s]) \n\t"
  209. "punpcklbh %[s_lo], %[s0], %[mask] \n\t"
  210. "punpckhbh %[s_hi], %[s0], %[mask] \n\t"
  211. "paddh %[dest_lo], %[s_lo], %[s_hi] \n\t"
  212. "gsldrc1 %[t0], 0x00(%[t]) \n\t"
  213. "gsldlc1 %[t0], 0x07(%[t]) \n\t"
  214. "punpcklbh %[t_lo], %[t0], %[mask] \n\t"
  215. "punpckhbh %[t_hi], %[t0], %[mask] \n\t"
  216. "paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t"
  217. "paddh %[dest_lo], %[dest_lo], %[t_hi] \n\t"
  218. "paddh %[dest_lo], %[dest_lo], %[ph] \n\t"
  219. "psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t"
  220. "gsldrc1 %[s0], 0x08(%[s]) \n\t"
  221. "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
  222. "punpcklbh %[s_lo], %[s0], %[mask] \n\t"
  223. "punpckhbh %[s_hi], %[s0], %[mask] \n\t"
  224. "paddh %[dest_hi], %[s_lo], %[s_hi] \n\t"
  225. "gsldrc1 %[t0], 0x08(%[t]) \n\t"
  226. "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
  227. "punpcklbh %[t_lo], %[t0], %[mask] \n\t"
  228. "punpckhbh %[t_hi], %[t0], %[mask] \n\t"
  229. "paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t"
  230. "paddh %[dest_hi], %[dest_hi], %[t_hi] \n\t"
  231. "paddh %[dest_hi], %[dest_hi], %[ph] \n\t"
  232. "psrlh %[dest_hi], %[dest_hi], %[shfit] \n\t"
  233. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  234. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  235. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  236. "daddiu %[s], %[s], 0x10 \n\t"
  237. "daddiu %[t], %[t], 0x10 \n\t"
  238. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  239. "daddi %[width], %[width], -0x02 \n\t"
  240. "bnez %[width], 1b \n\t"
  241. : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi),
  242. [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo),
  243. [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest)
  244. : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
  245. [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit)
  246. : "memory");
  247. }
  248. void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
  249. ptrdiff_t src_stride,
  250. uint16_t* dst,
  251. int dst_width) {
  252. (void)src_stride;
  253. uint64_t src0, src1, dest;
  254. const uint64_t shift = 0x10ULL;
  255. __asm__ volatile(
  256. "1: \n\t"
  257. "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
  258. "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
  259. "psrlw %[src0], %[src0], %[shift] \n\t"
  260. "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
  261. "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
  262. "psrlw %[src1], %[src1], %[shift] \n\t"
  263. "packsswh %[dest], %[src0], %[src1] \n\t"
  264. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  265. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  266. "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
  267. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  268. "daddi %[width], %[width], -0x04 \n\t"
  269. "bnez %[width], 1b \n\t"
  270. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
  271. : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
  272. [shift] "f"(shift)
  273. : "memory");
  274. }
  275. void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
  276. ptrdiff_t src_stride,
  277. uint16_t* dst,
  278. int dst_width) {
  279. (void)src_stride;
  280. uint64_t src0, src1;
  281. uint64_t dest, dest_hi, dest_lo;
  282. __asm__ volatile(
  283. "1: \n\t"
  284. "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
  285. "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
  286. "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
  287. "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
  288. "punpcklhw %[dest_lo], %[src0], %[src1] \n\t"
  289. "punpckhhw %[dest_hi], %[src0], %[src1] \n\t"
  290. "punpcklhw %[src0], %[dest_lo], %[dest_hi] \n\t"
  291. "punpckhhw %[src1], %[dest_lo], %[dest_hi] \n\t"
  292. "pavgh %[dest], %[src0], %[src1] \n\t"
  293. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  294. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  295. "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
  296. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  297. "daddi %[width], %[width], -0x04 \n\t"
  298. "bnez %[width], 1b \n\t"
  299. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
  300. [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
  301. : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width)
  302. : "memory");
  303. }
  304. void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
  305. ptrdiff_t src_stride,
  306. uint16_t* dst,
  307. int dst_width) {
  308. const uint16_t* s = src_ptr;
  309. const uint16_t* t = src_ptr + src_stride;
  310. uint64_t s0, s1, s_hi, s_lo;
  311. uint64_t t0, t1, t_hi, t_lo;
  312. uint64_t dest, dest0, dest1;
  313. const uint64_t ph = 0x0000000200000002ULL;
  314. const uint64_t mask = 0x0000ffff0000ffffULL;
  315. const uint64_t shift0 = 0x10ULL;
  316. const uint64_t shift1 = 0x2ULL;
  317. __asm__ volatile(
  318. "1: \n\t"
  319. "gsldrc1 %[s0], 0x00(%[s]) \n\t"
  320. "gsldlc1 %[s0], 0x07(%[s]) \n\t"
  321. "psrlw %[s1], %[s0], %[shift0] \n\t"
  322. "and %[s0], %[s0], %[mask] \n\t"
  323. "gsldrc1 %[t0], 0x00(%[t]) \n\t"
  324. "gsldlc1 %[t0], 0x07(%[t]) \n\t"
  325. "psrlw %[t1], %[t0], %[shift0] \n\t"
  326. "and %[t0], %[t0], %[mask] \n\t"
  327. "paddw %[dest0], %[s0], %[s1] \n\t"
  328. "paddw %[dest0], %[dest0], %[t0] \n\t"
  329. "paddw %[dest0], %[dest0], %[t1] \n\t"
  330. "paddw %[dest0], %[dest0], %[ph] \n\t"
  331. "psrlw %[dest0], %[dest0], %[shift1] \n\t"
  332. "gsldrc1 %[s0], 0x08(%[s]) \n\t"
  333. "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
  334. "psrlw %[s1], %[s0], %[shift0] \n\t"
  335. "and %[s0], %[s0], %[mask] \n\t"
  336. "gsldrc1 %[t0], 0x08(%[t]) \n\t"
  337. "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
  338. "psrlw %[t1], %[t0], %[shift0] \n\t"
  339. "and %[t0], %[t0], %[mask] \n\t"
  340. "paddw %[dest1], %[s0], %[s1] \n\t"
  341. "paddw %[dest1], %[dest1], %[t0] \n\t"
  342. "paddw %[dest1], %[dest1], %[t1] \n\t"
  343. "paddw %[dest1], %[dest1], %[ph] \n\t"
  344. "psrlw %[dest1], %[dest1], %[shift1] \n\t"
  345. "packsswh %[dest], %[dest0], %[dest1] \n\t"
  346. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  347. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  348. "daddiu %[s], %[s], 0x10 \n\t"
  349. "daddiu %[t], %[t], 0x10 \n\t"
  350. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  351. "daddi %[width], %[width], -0x04 \n\t"
  352. "bnez %[width], 1b \n\t"
  353. : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
  354. [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi),
  355. [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
  356. [dest] "=&f"(dest)
  357. : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
  358. [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
  359. [mask] "f"(mask)
  360. : "memory");
  361. }
  362. void ScaleRowDown4_MMI(const uint8_t* src_ptr,
  363. ptrdiff_t src_stride,
  364. uint8_t* dst,
  365. int dst_width) {
  366. (void)src_stride;
  367. uint64_t src0, src1;
  368. uint64_t dest, dest_hi, dest_lo;
  369. const uint64_t shift = 0x10ULL;
  370. const uint64_t mask = 0x000000ff000000ffULL;
  371. __asm__ volatile(
  372. "1: \n\t"
  373. "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
  374. "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
  375. "psrlw %[src0], %[src0], %[shift] \n\t"
  376. "and %[src0], %[src0], %[mask] \n\t"
  377. "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
  378. "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
  379. "psrlw %[src1], %[src1], %[shift] \n\t"
  380. "and %[src1], %[src1], %[mask] \n\t"
  381. "packsswh %[dest_lo], %[src0], %[src1] \n\t"
  382. "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t"
  383. "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t"
  384. "psrlw %[src0], %[src0], %[shift] \n\t"
  385. "and %[src0], %[src0], %[mask] \n\t"
  386. "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t"
  387. "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t"
  388. "psrlw %[src1], %[src1], %[shift] \n\t"
  389. "and %[src1], %[src1], %[mask] \n\t"
  390. "packsswh %[dest_hi], %[src0], %[src1] \n\t"
  391. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  392. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  393. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  394. "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
  395. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  396. "daddi %[width], %[width], -0x08 \n\t"
  397. "bnez %[width], 1b \n\t"
  398. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
  399. [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
  400. : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
  401. [shift] "f"(shift), [mask] "f"(mask)
  402. : "memory");
  403. }
  404. void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
  405. ptrdiff_t src_stride,
  406. uint16_t* dst,
  407. int dst_width) {
  408. (void)src_stride;
  409. uint64_t src0, src1;
  410. uint64_t dest, dest_hi, dest_lo;
  411. const uint64_t mask = 0x0ULL;
  412. __asm__ volatile(
  413. "1: \n\t"
  414. "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
  415. "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
  416. "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
  417. "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
  418. "punpckhhw %[dest_lo], %[src0], %[src1] \n\t"
  419. "punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t"
  420. "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t"
  421. "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t"
  422. "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t"
  423. "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t"
  424. "punpckhhw %[dest_hi], %[src0], %[src1] \n\t"
  425. "punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t"
  426. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  427. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  428. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  429. "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
  430. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  431. "daddi %[width], %[width], -0x04 \n\t"
  432. "bnez %[width], 1b \n\t"
  433. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
  434. [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
  435. : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
  436. [mask] "f"(mask)
  437. : "memory");
  438. }
  439. #define DO_SCALEROWDOWN4BOX_PUNPCKADD() \
  440. "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \
  441. "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \
  442. "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \
  443. "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
  444. #define DO_SCALEROWDOWN4BOX_LOOP(reg) \
  445. "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \
  446. "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \
  447. "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \
  448. \
  449. "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \
  450. DO_SCALEROWDOWN4BOX_PUNPCKADD() \
  451. \
  452. "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \
  453. DO_SCALEROWDOWN4BOX_PUNPCKADD() \
  454. \
  455. "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \
  456. DO_SCALEROWDOWN4BOX_PUNPCKADD() \
  457. \
  458. "pmaddhw %[dest_lo], %[dest_lo], %[mask1] \n\t" \
  459. "pmaddhw %[dest_hi], %[dest_hi], %[mask1] \n\t" \
  460. "packsswh " #reg ", %[dest_lo], %[dest_hi] \n\t" \
  461. "pmaddhw " #reg ", " #reg ", %[mask1] \n\t" \
  462. "paddh " #reg ", " #reg ", %[ph] \n\t" \
  463. "psrlh " #reg ", " #reg ", %[shift] \n\t" \
  464. \
  465. "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \
  466. "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \
  467. "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \
  468. "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t"
  469. /* LibYUVScaleTest.ScaleDownBy4_Box */
  470. void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
  471. ptrdiff_t src_stride,
  472. uint8_t* dst,
  473. int dst_width) {
  474. const uint8_t* src0_ptr = src_ptr;
  475. const uint8_t* src1_ptr = src_ptr + src_stride;
  476. const uint8_t* src2_ptr = src_ptr + src_stride * 2;
  477. const uint8_t* src3_ptr = src_ptr + src_stride * 3;
  478. uint64_t src, src_hi, src_lo;
  479. uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
  480. const uint64_t mask0 = 0x0ULL;
  481. const uint64_t mask1 = 0x0001000100010001ULL;
  482. const uint64_t ph = 0x0008000800080008ULL;
  483. const uint64_t shift = 0x4ULL;
  484. __asm__ volatile(
  485. "1: \n\t"
  486. DO_SCALEROWDOWN4BOX_LOOP(%[dest0])
  487. DO_SCALEROWDOWN4BOX_LOOP(%[dest1])
  488. DO_SCALEROWDOWN4BOX_LOOP(%[dest2])
  489. DO_SCALEROWDOWN4BOX_LOOP(%[dest3])
  490. "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
  491. "packsswh %[dest_hi], %[dest2], %[dest3] \n\t"
  492. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  493. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  494. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  495. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  496. "daddi %[width], %[width], -0x08 \n\t"
  497. "bnez %[width], 1b \n\t"
  498. : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
  499. [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
  500. [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
  501. [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
  502. : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
  503. [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
  504. [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
  505. [ph] "f"(ph), [mask1] "f"(mask1)
  506. : "memory");
  507. }
  508. #define DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
  509. "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \
  510. "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \
  511. "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \
  512. "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
  513. #define DO_SCALEROWDOWN4BOX_16_LOOP(reg) \
  514. "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \
  515. "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \
  516. "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \
  517. \
  518. "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \
  519. DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
  520. \
  521. "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \
  522. DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
  523. \
  524. "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \
  525. DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
  526. \
  527. "paddw %[dest], %[dest_lo], %[dest_hi] \n\t" \
  528. "punpckhwd %[dest_hi], %[dest], %[dest] \n\t" \
  529. "paddw %[dest], %[dest_hi], %[dest] \n\t" \
  530. "paddw %[dest], %[dest], %[ph] \n\t" \
  531. "psraw %[dest], %[dest], %[shift] \n\t" \
  532. "and " #reg ", %[dest], %[mask1] \n\t" \
  533. \
  534. "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \
  535. "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \
  536. "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \
  537. "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t"
  538. /* LibYUVScaleTest.ScaleDownBy4_Box_16 */
  539. void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
  540. ptrdiff_t src_stride,
  541. uint16_t* dst,
  542. int dst_width) {
  543. const uint16_t* src0_ptr = src_ptr;
  544. const uint16_t* src1_ptr = src_ptr + src_stride;
  545. const uint16_t* src2_ptr = src_ptr + src_stride * 2;
  546. const uint16_t* src3_ptr = src_ptr + src_stride * 3;
  547. uint64_t src, src_hi, src_lo;
  548. uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
  549. const uint64_t mask0 = 0x0ULL;
  550. const uint64_t mask1 = 0x00000000ffffffffULL;
  551. const uint64_t ph = 0x0000000800000008ULL;
  552. const uint64_t shift = 0x04ULL;
  553. __asm__ volatile(
  554. "1: \n\t"
  555. DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0])
  556. DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1])
  557. DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2])
  558. DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3])
  559. "punpcklwd %[dest_lo], %[dest0], %[dest1] \n\t"
  560. "punpcklwd %[dest_hi], %[dest2], %[dest3] \n\t"
  561. "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
  562. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  563. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  564. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  565. "daddi %[width], %[width], -0x04 \n\t"
  566. "bnez %[width], 1b \n\t"
  567. : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
  568. [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
  569. [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
  570. [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
  571. : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
  572. [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
  573. [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
  574. [ph] "f"(ph), [mask1] "f"(mask1)
  575. : "memory");
  576. }
  577. // Scales a single row of pixels up by 2x using point sampling.
  578. void ScaleColsUp2_MMI(uint8_t* dst_ptr,
  579. const uint8_t* src_ptr,
  580. int dst_width,
  581. int x,
  582. int dx) {
  583. uint64_t src, dest;
  584. (void)x;
  585. (void)dx;
  586. __asm__ volatile(
  587. "1: \n\t"
  588. "lwc1 %[src], 0x00(%[src_ptr]) \n\t"
  589. "punpcklbh %[dest], %[src], %[src] \n\t"
  590. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  591. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  592. "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t"
  593. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  594. "daddi %[width], %[width], -0x08 \n\t"
  595. "bnez %[width], 1b \n\t"
  596. : [src] "=&f"(src), [dest] "=&f"(dest)
  597. : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
  598. : "memory");
  599. }
  600. void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
  601. const uint16_t* src_ptr,
  602. int dst_width,
  603. int x,
  604. int dx) {
  605. uint64_t src, dest;
  606. (void)x;
  607. (void)dx;
  608. __asm__ volatile(
  609. "1: \n\t"
  610. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  611. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  612. "punpcklhw %[dest], %[src], %[src] \n\t"
  613. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  614. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  615. "punpckhhw %[dest], %[src], %[src] \n\t"
  616. "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
  617. "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
  618. "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
  619. "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
  620. "daddi %[width], %[width], -0x08 \n\t"
  621. "bnez %[width], 1b \n\t"
  622. : [src] "=&f"(src), [dest] "=&f"(dest)
  623. : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
  624. : "memory");
  625. }
  626. void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
  627. uint64_t src, src_hi, src_lo, dest0, dest1;
  628. const uint64_t mask = 0x0ULL;
  629. __asm__ volatile(
  630. "1: \n\t"
  631. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  632. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  633. "punpcklbh %[src_lo], %[src], %[mask] \n\t"
  634. "punpckhbh %[src_hi], %[src], %[mask] \n\t"
  635. "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
  636. "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
  637. "paddush %[dest0], %[dest0], %[src_lo] \n\t"
  638. "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
  639. "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
  640. "paddush %[dest1], %[dest1], %[src_hi] \n\t"
  641. "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
  642. "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
  643. "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
  644. "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
  645. "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
  646. "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
  647. "daddi %[width], %[width], -0x08 \n\t"
  648. "bnez %[width], 1b \n\t"
  649. : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
  650. [src_lo] "=&f"(src_lo), [src] "=&f"(src)
  651. : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
  652. [mask] "f"(mask)
  653. : "memory");
  654. }
  655. void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
  656. uint32_t* dst_ptr,
  657. int src_width) {
  658. uint64_t src, src_hi, src_lo, dest0, dest1;
  659. const uint64_t mask = 0x0ULL;
  660. __asm__ volatile(
  661. "1: \n\t"
  662. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  663. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  664. "punpcklhw %[src_lo], %[src], %[mask] \n\t"
  665. "punpckhhw %[src_hi], %[src], %[mask] \n\t"
  666. "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
  667. "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
  668. "paddw %[dest0], %[dest0], %[src_lo] \n\t"
  669. "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
  670. "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
  671. "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
  672. "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
  673. "paddw %[dest1], %[dest1], %[src_hi] \n\t"
  674. "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
  675. "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
  676. "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
  677. "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
  678. "daddi %[width], %[width], -0x04 \n\t"
  679. "bnez %[width], 1b \n\t"
  680. : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
  681. [src_lo] "=&f"(src_lo), [src] "=&f"(src)
  682. : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
  683. [mask] "f"(mask)
  684. : "memory");
  685. }
  686. void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
  687. ptrdiff_t src_stride,
  688. int src_stepx,
  689. uint8_t* dst_argb,
  690. int dst_width) {
  691. (void)src_stride;
  692. uint64_t src0, src1, dest;
  693. __asm__ volatile(
  694. "1: \n\t"
  695. "lwc1 %[src0], 0x00(%[src_ptr]) \n\t"
  696. "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t"
  697. "lwc1 %[src1], 0x00(%[src_ptr]) \n\t"
  698. "punpcklwd %[dest], %[src0], %[src1] \n\t"
  699. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  700. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  701. "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t"
  702. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  703. "daddi %[width], %[width], -0x02 \n\t"
  704. "bnez %[width], 1b \n\t"
  705. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
  706. : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb),
  707. [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width)
  708. : "memory");
  709. }
  710. void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
  711. ptrdiff_t src_stride,
  712. int src_stepx,
  713. uint8_t* dst_argb,
  714. int dst_width) {
  715. const uint8_t* src0_ptr = src_argb;
  716. const uint8_t* src1_ptr = src_argb + src_stride;
  717. uint64_t src0, src1, src_hi, src_lo;
  718. uint64_t dest, dest_hi, dest_lo, dest0, dest1;
  719. const uint64_t mask = 0x0ULL;
  720. const uint64_t ph = 0x0002000200020002ULL;
  721. const uint64_t shift = 0x2ULL;
  722. __asm__ volatile(
  723. "1: \n\t"
  724. "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t"
  725. "punpcklbh %[dest_lo], %[src0], %[mask] \n\t"
  726. "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t"
  727. "punpcklbh %[dest_hi], %[src0], %[mask] \n\t"
  728. "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t"
  729. "punpcklbh %[src_lo], %[src1], %[mask] \n\t"
  730. "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t"
  731. "punpcklbh %[src_hi], %[src1], %[mask] \n\t"
  732. "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t"
  733. "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
  734. "paddh %[dest0], %[dest_hi], %[dest_lo] \n\t"
  735. "paddh %[dest0], %[dest0], %[ph] \n\t"
  736. "psrlh %[dest0], %[dest0], %[shift] \n\t"
  737. "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t"
  738. "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t"
  739. "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t"
  740. "punpcklbh %[dest_lo], %[src0], %[mask] \n\t"
  741. "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t"
  742. "punpcklbh %[dest_hi], %[src0], %[mask] \n\t"
  743. "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t"
  744. "punpcklbh %[src_lo], %[src1], %[mask] \n\t"
  745. "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t"
  746. "punpcklbh %[src_hi], %[src1], %[mask] \n\t"
  747. "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t"
  748. "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
  749. "paddh %[dest1], %[dest_hi], %[dest_lo] \n\t"
  750. "paddh %[dest1], %[dest1], %[ph] \n\t"
  751. "psrlh %[dest1], %[dest1], %[shift] \n\t"
  752. "packushb %[dest], %[dest0], %[dest1] \n\t"
  753. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  754. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  755. "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t"
  756. "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t"
  757. "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
  758. "daddi %[width], %[width], -0x02 \n\t"
  759. "bnez %[width], 1b \n\t"
  760. : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
  761. [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
  762. [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
  763. [src1] "=&f"(src1), [dest] "=&f"(dest)
  764. : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
  765. [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
  766. [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask),
  767. [ph] "f"(ph)
  768. : "memory");
  769. }
  770. // Scales a single row of pixels using point sampling.
  771. void ScaleARGBCols_MMI(uint8_t* dst_argb,
  772. const uint8_t* src_argb,
  773. int dst_width,
  774. int x,
  775. int dx) {
  776. const uint32_t* src = (const uint32_t*)(src_argb);
  777. uint32_t* dst = (uint32_t*)(dst_argb);
  778. const uint32_t* src_tmp;
  779. uint64_t dest, offset;
  780. const uint64_t shift0 = 16;
  781. const uint64_t shift1 = 2;
  782. __asm__ volatile(
  783. "1: \n\t"
  784. "srav %[offset], %[x], %[shift0] \n\t"
  785. "sllv %[offset], %[offset], %[shift1] \n\t"
  786. "dadd %[src_tmp], %[src_ptr], %[offset] \n\t"
  787. "lwc1 %[dest], 0x00(%[src_tmp]) \n\t"
  788. "swc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  789. "dadd %[x], %[x], %[dx] \n\t"
  790. "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t"
  791. "daddi %[width], %[width], -0x01 \n\t"
  792. "bnez %[width], 1b \n\t"
  793. : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp)
  794. : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width),
  795. [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1)
  796. : "memory");
  797. }
  798. // Scales a single row of pixels up by 2x using point sampling.
  799. void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
  800. const uint8_t* src_argb,
  801. int dst_width,
  802. int x,
  803. int dx) {
  804. uint64_t src, dest0, dest1;
  805. (void)x;
  806. (void)dx;
  807. __asm__ volatile(
  808. "1: \n\t"
  809. "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
  810. "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
  811. "punpcklwd %[dest0], %[src], %[src] \n\t"
  812. "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
  813. "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
  814. "punpckhwd %[dest1], %[src], %[src] \n\t"
  815. "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
  816. "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
  817. "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
  818. "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
  819. "daddi %[width], %[width], -0x04 \n\t"
  820. "bnez %[width], 1b \n\t"
  821. : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src)
  822. : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
  823. : "memory");
  824. }
  825. // Divide num by div and return as 16.16 fixed point result.
  826. /* LibYUVBaseTest.TestFixedDiv */
  827. int FixedDiv_MIPS(int num, int div) {
  828. int quotient = 0;
  829. const int shift = 16;
  830. asm(
  831. "dsll %[num], %[num], %[shift] \n\t"
  832. "ddiv %[num], %[div] \t\n"
  833. "mflo %[quo] \t\n"
  834. : [quo] "+&r"(quotient)
  835. : [num] "r"(num), [div] "r"(div), [shift] "r"(shift));
  836. return quotient;
  837. }
  838. // Divide num by div and return as 16.16 fixed point result.
  839. /* LibYUVScaleTest.ARGBScaleTo320x240_Linear */
  840. int FixedDiv1_MIPS(int num, int div) {
  841. int quotient = 0;
  842. const int shift = 16;
  843. const int val1 = 1;
  844. const int64_t val11 = 0x00010001ULL;
  845. asm(
  846. "dsll %[num], %[num], %[shift] \n\t"
  847. "dsub %[num], %[num], %[val11] \n\t"
  848. "dsub %[div], %[div], %[val1] \n\t"
  849. "ddiv %[num], %[div] \t\n"
  850. "mflo %[quo] \t\n"
  851. : [quo] "+&r"(quotient)
  852. : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11),
  853. [shift] "r"(shift));
  854. return quotient;
  855. }
  856. // Read 8x2 upsample with filtering and write 16x1.
  857. // actually reads an extra pixel, so 9x2.
  858. void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
  859. ptrdiff_t src_stride,
  860. uint16_t* dst,
  861. int dst_width) {
  862. const uint16_t* src2_ptr = src_ptr + src_stride;
  863. uint64_t src0, src1;
  864. uint64_t dest, dest04, dest15, dest26, dest37;
  865. uint64_t tmp0, tmp1, tmp2, tmp3;
  866. const uint64_t mask0 = 0x0003000900030009ULL;
  867. const uint64_t mask1 = 0x0001000300010003ULL;
  868. const uint64_t mask2 = 0x0009000300090003ULL;
  869. const uint64_t mask3 = 0x0003000100030001ULL;
  870. const uint64_t ph = 0x0000000800000008ULL;
  871. const uint64_t shift = 4;
  872. __asm__ volatile(
  873. "1: \n\t"
  874. "gsldrc1 %[src0], 0x00(%[src1_ptr]) \n\t"
  875. "gsldlc1 %[src0], 0x07(%[src1_ptr]) \n\t"
  876. "pmaddhw %[dest04], %[src0], %[mask0] \n\t"
  877. "gsldrc1 %[src1], 0x00(%[src2_ptr]) \n\t"
  878. "gsldlc1 %[src1], 0x07(%[src2_ptr]) \n\t"
  879. "pmaddhw %[dest], %[src1], %[mask1] \n\t"
  880. "paddw %[dest04], %[dest04], %[dest] \n\t"
  881. "paddw %[dest04], %[dest04], %[ph] \n\t"
  882. "psrlw %[dest04], %[dest04], %[shift] \n\t"
  883. "pmaddhw %[dest15], %[src0], %[mask2] \n\t"
  884. "pmaddhw %[dest], %[src1], %[mask3] \n\t"
  885. "paddw %[dest15], %[dest15], %[dest] \n\t"
  886. "paddw %[dest15], %[dest15], %[ph] \n\t"
  887. "psrlw %[dest15], %[dest15], %[shift] \n\t"
  888. "gsldrc1 %[src0], 0x02(%[src1_ptr]) \n\t"
  889. "gsldlc1 %[src0], 0x09(%[src1_ptr]) \n\t"
  890. "pmaddhw %[dest26], %[src0], %[mask0] \n\t"
  891. "gsldrc1 %[src1], 0x02(%[src2_ptr]) \n\t"
  892. "gsldlc1 %[src1], 0x09(%[src2_ptr]) \n\t"
  893. "pmaddhw %[dest], %[src1], %[mask1] \n\t"
  894. "paddw %[dest26], %[dest26], %[dest] \n\t"
  895. "paddw %[dest26], %[dest26], %[ph] \n\t"
  896. "psrlw %[dest26], %[dest26], %[shift] \n\t"
  897. "pmaddhw %[dest37], %[src0], %[mask2] \n\t"
  898. "pmaddhw %[dest], %[src1], %[mask3] \n\t"
  899. "paddw %[dest37], %[dest37], %[dest] \n\t"
  900. "paddw %[dest37], %[dest37], %[ph] \n\t"
  901. "psrlw %[dest37], %[dest37], %[shift] \n\t"
  902. /* tmp0 = ( 00 04 02 06 ) */
  903. "packsswh %[tmp0], %[dest04], %[dest26] \n\t"
  904. /* tmp1 = ( 01 05 03 07 ) */
  905. "packsswh %[tmp1], %[dest15], %[dest37] \n\t"
  906. /* tmp2 = ( 00 01 04 05 )*/
  907. "punpcklhw %[tmp2], %[tmp0], %[tmp1] \n\t"
  908. /* tmp3 = ( 02 03 06 07 )*/
  909. "punpckhhw %[tmp3], %[tmp0], %[tmp1] \n\t"
  910. /* ( 00 01 02 03 ) */
  911. "punpcklwd %[dest], %[tmp2], %[tmp3] \n\t"
  912. "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
  913. "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
  914. /* ( 04 05 06 07 ) */
  915. "punpckhwd %[dest], %[tmp2], %[tmp3] \n\t"
  916. "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
  917. "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
  918. "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
  919. "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t"
  920. "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
  921. "daddi %[width], %[width], -0x08 \n\t"
  922. "bnez %[width], 1b \n\t"
  923. : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04),
  924. [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37),
  925. [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
  926. [tmp3] "=&f"(tmp3), [dest] "=&f"(dest)
  927. : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst),
  928. [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1),
  929. [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph)
  930. : "memory");
  931. }
  932. void ScaleRowDown34_MMI(const uint8_t* src_ptr,
  933. ptrdiff_t src_stride,
  934. uint8_t* dst,
  935. int dst_width) {
  936. (void)src_stride;
  937. assert((dst_width % 3 == 0) && (dst_width > 0));
  938. uint64_t src[2];
  939. uint64_t tmp[2];
  940. __asm__ volatile (
  941. "1: \n\t"
  942. "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
  943. "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
  944. "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
  945. "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
  946. "and %[tmp1], %[src0], %[mask1] \n\t"
  947. "psrlw %[tmp0], %[src0], %[rmov] \n\t"
  948. "psllw %[tmp0], %[tmp0], %[lmov1] \n\t"
  949. "or %[src0], %[tmp0], %[tmp1] \n\t"
  950. "punpckhwd %[tmp0], %[src0], %[src0] \n\t"
  951. "psllw %[tmp1], %[tmp0], %[rmov] \n\t"
  952. "or %[src0], %[src0], %[tmp1] \n\t"
  953. "psrlw %[tmp0], %[tmp0], %[rmov8] \n\t"
  954. "pextrh %[tmp0], %[tmp0], %[zero] \n\t"
  955. "pinsrh_2 %[src0], %[src0], %[tmp0] \n\t"
  956. "pextrh %[tmp0], %[src1], %[zero] \n\t"
  957. "pinsrh_3 %[src0], %[src0], %[tmp0] \n\t"
  958. "punpckhwd %[tmp0], %[src1], %[src1] \n\t"
  959. "pextrh %[tmp1], %[tmp0], %[zero] \n\t"
  960. "psrlw %[src1], %[src1], %[rmov] \n\t"
  961. "psllw %[tmp1], %[tmp1], %[rmov8] \n\t"
  962. "or %[src1], %[src1], %[tmp1] \n\t"
  963. "and %[tmp0], %[tmp0], %[mask2] \n\t"
  964. "or %[src1], %[src1], %[tmp0] \n\t"
  965. "gssdlc1 %[src0], 0x07(%[dst_ptr]) \n\t"
  966. "gssdrc1 %[src0], 0x00(%[dst_ptr]) \n\t"
  967. "gsswlc1 %[src1], 0x0b(%[dst_ptr]) \n\t"
  968. "gsswrc1 %[src1], 0x08(%[dst_ptr]) \n\t"
  969. "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
  970. "daddi %[width], %[width], -0x0c \n\t"
  971. "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t"
  972. "bnez %[width], 1b \n\t"
  973. : [src0]"=&f"(src[0]), [src1]"=&f"(src[1]),
  974. [tmp0]"=&f"(tmp[0]), [tmp1]"=&f"(tmp[1])
  975. : [src_ptr]"r"(src_ptr), [dst_ptr]"r"(dst),
  976. [lmov]"f"(0xc), [rmov]"f"(0x18),
  977. [mask1]"f"(0xffff0000ffff), [rmov8]"f"(0x8),
  978. [zero]"f"(0x0), [mask2]"f"(0xff000000),
  979. [width]"r"(dst_width), [lmov1]"f"(0x10)
  980. : "memory"
  981. );
  982. }
  983. // clang-format on
  984. #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
  985. #ifdef __cplusplus
  986. } // extern "C"
  987. } // namespace libyuv
  988. #endif