jccolext-mmi.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. /*
  2. * Loongson MMI optimizations for libjpeg-turbo
  3. *
  4. * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved.
  6. * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  7. * All Rights Reserved.
  8. * Authors: ZhuChen <zhuchen@loongson.cn>
  9. * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
  10. * CaiWanwei <caiwanwei@loongson.cn>
  11. * ZhangLixia <zhanglixia-hf@loongson.cn>
  12. *
  13. * Based on the x86 SIMD extension for IJG JPEG library
  14. * Copyright (C) 1999-2006, MIYASAKA Masaru.
  15. *
  16. * This software is provided 'as-is', without any express or implied
  17. * warranty. In no event will the authors be held liable for any damages
  18. * arising from the use of this software.
  19. *
  20. * Permission is granted to anyone to use this software for any purpose,
  21. * including commercial applications, and to alter it and redistribute it
  22. * freely, subject to the following restrictions:
  23. *
  24. * 1. The origin of this software must not be misrepresented; you must not
  25. * claim that you wrote the original software. If you use this software
  26. * in a product, an acknowledgment in the product documentation would be
  27. * appreciated but is not required.
  28. * 2. Altered source versions must be plainly marked as such, and must not be
  29. * misrepresented as being the original software.
  30. * 3. This notice may not be removed or altered from any source distribution.
  31. */
  32. /* This file is included by jccolor-mmi.c */
  33. #if RGB_RED == 0
  34. #define mmA mm0
  35. #define mmB mm1
  36. #elif RGB_GREEN == 0
  37. #define mmA mm2
  38. #define mmB mm3
  39. #elif RGB_BLUE == 0
  40. #define mmA mm4
  41. #define mmB mm5
  42. #else
  43. #define mmA mm6
  44. #define mmB mm7
  45. #endif
  46. #if RGB_RED == 1
  47. #define mmC mm0
  48. #define mmD mm1
  49. #elif RGB_GREEN == 1
  50. #define mmC mm2
  51. #define mmD mm3
  52. #elif RGB_BLUE == 1
  53. #define mmC mm4
  54. #define mmD mm5
  55. #else
  56. #define mmC mm6
  57. #define mmD mm7
  58. #endif
  59. #if RGB_RED == 2
  60. #define mmE mm0
  61. #define mmF mm1
  62. #elif RGB_GREEN == 2
  63. #define mmE mm2
  64. #define mmF mm3
  65. #elif RGB_BLUE == 2
  66. #define mmE mm4
  67. #define mmF mm5
  68. #else
  69. #define mmE mm6
  70. #define mmF mm7
  71. #endif
  72. #if RGB_RED == 3
  73. #define mmG mm0
  74. #define mmH mm1
  75. #elif RGB_GREEN == 3
  76. #define mmG mm2
  77. #define mmH mm3
  78. #elif RGB_BLUE == 3
  79. #define mmG mm4
  80. #define mmH mm5
  81. #else
  82. #define mmG mm6
  83. #define mmH mm7
  84. #endif
  85. void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
  86. JSAMPIMAGE output_buf, JDIMENSION output_row,
  87. int num_rows)
  88. {
  89. JSAMPROW inptr, outptr0, outptr1, outptr2;
  90. int num_cols, col;
  91. __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
  92. __m64 wk[7];
  93. __m64 Y_BG, Cb_RG, Cr_BG;
  94. while (--num_rows >= 0) {
  95. inptr = *input_buf++;
  96. outptr0 = output_buf[0][output_row];
  97. outptr1 = output_buf[1][output_row];
  98. outptr2 = output_buf[2][output_row];
  99. output_row++;
  100. for (num_cols = image_width; num_cols > 0; num_cols -= 8,
  101. outptr0 += 8, outptr1 += 8, outptr2 += 8) {
  102. #if RGB_PIXELSIZE == 3
  103. if (num_cols < 8) {
  104. col = num_cols * 3;
  105. asm(".set noreorder\r\n"
  106. "li $8, 1\r\n"
  107. "move $9, %3\r\n"
  108. "and $10, $9, $8\r\n"
  109. "beqz $10, 1f\r\n"
  110. "nop \r\n"
  111. "subu $9, $9, 1\r\n"
  112. "xor $12, $12, $12\r\n"
  113. "move $13, %5\r\n"
  114. "dadd $13, $13, $9\r\n"
  115. "lbu $12, 0($13)\r\n"
  116. "1: \r\n"
  117. "li $8, 2\r\n"
  118. "and $10, $9, $8\r\n"
  119. "beqz $10, 2f\r\n"
  120. "nop \r\n"
  121. "subu $9, $9, 2\r\n"
  122. "xor $11, $11, $11\r\n"
  123. "move $13, %5\r\n"
  124. "dadd $13, $13, $9\r\n"
  125. "lhu $11, 0($13)\r\n"
  126. "sll $12, $12, 16\r\n"
  127. "or $12, $12, $11\r\n"
  128. "2: \r\n"
  129. "dmtc1 $12, %0\r\n"
  130. "li $8, 4\r\n"
  131. "and $10, $9, $8\r\n"
  132. "beqz $10, 3f\r\n"
  133. "nop \r\n"
  134. "subu $9, $9, 4\r\n"
  135. "move $13, %5\r\n"
  136. "dadd $13, $13, $9\r\n"
  137. "lwu $14, 0($13)\r\n"
  138. "dmtc1 $14, %1\r\n"
  139. "dsll32 $12, $12, 0\r\n"
  140. "or $12, $12, $14\r\n"
  141. "dmtc1 $12, %0\r\n"
  142. "3: \r\n"
  143. "li $8, 8\r\n"
  144. "and $10, $9, $8\r\n"
  145. "beqz $10, 4f\r\n"
  146. "nop \r\n"
  147. "mov.s %1, %0\r\n"
  148. "ldc1 %0, 0(%5)\r\n"
  149. "li $9, 8\r\n"
  150. "j 5f\r\n"
  151. "nop \r\n"
  152. "4: \r\n"
  153. "li $8, 16\r\n"
  154. "and $10, $9, $8\r\n"
  155. "beqz $10, 5f\r\n"
  156. "nop \r\n"
  157. "mov.s %2, %0\r\n"
  158. "ldc1 %0, 0(%5)\r\n"
  159. "ldc1 %1, 8(%5)\r\n"
  160. "5: \r\n"
  161. "nop \r\n"
  162. ".set reorder\r\n"
  163. : "=f" (mmA), "=f" (mmG), "=f" (mmF)
  164. : "r" (col), "r" (num_rows), "r" (inptr)
  165. : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
  166. "$14", "memory"
  167. );
  168. } else {
  169. if (!(((long)inptr) & 7)) {
  170. mmA = _mm_load_si64((__m64 *)&inptr[0]);
  171. mmG = _mm_load_si64((__m64 *)&inptr[8]);
  172. mmF = _mm_load_si64((__m64 *)&inptr[16]);
  173. } else {
  174. mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
  175. mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
  176. mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
  177. }
  178. inptr += RGB_PIXELSIZE * 8;
  179. }
  180. mmD = mmA;
  181. mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
  182. mmD = _mm_srli_si64(mmD, 4 * BYTE_BIT);
  183. mmA = _mm_unpackhi_pi8(mmA, mmG);
  184. mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
  185. mmD = _mm_unpacklo_pi8(mmD, mmF);
  186. mmG = _mm_unpackhi_pi8(mmG, mmF);
  187. mmE = mmA;
  188. mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
  189. mmE = _mm_srli_si64(mmE, 4 * BYTE_BIT);
  190. mmA = _mm_unpackhi_pi8(mmA, mmD);
  191. mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
  192. mmE = _mm_unpacklo_pi8(mmE, mmG);
  193. mmD = _mm_unpackhi_pi8(mmD, mmG);
  194. mmC = mmA;
  195. mmA = _mm_loadlo_pi8_f(mmA);
  196. mmC = _mm_loadhi_pi8_f(mmC);
  197. mmB = mmE;
  198. mmE = _mm_loadlo_pi8_f(mmE);
  199. mmB = _mm_loadhi_pi8_f(mmB);
  200. mmF = mmD;
  201. mmD = _mm_loadlo_pi8_f(mmD);
  202. mmF = _mm_loadhi_pi8_f(mmF);
  203. #else /* RGB_PIXELSIZE == 4 */
  204. if (num_cols < 8) {
  205. col = num_cols;
  206. asm(".set noreorder\r\n"
  207. "li $8, 1\r\n"
  208. "move $9, %4\r\n"
  209. "and $10, $9, $8\r\n"
  210. "beqz $10, 1f\r\n"
  211. "nop \r\n"
  212. "subu $9, $9, 1\r\n"
  213. "dsll $11, $9, 2\r\n"
  214. "move $13, %5\r\n"
  215. "daddu $13, $13, $11\r\n"
  216. "lwc1 %0, 0($13)\r\n"
  217. "1: \r\n"
  218. "li $8, 2\r\n"
  219. "and $10, $9, $8\r\n"
  220. "beqz $10, 2f\r\n"
  221. "nop \r\n"
  222. "subu $9, $9, 2\r\n"
  223. "dsll $11, $9, 2\r\n"
  224. "move $13, %5\r\n"
  225. "daddu $13, $13, $11\r\n"
  226. "mov.s %1, %0\r\n"
  227. "ldc1 %0, 0($13)\r\n"
  228. "2: \r\n"
  229. "li $8, 4\r\n"
  230. "and $10, $9, $8\r\n"
  231. "beqz $10, 3f\r\n"
  232. "nop \r\n"
  233. "mov.s %2, %0\r\n"
  234. "mov.s %3, %1\r\n"
  235. "ldc1 %0, 0(%5)\r\n"
  236. "ldc1 %1, 8(%5)\r\n"
  237. "3: \r\n"
  238. "nop \r\n"
  239. ".set reorder\r\n"
  240. : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
  241. : "r" (col), "r" (inptr)
  242. : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
  243. );
  244. } else {
  245. if (!(((long)inptr) & 7)) {
  246. mmA = _mm_load_si64((__m64 *)&inptr[0]);
  247. mmF = _mm_load_si64((__m64 *)&inptr[8]);
  248. mmD = _mm_load_si64((__m64 *)&inptr[16]);
  249. mmC = _mm_load_si64((__m64 *)&inptr[24]);
  250. } else {
  251. mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
  252. mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
  253. mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
  254. mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
  255. }
  256. inptr += RGB_PIXELSIZE * 8;
  257. }
  258. mmB = mmA;
  259. mmA = _mm_unpacklo_pi8(mmA, mmF);
  260. mmB = _mm_unpackhi_pi8(mmB, mmF);
  261. mmG = mmD;
  262. mmD = _mm_unpacklo_pi8(mmD, mmC);
  263. mmG = _mm_unpackhi_pi8(mmG, mmC);
  264. mmE = mmA;
  265. mmA = _mm_unpacklo_pi16(mmA, mmD);
  266. mmE = _mm_unpackhi_pi16(mmE, mmD);
  267. mmH = mmB;
  268. mmB = _mm_unpacklo_pi16(mmB, mmG);
  269. mmH = _mm_unpackhi_pi16(mmH, mmG);
  270. mmC = mmA;
  271. mmA = _mm_loadlo_pi8_f(mmA);
  272. mmC = _mm_loadhi_pi8_f(mmC);
  273. mmD = mmB;
  274. mmB = _mm_loadlo_pi8_f(mmB);
  275. mmD = _mm_loadhi_pi8_f(mmD);
  276. mmG = mmE;
  277. mmE = _mm_loadlo_pi8_f(mmE);
  278. mmG = _mm_loadhi_pi8_f(mmG);
  279. mmF = mmH;
  280. mmF = _mm_unpacklo_pi8(mmF, mmH);
  281. mmH = _mm_unpackhi_pi8(mmH, mmH);
  282. mmF = _mm_srli_pi16(mmF, BYTE_BIT);
  283. mmH = _mm_srli_pi16(mmH, BYTE_BIT);
  284. #endif
  285. wk[0] = mm0;
  286. wk[1] = mm1;
  287. wk[2] = mm4;
  288. wk[3] = mm5;
  289. mm6 = mm1;
  290. mm1 = _mm_unpacklo_pi16(mm1, mm3);
  291. mm6 = _mm_unpackhi_pi16(mm6, mm3);
  292. mm7 = mm1;
  293. mm4 = mm6;
  294. mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337);
  295. mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
  296. mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033);
  297. mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
  298. wk[4] = mm1;
  299. wk[5] = mm6;
  300. mm1 = _mm_loadlo_pi16_f(mm5);
  301. mm6 = _mm_loadhi_pi16_f(mm5);
  302. mm1 = _mm_srli_pi32(mm1, 1);
  303. mm6 = _mm_srli_pi32(mm6, 1);
  304. mm5 = PD_ONEHALFM1_CJ;
  305. mm7 = _mm_add_pi32(mm7, mm1);
  306. mm4 = _mm_add_pi32(mm4, mm6);
  307. mm7 = _mm_add_pi32(mm7, mm5);
  308. mm4 = _mm_add_pi32(mm4, mm5);
  309. mm7 = _mm_srli_pi32(mm7, SCALEBITS);
  310. mm4 = _mm_srli_pi32(mm4, SCALEBITS);
  311. mm7 = _mm_packs_pi32(mm7, mm4);
  312. mm1 = wk[2];
  313. mm6 = mm0;
  314. mm0 = _mm_unpacklo_pi16(mm0, mm2);
  315. mm6 = _mm_unpackhi_pi16(mm6, mm2);
  316. mm5 = mm0;
  317. mm4 = mm6;
  318. mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337);
  319. mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
  320. mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033);
  321. mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
  322. wk[6] = mm0;
  323. wk[7] = mm6;
  324. mm0 = _mm_loadlo_pi16_f(mm1);
  325. mm6 = _mm_loadhi_pi16_f(mm1);
  326. mm0 = _mm_srli_pi32(mm0, 1);
  327. mm6 = _mm_srli_pi32(mm6, 1);
  328. mm1 = PD_ONEHALFM1_CJ;
  329. mm5 = _mm_add_pi32(mm5, mm0);
  330. mm4 = _mm_add_pi32(mm4, mm6);
  331. mm5 = _mm_add_pi32(mm5, mm1);
  332. mm4 = _mm_add_pi32(mm4, mm1);
  333. mm5 = _mm_srli_pi32(mm5, SCALEBITS);
  334. mm4 = _mm_srli_pi32(mm4, SCALEBITS);
  335. mm5 = _mm_packs_pi32(mm5, mm4);
  336. mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
  337. mm5 = _mm_or_si64(mm5, mm7);
  338. Cb_RG = mm5;
  339. mm0 = wk[3];
  340. mm6 = wk[2];
  341. mm1 = wk[1];
  342. mm4 = mm0;
  343. mm0 = _mm_unpacklo_pi16(mm0, mm3);
  344. mm4 = _mm_unpackhi_pi16(mm4, mm3);
  345. mm7 = mm0;
  346. mm5 = mm4;
  347. mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250);
  348. mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
  349. mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041);
  350. mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
  351. mm3 = PD_ONEHALF;
  352. mm0 = _mm_add_pi32(mm0, wk[4]);
  353. mm4 = _mm_add_pi32(mm4, wk[5]);
  354. mm0 = _mm_add_pi32(mm0, mm3);
  355. mm4 = _mm_add_pi32(mm4, mm3);
  356. mm0 = _mm_srli_pi32(mm0, SCALEBITS);
  357. mm4 = _mm_srli_pi32(mm4, SCALEBITS);
  358. mm0 = _mm_packs_pi32(mm0, mm4);
  359. mm3 = _mm_loadlo_pi16_f(mm1);
  360. mm4 = _mm_loadhi_pi16_f(mm1);
  361. mm3 = _mm_srli_pi32(mm3, 1);
  362. mm4 = _mm_srli_pi32(mm4, 1);
  363. mm1 = PD_ONEHALFM1_CJ;
  364. mm7 = _mm_add_pi32(mm7, mm3);
  365. mm5 = _mm_add_pi32(mm5, mm4);
  366. mm7 = _mm_add_pi32(mm7, mm1);
  367. mm5 = _mm_add_pi32(mm5, mm1);
  368. mm7 = _mm_srli_pi32(mm7, SCALEBITS);
  369. mm5 = _mm_srli_pi32(mm5, SCALEBITS);
  370. mm7 = _mm_packs_pi32(mm7, mm5);
  371. mm3 = wk[0];
  372. mm4 = mm6;
  373. mm6 = _mm_unpacklo_pi16(mm6, mm2);
  374. mm4 = _mm_unpackhi_pi16(mm4, mm2);
  375. mm1 = mm6;
  376. mm5 = mm4;
  377. mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250);
  378. mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
  379. mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041);
  380. mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
  381. mm2 = PD_ONEHALF;
  382. mm6 = _mm_add_pi32(mm6, wk[6]);
  383. mm4 = _mm_add_pi32(mm4, wk[7]);
  384. mm6 = _mm_add_pi32(mm6, mm2);
  385. mm4 = _mm_add_pi32(mm4, mm2);
  386. mm6 = _mm_srli_pi32(mm6, SCALEBITS);
  387. mm4 = _mm_srli_pi32(mm4, SCALEBITS);
  388. mm6 = _mm_packs_pi32(mm6, mm4);
  389. mm0 = _mm_slli_pi16(mm0, BYTE_BIT);
  390. mm6 = _mm_or_si64(mm6, mm0);
  391. Y_BG = mm6;
  392. mm2 = _mm_loadlo_pi16_f(mm3);
  393. mm4 = _mm_loadhi_pi16_f(mm3);
  394. mm2 = _mm_srli_pi32(mm2, 1);
  395. mm4 = _mm_srli_pi32(mm4, 1);
  396. mm0 = PD_ONEHALFM1_CJ;
  397. mm1 = _mm_add_pi32(mm1, mm2);
  398. mm5 = _mm_add_pi32(mm5, mm4);
  399. mm1 = _mm_add_pi32(mm1, mm0);
  400. mm5 = _mm_add_pi32(mm5, mm0);
  401. mm1 = _mm_srli_pi32(mm1, SCALEBITS);
  402. mm5 = _mm_srli_pi32(mm5, SCALEBITS);
  403. mm1 = _mm_packs_pi32(mm1, mm5);
  404. mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
  405. mm1 = _mm_or_si64(mm1, mm7);
  406. Cr_BG = mm1;
  407. _mm_store_si64((__m64 *)&outptr0[0], Y_BG);
  408. _mm_store_si64((__m64 *)&outptr1[0], Cb_RG);
  409. _mm_store_si64((__m64 *)&outptr2[0], Cr_BG);
  410. }
  411. }
  412. }
  413. #undef mmA
  414. #undef mmB
  415. #undef mmC
  416. #undef mmD
  417. #undef mmE
  418. #undef mmF
  419. #undef mmG
  420. #undef mmH