macros_msa.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. /*
  2. * Copyright 2016 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef INCLUDE_LIBYUV_MACROS_MSA_H_
  11. #define INCLUDE_LIBYUV_MACROS_MSA_H_
  12. #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
  13. #include <msa.h>
  14. #include <stdint.h>
  15. #if (__mips_isa_rev >= 6)
  16. #define LW(psrc) \
  17. ({ \
  18. const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
  19. uint32_t val_m; \
  20. asm volatile("lw %[val_m], %[psrc_lw_m] \n" \
  21. : [val_m] "=r"(val_m) \
  22. : [psrc_lw_m] "m"(*psrc_lw_m)); \
  23. val_m; \
  24. })
  25. #if (__mips == 64)
  26. #define LD(psrc) \
  27. ({ \
  28. const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
  29. uint64_t val_m = 0; \
  30. asm volatile("ld %[val_m], %[psrc_ld_m] \n" \
  31. : [val_m] "=r"(val_m) \
  32. : [psrc_ld_m] "m"(*psrc_ld_m)); \
  33. val_m; \
  34. })
  35. #else // !(__mips == 64)
  36. #define LD(psrc) \
  37. ({ \
  38. const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
  39. uint32_t val0_m, val1_m; \
  40. uint64_t val_m = 0; \
  41. val0_m = LW(psrc_ld_m); \
  42. val1_m = LW(psrc_ld_m + 4); \
  43. val_m = (uint64_t)(val1_m); /* NOLINT */ \
  44. val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
  45. val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \
  46. val_m; \
  47. })
  48. #endif // (__mips == 64)
  49. #define SW(val, pdst) \
  50. ({ \
  51. uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
  52. uint32_t val_m = (val); \
  53. asm volatile("sw %[val_m], %[pdst_sw_m] \n" \
  54. : [pdst_sw_m] "=m"(*pdst_sw_m) \
  55. : [val_m] "r"(val_m)); \
  56. })
  57. #if (__mips == 64)
  58. #define SD(val, pdst) \
  59. ({ \
  60. uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
  61. uint64_t val_m = (val); \
  62. asm volatile("sd %[val_m], %[pdst_sd_m] \n" \
  63. : [pdst_sd_m] "=m"(*pdst_sd_m) \
  64. : [val_m] "r"(val_m)); \
  65. })
  66. #else // !(__mips == 64)
  67. #define SD(val, pdst) \
  68. ({ \
  69. uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
  70. uint32_t val0_m, val1_m; \
  71. val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
  72. val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
  73. SW(val0_m, pdst_sd_m); \
  74. SW(val1_m, pdst_sd_m + 4); \
  75. })
  76. #endif // !(__mips == 64)
  77. #else // !(__mips_isa_rev >= 6)
  78. #define LW(psrc) \
  79. ({ \
  80. const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
  81. uint32_t val_m; \
  82. asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \
  83. : [val_m] "=r"(val_m) \
  84. : [psrc_lw_m] "m"(*psrc_lw_m)); \
  85. val_m; \
  86. })
  87. #if (__mips == 64)
  88. #define LD(psrc) \
  89. ({ \
  90. const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
  91. uint64_t val_m = 0; \
  92. asm volatile("uld %[val_m], %[psrc_ld_m] \n" \
  93. : [val_m] "=r"(val_m) \
  94. : [psrc_ld_m] "m"(*psrc_ld_m)); \
  95. val_m; \
  96. })
  97. #else // !(__mips == 64)
  98. #define LD(psrc) \
  99. ({ \
  100. const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
  101. uint32_t val0_m, val1_m; \
  102. uint64_t val_m = 0; \
  103. val0_m = LW(psrc_ld_m); \
  104. val1_m = LW(psrc_ld_m + 4); \
  105. val_m = (uint64_t)(val1_m); /* NOLINT */ \
  106. val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
  107. val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \
  108. val_m; \
  109. })
  110. #endif // (__mips == 64)
  111. #define SW(val, pdst) \
  112. ({ \
  113. uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
  114. uint32_t val_m = (val); \
  115. asm volatile("usw %[val_m], %[pdst_sw_m] \n" \
  116. : [pdst_sw_m] "=m"(*pdst_sw_m) \
  117. : [val_m] "r"(val_m)); \
  118. })
  119. #define SD(val, pdst) \
  120. ({ \
  121. uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
  122. uint32_t val0_m, val1_m; \
  123. val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
  124. val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
  125. SW(val0_m, pdst_sd_m); \
  126. SW(val1_m, pdst_sd_m + 4); \
  127. })
  128. #endif // (__mips_isa_rev >= 6)
  129. // TODO(fbarchard): Consider removing __VAR_ARGS versions.
  130. #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
  131. #define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
  132. #define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
  133. #define LD_UH(...) LD_H(const v8u16, __VA_ARGS__)
  134. #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
  135. #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
  136. #define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
  137. #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
  138. /* Description : Load two vectors with 16 'byte' sized elements
  139. Arguments : Inputs - psrc, stride
  140. Outputs - out0, out1
  141. Return Type - as per RTYPE
  142. Details : Load 16 byte elements in 'out0' from (psrc)
  143. Load 16 byte elements in 'out1' from (psrc + stride)
  144. */
  145. #define LD_B2(RTYPE, psrc, stride, out0, out1) \
  146. { \
  147. out0 = LD_B(RTYPE, (psrc)); \
  148. out1 = LD_B(RTYPE, (psrc) + stride); \
  149. }
  150. #define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__)
  151. #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
  152. { \
  153. LD_B2(RTYPE, (psrc), stride, out0, out1); \
  154. LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
  155. }
  156. #define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__)
  157. /* Description : Store two vectors with stride each having 16 'byte' sized
  158. elements
  159. Arguments : Inputs - in0, in1, pdst, stride
  160. Details : Store 16 byte elements from 'in0' to (pdst)
  161. Store 16 byte elements from 'in1' to (pdst + stride)
  162. */
  163. #define ST_B2(RTYPE, in0, in1, pdst, stride) \
  164. { \
  165. ST_B(RTYPE, in0, (pdst)); \
  166. ST_B(RTYPE, in1, (pdst) + stride); \
  167. }
  168. #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
  169. #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
  170. { \
  171. ST_B2(RTYPE, in0, in1, (pdst), stride); \
  172. ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
  173. }
  174. #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
  175. /* Description : Store vectors of 8 halfword elements with stride
  176. Arguments : Inputs - in0, in1, pdst, stride
  177. Details : Store 8 halfword elements from 'in0' to (pdst)
  178. Store 8 halfword elements from 'in1' to (pdst + stride)
  179. */
  180. #define ST_H2(RTYPE, in0, in1, pdst, stride) \
  181. { \
  182. ST_H(RTYPE, in0, (pdst)); \
  183. ST_H(RTYPE, in1, (pdst) + stride); \
  184. }
  185. #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
  186. // TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
  187. /* Description : Shuffle byte vector elements as per mask vector
  188. Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
  189. Outputs - out0, out1
  190. Return Type - as per RTYPE
  191. Details : Byte elements from 'in0' & 'in1' are copied selectively to
  192. 'out0' as per control vector 'mask0'
  193. */
  194. #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
  195. { \
  196. out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
  197. out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
  198. }
  199. #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
  200. /* Description : Interleave both left and right half of input vectors
  201. Arguments : Inputs - in0, in1
  202. Outputs - out0, out1
  203. Return Type - as per RTYPE
  204. Details : Right half of byte elements from 'in0' and 'in1' are
  205. interleaved and written to 'out0'
  206. */
  207. #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
  208. { \
  209. out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
  210. out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
  211. }
  212. #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
  213. #endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
  214. #endif // INCLUDE_LIBYUV_MACROS_MSA_H_