compare_msa.cc 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. /*
  2. * Copyright 2017 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/basic_types.h"
  11. #include "libyuv/compare_row.h"
  12. #include "libyuv/row.h"
  13. // This module is for GCC MSA
  14. #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
  15. #include "libyuv/macros_msa.h"
  16. #ifdef __cplusplus
  17. namespace libyuv {
  18. extern "C" {
  19. #endif
  20. uint32_t HammingDistance_MSA(const uint8_t* src_a,
  21. const uint8_t* src_b,
  22. int count) {
  23. uint32_t diff = 0u;
  24. int i;
  25. v16u8 src0, src1, src2, src3;
  26. v2i64 vec0 = {0}, vec1 = {0};
  27. for (i = 0; i < count; i += 32) {
  28. src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
  29. src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
  30. src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
  31. src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
  32. src0 ^= src2;
  33. src1 ^= src3;
  34. vec0 += __msa_pcnt_d((v2i64)src0);
  35. vec1 += __msa_pcnt_d((v2i64)src1);
  36. src_a += 32;
  37. src_b += 32;
  38. }
  39. vec0 += vec1;
  40. diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0);
  41. diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2);
  42. return diff;
  43. }
  44. uint32_t SumSquareError_MSA(const uint8_t* src_a,
  45. const uint8_t* src_b,
  46. int count) {
  47. uint32_t sse = 0u;
  48. int i;
  49. v16u8 src0, src1, src2, src3;
  50. v8i16 vec0, vec1, vec2, vec3;
  51. v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0};
  52. v2i64 tmp0;
  53. for (i = 0; i < count; i += 32) {
  54. src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
  55. src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
  56. src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
  57. src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
  58. vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
  59. vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
  60. vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
  61. vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
  62. vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
  63. vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
  64. vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
  65. vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
  66. reg0 = __msa_dpadd_s_w(reg0, vec0, vec0);
  67. reg1 = __msa_dpadd_s_w(reg1, vec1, vec1);
  68. reg2 = __msa_dpadd_s_w(reg2, vec2, vec2);
  69. reg3 = __msa_dpadd_s_w(reg3, vec3, vec3);
  70. src_a += 32;
  71. src_b += 32;
  72. }
  73. reg0 += reg1;
  74. reg2 += reg3;
  75. reg0 += reg2;
  76. tmp0 = __msa_hadd_s_d(reg0, reg0);
  77. sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0);
  78. sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2);
  79. return sse;
  80. }
  81. #ifdef __cplusplus
  82. } // extern "C"
  83. } // namespace libyuv
  84. #endif
  85. #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)