preprocess.cu 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. #include "preprocess.h"
  2. #include <opencv2/opencv.hpp>
  3. __global__ void warpaffine_kernel(
  4. uint8_t* src, int src_line_size, int src_width,
  5. int src_height, float* dst, int dst_width,
  6. int dst_height, uint8_t const_value_st,
  7. AffineMatrix d2s, int edge) {
  8. int position = blockDim.x * blockIdx.x + threadIdx.x;
  9. if (position >= edge) return;
  10. float m_x1 = d2s.value[0];
  11. float m_y1 = d2s.value[1];
  12. float m_z1 = d2s.value[2];
  13. float m_x2 = d2s.value[3];
  14. float m_y2 = d2s.value[4];
  15. float m_z2 = d2s.value[5];
  16. int dx = position % dst_width;
  17. int dy = position / dst_width;
  18. float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
  19. float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
  20. float c0, c1, c2;
  21. if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
  22. // out of range
  23. c0 = const_value_st;
  24. c1 = const_value_st;
  25. c2 = const_value_st;
  26. } else {
  27. int y_low = floorf(src_y);
  28. int x_low = floorf(src_x);
  29. int y_high = y_low + 1;
  30. int x_high = x_low + 1;
  31. uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
  32. float ly = src_y - y_low;
  33. float lx = src_x - x_low;
  34. float hy = 1 - ly;
  35. float hx = 1 - lx;
  36. float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
  37. uint8_t* v1 = const_value;
  38. uint8_t* v2 = const_value;
  39. uint8_t* v3 = const_value;
  40. uint8_t* v4 = const_value;
  41. if (y_low >= 0) {
  42. if (x_low >= 0)
  43. v1 = src + y_low * src_line_size + x_low * 3;
  44. if (x_high < src_width)
  45. v2 = src + y_low * src_line_size + x_high * 3;
  46. }
  47. if (y_high < src_height) {
  48. if (x_low >= 0)
  49. v3 = src + y_high * src_line_size + x_low * 3;
  50. if (x_high < src_width)
  51. v4 = src + y_high * src_line_size + x_high * 3;
  52. }
  53. c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
  54. c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
  55. c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
  56. }
  57. //bgr to rgb
  58. float t = c2;
  59. c2 = c0;
  60. c0 = t;
  61. //normalization
  62. c0 = c0 / 255.0f;
  63. c1 = c1 / 255.0f;
  64. c2 = c2 / 255.0f;
  65. //rgbrgbrgb to rrrgggbbb
  66. int area = dst_width * dst_height;
  67. float* pdst_c0 = dst + dy * dst_width + dx;
  68. float* pdst_c1 = pdst_c0 + area;
  69. float* pdst_c2 = pdst_c1 + area;
  70. *pdst_c0 = c0;
  71. *pdst_c1 = c1;
  72. *pdst_c2 = c2;
  73. }
  74. void preprocess_kernel_img(
  75. uint8_t* src, int src_width, int src_height,
  76. float* dst, int dst_width, int dst_height,
  77. cudaStream_t stream) {
  78. AffineMatrix s2d,d2s;
  79. float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);
  80. s2d.value[0] = scale;
  81. s2d.value[1] = 0;
  82. s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
  83. s2d.value[3] = 0;
  84. s2d.value[4] = scale;
  85. s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
  86. cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
  87. cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
  88. cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
  89. memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
  90. int jobs = dst_height * dst_width;
  91. int threads = 256;
  92. int blocks = ceil(jobs / (float)threads);
  93. warpaffine_kernel<<<blocks, threads, 0, stream>>>(
  94. src, src_width*3, src_width,
  95. src_height, dst, dst_width,
  96. dst_height, 128, d2s, jobs);
  97. }