rbox_iou_op.cu 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // The code is based on https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
  16. #include "rbox_iou_op.h"
  17. #include "paddle/extension.h"
  18. // 2D block with 32 * 16 = 512 threads per block
  19. const int BLOCK_DIM_X = 32;
  20. const int BLOCK_DIM_Y = 16;
  21. /**
  22. Computes ceil(a / b)
  23. */
  24. static inline int CeilDiv(const int a, const int b) {
  25. return (a + b - 1) / b;
  26. }
  27. template <typename T>
  28. __global__ void rbox_iou_cuda_kernel(
  29. const int rbox1_num,
  30. const int rbox2_num,
  31. const T* rbox1_data_ptr,
  32. const T* rbox2_data_ptr,
  33. T* output_data_ptr) {
  34. // get row_start and col_start
  35. const int rbox1_block_idx = blockIdx.x * blockDim.x;
  36. const int rbox2_block_idx = blockIdx.y * blockDim.y;
  37. const int rbox1_thread_num = min(rbox1_num - rbox1_block_idx, blockDim.x);
  38. const int rbox2_thread_num = min(rbox2_num - rbox2_block_idx, blockDim.y);
  39. __shared__ T block_boxes1[BLOCK_DIM_X * 5];
  40. __shared__ T block_boxes2[BLOCK_DIM_Y * 5];
  41. // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
  42. if (threadIdx.x < rbox1_thread_num && threadIdx.y == 0) {
  43. block_boxes1[threadIdx.x * 5 + 0] =
  44. rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 0];
  45. block_boxes1[threadIdx.x * 5 + 1] =
  46. rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 1];
  47. block_boxes1[threadIdx.x * 5 + 2] =
  48. rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 2];
  49. block_boxes1[threadIdx.x * 5 + 3] =
  50. rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 3];
  51. block_boxes1[threadIdx.x * 5 + 4] =
  52. rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 4];
  53. }
  54. // threadIdx.x < BLOCK_DIM_Y=rbox2_thread_num, just use same condition as above: threadIdx.y == 0
  55. if (threadIdx.x < rbox2_thread_num && threadIdx.y == 0) {
  56. block_boxes2[threadIdx.x * 5 + 0] =
  57. rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 0];
  58. block_boxes2[threadIdx.x * 5 + 1] =
  59. rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 1];
  60. block_boxes2[threadIdx.x * 5 + 2] =
  61. rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 2];
  62. block_boxes2[threadIdx.x * 5 + 3] =
  63. rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 3];
  64. block_boxes2[threadIdx.x * 5 + 4] =
  65. rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 4];
  66. }
  67. // sync
  68. __syncthreads();
  69. if (threadIdx.x < rbox1_thread_num && threadIdx.y < rbox2_thread_num) {
  70. int offset = (rbox1_block_idx + threadIdx.x) * rbox2_num + rbox2_block_idx + threadIdx.y;
  71. output_data_ptr[offset] = rbox_iou_single<T>(block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
  72. }
  73. }
  74. #define CHECK_INPUT_GPU(x) PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")
  75. std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) {
  76. CHECK_INPUT_GPU(rbox1);
  77. CHECK_INPUT_GPU(rbox2);
  78. auto rbox1_num = rbox1.shape()[0];
  79. auto rbox2_num = rbox2.shape()[0];
  80. auto output = paddle::Tensor(paddle::PlaceType::kGPU, {rbox1_num, rbox2_num});
  81. const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X);
  82. const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y);
  83. dim3 blocks(blocks_x, blocks_y);
  84. dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
  85. PD_DISPATCH_FLOATING_TYPES(
  86. rbox1.type(),
  87. "rbox_iou_cuda_kernel",
  88. ([&] {
  89. rbox_iou_cuda_kernel<data_t><<<blocks, threads, 0, rbox1.stream()>>>(
  90. rbox1_num,
  91. rbox2_num,
  92. rbox1.data<data_t>(),
  93. rbox2.data<data_t>(),
  94. output.mutable_data<data_t>());
  95. }));
  96. return {output};
  97. }