cuda_src.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. cuda_src_forward = '''
  2. __global__
  3. void line_accum_forward_kernel(
  4. const float* __restrict__ feat,
  5. const float* tabCos,
  6. const float* tabSin,
  7. float* output,
  8. const int imWidth,
  9. const int imHeight,
  10. const int threadW,
  11. const int threadH,
  12. const int threadK,
  13. const int channelSize,
  14. const int batchSize,
  15. const int numangle,
  16. const int numrho)
  17. {
  18. int batch = blockIdx.y;
  19. int channel = blockIdx.x;
  20. int x = threadIdx.x*threadW;
  21. int y = threadIdx.y*threadH;
  22. int k = threadIdx.z*threadK;
  23. int imgStartIdx = batch*channelSize*imWidth*imHeight+
  24. channel*imWidth*imHeight+
  25. y*imWidth+
  26. x;
  27. int angleStartIdx = k;
  28. if (x < imWidth && y < imHeight && channel < channelSize && batch < batchSize && k < numangle)
  29. {
  30. int imgIndex = imgStartIdx;
  31. int angleIndex;
  32. int outIndex;
  33. int r;
  34. for (int idY=0; idY < threadH; idY++)
  35. {
  36. imgIndex = imgStartIdx + idY*imWidth;
  37. // labelIndex = labelStartIdx + idY*imWidth;
  38. if (y+idY < imHeight)
  39. {
  40. for (int idX=0; idX<threadW; idX++)
  41. {
  42. if (x + idX < imWidth)
  43. {
  44. for (int idK=0; idK<threadK; idK++)
  45. {
  46. angleIndex = angleStartIdx + idK;
  47. if(angleIndex < numangle)
  48. {
  49. int xx = x + idX - imWidth / 2, yy = y + idY - imHeight / 2;
  50. r = ::round(float(xx) * (tabCos[angleIndex]) + float(yy) * (tabSin[angleIndex]));
  51. r += ((numrho) / 2);
  52. outIndex = batch*channelSize*numangle*numrho + numangle*numrho*channel + angleIndex*numrho + r;
  53. float val = feat[imgIndex];
  54. atomicAdd(&(output[outIndex]), val);
  55. }
  56. else break;
  57. }
  58. imgIndex++;
  59. }
  60. else break;
  61. }
  62. }
  63. else break;
  64. }
  65. }
  66. }
  67. using namespace std;
  68. int blockSizeX = std::min(8, in0_shape3);
  69. const int threadW = ceil(in0_shape3/(float)blockSizeX);
  70. int blockSizeY = std::min(8, in0_shape2);
  71. const int threadH = ceil(in0_shape3/(float)blockSizeY);
  72. int blockSizeZ = std::min(8, #numangle);
  73. const int threadK = ceil(#numangle/(float)blockSizeZ);
  74. const dim3 blocks(in0_shape1, in0_shape0);
  75. const dim3 threads(blockSizeX, blockSizeY, blockSizeZ);
  76. cudaMemsetAsync(out0_p, 0, out0->size, 0);
  77. line_accum_forward_kernel<<<blocks, threads>>>(
  78. in0_p,
  79. in1_p,
  80. in2_p,
  81. out0_p,
  82. in0_shape3,
  83. in0_shape2,
  84. threadW,
  85. threadH,
  86. threadK,
  87. in0_shape1,
  88. in0_shape0,
  89. #numangle,
  90. #numrho
  91. );
  92. '''
  93. cuda_src_backward = '''
  94. __global__
  95. void line_accum_backward_kernel(
  96. float* grad_in,
  97. const float* grad_out,
  98. const float* tabCos,
  99. const float* tabSin,
  100. const int imWidth,
  101. const int imHeight,
  102. const int threadW,
  103. const int threadH,
  104. const int threadK,
  105. const int channelSize,
  106. const int batchSize,
  107. const int numangle,
  108. const int numrho)
  109. {
  110. int batch = blockIdx.y;
  111. int channel = blockIdx.x;
  112. int x = threadIdx.x*threadW;
  113. int y = threadIdx.y*threadH;
  114. int k = threadIdx.z*threadK;
  115. int imgStartIdx = batch*channelSize*imWidth*imHeight+
  116. channel*imWidth*imHeight+
  117. y*imWidth+
  118. x;
  119. int angleStartIdx = k;
  120. if (x < imWidth && y < imHeight && channel < channelSize && batch < batchSize && k < numangle)
  121. {
  122. int imgIndex = imgStartIdx;
  123. int angleIndex;
  124. int outIndex;
  125. int r;
  126. for (int idY=0; idY < threadH; idY++)
  127. {
  128. imgIndex = imgStartIdx + idY*imWidth;
  129. if (y+idY < imHeight)
  130. {
  131. for (int idX=0; idX<threadW; idX++)
  132. {
  133. if (x + idX < imWidth)
  134. {
  135. for (int idK=0; idK<threadK; idK++)
  136. {
  137. angleIndex = angleStartIdx + idK;
  138. if(angleIndex < numangle)
  139. {
  140. int xx = x + idX - imWidth / 2, yy = y + idY - imHeight / 2;
  141. r = std::round(float(xx)*tabCos[angleIndex] + float(yy)*tabSin[angleIndex]);
  142. r += ((numrho) / 2);
  143. outIndex = batch*channelSize*numangle*numrho + numangle*numrho*channel + angleIndex*numrho + r;
  144. float val = grad_out[outIndex];
  145. atomicAdd(&(grad_in[imgIndex]), val);
  146. }
  147. else break;
  148. }
  149. imgIndex++;
  150. }
  151. else break;
  152. }
  153. }
  154. else break;
  155. }
  156. }
  157. }
  158. using namespace std;
  159. int blockSizeX = std::min(8, in1_shape3);
  160. const int threadW = ceil(in1_shape3/(float)blockSizeX);
  161. int blockSizeY = std::min(8, in1_shape2);
  162. const int threadH = ceil(in1_shape3/(float)blockSizeY);
  163. int blockSizeZ = std::min(8, #numangle);
  164. const int threadK = ceil(#numangle/(float)blockSizeZ);
  165. const dim3 blocks(in1_shape1, in1_shape0);
  166. const dim3 threads(blockSizeX, blockSizeY, blockSizeZ);
  167. cudaMemsetAsync(out1_p, 0, out0->size, 0);
  168. line_accum_backward_kernel<<<blocks, threads>>>(
  169. out0_p,
  170. in1_p,
  171. in2_p,
  172. in3_p,
  173. in1_shape3,
  174. in1_shape2,
  175. threadW,
  176. threadH,
  177. threadK,
  178. in1_shape1,
  179. in1_shape0,
  180. #numangle,
  181. #numrho
  182. );
  183. '''