123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- cuda_src_forward = '''
- __global__
- void line_accum_forward_kernel(
- const float* __restrict__ feat,
- const float* tabCos,
- const float* tabSin,
- float* output,
- const int imWidth,
- const int imHeight,
- const int threadW,
- const int threadH,
- const int threadK,
- const int channelSize,
- const int batchSize,
- const int numangle,
- const int numrho)
- {
- int batch = blockIdx.y;
- int channel = blockIdx.x;
- int x = threadIdx.x*threadW;
- int y = threadIdx.y*threadH;
- int k = threadIdx.z*threadK;
- int imgStartIdx = batch*channelSize*imWidth*imHeight+
- channel*imWidth*imHeight+
- y*imWidth+
- x;
-
- int angleStartIdx = k;
-
- if (x < imWidth && y < imHeight && channel < channelSize && batch < batchSize && k < numangle)
- {
- int imgIndex = imgStartIdx;
- int angleIndex;
- int outIndex;
- int r;
- for (int idY=0; idY < threadH; idY++)
- {
- imgIndex = imgStartIdx + idY*imWidth;
- // labelIndex = labelStartIdx + idY*imWidth;
- if (y+idY < imHeight)
- {
- for (int idX=0; idX<threadW; idX++)
- {
- if (x + idX < imWidth)
- {
- for (int idK=0; idK<threadK; idK++)
- {
- angleIndex = angleStartIdx + idK;
- if(angleIndex < numangle)
- {
- int xx = x + idX - imWidth / 2, yy = y + idY - imHeight / 2;
- r = ::round(float(xx) * (tabCos[angleIndex]) + float(yy) * (tabSin[angleIndex]));
- r += ((numrho) / 2);
- outIndex = batch*channelSize*numangle*numrho + numangle*numrho*channel + angleIndex*numrho + r;
- float val = feat[imgIndex];
- atomicAdd(&(output[outIndex]), val);
-
- }
- else break;
- }
- imgIndex++;
- }
- else break;
- }
- }
- else break;
- }
- }
- }
- using namespace std;
- int blockSizeX = std::min(8, in0_shape3);
- const int threadW = ceil(in0_shape3/(float)blockSizeX);
- int blockSizeY = std::min(8, in0_shape2);
- const int threadH = ceil(in0_shape3/(float)blockSizeY);
- int blockSizeZ = std::min(8, #numangle);
- const int threadK = ceil(#numangle/(float)blockSizeZ);
- const dim3 blocks(in0_shape1, in0_shape0);
- const dim3 threads(blockSizeX, blockSizeY, blockSizeZ);
- cudaMemsetAsync(out0_p, 0, out0->size, 0);
- line_accum_forward_kernel<<<blocks, threads>>>(
- in0_p,
- in1_p,
- in2_p,
- out0_p,
- in0_shape3,
- in0_shape2,
- threadW,
- threadH,
- threadK,
- in0_shape1,
- in0_shape0,
- #numangle,
- #numrho
- );
- '''
- cuda_src_backward = '''
- __global__
- void line_accum_backward_kernel(
- float* grad_in,
- const float* grad_out,
- const float* tabCos,
- const float* tabSin,
- const int imWidth,
- const int imHeight,
- const int threadW,
- const int threadH,
- const int threadK,
- const int channelSize,
- const int batchSize,
- const int numangle,
- const int numrho)
- {
- int batch = blockIdx.y;
- int channel = blockIdx.x;
- int x = threadIdx.x*threadW;
- int y = threadIdx.y*threadH;
- int k = threadIdx.z*threadK;
- int imgStartIdx = batch*channelSize*imWidth*imHeight+
- channel*imWidth*imHeight+
- y*imWidth+
- x;
- int angleStartIdx = k;
- if (x < imWidth && y < imHeight && channel < channelSize && batch < batchSize && k < numangle)
- {
- int imgIndex = imgStartIdx;
- int angleIndex;
- int outIndex;
- int r;
- for (int idY=0; idY < threadH; idY++)
- {
- imgIndex = imgStartIdx + idY*imWidth;
- if (y+idY < imHeight)
- {
- for (int idX=0; idX<threadW; idX++)
- {
- if (x + idX < imWidth)
- {
- for (int idK=0; idK<threadK; idK++)
- {
- angleIndex = angleStartIdx + idK;
- if(angleIndex < numangle)
- {
- int xx = x + idX - imWidth / 2, yy = y + idY - imHeight / 2;
- r = std::round(float(xx)*tabCos[angleIndex] + float(yy)*tabSin[angleIndex]);
- r += ((numrho) / 2);
- outIndex = batch*channelSize*numangle*numrho + numangle*numrho*channel + angleIndex*numrho + r;
- float val = grad_out[outIndex];
- atomicAdd(&(grad_in[imgIndex]), val);
- }
- else break;
- }
- imgIndex++;
- }
- else break;
- }
- }
- else break;
- }
- }
- }
- using namespace std;
- int blockSizeX = std::min(8, in1_shape3);
- const int threadW = ceil(in1_shape3/(float)blockSizeX);
- int blockSizeY = std::min(8, in1_shape2);
- const int threadH = ceil(in1_shape3/(float)blockSizeY);
- int blockSizeZ = std::min(8, #numangle);
- const int threadK = ceil(#numangle/(float)blockSizeZ);
- const dim3 blocks(in1_shape1, in1_shape0);
- const dim3 threads(blockSizeX, blockSizeY, blockSizeZ);
- cudaMemsetAsync(out1_p, 0, out0->size, 0);
- line_accum_backward_kernel<<<blocks, threads>>>(
- out0_p,
- in1_p,
- in2_p,
- in3_p,
- in1_shape3,
- in1_shape2,
- threadW,
- threadH,
- threadK,
- in1_shape1,
- in1_shape0,
- #numangle,
- #numrho
- );
- '''
|