123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209 |
- #include <ATen/ATen.h>
- #include <cuda.h>
- #include <cuda_runtime.h>
- __global__
- void ChamferDistanceKernel(
- int b,
- int n,
- const float* xyz,
- int m,
- const float* xyz2,
- float* result,
- int* result_i)
- {
- const int batch=512;
- __shared__ float buf[batch*3];
- for (int i=blockIdx.x;i<b;i+=gridDim.x){
- for (int k2=0;k2<m;k2+=batch){
- int end_k=min(m,k2+batch)-k2;
- for (int j=threadIdx.x;j<end_k*3;j+=blockDim.x){
- buf[j]=xyz2[(i*m+k2)*3+j];
- }
- __syncthreads();
- for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
- float x1=xyz[(i*n+j)*3+0];
- float y1=xyz[(i*n+j)*3+1];
- float z1=xyz[(i*n+j)*3+2];
- int best_i=0;
- float best=0;
- int end_ka=end_k-(end_k&3);
- if (end_ka==batch){
- for (int k=0;k<batch;k+=4){
- {
- float x2=buf[k*3+0]-x1;
- float y2=buf[k*3+1]-y1;
- float z2=buf[k*3+2]-z1;
- float d=x2*x2+y2*y2+z2*z2;
- if (k==0 || d<best){
- best=d;
- best_i=k+k2;
- }
- }
- {
- float x2=buf[k*3+3]-x1;
- float y2=buf[k*3+4]-y1;
- float z2=buf[k*3+5]-z1;
- float d=x2*x2+y2*y2+z2*z2;
- if (d<best){
- best=d;
- best_i=k+k2+1;
- }
- }
- {
- float x2=buf[k*3+6]-x1;
- float y2=buf[k*3+7]-y1;
- float z2=buf[k*3+8]-z1;
- float d=x2*x2+y2*y2+z2*z2;
- if (d<best){
- best=d;
- best_i=k+k2+2;
- }
- }
- {
- float x2=buf[k*3+9]-x1;
- float y2=buf[k*3+10]-y1;
- float z2=buf[k*3+11]-z1;
- float d=x2*x2+y2*y2+z2*z2;
- if (d<best){
- best=d;
- best_i=k+k2+3;
- }
- }
- }
- }else{
- for (int k=0;k<end_ka;k+=4){
- {
- float x2=buf[k*3+0]-x1;
- float y2=buf[k*3+1]-y1;
- float z2=buf[k*3+2]-z1;
- float d=x2*x2+y2*y2+z2*z2;
- if (k==0 || d<best){
- best=d;
- best_i=k+k2;
- }
- }
- {
- float x2=buf[k*3+3]-x1;
- float y2=buf[k*3+4]-y1;
- float z2=buf[k*3+5]-z1;
- float d=x2*x2+y2*y2+z2*z2;
- if (d<best){
- best=d;
- best_i=k+k2+1;
- }
- }
- {
- float x2=buf[k*3+6]-x1;
- float y2=buf[k*3+7]-y1;
- float z2=buf[k*3+8]-z1;
- float d=x2*x2+y2*y2+z2*z2;
- if (d<best){
- best=d;
- best_i=k+k2+2;
- }
- }
- {
- float x2=buf[k*3+9]-x1;
- float y2=buf[k*3+10]-y1;
- float z2=buf[k*3+11]-z1;
- float d=x2*x2+y2*y2+z2*z2;
- if (d<best){
- best=d;
- best_i=k+k2+3;
- }
- }
- }
- }
- for (int k=end_ka;k<end_k;k++){
- float x2=buf[k*3+0]-x1;
- float y2=buf[k*3+1]-y1;
- float z2=buf[k*3+2]-z1;
- float d=x2*x2+y2*y2+z2*z2;
- if (k==0 || d<best){
- best=d;
- best_i=k+k2;
- }
- }
- if (k2==0 || result[(i*n+j)]>best){
- result[(i*n+j)]=best;
- result_i[(i*n+j)]=best_i;
- }
- }
- __syncthreads();
- }
- }
- }
- void ChamferDistanceKernelLauncher(
- const int b, const int n,
- const float* xyz,
- const int m,
- const float* xyz2,
- float* result,
- int* result_i,
- float* result2,
- int* result2_i)
- {
- ChamferDistanceKernel<<<dim3(32,16,1),512>>>(b, n, xyz, m, xyz2, result, result_i);
- ChamferDistanceKernel<<<dim3(32,16,1),512>>>(b, m, xyz2, n, xyz, result2, result2_i);
- cudaError_t err = cudaGetLastError();
- if (err != cudaSuccess)
- printf("error in chamfer distance updateOutput: %s\n", cudaGetErrorString(err));
- }
- __global__
- void ChamferDistanceGradKernel(
- int b, int n,
- const float* xyz1,
- int m,
- const float* xyz2,
- const float* grad_dist1,
- const int* idx1,
- float* grad_xyz1,
- float* grad_xyz2)
- {
- for (int i = blockIdx.x; i<b; i += gridDim.x) {
- for (int j = threadIdx.x + blockIdx.y * blockDim.x; j < n; j += blockDim.x*gridDim.y) {
- float x1=xyz1[(i*n+j)*3+0];
- float y1=xyz1[(i*n+j)*3+1];
- float z1=xyz1[(i*n+j)*3+2];
- int j2=idx1[i*n+j];
- float x2=xyz2[(i*m+j2)*3+0];
- float y2=xyz2[(i*m+j2)*3+1];
- float z2=xyz2[(i*m+j2)*3+2];
- float g=grad_dist1[i*n+j]*2;
- atomicAdd(&(grad_xyz1[(i*n+j)*3+0]),g*(x1-x2));
- atomicAdd(&(grad_xyz1[(i*n+j)*3+1]),g*(y1-y2));
- atomicAdd(&(grad_xyz1[(i*n+j)*3+2]),g*(z1-z2));
- atomicAdd(&(grad_xyz2[(i*m+j2)*3+0]),-(g*(x1-x2)));
- atomicAdd(&(grad_xyz2[(i*m+j2)*3+1]),-(g*(y1-y2)));
- atomicAdd(&(grad_xyz2[(i*m+j2)*3+2]),-(g*(z1-z2)));
- }
- }
- }
- void ChamferDistanceGradKernelLauncher(
- const int b, const int n,
- const float* xyz1,
- const int m,
- const float* xyz2,
- const float* grad_dist1,
- const int* idx1,
- const float* grad_dist2,
- const int* idx2,
- float* grad_xyz1,
- float* grad_xyz2)
- {
- cudaMemset(grad_xyz1, 0, b*n*3*4);
- cudaMemset(grad_xyz2, 0, b*m*3*4);
- ChamferDistanceGradKernel<<<dim3(1,16,1), 256>>>(b, n, xyz1, m, xyz2, grad_dist1, idx1, grad_xyz1, grad_xyz2);
- ChamferDistanceGradKernel<<<dim3(1,16,1), 256>>>(b, m, xyz2, n, xyz1, grad_dist2, idx2, grad_xyz2, grad_xyz1);
- cudaError_t err = cudaGetLastError();
- if (err != cudaSuccess)
- printf("error in chamfer distance get grad: %s\n", cudaGetErrorString(err));
- }
|