#include #include #include __global__ void ChamferDistanceKernel( int b, int n, const float* xyz, int m, const float* xyz2, float* result, int* result_i) { const int batch=512; __shared__ float buf[batch*3]; for (int i=blockIdx.x;ibest){ result[(i*n+j)]=best; result_i[(i*n+j)]=best_i; } } __syncthreads(); } } } void ChamferDistanceKernelLauncher( const int b, const int n, const float* xyz, const int m, const float* xyz2, float* result, int* result_i, float* result2, int* result2_i) { ChamferDistanceKernel<<>>(b, n, xyz, m, xyz2, result, result_i); ChamferDistanceKernel<<>>(b, m, xyz2, n, xyz, result2, result2_i); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) printf("error in chamfer distance updateOutput: %s\n", cudaGetErrorString(err)); } __global__ void ChamferDistanceGradKernel( int b, int n, const float* xyz1, int m, const float* xyz2, const float* grad_dist1, const int* idx1, float* grad_xyz1, float* grad_xyz2) { for (int i = blockIdx.x; i>>(b, n, xyz1, m, xyz2, grad_dist1, idx1, grad_xyz1, grad_xyz2); ChamferDistanceGradKernel<<>>(b, m, xyz2, n, xyz1, grad_dist2, idx2, grad_xyz2, grad_xyz1); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) printf("error in chamfer distance get grad: %s\n", cudaGetErrorString(err)); }