yololayer.cu 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. #include <assert.h>
  2. #include <vector>
  3. #include <iostream>
  4. #include "yololayer.h"
  5. #include "cuda_utils.h"
  6. namespace Tn
  7. {
  8. template<typename T>
  9. void write(char*& buffer, const T& val)
  10. {
  11. *reinterpret_cast<T*>(buffer) = val;
  12. buffer += sizeof(T);
  13. }
  14. template<typename T>
  15. void read(const char*& buffer, T& val)
  16. {
  17. val = *reinterpret_cast<const T*>(buffer);
  18. buffer += sizeof(T);
  19. }
  20. }
  21. using namespace Yolo;
  22. namespace nvinfer1
  23. {
  24. YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel)
  25. {
  26. mClassCount = classCount;
  27. mYoloV5NetWidth = netWidth;
  28. mYoloV5NetHeight = netHeight;
  29. mMaxOutObject = maxOut;
  30. mYoloKernel = vYoloKernel;
  31. mKernelCount = vYoloKernel.size();
  32. CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
  33. size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
  34. for (int ii = 0; ii < mKernelCount; ii++)
  35. {
  36. CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
  37. const auto& yolo = mYoloKernel[ii];
  38. CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
  39. }
  40. }
  41. YoloLayerPlugin::~YoloLayerPlugin()
  42. {
  43. for (int ii = 0; ii < mKernelCount; ii++)
  44. {
  45. CUDA_CHECK(cudaFree(mAnchor[ii]));
  46. }
  47. CUDA_CHECK(cudaFreeHost(mAnchor));
  48. }
  49. // create the plugin at runtime from a byte stream
  50. YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
  51. {
  52. using namespace Tn;
  53. const char *d = reinterpret_cast<const char *>(data), *a = d;
  54. read(d, mClassCount);
  55. read(d, mThreadCount);
  56. read(d, mKernelCount);
  57. read(d, mYoloV5NetWidth);
  58. read(d, mYoloV5NetHeight);
  59. read(d, mMaxOutObject);
  60. mYoloKernel.resize(mKernelCount);
  61. auto kernelSize = mKernelCount * sizeof(YoloKernel);
  62. memcpy(mYoloKernel.data(), d, kernelSize);
  63. d += kernelSize;
  64. CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
  65. size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
  66. for (int ii = 0; ii < mKernelCount; ii++)
  67. {
  68. CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
  69. const auto& yolo = mYoloKernel[ii];
  70. CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
  71. }
  72. assert(d == a + length);
  73. }
  74. void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT
  75. {
  76. using namespace Tn;
  77. char* d = static_cast<char*>(buffer), *a = d;
  78. write(d, mClassCount);
  79. write(d, mThreadCount);
  80. write(d, mKernelCount);
  81. write(d, mYoloV5NetWidth);
  82. write(d, mYoloV5NetHeight);
  83. write(d, mMaxOutObject);
  84. auto kernelSize = mKernelCount * sizeof(YoloKernel);
  85. memcpy(d, mYoloKernel.data(), kernelSize);
  86. d += kernelSize;
  87. assert(d == a + getSerializationSize());
  88. }
  89. size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT
  90. {
  91. return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight) + sizeof(mMaxOutObject);
  92. }
  93. int YoloLayerPlugin::initialize() TRT_NOEXCEPT
  94. {
  95. return 0;
  96. }
  97. Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT
  98. {
  99. //output the result to channel
  100. int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float);
  101. return Dims3(totalsize + 1, 1, 1);
  102. }
  103. // Set plugin namespace
  104. void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT
  105. {
  106. mPluginNamespace = pluginNamespace;
  107. }
  108. const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT
  109. {
  110. return mPluginNamespace;
  111. }
  112. // Return the DataType of the plugin output at the requested index
  113. DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
  114. {
  115. return DataType::kFLOAT;
  116. }
  117. // Return true if output tensor is broadcast across a batch.
  118. bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT
  119. {
  120. return false;
  121. }
  122. // Return true if plugin can use input that is broadcast across batch without replication.
  123. bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT
  124. {
  125. return false;
  126. }
  127. void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT
  128. {
  129. }
  130. // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
  131. void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
  132. {
  133. }
  134. // Detach the plugin object from its execution context.
  135. void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}
  136. const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT
  137. {
  138. return "YoloLayer_TRT";
  139. }
  140. const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT
  141. {
  142. return "1";
  143. }
  144. void YoloLayerPlugin::destroy() TRT_NOEXCEPT
  145. {
  146. delete this;
  147. }
  148. // Clone the plugin
  149. IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT
  150. {
  151. YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, mYoloKernel);
  152. p->setPluginNamespace(mPluginNamespace);
  153. return p;
  154. }
  155. __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };
  156. __global__ void CalDetection(const float *input, float *output, int noElements,
  157. const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT * 2], int classes, int outputElem)
  158. {
  159. int idx = threadIdx.x + blockDim.x * blockIdx.x;
  160. if (idx >= noElements) return;
  161. int total_grid = yoloWidth * yoloHeight;
  162. int bnIdx = idx / total_grid;
  163. idx = idx - total_grid * bnIdx;
  164. int info_len_i = 5 + classes;
  165. const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
  166. for (int k = 0; k < CHECK_COUNT; ++k) {
  167. float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
  168. if (box_prob < IGNORE_THRESH) continue;
  169. int class_id = 0;
  170. float max_cls_prob = 0.0;
  171. for (int i = 5; i < info_len_i; ++i) {
  172. float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
  173. if (p > max_cls_prob) {
  174. max_cls_prob = p;
  175. class_id = i - 5;
  176. }
  177. }
  178. float *res_count = output + bnIdx * outputElem;
  179. int count = (int)atomicAdd(res_count, 1);
  180. if (count >= maxoutobject) return;
  181. char *data = (char*)res_count + sizeof(float) + count * sizeof(Detection);
  182. Detection *det = (Detection*)(data);
  183. int row = idx / yoloWidth;
  184. int col = idx % yoloWidth;
  185. //Location
  186. // pytorch:
  187. // y = x[i].sigmoid()
  188. // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i] # xy
  189. // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
  190. // X: (sigmoid(tx) + cx)/FeaturemapW * netwidth
  191. det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
  192. det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;
  193. // W: (Pw * e^tw) / FeaturemapW * netwidth
  194. // v5: https://github.com/ultralytics/yolov5/issues/471
  195. det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
  196. det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k];
  197. det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
  198. det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1];
  199. det->conf = box_prob * max_cls_prob;
  200. det->class_id = class_id;
  201. }
  202. }
  203. void YoloLayerPlugin::forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize)
  204. {
  205. int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
  206. for (int idx = 0; idx < batchSize; ++idx) {
  207. CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
  208. }
  209. int numElem = 0;
  210. for (unsigned int i = 0; i < mYoloKernel.size(); ++i) {
  211. const auto& yolo = mYoloKernel[i];
  212. numElem = yolo.width * yolo.height * batchSize;
  213. if (numElem < mThreadCount) mThreadCount = numElem;
  214. //printf("Net: %d %d \n", mYoloV5NetWidth, mYoloV5NetHeight);
  215. CalDetection << < (numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> >
  216. (inputs[i], output, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, yolo.width, yolo.height, (float*)mAnchor[i], mClassCount, outputElem);
  217. }
  218. }
  219. int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT
  220. {
  221. forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize);
  222. return 0;
  223. }
  224. PluginFieldCollection YoloPluginCreator::mFC{};
  225. std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
  226. YoloPluginCreator::YoloPluginCreator()
  227. {
  228. mPluginAttributes.clear();
  229. mFC.nbFields = mPluginAttributes.size();
  230. mFC.fields = mPluginAttributes.data();
  231. }
  232. const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT
  233. {
  234. return "YoloLayer_TRT";
  235. }
  236. const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT
  237. {
  238. return "1";
  239. }
  240. const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT
  241. {
  242. return &mFC;
  243. }
  244. IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT
  245. {
  246. assert(fc->nbFields == 2);
  247. assert(strcmp(fc->fields[0].name, "netinfo") == 0);
  248. assert(strcmp(fc->fields[1].name, "kernels") == 0);
  249. int *p_netinfo = (int*)(fc->fields[0].data);
  250. int class_count = p_netinfo[0];
  251. int input_w = p_netinfo[1];
  252. int input_h = p_netinfo[2];
  253. int max_output_object_count = p_netinfo[3];
  254. std::vector<Yolo::YoloKernel> kernels(fc->fields[1].length);
  255. memcpy(&kernels[0], fc->fields[1].data, kernels.size() * sizeof(Yolo::YoloKernel));
  256. YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, kernels);
  257. obj->setPluginNamespace(mNamespace.c_str());
  258. return obj;
  259. }
  260. IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
  261. {
  262. // This object will be deleted when the network is destroyed, which will
  263. // call YoloLayerPlugin::destroy()
  264. YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
  265. obj->setPluginNamespace(mNamespace.c_str());
  266. return obj;
  267. }
  268. }