#include "InferTools.hpp" namespace gsd{ #define BATCH_SIZE 1 #define MAX_IMAGE_INPUT_SIZE_THRESH 3000 * 3000 // ensure it exceed the maximum size in the input images ! #define NMS_THRESH 0.1 #define CONF_THRESH 0.1 static gsd::Logger gLogger; // stuff we know about the network and the input/output blobs static const int INPUT_H = Yolo::INPUT_H; static const int INPUT_W = Yolo::INPUT_W; static const int CLASS_NUM = Yolo::CLASS_NUM; static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1 static float prob[BATCH_SIZE * OUTPUT_SIZE]; /** * @description: getPtr * @return {*} */ std::shared_ptr InferTools::getPtr(){ static std::shared_ptr m_InferTools = nullptr; if(m_InferTools == nullptr) m_InferTools = std::shared_ptr(new InferTools); return m_InferTools; } /** * @description: 初始化 * @return {*} */ bool InferTools::Init(std::string enginefile){ cudaSetDevice(this->device); std::ifstream file(enginefile, std::ios::binary); if(!file.good()){ ErrorL << "read " << enginefile << ", error!" << std::endl; return false; } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); if(trtModelStream == nullptr){ trtModelStream = new char[size]; } assert(trtModelStream); file.read(trtModelStream, size); file.close(); if(this->runtime == nullptr){ runtime = createInferRuntime(gLogger); } assert(runtime != nullptr); if(this->engine == nullptr){ this->engine = runtime->deserializeCudaEngine(trtModelStream, size); } assert(engine != nullptr); if(this->context == nullptr){ this->context = engine->createExecutionContext(); } assert(context != nullptr); if(trtModelStream != nullptr){ delete[] trtModelStream; trtModelStream = nullptr; } assert(engine->getNbBindings() == 2); inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME); outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME); assert(inputIndex == 0); assert(outputIndex == 1); CUDA_CHECK(cudaMalloc((void**)&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); CUDA_CHECK(cudaStreamCreate(&stream)); // prepare input data cache in pinned memory CUDA_CHECK(cudaMallocHost((void**)&img_host, MAX_IMAGE_INPUT_SIZE_THRESH * 3)); // prepare input data cache in device memory CUDA_CHECK(cudaMalloc((void**)&img_device, MAX_IMAGE_INPUT_SIZE_THRESH * 3)); return true; } /** * @description: 推理 * @param {Mat&} img * @return {*} */ bool InferTools::Inference(std::shared_ptr img, CNStreamInferData::Ptr result){ unique_lock lk(m_mutex); float* buffer_idx = (float*)buffers[inputIndex]; if(img->empty()) return false; size_t size_image = img->cols * img->rows * 3; size_t size_image_dst = INPUT_H * INPUT_W * 3; //copy data to pinned memory memcpy(img_host, img->data,size_image); //copy data to device memory CUDA_CHECK(cudaMemcpyAsync(img_device,img_host,size_image,cudaMemcpyHostToDevice,stream)); preprocess_kernel_img(img_device, img->cols, img->rows, buffer_idx, INPUT_W, INPUT_H, stream); buffer_idx += size_image_dst; // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, stream, (void**)buffers, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); if(config::getPtr()->Debug) DebugL << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; int fcount = 1; std::vector> batch_res(fcount); for(int b = 0; b < fcount; b++){ auto& res = batch_res[b]; nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH); } if(result == nullptr) return false; result->width = img->cols; result->height = img->rows; for (int b = 0; b < fcount; b++){ auto& res = batch_res[b]; for (size_t j = 0; j < res.size(); j++) { int class_id = res[j].class_id; cv::Rect r = get_rect(*img, res[j].bbox); InferInfo data; data.Label = std::to_string(class_id); data.Score = res[j].conf; data.BBox.x = (double)r.x / (double)result->width; data.BBox.y = (double)r.y / (double)result->height; data.BBox.w = (double)r.width / (double)result->width; data.BBox.h = (double)r.height / (double)result->height; result->Objects.push_back(data); } } // 比例过滤器 if(!InfineFilter::getPtr()->proportionalFilter(result)){ result->Objects.clear(); result = nullptr; return false; } // 异物过滤器 InfineFilter::getPtr()->AlienFilter(result); #ifdef TEST for (int b = 0; b < fcount; b++) { auto& res = batch_res[b]; DebugL << res.size() << endl; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(*img, res[j].bbox); cv::rectangle(*img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(*img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } cv::imwrite("_" + std::to_string(b) + ".jpg", *img); } #endif return true; } /** * @description: 释放资源 * @return {*} */ void InferTools::Destroy(){ cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(img_device)); CUDA_CHECK(cudaFreeHost(img_host)); CUDA_CHECK(cudaFree(buffers[inputIndex])); CUDA_CHECK(cudaFree(buffers[outputIndex])); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); } /** * @description: doInference * @param {IExecutionContext&} context * @param {cudaStream_t&} stream * @param {void} * * @param {float*} output * @param {int} batchSize * @return {*} */ void InferTools::doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* output, int batchSize) { // infer on the batch asynchronously, and DMA output back to host context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } }