diff --git a/config_infer_primary.txt b/config_infer_primary.txt index b26c95f..cba5b2a 100644 --- a/config_infer_primary.txt +++ b/config_infer_primary.txt @@ -14,12 +14,11 @@ interval=0 gie-unique-id=1 process-mode=1 network-type=0 -cluster-mode=2 +cluster-mode=4 maintain-aspect-ratio=0 parse-bbox-func-name=NvDsInferParseYolo custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] -nms-iou-threshold=0.45 -pre-cluster-threshold=0.25 +pre-cluster-threshold=0 diff --git a/config_infer_primary_yoloV2.txt b/config_infer_primary_yoloV2.txt index 8d939d6..84874d6 100644 --- a/config_infer_primary_yoloV2.txt +++ b/config_infer_primary_yoloV2.txt @@ -14,12 +14,11 @@ interval=0 gie-unique-id=1 process-mode=1 network-type=0 -cluster-mode=2 +cluster-mode=4 maintain-aspect-ratio=0 parse-bbox-func-name=NvDsInferParseYolo custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] -nms-iou-threshold=0.45 -pre-cluster-threshold=0.25 +pre-cluster-threshold=0 diff --git a/config_infer_primary_yoloV5.txt b/config_infer_primary_yoloV5.txt index 0f2a818..72bb8ed 100644 --- a/config_infer_primary_yoloV5.txt +++ b/config_infer_primary_yoloV5.txt @@ -14,12 +14,11 @@ interval=0 gie-unique-id=1 process-mode=1 network-type=0 -cluster-mode=2 +cluster-mode=4 maintain-aspect-ratio=1 parse-bbox-func-name=NvDsInferParseYolo custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] -nms-iou-threshold=0.45 -pre-cluster-threshold=0.25 +pre-cluster-threshold=0 diff --git a/config_infer_primary_yolor.txt b/config_infer_primary_yolor.txt index ad92a86..0a66c0f 100644 --- a/config_infer_primary_yolor.txt +++ b/config_infer_primary_yolor.txt @@ -14,12 +14,11 @@ interval=0 gie-unique-id=1 process-mode=1 network-type=0 -cluster-mode=2 +cluster-mode=4 maintain-aspect-ratio=1 parse-bbox-func-name=NvDsInferParseYolo custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] -nms-iou-threshold=0.5 -pre-cluster-threshold=0.25 +pre-cluster-threshold=0 diff --git a/config_nms.txt b/config_nms.txt new file mode 100644 index 0000000..bb658a4 --- /dev/null +++ b/config_nms.txt @@ -0,0 +1,4 @@ +[property] +iou-threshold=0.45 +score-threshold=0.25 +topk=300 diff --git a/docs/customModels.md b/docs/customModels.md index 894869f..381af62 100644 --- a/docs/customModels.md +++ b/docs/customModels.md @@ -272,24 +272,6 @@ interval=0 ## -#### nms-iou-threshold - -``` -# IOU threshold -nms-iou-threshold=0.6 -``` - -## - -#### pre-cluster-threshold - -``` -# Socre threshold -pre-cluster-threshold=0.25 -``` - -## - ### Testing model ``` diff --git a/nvdsinfer_custom_impl_Yolo/Makefile b/nvdsinfer_custom_impl_Yolo/Makefile index 0bc4590..ad30dcc 100644 --- a/nvdsinfer_custom_impl_Yolo/Makefile +++ b/nvdsinfer_custom_impl_Yolo/Makefile @@ -55,7 +55,6 @@ SRCFILES:= nvdsinfer_yolo_engine.cpp \ layers/convolutional_layer.cpp \ layers/implicit_layer.cpp \ layers/channels_layer.cpp \ - layers/dropout_layer.cpp \ layers/shortcut_layer.cpp \ layers/route_layer.cpp \ layers/upsample_layer.cpp \ @@ -67,7 +66,8 @@ SRCFILES:= nvdsinfer_yolo_engine.cpp \ yoloForward.cu \ yoloForward_v2.cu \ yoloForward_nc.cu \ - yoloForward_r.cu + yoloForward_r.cu \ + sortDetections.cu ifeq ($(OPENCV), 1) SRCFILES+= calibrator.cpp diff --git a/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.cpp deleted file mode 100644 index 467aa0d..0000000 --- a/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.cpp +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Created by Marcos Luciano - * https://www.github.com/marcoslucianops - */ - -#include "dropout_layer.h" - -nvinfer1::ILayer* dropoutLayer( - float probability, - nvinfer1::ITensor* input, - nvinfer1::INetworkDefinition* network) -{ - nvinfer1::ILayer* output; - return output; -} diff --git a/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.h b/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.h deleted file mode 100644 index 3e8a1d4..0000000 --- a/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Created by Marcos Luciano - * https://www.github.com/marcoslucianops - */ - -#ifndef __DROPOUT_LAYER_H__ -#define __DROPOUT_LAYER_H__ - -#include "NvInfer.h" - -nvinfer1::ILayer* dropoutLayer( - float probability, - nvinfer1::ITensor* input, - nvinfer1::INetworkDefinition* network); - -#endif diff --git a/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp b/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp index d1e3127..bf5d675 100644 --- a/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp +++ b/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp @@ -63,15 +63,13 @@ static bool getYoloNetworkInfo (NetworkInfo &networkInfo, const NvDsInferContext if (networkInfo.configFilePath.empty() || networkInfo.wtsFilePath.empty()) { - std::cerr << "YOLO config file or weights file is not specified" - << std::endl; + std::cerr << "YOLO config file or weights file is not specified\n" << std::endl; return false; } if (!fileExists(networkInfo.configFilePath) || !fileExists(networkInfo.wtsFilePath)) { - std::cerr << "YOLO config file or weights file is not exist" - << std::endl; + std::cerr << "YOLO config file or weights file is not exist\n" << std::endl; return false; } diff --git a/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp b/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp index e31888a..1b45fbb 100644 --- a/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp +++ b/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp @@ -38,15 +38,15 @@ extern "C" bool NvDsInferParseYolo( std::vector& objectList); static NvDsInferParseObjectInfo convertBBox( - const float& bx, const float& by, const float& bw, - const float& bh, const uint& netW, const uint& netH) + const float& bx1, const float& by1, const float& bx2, + const float& by2, const uint& netW, const uint& netH) { NvDsInferParseObjectInfo b; - float x1 = bx - bw / 2; - float y1 = by - bh / 2; - float x2 = x1 + bw; - float y2 = y1 + bh; + float x1 = bx1; + float y1 = by1; + float x2 = bx2; + float y2 = by2; x1 = clamp(x1, 0, netW); y1 = clamp(y1, 0, netH); @@ -62,11 +62,11 @@ static NvDsInferParseObjectInfo convertBBox( } static void addBBoxProposal( - const float bx, const float by, const float bw, const float bh, + const float bx1, const float by1, const float bx2, const float by2, const uint& netW, const uint& netH, const int maxIndex, const float maxProb, std::vector& binfo) { - NvDsInferParseObjectInfo bbi = convertBBox(bx, by, bw, bh, netW, netH); + NvDsInferParseObjectInfo bbi = convertBBox(bx1, by1, bx2, by2, netW, netH); if (bbi.width < 1 || bbi.height < 1) return; bbi.detectionConfidence = maxProb; @@ -75,34 +75,25 @@ static void addBBoxProposal( } static std::vector decodeYoloTensor( - const float* detections, - const uint gridSizeW, const uint gridSizeH, const uint numBBoxes, - const uint numOutputClasses, const uint& netW, const uint& netH) + const int* counts, const float* boxes, + const float* scores, const float* classes, + const uint& netW, const uint& netH) { std::vector binfo; - for (uint y = 0; y < gridSizeH; ++y) { - for (uint x = 0; x < gridSizeW; ++x) { - for (uint b = 0; b < numBBoxes; ++b) - { - const int numGridCells = gridSizeH * gridSizeW; - const int bbindex = y * gridSizeW + x; - const float bx - = detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 0)]; - const float by - = detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 1)]; - const float bw - = detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 2)]; - const float bh - = detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 3)]; - const float maxProb - = detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 4)]; - const int maxIndex - = detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 5)]; + uint numBoxes = counts[0]; - addBBoxProposal(bx, by, bw, bh, netW, netH, maxIndex, maxProb, binfo); - } - } + for (uint b = 0; b < numBoxes; ++b) + { + float bx1 = boxes[b * 4 + 0]; + float by1 = boxes[b * 4 + 1]; + float bx2 = boxes[b * 4 + 2]; + float by2 = boxes[b * 4 + 3]; + + float maxProb = scores[b]; + int maxIndex = classes[b]; + + addBBoxProposal(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo); } return binfo; } @@ -112,7 +103,6 @@ static bool NvDsInferParseCustomYolo( NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList, - const uint &numBBoxes, const uint &numClasses) { if (outputLayersInfo.empty()) @@ -130,18 +120,17 @@ static bool NvDsInferParseCustomYolo( std::vector objects; - for (uint idx = 0; idx < outputLayersInfo.size(); ++idx) + for (uint idx = 0; idx < outputLayersInfo.size() / 4; ++idx) { - const NvDsInferLayerInfo &layer = outputLayersInfo[idx]; - - assert(layer.inferDims.numDims == 3); - const uint gridSizeH = layer.inferDims.d[1]; - const uint gridSizeW = layer.inferDims.d[2]; + const NvDsInferLayerInfo &counts = outputLayersInfo[idx * 4 + 0]; + const NvDsInferLayerInfo &boxes = outputLayersInfo[idx * 4 + 1]; + const NvDsInferLayerInfo &scores = outputLayersInfo[idx * 4 + 2]; + const NvDsInferLayerInfo &classes = outputLayersInfo[idx * 4 + 3]; std::vector outObjs = decodeYoloTensor( - (const float*)(layer.buffer), - gridSizeW, gridSizeH, numBBoxes, numClasses, + (const int*)(counts.buffer), (const float*)(boxes.buffer), + (const float*)(scores.buffer), (const float*)(classes.buffer), networkInfo.width, networkInfo.height); objects.insert(objects.end(), outObjs.begin(), outObjs.end()); @@ -158,11 +147,10 @@ extern "C" bool NvDsInferParseYolo( NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList) { - uint numBBoxes = kNUM_BBOXES; - uint numClasses = kNUM_CLASSES; + int num_classes = kNUM_CLASSES; return NvDsInferParseCustomYolo ( - outputLayersInfo, networkInfo, detectionParams, objectList, numBBoxes, numClasses); + outputLayersInfo, networkInfo, detectionParams, objectList, num_classes); } CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYolo); diff --git a/nvdsinfer_custom_impl_Yolo/sortDetections.cu b/nvdsinfer_custom_impl_Yolo/sortDetections.cu new file mode 100644 index 0000000..1463644 --- /dev/null +++ b/nvdsinfer_custom_impl_Yolo/sortDetections.cu @@ -0,0 +1,84 @@ +/* + * Created by Marcos Luciano + * https://www.github.com/marcoslucianops + */ + +#include + +__global__ void sortOutput( + int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, float* bboxData, float* scoreData, + const uint numOutputClasses) +{ + uint x_id = blockIdx.x * blockDim.x + threadIdx.x; + + int index = d_indexes[x_id]; + int maxIndex = d_classes[index]; + bboxData[x_id * 4 + 0] = d_boxes[index * 4 + 0]; + bboxData[x_id * 4 + 1] = d_boxes[index * 4 + 1]; + bboxData[x_id * 4 + 2] = d_boxes[index * 4 + 2]; + bboxData[x_id * 4 + 3] = d_boxes[index * 4 + 3]; + scoreData[x_id * numOutputClasses + maxIndex] = d_scores[x_id] - 1.f; +} + +cudaError_t sortDetections( + void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* bboxData, void* scoreData, void* countData, + const uint& batchSize, uint64_t& outputSize, uint& topK, const uint& numOutputClasses, cudaStream_t stream); + +cudaError_t sortDetections( + void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* bboxData, void* scoreData, void* countData, + const uint& batchSize, uint64_t& outputSize, uint& topK, const uint& numOutputClasses, cudaStream_t stream) +{ + for (unsigned int batch = 0; batch < batchSize; ++batch) + { + int* _d_indexes = reinterpret_cast(d_indexes) + (batch * outputSize); + float* _d_scores = reinterpret_cast(d_scores) + (batch * outputSize); + + int* _countData = reinterpret_cast(countData) + (batch); + int* _count = (int*)malloc(sizeof(int)); + cudaMemcpy(_count, (int*)&_countData[0], sizeof(int), cudaMemcpyDeviceToHost); + int count = _count[0]; + + if (count == 0) + { + free(_count); + return cudaGetLastError(); + } + + size_t begin_bit = 0; + size_t end_bit = sizeof(float) * 8; + + float *d_keys_out = NULL; + int *d_values_out = NULL; + + cudaMalloc((void **)&d_keys_out, count * sizeof(float)); + cudaMalloc((void **)&d_values_out, count * sizeof(int)); + + void* d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + + cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, _d_scores, d_keys_out, _d_indexes, + d_values_out, count, begin_bit, end_bit); + + cudaMalloc(&d_temp_storage, temp_storage_bytes); + + cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, _d_scores, d_keys_out, _d_indexes, + d_values_out, count, begin_bit, end_bit); + + cudaMemcpy(_d_scores, d_keys_out, count * sizeof(float), cudaMemcpyDeviceToDevice); + cudaMemcpy(_d_indexes, d_values_out, count * sizeof(int), cudaMemcpyDeviceToDevice); + + int threads_per_block = count < topK ? count : topK; + + sortOutput<<<1, threads_per_block, 0, stream>>>( + _d_indexes, _d_scores, reinterpret_cast(d_boxes) + (batch * 4 * outputSize), + reinterpret_cast(d_classes) + (batch * outputSize), reinterpret_cast(bboxData) + (batch * topK), + reinterpret_cast(scoreData) + (batch * topK), numOutputClasses); + + cudaFree(d_keys_out); + cudaFree(d_values_out); + cudaFree(d_temp_storage); + + free(_count); + } + return cudaGetLastError(); +} diff --git a/nvdsinfer_custom_impl_Yolo/utils.cpp b/nvdsinfer_custom_impl_Yolo/utils.cpp index 8830a0f..21d8117 100644 --- a/nvdsinfer_custom_impl_Yolo/utils.cpp +++ b/nvdsinfer_custom_impl_Yolo/utils.cpp @@ -57,7 +57,7 @@ bool fileExists(const std::string fileName, bool verbose) { if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(fileName))) { - if (verbose) std::cout << "File does not exist: " << fileName << std::endl; + if (verbose) std::cout << "\nFile does not exist: " << fileName << std::endl; return false; } return true; @@ -101,7 +101,7 @@ std::vector loadWeights(const std::string weightsFilePath, const std::str assert(file.good()); int32_t count; file >> count; - assert(count > 0 && "Invalid .wts file."); + assert(count > 0 && "\nInvalid .wts file."); uint32_t floatWeight; std::string name; @@ -118,7 +118,7 @@ std::vector loadWeights(const std::string weightsFilePath, const std::str } else { - std::cerr << "File " << weightsFilePath << " is not supported" << std::endl; + std::cerr << "\nFile " << weightsFilePath << " is not supported" << std::endl; std::abort(); } @@ -149,11 +149,19 @@ int getNumChannels(nvinfer1::ITensor* t) return d.d[0]; } -void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput, - std::string layerOutput, std::string weightPtr) +void printLayerInfo( + std::string layerIndex, std::string layerName, std::string layerInput, std::string layerOutput, std::string weightPtr) { std::cout << std::setw(6) << std::left << layerIndex << std::setw(24) << std::left << layerName; std::cout << std::setw(20) << std::left << layerInput << std::setw(20) << std::left << layerOutput; std::cout << std::setw(7) << std::left << weightPtr << std::endl; } + +std::string getAbsPath(std::string path) +{ + std::size_t found = path.rfind("/"); + if (found != std::string::npos) + path.erase(path.begin() + found, path.end()); + return path; +} diff --git a/nvdsinfer_custom_impl_Yolo/utils.h b/nvdsinfer_custom_impl_Yolo/utils.h index 5013763..e453917 100644 --- a/nvdsinfer_custom_impl_Yolo/utils.h +++ b/nvdsinfer_custom_impl_Yolo/utils.h @@ -41,8 +41,8 @@ bool fileExists(const std::string fileName, bool verbose = true); std::vector loadWeights(const std::string weightsFilePath, const std::string& networkType); std::string dimsToString(const nvinfer1::Dims d); int getNumChannels(nvinfer1::ITensor* t); - -void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput, - std::string layerOutput, std::string weightPtr); +void printLayerInfo( + std::string layerIndex, std::string layerName, std::string layerInput, std::string layerOutput, std::string weightPtr); +std::string getAbsPath(std::string path); #endif diff --git a/nvdsinfer_custom_impl_Yolo/yolo.cpp b/nvdsinfer_custom_impl_Yolo/yolo.cpp index 6e406fe..3099f65 100644 --- a/nvdsinfer_custom_impl_Yolo/yolo.cpp +++ b/nvdsinfer_custom_impl_Yolo/yolo.cpp @@ -46,7 +46,12 @@ Yolo::Yolo(const NetworkInfo& networkInfo) m_InputC(0), m_InputSize(0), m_NumClasses(0), - m_LetterBox(0) + m_LetterBox(0), + m_NewCoords(0), + m_YoloCount(0), + m_IouThreshold(0), + m_ScoreThreshold(0), + m_TopK(0) {} Yolo::~Yolo() @@ -61,59 +66,75 @@ nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder, nvinfer1 m_ConfigBlocks = parseConfigFile(m_ConfigFilePath); parseConfigBlocks(); + std::string configNMS = getAbsPath(m_WtsFilePath) + "/config_nms.txt"; + if (!fileExists(configNMS)) + { + std::cerr << "YOLO config_nms.txt file is not specified\n" << std::endl; + assert(0); + } + m_ConfigNMSBlocks = parseConfigFile(configNMS); + parseConfigNMSBlocks(); + nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0); - if (parseModel(*network) != NVDSINFER_SUCCESS) { + if (parseModel(*network) != NVDSINFER_SUCCESS) + { delete network; return nullptr; } - std::cout << "Building the TensorRT Engine" << std::endl; + std::cout << "Building the TensorRT Engine\n" << std::endl; - if (m_NumClasses != m_NumDetectedClasses) { - std::cout << "\nNOTE: Number of classes mismatch, make sure to set num-detected-classes=" << m_NumClasses << " in config_infer file" << std::endl; + if (m_NumClasses != m_NumDetectedClasses) + { + std::cout << "NOTE: Number of classes mismatch, make sure to set num-detected-classes=" << m_NumClasses + << " in config_infer file\n" << std::endl; } - if (m_LetterBox == 1) { - std::cout << "\nNOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file to get better accuracy" << std::endl; + if (m_LetterBox == 1) + { + std::cout << "NOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file" + << " to get better accuracy\n" << std::endl; } - if (m_ClusterMode != 2) { - std::cout << "\nNOTE: Wrong cluster-mode is set, make sure to set cluster-mode=2 in config_infer file" << std::endl; + if (m_ClusterMode != 4) + { + std::cout << "NOTE: Wrong cluster-mode is set, make sure to set cluster-mode=4 in config_infer file\n" + << std::endl; } - std::cout << "" << std::endl; - if (m_NetworkMode == "INT8" && !fileExists(m_Int8CalibPath)) { + if (m_NetworkMode == "INT8" && !fileExists(m_Int8CalibPath)) + { assert(builder->platformHasFastInt8()); #ifdef OPENCV std::string calib_image_list; int calib_batch_size; - if (getenv("INT8_CALIB_IMG_PATH")) { + if (getenv("INT8_CALIB_IMG_PATH")) calib_image_list = getenv("INT8_CALIB_IMG_PATH"); - } - else { + else + { std::cerr << "INT8_CALIB_IMG_PATH not set" << std::endl; std::abort(); } - if (getenv("INT8_CALIB_BATCH_SIZE")) { + if (getenv("INT8_CALIB_BATCH_SIZE")) calib_batch_size = std::stoi(getenv("INT8_CALIB_BATCH_SIZE")); - } - else { + else + { std::cerr << "INT8_CALIB_BATCH_SIZE not set" << std::endl; std::abort(); } - nvinfer1::Int8EntropyCalibrator2 *calibrator = new nvinfer1::Int8EntropyCalibrator2(calib_batch_size, m_InputC, m_InputH, m_InputW, m_LetterBox, calib_image_list, m_Int8CalibPath); + nvinfer1::Int8EntropyCalibrator2 *calibrator = new nvinfer1::Int8EntropyCalibrator2( + calib_batch_size, m_InputC, m_InputH, m_InputW, m_LetterBox, calib_image_list, m_Int8CalibPath); config->setFlag(nvinfer1::BuilderFlag::kINT8); config->setInt8Calibrator(calibrator); #else - std::cerr << "OpenCV is required to run INT8 calibrator" << std::endl; - std::abort(); + std::cerr << "OpenCV is required to run INT8 calibrator\n" << std::endl; + assert(0); #endif } nvinfer1::ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config); - if (engine) { + if (engine) std::cout << "Building complete\n" << std::endl; - } else { + else std::cerr << "Building engine failed\n" << std::endl; - } delete network; return engine; @@ -126,28 +147,30 @@ NvDsInferStatus Yolo::parseModel(nvinfer1::INetworkDefinition& network) { std::cout << "Building YOLO network\n" << std::endl; NvDsInferStatus status = buildYoloNetwork(weights, network); - if (status == NVDSINFER_SUCCESS) { + if (status == NVDSINFER_SUCCESS) std::cout << "Building YOLO network complete" << std::endl; - } else { + else std::cerr << "Building YOLO network failed" << std::endl; - } return status; } -NvDsInferStatus Yolo::buildYoloNetwork( - std::vector& weights, nvinfer1::INetworkDefinition& network) { +NvDsInferStatus Yolo::buildYoloNetwork(std::vector& weights, nvinfer1::INetworkDefinition& network) +{ int weightPtr = 0; int channels = m_InputC; std::string weightsType; - - if (m_WtsFilePath.find(".weights") != std::string::npos) { + if (m_WtsFilePath.find(".weights") != std::string::npos) weightsType = "weights"; - } - else { + else weightsType = "wts"; - } + + float eps = 1.0e-5; + if (m_NetworkType.find("yolov5") != std::string::npos) + eps = 1.0e-3; + else if (m_NetworkType.find("yolor") != std::string::npos) + eps = 1.0e-4; nvinfer1::ITensor* data = network.addInput(m_InputBlobName.c_str(), nvinfer1::DataType::kFLOAT, @@ -157,26 +180,24 @@ NvDsInferStatus Yolo::buildYoloNetwork( nvinfer1::ITensor* previous = data; std::vector tensorOutputs; - uint outputTensorCount = 0; + std::vector yoloInputs; + uint inputYoloCount = 0; - for (uint i = 0; i < m_ConfigBlocks.size(); ++i) { + int modelType = -1; + + for (uint i = 0; i < m_ConfigBlocks.size(); ++i) + { assert(getNumChannels(previous) == channels); std::string layerIndex = "(" + std::to_string(tensorOutputs.size()) + ")"; - if (m_ConfigBlocks.at(i).at("type") == "net") { + if (m_ConfigBlocks.at(i).at("type") == "net") printLayerInfo("", "layer", " input", " output", "weightPtr"); - } - else if (m_ConfigBlocks.at(i).at("type") == "convolutional") { - float eps = 1.0e-5; - if (m_NetworkType.find("yolov5") != std::string::npos) { - eps = 1.0e-3; - } - else if (m_NetworkType.find("yolor") != std::string::npos) { - eps = 1.0e-4; - } + else if (m_ConfigBlocks.at(i).at("type") == "convolutional") + { std::string inputVol = dimsToString(previous->getDimensions()); - nvinfer1::ILayer* out = convolutionalLayer(i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, weightsType, channels, eps, previous, &network); + nvinfer1::ILayer* out = convolutionalLayer( + i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, weightsType, channels, eps, previous, &network); previous = out->getOutput(0); assert(previous != nullptr); channels = getNumChannels(previous); @@ -186,14 +207,13 @@ NvDsInferStatus Yolo::buildYoloNetwork( printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr)); } - else if (m_ConfigBlocks.at(i).at("type") == "implicit_add" || m_ConfigBlocks.at(i).at("type") == "implicit_mul") { + else if (m_ConfigBlocks.at(i).at("type") == "implicit_add" || m_ConfigBlocks.at(i).at("type") == "implicit_mul") + { std::string type; - if (m_ConfigBlocks.at(i).at("type") == "implicit_add") { + if (m_ConfigBlocks.at(i).at("type") == "implicit_add") type = "add"; - } - else if (m_ConfigBlocks.at(i).at("type") == "implicit_mul") { + else if (m_ConfigBlocks.at(i).at("type") == "implicit_mul") type = "mul"; - } assert(m_ConfigBlocks.at(i).find("filters") != m_ConfigBlocks.at(i).end()); int filters = std::stoi(m_ConfigBlocks.at(i).at("filters")); nvinfer1::ILayer* out = implicitLayer(filters, weights, m_TrtWeights, weightPtr, &network); @@ -206,19 +226,17 @@ NvDsInferStatus Yolo::buildYoloNetwork( printLayerInfo(layerIndex, layerType, " -", outputVol, std::to_string(weightPtr)); } - else if (m_ConfigBlocks.at(i).at("type") == "shift_channels" || m_ConfigBlocks.at(i).at("type") == "control_channels") { + else if (m_ConfigBlocks.at(i).at("type") == "shift_channels" || m_ConfigBlocks.at(i).at("type") == "control_channels") + { std::string type; - if (m_ConfigBlocks.at(i).at("type") == "shift_channels") { + if (m_ConfigBlocks.at(i).at("type") == "shift_channels") type = "shift"; - } - else if (m_ConfigBlocks.at(i).at("type") == "control_channels") { + else if (m_ConfigBlocks.at(i).at("type") == "control_channels") type = "control"; - } assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end()); int from = stoi(m_ConfigBlocks.at(i).at("from")); - if (from > 0) { + if (from > 0) from = from - i + 1; - } assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size())); assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size())); assert(i + from - 1 < i - 2); @@ -231,25 +249,22 @@ NvDsInferStatus Yolo::buildYoloNetwork( printLayerInfo(layerIndex, layerType, " -", outputVol, " -"); } - else if (m_ConfigBlocks.at(i).at("type") == "dropout") { + else if (m_ConfigBlocks.at(i).at("type") == "dropout") + { // Skip dropout layer - assert(m_ConfigBlocks.at(i).find("probability") != m_ConfigBlocks.at(i).end()); - /*float probability = std::stof(m_ConfigBlocks.at(i).at("probability")); - nvinfer1::ILayer* out = dropoutLayer(probability, previous, &network); - previous = out->getOutput(0);*/ assert(previous != nullptr); tensorOutputs.push_back(previous); printLayerInfo(layerIndex, "dropout", " -", " -", " -"); } - else if (m_ConfigBlocks.at(i).at("type") == "shortcut") { + else if (m_ConfigBlocks.at(i).at("type") == "shortcut") + { assert(m_ConfigBlocks.at(i).find("activation") != m_ConfigBlocks.at(i).end()); assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end()); std::string activation = m_ConfigBlocks.at(i).at("activation"); int from = stoi(m_ConfigBlocks.at(i).at("from")); - if (from > 0) { + if (from > 0) from = from - i + 1; - } assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size())); assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size())); assert(i + from - 1 < i - 2); @@ -267,7 +282,8 @@ NvDsInferStatus Yolo::buildYoloNetwork( } } - else if (m_ConfigBlocks.at(i).at("type") == "route") { + else if (m_ConfigBlocks.at(i).at("type") == "route") + { assert(m_ConfigBlocks.at(i).find("layers") != m_ConfigBlocks.at(i).end()); nvinfer1::ILayer* out = routeLayer(i, m_ConfigBlocks.at(i), tensorOutputs, &network); previous = out->getOutput(0); @@ -278,7 +294,8 @@ NvDsInferStatus Yolo::buildYoloNetwork( printLayerInfo(layerIndex, "route", " -", outputVol, std::to_string(weightPtr)); } - else if (m_ConfigBlocks.at(i).at("type") == "upsample") { + else if (m_ConfigBlocks.at(i).at("type") == "upsample") + { std::string inputVol = dimsToString(previous->getDimensions()); nvinfer1::ILayer* out = upsampleLayer(i - 1, m_ConfigBlocks[i], previous, &network); previous = out->getOutput(0); @@ -288,7 +305,8 @@ NvDsInferStatus Yolo::buildYoloNetwork( printLayerInfo(layerIndex, "upsample", inputVol, outputVol, " -"); } - else if (m_ConfigBlocks.at(i).at("type") == "maxpool") { + else if (m_ConfigBlocks.at(i).at("type") == "maxpool") + { std::string inputVol = dimsToString(previous->getDimensions()); nvinfer1::ILayer* out = maxpoolLayer(i, m_ConfigBlocks.at(i), previous, &network); previous = out->getOutput(0); @@ -298,8 +316,10 @@ NvDsInferStatus Yolo::buildYoloNetwork( printLayerInfo(layerIndex, "maxpool", inputVol, outputVol, std::to_string(weightPtr)); } - else if (m_ConfigBlocks.at(i).at("type") == "reorg") { - if (m_NetworkType.find("yolov5") != std::string::npos || m_NetworkType.find("yolor") != std::string::npos) { + else if (m_ConfigBlocks.at(i).at("type") == "reorg") + { + if (m_NetworkType.find("yolov5") != std::string::npos || m_NetworkType.find("yolor") != std::string::npos) + { std::string inputVol = dimsToString(previous->getDimensions()); nvinfer1::ILayer* out = reorgV5Layer(i, previous, &network); previous = out->getOutput(0); @@ -310,7 +330,8 @@ NvDsInferStatus Yolo::buildYoloNetwork( std::string layerType = "reorgV5"; printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr)); } - else { + else + { std::string inputVol = dimsToString(previous->getDimensions()); nvinfer1::IPluginV2* reorgPlugin = createReorgPlugin(2); assert(reorgPlugin != nullptr); @@ -328,95 +349,127 @@ NvDsInferStatus Yolo::buildYoloNetwork( } } - else if (m_ConfigBlocks.at(i).at("type") == "yolo") { - uint modelType = 1; - uint newCoords = 0; - float scaleXY = 1.0; - if (m_NetworkType.find("yolor") != std::string::npos) { - modelType = 2; - } - if (m_ConfigBlocks.at(i).find("new_coords") != m_ConfigBlocks.at(i).end()) { - newCoords = std::stoi(m_ConfigBlocks.at(i).at("new_coords")); - } - if (m_ConfigBlocks.at(i).find("scale_x_y") != m_ConfigBlocks.at(i).end()) { - scaleXY = std::stof(m_ConfigBlocks.at(i).at("scale_x_y")); + else if (m_ConfigBlocks.at(i).at("type") == "yolo" || m_ConfigBlocks.at(i).at("type") == "region") + { + if (m_ConfigBlocks.at(i).at("type") == "yolo") + { + if (m_NetworkType.find("yolor") != std::string::npos) + modelType = 2; + else + modelType = 1; } + else + modelType = 0; - std::string layerName = "yolo_" + std::to_string(i); + std::string layerName = modelType != 0 ? "yolo_" + std::to_string(i) : "region_" + std::to_string(i); nvinfer1::Dims prevTensorDims = previous->getDimensions(); - TensorInfo& curYoloTensor = m_OutputTensors.at(outputTensorCount); - m_NumClasses = curYoloTensor.numClasses; + TensorInfo& curYoloTensor = m_YoloTensors.at(inputYoloCount); curYoloTensor.blobName = layerName; - nvinfer1::IPluginV2* yoloPlugin - = new YoloLayer(curYoloTensor.numBBoxes, curYoloTensor.numClasses, m_InputW, m_InputH, - prevTensorDims.d[2], prevTensorDims.d[1], modelType, newCoords, scaleXY, - curYoloTensor.anchors, curYoloTensor.mask); - assert(yoloPlugin != nullptr); - nvinfer1::IPluginV2Layer* yolo = - network.addPluginV2(&previous, 1, *yoloPlugin); - assert(yolo != nullptr); - yolo->setName(layerName.c_str()); - std::string inputVol = dimsToString(previous->getDimensions()); - previous = yolo->getOutput(0); - assert(previous != nullptr); - previous->setName(layerName.c_str()); - std::string outputVol = dimsToString(previous->getDimensions()); - network.markOutput(*previous); - channels = getNumChannels(previous); - tensorOutputs.push_back(yolo->getOutput(0)); - printLayerInfo(layerIndex, "yolo", inputVol, outputVol, std::to_string(weightPtr)); - ++outputTensorCount; - } + curYoloTensor.gridSizeX = prevTensorDims.d[2]; + curYoloTensor.gridSizeY = prevTensorDims.d[1]; - else if (m_ConfigBlocks.at(i).at("type") == "region") { - std::vector mask; - std::string layerName = "region_" + std::to_string(i); - nvinfer1::Dims prevTensorDims = previous->getDimensions(); - TensorInfo& curRegionTensor = m_OutputTensors.at(outputTensorCount); - m_NumClasses = curRegionTensor.numClasses; - curRegionTensor.blobName = layerName; - nvinfer1::IPluginV2* regionPlugin - = new YoloLayer(curRegionTensor.numBBoxes, curRegionTensor.numClasses, m_InputW, m_InputH, - prevTensorDims.d[2], prevTensorDims.d[1], 0, 0, 1.0, curRegionTensor.anchors, - mask); - assert(regionPlugin != nullptr); - nvinfer1::IPluginV2Layer* region = - network.addPluginV2(&previous, 1, *regionPlugin); - assert(region != nullptr); - region->setName(layerName.c_str()); std::string inputVol = dimsToString(previous->getDimensions()); - previous = region->getOutput(0); - assert(previous != nullptr); - previous->setName(layerName.c_str()); - std::string outputVol = dimsToString(previous->getDimensions()); - network.markOutput(*previous); channels = getNumChannels(previous); - tensorOutputs.push_back(region->getOutput(0)); - printLayerInfo(layerIndex, "region", inputVol, outputVol, std::to_string(weightPtr)); - ++outputTensorCount; + tensorOutputs.push_back(previous); + yoloInputs.push_back(previous); + ++inputYoloCount; + printLayerInfo(layerIndex, modelType != 0 ? "yolo" : "region", inputVol, " -", " -"); } else { - std::cout << "Unsupported layer type --> \"" - << m_ConfigBlocks.at(i).at("type") << "\"" << std::endl; + std::cout << "\nUnsupported layer type --> \"" << m_ConfigBlocks.at(i).at("type") << "\"" << std::endl; assert(0); } } if ((int)weights.size() != weightPtr) { - std::cout << "Number of unused weights left: " << weights.size() - weightPtr << std::endl; + std::cout << "\nNumber of unused weights left: " << weights.size() - weightPtr << std::endl; assert(0); } - std::cout << "Output YOLO blob names: " << std::endl; - for (auto& tensor : m_OutputTensors) { + if (m_YoloCount == inputYoloCount) + { + assert((modelType != -1) && "\nCould not determine model type"); + + nvinfer1::ITensor* yoloInputTensors[inputYoloCount]; + uint64_t outputSize = 0; + for (uint j = 0; j < inputYoloCount; ++j) + { + yoloInputTensors[j] = yoloInputs[j]; + TensorInfo& curYoloTensor = m_YoloTensors.at(j); + outputSize += curYoloTensor.gridSizeX * curYoloTensor.gridSizeY * curYoloTensor.numBBoxes; + } + + if (m_TopK > outputSize) { + std::cout << "\ntopk > Number of outputs\nPlease change the topk to " << outputSize + << " or less in config_nms.txt file\n" << std::endl; + assert(0); + } + + std::string layerName = "yolo"; + nvinfer1::IPluginV2* yoloPlugin = new YoloLayer( + m_InputW, m_InputH, m_NumClasses, m_NewCoords, m_YoloTensors, outputSize, modelType, m_TopK, + m_ScoreThreshold); + assert(yoloPlugin != nullptr); + nvinfer1::IPluginV2Layer* yolo = network.addPluginV2(yoloInputTensors, inputYoloCount, *yoloPlugin); + assert(yolo != nullptr); + yolo->setName(layerName.c_str()); + previous = yolo->getOutput(0); + assert(previous != nullptr); + previous->setName(layerName.c_str()); + tensorOutputs.push_back(yolo->getOutput(0)); + + nvinfer1::ITensor* yoloTensors[] = {yolo->getOutput(0), yolo->getOutput(1)}; + std::string outputVol = dimsToString(previous->getDimensions()); + + nvinfer1::plugin::NMSParameters nmsParams; + nmsParams.shareLocation = true; + nmsParams.backgroundLabelId = -1; + nmsParams.numClasses = m_NumClasses; + nmsParams.topK = m_TopK; + nmsParams.keepTopK = m_TopK; + nmsParams.scoreThreshold = m_ScoreThreshold; + nmsParams.iouThreshold = m_IouThreshold; + nmsParams.isNormalized = false; + + layerName = "batchedNMS"; + nvinfer1::IPluginV2* batchedNMS = createBatchedNMSPlugin(nmsParams); + nvinfer1::IPluginV2Layer* nms = network.addPluginV2(yoloTensors, 2, *batchedNMS); + nms->setName(layerName.c_str()); + nvinfer1::ITensor* num_detections = nms->getOutput(0); + layerName = "num_detections"; + num_detections->setName(layerName.c_str()); + nvinfer1::ITensor* nmsed_boxes = nms->getOutput(1); + layerName = "nmsed_boxes"; + nmsed_boxes->setName(layerName.c_str()); + nvinfer1::ITensor* nmsed_scores = nms->getOutput(2); + layerName = "nmsed_scores"; + nmsed_scores->setName(layerName.c_str()); + nvinfer1::ITensor* nmsed_classes = nms->getOutput(3); + layerName = "nmsed_classes"; + nmsed_classes->setName(layerName.c_str()); + network.markOutput(*num_detections); + network.markOutput(*nmsed_boxes); + network.markOutput(*nmsed_scores); + network.markOutput(*nmsed_classes); + + printLayerInfo("", "batched_nms", " -", outputVol, " -"); + } + else { + std::cout << "\nError in yolo cfg file" << std::endl; + assert(0); + } + + std::cout << "\nOutput YOLO blob names: " << std::endl; + for (auto& tensor : m_YoloTensors) + { std::cout << tensor.blobName << std::endl; } int nbLayers = network.getNbLayers(); - std::cout << "Total number of YOLO layers: " << nbLayers << std::endl; + std::cout << "\nTotal number of YOLO layers: " << nbLayers << "\n" << std::endl; return NVDSINFER_SUCCESS; } @@ -462,39 +515,44 @@ Yolo::parseConfigFile (const std::string cfgFilePath) void Yolo::parseConfigBlocks() { - for (auto block : m_ConfigBlocks) { + for (auto block : m_ConfigBlocks) + { if (block.at("type") == "net") { - assert((block.find("height") != block.end()) - && "Missing 'height' param in network cfg"); + assert((block.find("height") != block.end()) && "Missing 'height' param in network cfg"); assert((block.find("width") != block.end()) && "Missing 'width' param in network cfg"); - assert((block.find("channels") != block.end()) - && "Missing 'channels' param in network cfg"); + assert((block.find("channels") != block.end()) && "Missing 'channels' param in network cfg"); m_InputH = std::stoul(block.at("height")); m_InputW = std::stoul(block.at("width")); m_InputC = std::stoul(block.at("channels")); m_InputSize = m_InputC * m_InputH * m_InputW; - if (block.find("letter_box") != block.end()) { + if (block.find("letter_box") != block.end()) + { m_LetterBox = std::stoul(block.at("letter_box")); } - else { - m_LetterBox = 0; - } } - else if ((block.at("type") == "region") || (block.at("type") == "yolo") || (block.at("type") == "detect")) + else if ((block.at("type") == "region") || (block.at("type") == "yolo")) { assert((block.find("num") != block.end()) && std::string("Missing 'num' param in " + block.at("type") + " layer").c_str()); assert((block.find("classes") != block.end()) - && std::string("Missing 'classes' param in " + block.at("type") + " layer") - .c_str()); + && std::string("Missing 'classes' param in " + block.at("type") + " layer").c_str()); assert((block.find("anchors") != block.end()) - && std::string("Missing 'anchors' param in " + block.at("type") + " layer") - .c_str()); + && std::string("Missing 'anchors' param in " + block.at("type") + " layer").c_str()); + + ++m_YoloCount; + + m_NumClasses = std::stoul(block.at("classes")); + + if (block.find("new_coords") != block.end()) + { + m_NewCoords = std::stoul(block.at("new_coords")); + } TensorInfo outputTensor; + std::string anchorString = block.at("anchors"); while (!anchorString.empty()) { @@ -513,7 +571,8 @@ void Yolo::parseConfigBlocks() } } - if (block.find("mask") != block.end()) { + if (block.find("mask") != block.end()) + { std::string maskString = block.at("mask"); while (!maskString.empty()) { @@ -533,17 +592,41 @@ void Yolo::parseConfigBlocks() } } - outputTensor.numBBoxes = outputTensor.mask.size() > 0 - ? outputTensor.mask.size() - : std::stoul(trim(block.at("num"))); - outputTensor.numClasses = std::stoul(block.at("classes")); - m_OutputTensors.push_back(outputTensor); + if (block.find("scale_x_y") != block.end()) + { + outputTensor.scaleXY = std::stof(block.at("scale_x_y")); + } + else + { + outputTensor.scaleXY = 1.0; + } + + outputTensor.numBBoxes + = outputTensor.mask.size() > 0 ? outputTensor.mask.size() : std::stoul(trim(block.at("num"))); + + m_YoloTensors.push_back(outputTensor); } } } -void Yolo::destroyNetworkUtils() { - for (uint i = 0; i < m_TrtWeights.size(); ++i) { +void Yolo::parseConfigNMSBlocks() +{ + auto block = m_ConfigNMSBlocks[0]; + + assert((block.at("type") == "property") && "Missing 'property' param in nms cfg"); + assert((block.find("iou-threshold") != block.end()) && "Missing 'iou-threshold' param in nms cfg"); + assert((block.find("score-threshold") != block.end()) && "Missing 'score-threshold' param in nms cfg"); + assert((block.find("topk") != block.end()) && "Missing 'topk' param in nms cfg"); + + m_IouThreshold = std::stof(block.at("iou-threshold")); + m_ScoreThreshold = std::stof(block.at("score-threshold")); + m_TopK = std::stoul(block.at("topk")); +} + +void Yolo::destroyNetworkUtils() +{ + for (uint i = 0; i < m_TrtWeights.size(); ++i) + { if (m_TrtWeights[i].count > 0) free(const_cast(m_TrtWeights[i].values)); } diff --git a/nvdsinfer_custom_impl_Yolo/yolo.h b/nvdsinfer_custom_impl_Yolo/yolo.h index 7b1c0aa..219ccac 100644 --- a/nvdsinfer_custom_impl_Yolo/yolo.h +++ b/nvdsinfer_custom_impl_Yolo/yolo.h @@ -29,7 +29,6 @@ #include "layers/convolutional_layer.h" #include "layers/implicit_layer.h" #include "layers/channels_layer.h" -#include "layers/dropout_layer.h" #include "layers/shortcut_layer.h" #include "layers/route_layer.h" #include "layers/upsample_layer.h" @@ -54,8 +53,10 @@ struct NetworkInfo struct TensorInfo { std::string blobName; + uint gridSizeX {0}; + uint gridSizeY {0}; uint numBBoxes {0}; - uint numClasses {0}; + float scaleXY; std::vector anchors; std::vector mask; }; @@ -63,12 +64,15 @@ struct TensorInfo class Yolo : public IModelParser { public: Yolo(const NetworkInfo& networkInfo); + ~Yolo() override; + bool hasFullDimsSupported() const override { return false; } + const char* getModelName() const override { - return m_ConfigFilePath.empty() ? m_NetworkType.c_str() - : m_ConfigFilePath.c_str(); + return m_ConfigFilePath.empty() ? m_NetworkType.c_str() : m_ConfigFilePath.c_str(); } + NvDsInferStatus parseModel(nvinfer1::INetworkDefinition& network) override; nvinfer1::ICudaEngine *createEngine (nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config); @@ -90,17 +94,26 @@ protected: uint64_t m_InputSize; uint m_NumClasses; uint m_LetterBox; + uint m_NewCoords; + uint m_YoloCount; + float m_IouThreshold; + float m_ScoreThreshold; + uint m_TopK; - std::vector m_OutputTensors; + std::vector m_YoloTensors; std::vector> m_ConfigBlocks; + std::vector> m_ConfigNMSBlocks; std::vector m_TrtWeights; private: - NvDsInferStatus buildYoloNetwork( - std::vector& weights, nvinfer1::INetworkDefinition& network); - std::vector> parseConfigFile( - const std::string cfgFilePath); + NvDsInferStatus buildYoloNetwork(std::vector& weights, nvinfer1::INetworkDefinition& network); + + std::vector> parseConfigFile(const std::string cfgFilePath); + void parseConfigBlocks(); + + void parseConfigNMSBlocks(); + void destroyNetworkUtils(); }; diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward.cu b/nvdsinfer_custom_impl_Yolo/yoloForward.cu index 3e15fa6..7c827b4 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloForward.cu +++ b/nvdsinfer_custom_impl_Yolo/yoloForward.cu @@ -3,18 +3,14 @@ * https://www.github.com/marcoslucianops */ -#include -#include #include -#include -#include inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); } __global__ void gpuYoloLayer( - const float* input, float* output, const uint netWidth, const uint netHeight, const uint gridSizeX, - const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY, - const float* anchors, const int* mask) + const float* input, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData, + const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY, + const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float* anchors, const int* mask) { uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint y_id = blockIdx.y * blockDim.y + threadIdx.y; @@ -28,28 +24,32 @@ __global__ void gpuYoloLayer( const int numGridCells = gridSizeX * gridSizeY; const int bbindex = y_id * gridSizeX + x_id; + const float objectness + = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); + + if (objectness < scoreThreshold) return; + + int count = (int)atomicAdd(&countData[0], 1); + const float alpha = scaleXY; const float beta = -0.5 * (scaleXY - 1); - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] + float x = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]) * alpha + beta + x_id) * netWidth / gridSizeX; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] + float y = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]) * alpha + beta + y_id) * netHeight / gridSizeY; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] + float w = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]) * anchors[mask[z_id] * 2]; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] + float h = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) * anchors[mask[z_id] * 2 + 1]; - const float objectness - = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); - float maxProb = 0.0f; int maxIndex = -1; @@ -65,22 +65,26 @@ __global__ void gpuYoloLayer( } } - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] - = objectness * maxProb; - - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)] - = maxIndex; + d_indexes[count] = count; + d_scores[count] = objectness * maxProb + 1.f; + d_boxes[count * 4 + 0] = x - 0.5 * w; + d_boxes[count * 4 + 1] = y - 0.5 * h; + d_boxes[count * 4 + 2] = x + 0.5 * w; + d_boxes[count * 4 + 3] = y + 0.5 * h; + d_classes[count] = maxIndex; } cudaError_t cudaYoloLayer( - const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, - const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, - uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); + const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData, + const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); cudaError_t cudaYoloLayer( - const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, - const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, - uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream) + const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData, + const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream) { dim3 threads_per_block(16, 16, 4); dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1, @@ -90,9 +94,12 @@ cudaError_t cudaYoloLayer( for (unsigned int batch = 0; batch < batchSize; ++batch) { gpuYoloLayer<<>>( - reinterpret_cast(input) + (batch * outputSize), - reinterpret_cast(output) + (batch * outputSize), - netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY, + reinterpret_cast(input) + (batch * inputSize), + reinterpret_cast(d_indexes) + (batch * outputSize), + reinterpret_cast(d_scores) + (batch * outputSize), + reinterpret_cast(d_boxes) + (batch * 4 * outputSize), + reinterpret_cast(d_classes) + (batch * outputSize), reinterpret_cast(countData) + (batch), + scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY, reinterpret_cast(anchors), reinterpret_cast(mask)); } return cudaGetLastError(); diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu b/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu index 313bb78..7e4872d 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu +++ b/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu @@ -3,16 +3,12 @@ * https://www.github.com/marcoslucianops */ -#include -#include #include -#include -#include __global__ void gpuYoloLayer_nc( - const float* input, float* output, const uint netWidth, const uint netHeight, const uint gridSizeX, - const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY, - const float* anchors, const int* mask) + const float* input, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData, + const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY, + const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float* anchors, const int* mask) { uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint y_id = blockIdx.y * blockDim.y + threadIdx.y; @@ -26,28 +22,32 @@ __global__ void gpuYoloLayer_nc( const int numGridCells = gridSizeX * gridSizeY; const int bbindex = y_id * gridSizeX + x_id; + const float objectness + = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]; + + if (objectness < scoreThreshold) return; + + int count = (int)atomicAdd(&countData[0], 1); + const float alpha = scaleXY; const float beta = -0.5 * (scaleXY - 1); - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] + float x = (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] * alpha + beta + x_id) * netWidth / gridSizeX; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] + float y = (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] * alpha + beta + y_id) * netHeight / gridSizeY; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] + float w = __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] * 2, 2) * anchors[mask[z_id] * 2]; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] + float h = __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] * 2, 2) * anchors[mask[z_id] * 2 + 1]; - const float objectness - = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]; - float maxProb = 0.0f; int maxIndex = -1; @@ -63,22 +63,26 @@ __global__ void gpuYoloLayer_nc( } } - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] - = objectness * maxProb; - - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)] - = maxIndex; + d_indexes[count] = count; + d_scores[count] = objectness * maxProb + 1.f; + d_boxes[count * 4 + 0] = x - 0.5 * w; + d_boxes[count * 4 + 1] = y - 0.5 * h; + d_boxes[count * 4 + 2] = x + 0.5 * w; + d_boxes[count * 4 + 3] = y + 0.5 * h; + d_classes[count] = maxIndex; } cudaError_t cudaYoloLayer_nc( - const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, - const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, - uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); + const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData, + const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); cudaError_t cudaYoloLayer_nc( - const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, - const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, - uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream) + const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData, + const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream) { dim3 threads_per_block(16, 16, 4); dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1, @@ -88,9 +92,12 @@ cudaError_t cudaYoloLayer_nc( for (unsigned int batch = 0; batch < batchSize; ++batch) { gpuYoloLayer_nc<<>>( - reinterpret_cast(input) + (batch * outputSize), - reinterpret_cast(output) + (batch * outputSize), - netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY, + reinterpret_cast(input) + (batch * inputSize), + reinterpret_cast(d_indexes) + (batch * outputSize), + reinterpret_cast(d_scores) + (batch * outputSize), + reinterpret_cast(d_boxes) + (batch * 4 * outputSize), + reinterpret_cast(d_classes) + (batch * outputSize), reinterpret_cast(countData) + (batch), + scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY, reinterpret_cast(anchors), reinterpret_cast(mask)); } return cudaGetLastError(); diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward_r.cu b/nvdsinfer_custom_impl_Yolo/yoloForward_r.cu index 7142e61..bba42bc 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloForward_r.cu +++ b/nvdsinfer_custom_impl_Yolo/yoloForward_r.cu @@ -3,18 +3,14 @@ * https://www.github.com/marcoslucianops */ -#include -#include #include -#include -#include inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); } __global__ void gpuYoloLayer_r( - const float* input, float* output, const uint netWidth, const uint netHeight, const uint gridSizeX, - const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY, - const float* anchors, const int* mask) + const float* input, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData, + const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY, + const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float* anchors, const int* mask) { uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint y_id = blockIdx.y * blockDim.y + threadIdx.y; @@ -28,28 +24,32 @@ __global__ void gpuYoloLayer_r( const int numGridCells = gridSizeX * gridSizeY; const int bbindex = y_id * gridSizeX + x_id; + const float objectness + = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); + + if (objectness < scoreThreshold) return; + + int count = (int)atomicAdd(&countData[0], 1); + const float alpha = scaleXY; const float beta = -0.5 * (scaleXY - 1); - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] + float x = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]) * alpha + beta + x_id) * netWidth / gridSizeX; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] + float y = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]) * alpha + beta + y_id) * netHeight / gridSizeY; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] + float w = __powf(sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]) * 2, 2) * anchors[mask[z_id] * 2]; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] + float h = __powf(sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) * 2, 2) * anchors[mask[z_id] * 2 + 1]; - const float objectness - = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); - float maxProb = 0.0f; int maxIndex = -1; @@ -65,22 +65,26 @@ __global__ void gpuYoloLayer_r( } } - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] - = objectness * maxProb; - - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)] - = maxIndex; + d_indexes[count] = count; + d_scores[count] = objectness * maxProb + 1.f; + d_boxes[count * 4 + 0] = x - 0.5 * w; + d_boxes[count * 4 + 1] = y - 0.5 * h; + d_boxes[count * 4 + 2] = x + 0.5 * w; + d_boxes[count * 4 + 3] = y + 0.5 * h; + d_classes[count] = maxIndex; } cudaError_t cudaYoloLayer_r( - const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, - const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, - uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); + const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData, + const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); cudaError_t cudaYoloLayer_r( - const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, - const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, - uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream) + const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData, + const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream) { dim3 threads_per_block(16, 16, 4); dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1, @@ -90,9 +94,12 @@ cudaError_t cudaYoloLayer_r( for (unsigned int batch = 0; batch < batchSize; ++batch) { gpuYoloLayer_r<<>>( - reinterpret_cast(input) + (batch * outputSize), - reinterpret_cast(output) + (batch * outputSize), - netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY, + reinterpret_cast(input) + (batch * inputSize), + reinterpret_cast(d_indexes) + (batch * outputSize), + reinterpret_cast(d_scores) + (batch * outputSize), + reinterpret_cast(d_boxes) + (batch * 4 * outputSize), + reinterpret_cast(d_classes) + (batch * outputSize), reinterpret_cast(countData) + (batch), + scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY, reinterpret_cast(anchors), reinterpret_cast(mask)); } return cudaGetLastError(); diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu b/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu index 186cb80..cd21797 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu +++ b/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu @@ -3,17 +3,13 @@ * https://www.github.com/marcoslucianops */ -#include -#include #include -#include -#include inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); } __device__ void softmaxGPU( - const float* input, const int bbindex, const int numGridCells, uint z_id, - const uint numOutputClasses, float temp, float* output) + const float* input, const int bbindex, const int numGridCells, uint z_id, const uint numOutputClasses, float temp, + float* output) { int i; float sum = 0; @@ -33,9 +29,9 @@ __device__ void softmaxGPU( } __global__ void gpuRegionLayer( - const float* input, float* output, float* softmax, const uint netWidth, const uint netHeight, - const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, - const float* anchors) + const float* input, float* softmax, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData, + const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY, + const uint numOutputClasses, const uint numBBoxes, const float* anchors) { uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint y_id = blockIdx.y * blockDim.y + threadIdx.y; @@ -49,27 +45,31 @@ __global__ void gpuRegionLayer( const int numGridCells = gridSizeX * gridSizeY; const int bbindex = y_id * gridSizeX + x_id; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] + const float objectness + = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); + + if (objectness < scoreThreshold) return; + + int count = (int)atomicAdd(&countData[0], 1); + + float x = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]) + x_id) * netWidth / gridSizeX; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] + float y = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]) + y_id) * netHeight / gridSizeY; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] + float w = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]) * anchors[z_id * 2] * netWidth / gridSizeX; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] + float h = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) * anchors[z_id * 2 + 1] * netHeight / gridSizeY; softmaxGPU(input, bbindex, numGridCells, z_id, numOutputClasses, 1.0, softmax); - const float objectness - = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); - float maxProb = 0.0f; int maxIndex = -1; @@ -85,22 +85,26 @@ __global__ void gpuRegionLayer( } } - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] - = objectness * maxProb; - - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)] - = maxIndex; + d_indexes[count] = count; + d_scores[count] = objectness * maxProb + 1.f; + d_boxes[count * 4 + 0] = x - 0.5 * w; + d_boxes[count * 4 + 1] = y - 0.5 * h; + d_boxes[count * 4 + 2] = x + 0.5 * w; + d_boxes[count * 4 + 3] = y + 0.5 * h; + d_classes[count] = maxIndex; } cudaError_t cudaRegionLayer( - const void* input, void* output, void* softmax, const uint& batchSize, const uint& netWidth, - const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, - const uint& numBBoxes, uint64_t& outputSize, const void* anchors, cudaStream_t stream); + const void* input, void* softmax, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData, + const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const void* anchors, cudaStream_t stream); cudaError_t cudaRegionLayer( - const void* input, void* output, void* softmax, const uint& batchSize, const uint& netWidth, - const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, - const uint& numBBoxes, uint64_t& outputSize, const void* anchors, cudaStream_t stream) + const void* input, void* softmax, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData, + const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const void* anchors, cudaStream_t stream) { dim3 threads_per_block(16, 16, 4); dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1, @@ -110,10 +114,13 @@ cudaError_t cudaRegionLayer( for (unsigned int batch = 0; batch < batchSize; ++batch) { gpuRegionLayer<<>>( - reinterpret_cast(input) + (batch * outputSize), - reinterpret_cast(output) + (batch * outputSize), - reinterpret_cast(softmax) + (batch * outputSize), - netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, + reinterpret_cast(input) + (batch * inputSize), + reinterpret_cast(softmax) + (batch * inputSize), + reinterpret_cast(d_indexes) + (batch * outputSize), + reinterpret_cast(d_scores) + (batch * outputSize), + reinterpret_cast(d_boxes) + (batch * 4 * outputSize), + reinterpret_cast(d_classes) + (batch * outputSize), reinterpret_cast(countData) + (batch), + scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, reinterpret_cast(anchors)); } return cudaGetLastError(); diff --git a/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp b/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp index 86b5324..504e68c 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp +++ b/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp @@ -29,7 +29,6 @@ #include #include -uint kNUM_BBOXES; uint kNUM_CLASSES; namespace { @@ -49,131 +48,108 @@ namespace { } cudaError_t cudaYoloLayer_r( - const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, - const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, - uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); + const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData, + const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); cudaError_t cudaYoloLayer_nc( - const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, - const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, - uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); + const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData, + const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); cudaError_t cudaYoloLayer( - const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, - const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, - uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); + const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData, + const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); cudaError_t cudaRegionLayer( - const void* input, void* output, void* softmax, const uint& batchSize, const uint& netWidth, - const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, - const uint& numBBoxes, uint64_t& outputSize, const void* anchors, cudaStream_t stream); + const void* input, void* softmax, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData, + const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const void* anchors, cudaStream_t stream); + +cudaError_t sortDetections( + void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* bboxData, void* scoreData, void* countData, + const uint& batchSize, uint64_t& outputSize, uint& topK, const uint& numOutputClasses, cudaStream_t stream); YoloLayer::YoloLayer (const void* data, size_t length) { const char *d = static_cast(data); - read(d, m_NumBBoxes); - read(d, m_NumClasses); read(d, m_NetWidth); read(d, m_NetHeight); - read(d, m_GridSizeX); - read(d, m_GridSizeY); - read(d, m_Type); + read(d, m_NumClasses); read(d, m_NewCoords); - read(d, m_ScaleXY); read(d, m_OutputSize); + read(d, m_Type); + read(d, m_TopK); + read(d, m_ScoreThreshold); - uint anchorsSize; - read(d, anchorsSize); - for (uint i = 0; i < anchorsSize; i++) { - float result; - read(d, result); - m_Anchors.push_back(result); + uint yoloTensorsSize; + read(d, yoloTensorsSize); + for (uint i = 0; i < yoloTensorsSize; ++i) + { + TensorInfo curYoloTensor; + read(d, curYoloTensor.gridSizeX); + read(d, curYoloTensor.gridSizeY); + read(d, curYoloTensor.numBBoxes); + read(d, curYoloTensor.scaleXY); + + uint anchorsSize; + read(d, anchorsSize); + for (uint j = 0; j < anchorsSize; j++) + { + float result; + read(d, result); + curYoloTensor.anchors.push_back(result); + } + + uint maskSize; + read(d, maskSize); + for (uint j = 0; j < maskSize; j++) + { + int result; + read(d, result); + curYoloTensor.mask.push_back(result); + } + m_YoloTensors.push_back(curYoloTensor); } - uint maskSize; - read(d, maskSize); - for (uint i = 0; i < maskSize; i++) { - int result; - read(d, result); - m_Mask.push_back(result); - } - - if (m_Anchors.size() > 0) { - float* anchors = m_Anchors.data(); - CUDA_CHECK(cudaMallocHost(&p_Anchors, m_Anchors.size() * sizeof(float))); - CUDA_CHECK(cudaMemcpy(p_Anchors, anchors, m_Anchors.size() * sizeof(float), cudaMemcpyHostToDevice)); - } - - if (m_Mask.size() > 0) { - int* mask = m_Mask.data(); - CUDA_CHECK(cudaMallocHost(&p_Mask, m_Mask.size() * sizeof(int))); - CUDA_CHECK(cudaMemcpy(p_Mask, mask, m_Mask.size() * sizeof(int), cudaMemcpyHostToDevice)); - } - - kNUM_BBOXES = m_NumBBoxes; kNUM_CLASSES = m_NumClasses; }; -YoloLayer::YoloLayer ( - const uint& numBBoxes, const uint& numClasses, const uint& netWidth, const uint& netHeight, - const uint& gridSizeX, const uint& gridSizeY, const uint& modelType, const uint& newCoords, - const float& scaleXY, const std::vector anchors, - const std::vector mask) : - m_NumBBoxes(numBBoxes), - m_NumClasses(numClasses), +YoloLayer::YoloLayer( + const uint& netWidth, const uint& netHeight, const uint& numClasses, const uint& newCoords, + const std::vector& yoloTensors, const uint64_t& outputSize, const uint& modelType, const uint& topK, + const float& scoreThreshold) : m_NetWidth(netWidth), m_NetHeight(netHeight), - m_GridSizeX(gridSizeX), - m_GridSizeY(gridSizeY), - m_Type(modelType), + m_NumClasses(numClasses), m_NewCoords(newCoords), - m_ScaleXY(scaleXY), - m_Anchors(anchors), - m_Mask(mask) + m_YoloTensors(yoloTensors), + m_OutputSize(outputSize), + m_Type(modelType), + m_TopK(topK), + m_ScoreThreshold(scoreThreshold) { - assert(m_NumBBoxes > 0); - assert(m_NumClasses > 0); assert(m_NetWidth > 0); assert(m_NetHeight > 0); - assert(m_GridSizeX > 0); - assert(m_GridSizeY > 0); - m_OutputSize = m_GridSizeX * m_GridSizeY * (m_NumBBoxes * (4 + 1 + m_NumClasses)); - - if (m_Anchors.size() > 0) { - float* anchors = m_Anchors.data(); - CUDA_CHECK(cudaMallocHost(&p_Anchors, m_Anchors.size() * sizeof(float))); - CUDA_CHECK(cudaMemcpy(p_Anchors, anchors, m_Anchors.size() * sizeof(float), cudaMemcpyHostToDevice)); - } - - if (m_Mask.size() > 0) { - int* mask = m_Mask.data(); - CUDA_CHECK(cudaMallocHost(&p_Mask, m_Mask.size() * sizeof(int))); - CUDA_CHECK(cudaMemcpy(p_Mask, mask, m_Mask.size() * sizeof(int), cudaMemcpyHostToDevice)); - } - - kNUM_BBOXES = m_NumBBoxes; kNUM_CLASSES = m_NumClasses; }; -YoloLayer::~YoloLayer() -{ - if (m_Anchors.size() > 0) { - CUDA_CHECK(cudaFreeHost(p_Anchors)); - } - if (m_Mask.size() > 0) { - CUDA_CHECK(cudaFreeHost(p_Mask)); - } -} - nvinfer1::Dims YoloLayer::getOutputDimensions( int index, const nvinfer1::Dims* inputs, int nbInputDims) noexcept { - assert(index == 0); - assert(nbInputDims == 1); - return inputs[0]; + assert(index < 3); + if (index == 0) { + return nvinfer1::Dims3(m_TopK, 1, 4); + } + return nvinfer1::DimsHW(m_TopK, m_NumClasses); } bool YoloLayer::supportsFormat ( @@ -188,43 +164,116 @@ YoloLayer::configureWithFormat ( const nvinfer1::Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept { - assert(nbInputs == 1); + assert(nbInputs > 0); assert(format == nvinfer1::PluginFormat::kLINEAR); assert(inputDims != nullptr); } int32_t YoloLayer::enqueue ( - int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, + int batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept { - if (m_Type == 2) { // YOLOR incorrect param: scale_x_y = 2.0 - CUDA_CHECK(cudaYoloLayer_r( - inputs[0], outputs[0], batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY, - m_NumClasses, m_NumBBoxes, m_OutputSize, 2.0, p_Anchors, p_Mask, stream)); - } - else if (m_Type == 1) { - if (m_NewCoords) { - CUDA_CHECK(cudaYoloLayer_nc( - inputs[0], outputs[0], batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY, - m_NumClasses, m_NumBBoxes, m_OutputSize, m_ScaleXY, p_Anchors, p_Mask, stream)); + void* countData = workspace; + void* bboxData = outputs[0]; + void* scoreData = outputs[1]; + + CUDA_CHECK(cudaMemsetAsync((int*)countData, 0, sizeof(int) * batchSize, stream)); + CUDA_CHECK(cudaMemsetAsync((float*)bboxData, 0, sizeof(float) * m_TopK * 4 * batchSize, stream)); + CUDA_CHECK(cudaMemsetAsync((float*)scoreData, 0, sizeof(float) * m_TopK * m_NumClasses * batchSize, stream)); + + void* d_indexes; + CUDA_CHECK(cudaMallocHost(&d_indexes, sizeof(int) * m_OutputSize * batchSize)); + CUDA_CHECK(cudaMemsetAsync((float*)d_indexes, 0, sizeof(int) * m_OutputSize * batchSize, stream)); + + void* d_scores; + CUDA_CHECK(cudaMallocHost(&d_scores, sizeof(float) * m_OutputSize * batchSize)); + CUDA_CHECK(cudaMemsetAsync((float*)d_scores, 0, sizeof(float) * m_OutputSize * batchSize, stream)); + + void* d_boxes; + CUDA_CHECK(cudaMallocHost(&d_boxes, sizeof(float) * m_OutputSize * 4 * batchSize)); + CUDA_CHECK(cudaMemsetAsync((float*)d_boxes, 0, sizeof(float) * m_OutputSize * 4 * batchSize, stream)); + + void* d_classes; + CUDA_CHECK(cudaMallocHost(&d_classes, sizeof(int) * m_OutputSize * batchSize)); + CUDA_CHECK(cudaMemsetAsync((float*)d_classes, 0, sizeof(int) * m_OutputSize * batchSize, stream)); + + uint yoloTensorsSize = m_YoloTensors.size(); + for (uint i = 0; i < yoloTensorsSize; ++i) + { + TensorInfo& curYoloTensor = m_YoloTensors.at(i); + + uint numBBoxes = curYoloTensor.numBBoxes; + float scaleXY = curYoloTensor.scaleXY; + uint gridSizeX = curYoloTensor.gridSizeX; + uint gridSizeY = curYoloTensor.gridSizeY; + std::vector anchors = curYoloTensor.anchors; + std::vector mask = curYoloTensor.mask; + + void* v_anchors; + void* v_mask; + if (anchors.size() > 0) { + float* f_anchors = anchors.data(); + CUDA_CHECK(cudaMallocHost(&v_anchors, sizeof(float) * anchors.size())); + CUDA_CHECK(cudaMemcpy(v_anchors, f_anchors, sizeof(float) * anchors.size(), cudaMemcpyHostToDevice)); + } + if (mask.size() > 0) { + int* f_mask = mask.data(); + CUDA_CHECK(cudaMallocHost(&v_mask, sizeof(int) * mask.size())); + CUDA_CHECK(cudaMemcpy(v_mask, f_mask, sizeof(int) * mask.size(), cudaMemcpyHostToDevice)); + } + + uint64_t inputSize = gridSizeX * gridSizeY * (numBBoxes * (4 + 1 + m_NumClasses)); + + if (m_Type == 2) { // YOLOR incorrect param: scale_x_y = 2.0 + CUDA_CHECK(cudaYoloLayer_r( + inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize, + m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, 2.0, v_anchors, + v_mask, stream)); + } + else if (m_Type == 1) { + if (m_NewCoords) { + CUDA_CHECK(cudaYoloLayer_nc( + inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize, + m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY, + v_anchors, v_mask, stream)); + } + else { + CUDA_CHECK(cudaYoloLayer( + inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize, + m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY, + v_anchors, v_mask, stream)); + } } else { - CUDA_CHECK(cudaYoloLayer( - inputs[0], outputs[0], batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY, - m_NumClasses, m_NumBBoxes, m_OutputSize, m_ScaleXY, p_Anchors, p_Mask, stream)); + void* softmax; + CUDA_CHECK(cudaMallocHost(&softmax, sizeof(float) * inputSize * batchSize)); + CUDA_CHECK(cudaMemsetAsync((float*)softmax, 0, sizeof(float) * inputSize * batchSize)); + + CUDA_CHECK(cudaRegionLayer( + inputs[i], softmax, d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize, + m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, v_anchors, + stream)); + + CUDA_CHECK(cudaFreeHost(softmax)); + } + + if (anchors.size() > 0) { + CUDA_CHECK(cudaFreeHost(v_anchors)); + } + if (mask.size() > 0) { + CUDA_CHECK(cudaFreeHost(v_mask)); } } - else { - void* softmax; - cudaMallocHost(&softmax, sizeof(outputs[0])); - cudaMemcpy(softmax, outputs[0], sizeof(outputs[0]), cudaMemcpyHostToDevice); - CUDA_CHECK(cudaRegionLayer( - inputs[0], outputs[0], softmax, batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY, - m_NumClasses, m_NumBBoxes, m_OutputSize, p_Anchors, stream)); + CUDA_CHECK(sortDetections( + d_indexes, d_scores, d_boxes, d_classes, bboxData, scoreData, countData, batchSize, m_OutputSize, m_TopK, + m_NumClasses, stream)); + + CUDA_CHECK(cudaFreeHost(d_indexes)); + CUDA_CHECK(cudaFreeHost(d_scores)); + CUDA_CHECK(cudaFreeHost(d_boxes)); + CUDA_CHECK(cudaFreeHost(d_classes)); - CUDA_CHECK(cudaFreeHost(softmax)); - } return 0; } @@ -232,18 +281,28 @@ size_t YoloLayer::getSerializationSize() const noexcept { size_t totalSize = 0; - totalSize += sizeof(m_NumBBoxes); - totalSize += sizeof(m_NumClasses); totalSize += sizeof(m_NetWidth); totalSize += sizeof(m_NetHeight); - totalSize += sizeof(m_GridSizeX); - totalSize += sizeof(m_GridSizeY); - totalSize += sizeof(m_Type); + totalSize += sizeof(m_NumClasses); totalSize += sizeof(m_NewCoords); - totalSize += sizeof(m_ScaleXY); totalSize += sizeof(m_OutputSize); - totalSize += sizeof(uint) + sizeof(m_Anchors[0]) * m_Anchors.size(); - totalSize += sizeof(uint) + sizeof(m_Mask[0]) * m_Mask.size(); + totalSize += sizeof(m_Type); + totalSize += sizeof(m_TopK); + totalSize += sizeof(m_ScoreThreshold); + + uint yoloTensorsSize = m_YoloTensors.size(); + totalSize += sizeof(yoloTensorsSize); + + for (uint i = 0; i < yoloTensorsSize; ++i) + { + const TensorInfo& curYoloTensor = m_YoloTensors.at(i); + totalSize += sizeof(curYoloTensor.gridSizeX); + totalSize += sizeof(curYoloTensor.gridSizeY); + totalSize += sizeof(curYoloTensor.numBBoxes); + totalSize += sizeof(curYoloTensor.scaleXY); + totalSize += sizeof(uint) + sizeof(curYoloTensor.anchors[0]) * curYoloTensor.anchors.size(); + totalSize += sizeof(uint) + sizeof(curYoloTensor.mask[0]) * curYoloTensor.mask.size(); + } return totalSize; } @@ -252,35 +311,46 @@ void YoloLayer::serialize(void* buffer) const noexcept { char *d = static_cast(buffer); - write(d, m_NumBBoxes); - write(d, m_NumClasses); write(d, m_NetWidth); write(d, m_NetHeight); - write(d, m_GridSizeX); - write(d, m_GridSizeY); - write(d, m_Type); + write(d, m_NumClasses); write(d, m_NewCoords); - write(d, m_ScaleXY); write(d, m_OutputSize); + write(d, m_Type); + write(d, m_TopK); + write(d, m_ScoreThreshold); - uint anchorsSize = m_Anchors.size(); - write(d, anchorsSize); - for (uint i = 0; i < anchorsSize; i++) { - write(d, m_Anchors[i]); - } + uint yoloTensorsSize = m_YoloTensors.size(); + write(d, yoloTensorsSize); + for (uint i = 0; i < yoloTensorsSize; ++i) + { + const TensorInfo& curYoloTensor = m_YoloTensors.at(i); + write(d, curYoloTensor.gridSizeX); + write(d, curYoloTensor.gridSizeY); + write(d, curYoloTensor.numBBoxes); + write(d, curYoloTensor.scaleXY); - uint maskSize = m_Mask.size(); - write(d, maskSize); - for (uint i = 0; i < maskSize; i++) { - write(d, m_Mask[i]); + uint anchorsSize = curYoloTensor.anchors.size(); + write(d, anchorsSize); + for (uint j = 0; j < anchorsSize; ++j) + { + write(d, curYoloTensor.anchors[j]); + } + + uint maskSize = curYoloTensor.mask.size(); + write(d, maskSize); + for (uint j = 0; j < maskSize; ++j) + { + write(d, curYoloTensor.mask[j]); + } } } nvinfer1::IPluginV2* YoloLayer::clone() const noexcept { return new YoloLayer ( - m_NumBBoxes, m_NumClasses, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY, m_Type, - m_NewCoords, m_ScaleXY, m_Anchors, m_Mask); + m_NetWidth, m_NetHeight, m_NumClasses, m_NewCoords, m_YoloTensors, m_OutputSize, m_Type, m_TopK, + m_ScoreThreshold); } REGISTER_TENSORRT_PLUGIN(YoloLayerPluginCreator); diff --git a/nvdsinfer_custom_impl_Yolo/yoloPlugins.h b/nvdsinfer_custom_impl_Yolo/yoloPlugins.h index f0d3a2d..e7ae524 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloPlugins.h +++ b/nvdsinfer_custom_impl_Yolo/yoloPlugins.h @@ -36,14 +36,16 @@ #include "NvInferPlugin.h" -#define CUDA_CHECK(status) \ - { \ - if (status != 0) \ - { \ - std::cout << "CUDA failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \ - << " at line " << __LINE__ << std::endl; \ - abort(); \ - } \ +#include "yolo.h" + +#define CUDA_CHECK(status) \ + { \ + if (status != 0) \ + { \ + std::cout << "CUDA failure: " << cudaGetErrorString(status) << " in file " << __FILE__ << " at line " \ + << __LINE__ << std::endl; \ + abort(); \ + } \ } namespace @@ -56,15 +58,17 @@ class YoloLayer : public nvinfer1::IPluginV2 { public: YoloLayer (const void* data, size_t length); + YoloLayer ( - const uint& numBBoxes, const uint& numClasses, const uint& netWidth, const uint& netHeight, - const uint& gridSizeX, const uint& gridSizeY, const uint& modelType, const uint& newCoords, - const float& scaleXY, const std::vector anchors, - const std::vector mask); - ~YoloLayer (); + const uint& netWidth, const uint& netHeight, const uint& numClasses, const uint& newCoords, + const std::vector& yoloTensors, const uint64_t& outputSize, const uint& modelType, const uint& topK, + const float& scoreThreshold); + const char* getPluginType () const noexcept override { return YOLOLAYER_PLUGIN_NAME; } + const char* getPluginVersion () const noexcept override { return YOLOLAYER_PLUGIN_VERSION; } - int getNbOutputs () const noexcept override { return 1; } + + int getNbOutputs () const noexcept override { return 2; } nvinfer1::Dims getOutputDimensions ( int index, const nvinfer1::Dims* inputs, @@ -74,53 +78,59 @@ public: nvinfer1::DataType type, nvinfer1::PluginFormat format) const noexcept override; void configureWithFormat ( - const nvinfer1::Dims* inputDims, int nbInputs, - const nvinfer1::Dims* outputDims, int nbOutputs, + const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept override; int initialize () noexcept override { return 0; } + void terminate () noexcept override {} - size_t getWorkspaceSize (int maxBatchSize) const noexcept override { return 0; } + + size_t getWorkspaceSize (int maxBatchSize) const noexcept override { + return maxBatchSize * sizeof(int); + } + int32_t enqueue ( - int32_t batchSize, void const* const* inputs, void* const* outputs, - void* workspace, cudaStream_t stream) noexcept override; + int batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) + noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize (void* buffer) const noexcept override; + void destroy () noexcept override { delete this; } + nvinfer1::IPluginV2* clone() const noexcept override; void setPluginNamespace (const char* pluginNamespace) noexcept override { m_Namespace = pluginNamespace; } + virtual const char* getPluginNamespace () const noexcept override { return m_Namespace.c_str(); } private: std::string m_Namespace {""}; - uint m_NumBBoxes {0}; - uint m_NumClasses {0}; uint m_NetWidth {0}; uint m_NetHeight {0}; - uint m_GridSizeX {0}; - uint m_GridSizeY {0}; - uint m_Type {0}; + uint m_NumClasses {0}; uint m_NewCoords {0}; - float m_ScaleXY {0}; - std::vector m_Anchors; - std::vector m_Mask; + std::vector m_YoloTensors; uint64_t m_OutputSize {0}; - void* p_Anchors; - void* p_Mask; + uint m_Type {0}; + uint m_TopK {0}; + float m_ScoreThreshold {0}; }; class YoloLayerPluginCreator : public nvinfer1::IPluginCreator { public: YoloLayerPluginCreator () {} + ~YoloLayerPluginCreator () {} const char* getPluginName () const noexcept override { return YOLOLAYER_PLUGIN_NAME; } + const char* getPluginVersion () const noexcept override { return YOLOLAYER_PLUGIN_VERSION; } const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override { @@ -153,7 +163,6 @@ private: std::string m_Namespace {""}; }; -extern uint kNUM_BBOXES; extern uint kNUM_CLASSES; #endif // __YOLO_PLUGINS__ diff --git a/readme.md b/readme.md index 7c5f5a2..556c25e 100644 --- a/readme.md +++ b/readme.md @@ -25,7 +25,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models * YOLOR native support * Models benchmarks (**outdated**) * **GPU YOLO Decoder (moved from CPU to GPU to get better performance)** [#138](https://github.com/marcoslucianops/DeepStream-Yolo/issues/138) -* **Improved NMS** [#142](https://github.com/marcoslucianops/DeepStream-Yolo/issues/142) +* **GPU Batched NMS** [#142](https://github.com/marcoslucianops/DeepStream-Yolo/issues/142) ## @@ -38,6 +38,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models * [Basic usage](#basic-usage) * [YOLOv5 usage](#yolov5-usage) * [YOLOR usage](#yolor-usage) +* [NMS configuration](#nms-configuration) * [INT8 calibration](#int8-calibration) * [Using your custom model](docs/customModels.md) @@ -101,67 +102,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models ### Benchmarks -``` -nms-iou-threshold = 0.6 -pre-cluster-threshold = 0.001 (mAP eval) / 0.25 (FPS measurement) -batch-size = 1 -valid = val2017 (COCO) - 1000 random images for INT8 calibration -sample = 1920x1080 video -NOTE: Used maintain-aspect-ratio=1 in config_infer file for YOLOv4 (with letter_box=1), YOLOv5 and YOLOR models. -``` - -#### NVIDIA GTX 1050 4GB (Mobile) - -##### YOLOR-CSP performance comparison - -| | DeepStream | PyTorch | -|:---------------------:|:----------:|:-------:| -| FPS (without display) | 13.32 | 10.07 | -| FPS (with display) | 12.63 | 9.41 | - -##### YOLOv5n performance comparison - -| | DeepStream | TensorRTx | Ultralytics | -|:---------------------:|:----------:|:---------:|:-----------:| -| FPS (without display) | 110.25 | 87.42 | 97.19 | -| FPS (with display) | 105.62 | 73.07 | 50.37 | - -
More -
- -| DeepStream | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS
(without display) | -|:------------------:|:---------:|:----------:|:------------:|:-------:|:--------:|:--------------------------:| -| YOLOR-P6 | FP32 | 1280 | 0.478 | 0.663 | 0.519 | 5.53 | -| YOLOR-CSP-X* | FP32 | 640 | 0.473 | 0.664 | 0.513 | 7.59 | -| YOLOR-CSP-X | FP32 | 640 | 0.470 | 0.661 | 0.507 | 7.52 | -| YOLOR-CSP* | FP32 | 640 | 0.459 | 0.652 | 0.496 | 13.28 | -| YOLOR-CSP | FP32 | 640 | 0.449 | 0.639 | 0.483 | 13.32 | -| YOLOv5x6 6.0 | FP32 | 1280 | 0.504 | 0.681 | 0.547 | 2.22 | -| YOLOv5l6 6.0 | FP32 | 1280 | 0.492 | 0.670 | 0.535 | 4.05 | -| YOLOv5m6 6.0 | FP32 | 1280 | 0.463 | 0.642 | 0.504 | 7.54 | -| YOLOv5s6 6.0 | FP32 | 1280 | 0.394 | 0.572 | 0.424 | 18.64 | -| YOLOv5n6 6.0 | FP32 | 1280 | 0.294 | 0.452 | 0.314 | 26.94 | -| YOLOv5x 6.0 | FP32 | 640 | 0.469 | 0.654 | 0.509 | 8.24 | -| YOLOv5l 6.0 | FP32 | 640 | 0.450 | 0.634 | 0.487 | 14.96 | -| YOLOv5m 6.0 | FP32 | 640 | 0.415 | 0.601 | 0.448 | 28.30 | -| YOLOv5s 6.0 | FP32 | 640 | 0.334 | 0.516 | 0.355 | 63.55 | -| YOLOv5n 6.0 | FP32 | 640 | 0.250 | 0.417 | 0.260 | 110.25 | -| YOLOv4-P6 | FP32 | 1280 | 0.499 | 0.685 | 0.542 | 2.57 | -| YOLOv4-P5 | FP32 | 896 | 0.472 | 0.659 | 0.513 | 5.48 | -| YOLOv4-CSP-X-SWISH | FP32 | 640 | 0.473 | 0.664 | 0.513 | 7.51 | -| YOLOv4-CSP-SWISH | FP32 | 640 | 0.459 | 0.652 | 0.496 | 13.13 | -| YOLOv4x-MISH | FP32 | 640 | 0.459 | 0.650 | 0.495 | 7.53 | -| YOLOv4-CSP | FP32 | 640 | 0.440 | 0.632 | 0.474 | 13.19 | -| YOLOv4 | FP32 | 608 | 0.498 | 0.740 | 0.549 | 12.18 | -| YOLOv4-Tiny | FP32 | 416 | 0.215 | 0.403 | 0.206 | 201.20 | -| YOLOv3-SPP | FP32 | 608 | 0.411 | 0.686 | 0.433 | 12.22 | -| YOLOv3-Tiny-PRN | FP32 | 416 | 0.167 | 0.382 | 0.125 | 277.14 | -| YOLOv3 | FP32 | 608 | 0.377 | 0.672 | 0.385 | 12.51 | -| YOLOv3-Tiny | FP32 | 416 | 0.095 | 0.203 | 0.079 | 218.42 | -| YOLOv2 | FP32 | 608 | 0.286 | 0.541 | 0.273 | 25.28 | -| YOLOv2-Tiny | FP32 | 416 | 0.102 | 0.258 | 0.061 | 231.36 | - -
+New tests comming soon. ## @@ -221,9 +162,11 @@ wget https://us.download.nvidia.com/tesla/510.47.03/NVIDIA-Linux-x86_64-510.47.0 * Run ``` -sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau +sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd ``` +**NOTE**: This step will disable the nouveau drivers. + * Reboot ``` @@ -233,7 +176,7 @@ sudo reboot * Install ``` -sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --dkms --install-libglvnd +sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd ``` **NOTE**: If you are using a laptop with NVIDIA Optimius, run @@ -326,7 +269,7 @@ sudo apt install libssl1.0.0 libgstreamer1.0-0 gstreamer1.0-tools gstreamer1.0-p sudo apt-get install linux-headers-$(uname -r) ``` -**NOTE**: Install DKMS only if you are using the default Ubuntu kernel +**NOTE**: Install DKMS only if you are using the default Ubuntu kernel. ``` sudo apt-get install dkms @@ -369,9 +312,11 @@ wget https://us.download.nvidia.com/tesla/470.129.06/NVIDIA-Linux-x86_64-470.129 * Run ``` -sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau +sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd ``` +**NOTE**: This step will disable the nouveau drivers. + * Reboot ``` @@ -381,7 +326,7 @@ sudo reboot * Install ``` -sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --dkms --install-libglvnd +sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd ``` **NOTE**: If you are using a laptop with NVIDIA Optimius, run @@ -519,11 +464,6 @@ network-mode=0 # Number of classes in label file num-detected-classes=80 ... -[class-attrs-all] -# IOU threshold -nms-iou-threshold=0.45 -# Score threshold -pre-cluster-threshold=0.25 ``` #### 5. Run @@ -542,6 +482,7 @@ gpu-id=0 gie-unique-id=1 nvbuf-memory-type=0 config-file=config_infer_primary_yoloV2.txt +... ``` ## @@ -618,11 +559,6 @@ network-mode=0 # Number of classes in label file num-detected-classes=80 ... -[class-attrs-all] -# IOU threshold -nms-iou-threshold=0.45 -# Score threshold -pre-cluster-threshold=0.25 ``` #### 8. Change the deepstream_app_config.txt file @@ -749,11 +685,6 @@ network-mode=0 # Number of classes in label file num-detected-classes=80 ... -[class-attrs-all] -# IOU threshold -nms-iou-threshold=0.5 -# Score threshold -pre-cluster-threshold=0.25 ``` #### 8. Change the deepstream_app_config.txt file @@ -776,6 +707,23 @@ deepstream-app -c deepstream_app_config.txt ## +### NMS Configuration + +To change the `iou-threshold`, `score-threshold` and `topk` values, modify the `config_nms.txt` file and regenerate the model engine file. + +**NOTE**: Lower `topk` values will result in more performance. + +**NOTE**: Make sure to set cluster-mode=4 and pre-cluster-threshold=0 in config_infer file. + +``` +[property] +iou-threshold=0.45 +score-threshold=0.25 +topk=300 +``` + +## + ### INT8 calibration #### 1. Install OpenCV diff --git a/utils/gen_wts_yoloV5.py b/utils/gen_wts_yoloV5.py index cde904a..24b49f6 100644 --- a/utils/gen_wts_yoloV5.py +++ b/utils/gen_wts_yoloV5.py @@ -144,6 +144,7 @@ with open(cfg_file, "w") as c: c.write("width=%d\n" % model_width) c.write("height=%d\n" % model_height) c.write("channels=%d\n" % model_channels) + c.write("letter_box=1\n") nc = 0 depth_multiple = 0 width_multiple = 0