From f80aa10cf205f6c62c9bcc28afab7eb77074844b Mon Sep 17 00:00:00 2001 From: Marcos Luciano Date: Sun, 19 Jun 2022 12:12:04 -0300 Subject: [PATCH] GPU Batched NMS --- .../nvdsparsebbox_Yolo.cpp | 56 +++++++------------ nvdsinfer_custom_impl_Yolo/sortDetections.cu | 19 +++++-- nvdsinfer_custom_impl_Yolo/yoloForward.cu | 7 +-- nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu | 7 +-- nvdsinfer_custom_impl_Yolo/yoloForward_r.cu | 7 +-- nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu | 7 +-- 6 files changed, 47 insertions(+), 56 deletions(-) diff --git a/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp b/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp index 1b45fbb..345d143 100644 --- a/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp +++ b/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp @@ -32,14 +32,11 @@ #include "yoloPlugins.h" extern "C" bool NvDsInferParseYolo( - std::vector const& outputLayersInfo, - NvDsInferNetworkInfo const& networkInfo, - NvDsInferParseDetectionParams const& detectionParams, - std::vector& objectList); + std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList); static NvDsInferParseObjectInfo convertBBox( - const float& bx1, const float& by1, const float& bx2, - const float& by2, const uint& netW, const uint& netH) + const float& bx1, const float& by1, const float& bx2, const float& by2, const uint& netW, const uint& netH) { NvDsInferParseObjectInfo b; @@ -62,9 +59,8 @@ static NvDsInferParseObjectInfo convertBBox( } static void addBBoxProposal( - const float bx1, const float by1, const float bx2, const float by2, - const uint& netW, const uint& netH, const int maxIndex, - const float maxProb, std::vector& binfo) + const float bx1, const float by1, const float bx2, const float by2, const uint& netW, const uint& netH, + const int maxIndex, const float maxProb, std::vector& binfo) { NvDsInferParseObjectInfo bbi = convertBBox(bx1, by1, bx2, by2, netW, netH); if (bbi.width < 1 || bbi.height < 1) return; @@ -75,14 +71,11 @@ static void addBBoxProposal( } static std::vector decodeYoloTensor( - const int* counts, const float* boxes, - const float* scores, const float* classes, - const uint& netW, const uint& netH) + const int* counts, const float* boxes, const float* scores, const float* classes, const uint& netW, const uint& netH) { std::vector binfo; uint numBoxes = counts[0]; - for (uint b = 0; b < numBoxes; ++b) { float bx1 = boxes[b * 4 + 0]; @@ -99,10 +92,8 @@ static std::vector decodeYoloTensor( } static bool NvDsInferParseCustomYolo( - std::vector const& outputLayersInfo, - NvDsInferNetworkInfo const& networkInfo, - NvDsInferParseDetectionParams const& detectionParams, - std::vector& objectList, + std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList, const uint &numClasses) { if (outputLayersInfo.empty()) @@ -113,28 +104,23 @@ static bool NvDsInferParseCustomYolo( if (numClasses != detectionParams.numClassesConfigured) { - std::cerr << "WARNING: Num classes mismatch. Configured: " - << detectionParams.numClassesConfigured + std::cerr << "WARNING: Num classes mismatch. Configured: " << detectionParams.numClassesConfigured << ", detected by network: " << numClasses << std::endl; } std::vector objects; - for (uint idx = 0; idx < outputLayersInfo.size() / 4; ++idx) - { - const NvDsInferLayerInfo &counts = outputLayersInfo[idx * 4 + 0]; - const NvDsInferLayerInfo &boxes = outputLayersInfo[idx * 4 + 1]; - const NvDsInferLayerInfo &scores = outputLayersInfo[idx * 4 + 2]; - const NvDsInferLayerInfo &classes = outputLayersInfo[idx * 4 + 3]; + const NvDsInferLayerInfo &counts = outputLayersInfo[0]; + const NvDsInferLayerInfo &boxes = outputLayersInfo[1]; + const NvDsInferLayerInfo &scores = outputLayersInfo[2]; + const NvDsInferLayerInfo &classes = outputLayersInfo[3]; - std::vector outObjs = - decodeYoloTensor( - (const int*)(counts.buffer), (const float*)(boxes.buffer), - (const float*)(scores.buffer), (const float*)(classes.buffer), - networkInfo.width, networkInfo.height); + std::vector outObjs = + decodeYoloTensor( + (const int*)(counts.buffer), (const float*)(boxes.buffer), (const float*)(scores.buffer), + (const float*)(classes.buffer), networkInfo.width, networkInfo.height); - objects.insert(objects.end(), outObjs.begin(), outObjs.end()); - } + objects.insert(objects.end(), outObjs.begin(), outObjs.end()); objectList = objects; @@ -142,10 +128,8 @@ static bool NvDsInferParseCustomYolo( } extern "C" bool NvDsInferParseYolo( - std::vector const& outputLayersInfo, - NvDsInferNetworkInfo const& networkInfo, - NvDsInferParseDetectionParams const& detectionParams, - std::vector& objectList) + std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList) { int num_classes = kNUM_CLASSES; diff --git a/nvdsinfer_custom_impl_Yolo/sortDetections.cu b/nvdsinfer_custom_impl_Yolo/sortDetections.cu index 1463644..5a7b219 100644 --- a/nvdsinfer_custom_impl_Yolo/sortDetections.cu +++ b/nvdsinfer_custom_impl_Yolo/sortDetections.cu @@ -7,10 +7,13 @@ __global__ void sortOutput( int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, float* bboxData, float* scoreData, - const uint numOutputClasses) + const uint numOutputClasses, const int topk) { uint x_id = blockIdx.x * blockDim.x + threadIdx.x; + if (x_id >= topk) + return; + int index = d_indexes[x_id]; int maxIndex = d_classes[index]; bboxData[x_id * 4 + 0] = d_boxes[index * 4 + 0]; @@ -67,12 +70,20 @@ cudaError_t sortDetections( cudaMemcpy(_d_scores, d_keys_out, count * sizeof(float), cudaMemcpyDeviceToDevice); cudaMemcpy(_d_indexes, d_values_out, count * sizeof(int), cudaMemcpyDeviceToDevice); - int threads_per_block = count < topK ? count : topK; + int _topK = count < topK ? count : topK; - sortOutput<<<1, threads_per_block, 0, stream>>>( + int threads_per_block = 0; + int number_of_blocks = 4; + + if (_topK % 2 == 0) + threads_per_block = _topK / number_of_blocks; + else + threads_per_block = (_topK / number_of_blocks) + 1; + + sortOutput<<>>( _d_indexes, _d_scores, reinterpret_cast(d_boxes) + (batch * 4 * outputSize), reinterpret_cast(d_classes) + (batch * outputSize), reinterpret_cast(bboxData) + (batch * topK), - reinterpret_cast(scoreData) + (batch * topK), numOutputClasses); + reinterpret_cast(scoreData) + (batch * topK), numOutputClasses, _topK); cudaFree(d_keys_out); cudaFree(d_values_out); diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward.cu b/nvdsinfer_custom_impl_Yolo/yoloForward.cu index 7c827b4..fb03854 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloForward.cu +++ b/nvdsinfer_custom_impl_Yolo/yoloForward.cu @@ -16,10 +16,8 @@ __global__ void gpuYoloLayer( uint y_id = blockIdx.y * blockDim.y + threadIdx.y; uint z_id = blockIdx.z * blockDim.z + threadIdx.z; - if ((x_id >= gridSizeX) || (y_id >= gridSizeY) || (z_id >= numBBoxes)) - { + if (x_id >= gridSizeX || y_id >= gridSizeY || z_id >= numBBoxes) return; - } const int numGridCells = gridSizeX * gridSizeY; const int bbindex = y_id * gridSizeX + x_id; @@ -27,7 +25,8 @@ __global__ void gpuYoloLayer( const float objectness = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); - if (objectness < scoreThreshold) return; + if (objectness < scoreThreshold) + return; int count = (int)atomicAdd(&countData[0], 1); diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu b/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu index 7e4872d..ff850c3 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu +++ b/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu @@ -14,10 +14,8 @@ __global__ void gpuYoloLayer_nc( uint y_id = blockIdx.y * blockDim.y + threadIdx.y; uint z_id = blockIdx.z * blockDim.z + threadIdx.z; - if ((x_id >= gridSizeX) || (y_id >= gridSizeY) || (z_id >= numBBoxes)) - { + if (x_id >= gridSizeX || y_id >= gridSizeY || z_id >= numBBoxes) return; - } const int numGridCells = gridSizeX * gridSizeY; const int bbindex = y_id * gridSizeX + x_id; @@ -25,7 +23,8 @@ __global__ void gpuYoloLayer_nc( const float objectness = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]; - if (objectness < scoreThreshold) return; + if (objectness < scoreThreshold) + return; int count = (int)atomicAdd(&countData[0], 1); diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward_r.cu b/nvdsinfer_custom_impl_Yolo/yoloForward_r.cu index bba42bc..46bda23 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloForward_r.cu +++ b/nvdsinfer_custom_impl_Yolo/yoloForward_r.cu @@ -16,10 +16,8 @@ __global__ void gpuYoloLayer_r( uint y_id = blockIdx.y * blockDim.y + threadIdx.y; uint z_id = blockIdx.z * blockDim.z + threadIdx.z; - if ((x_id >= gridSizeX) || (y_id >= gridSizeY) || (z_id >= numBBoxes)) - { + if (x_id >= gridSizeX || y_id >= gridSizeY || z_id >= numBBoxes) return; - } const int numGridCells = gridSizeX * gridSizeY; const int bbindex = y_id * gridSizeX + x_id; @@ -27,7 +25,8 @@ __global__ void gpuYoloLayer_r( const float objectness = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); - if (objectness < scoreThreshold) return; + if (objectness < scoreThreshold) + return; int count = (int)atomicAdd(&countData[0], 1); diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu b/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu index cd21797..a1e0cbc 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu +++ b/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu @@ -37,10 +37,8 @@ __global__ void gpuRegionLayer( uint y_id = blockIdx.y * blockDim.y + threadIdx.y; uint z_id = blockIdx.z * blockDim.z + threadIdx.z; - if ((x_id >= gridSizeX) || (y_id >= gridSizeY) || (z_id >= numBBoxes)) - { + if (x_id >= gridSizeX || y_id >= gridSizeY || z_id >= numBBoxes) return; - } const int numGridCells = gridSizeX * gridSizeY; const int bbindex = y_id * gridSizeX + x_id; @@ -48,7 +46,8 @@ __global__ void gpuRegionLayer( const float objectness = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); - if (objectness < scoreThreshold) return; + if (objectness < scoreThreshold) + return; int count = (int)atomicAdd(&countData[0], 1);