GPU Batched NMS

This commit is contained in:
Marcos Luciano
2022-06-19 12:12:04 -03:00
parent f621c0f429
commit f80aa10cf2
6 changed files with 47 additions and 56 deletions

View File

@@ -32,14 +32,11 @@
#include "yoloPlugins.h" #include "yoloPlugins.h"
extern "C" bool NvDsInferParseYolo( extern "C" bool NvDsInferParseYolo(
std::vector<NvDsInferLayerInfo> const& outputLayersInfo, std::vector<NvDsInferLayerInfo> const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo,
NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector<NvDsInferParseObjectInfo>& objectList);
NvDsInferParseDetectionParams const& detectionParams,
std::vector<NvDsInferParseObjectInfo>& objectList);
static NvDsInferParseObjectInfo convertBBox( static NvDsInferParseObjectInfo convertBBox(
const float& bx1, const float& by1, const float& bx2, const float& bx1, const float& by1, const float& bx2, const float& by2, const uint& netW, const uint& netH)
const float& by2, const uint& netW, const uint& netH)
{ {
NvDsInferParseObjectInfo b; NvDsInferParseObjectInfo b;
@@ -62,9 +59,8 @@ static NvDsInferParseObjectInfo convertBBox(
} }
static void addBBoxProposal( static void addBBoxProposal(
const float bx1, const float by1, const float bx2, const float by2, const float bx1, const float by1, const float bx2, const float by2, const uint& netW, const uint& netH,
const uint& netW, const uint& netH, const int maxIndex, const int maxIndex, const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
{ {
NvDsInferParseObjectInfo bbi = convertBBox(bx1, by1, bx2, by2, netW, netH); NvDsInferParseObjectInfo bbi = convertBBox(bx1, by1, bx2, by2, netW, netH);
if (bbi.width < 1 || bbi.height < 1) return; if (bbi.width < 1 || bbi.height < 1) return;
@@ -75,14 +71,11 @@ static void addBBoxProposal(
} }
static std::vector<NvDsInferParseObjectInfo> decodeYoloTensor( static std::vector<NvDsInferParseObjectInfo> decodeYoloTensor(
const int* counts, const float* boxes, const int* counts, const float* boxes, const float* scores, const float* classes, const uint& netW, const uint& netH)
const float* scores, const float* classes,
const uint& netW, const uint& netH)
{ {
std::vector<NvDsInferParseObjectInfo> binfo; std::vector<NvDsInferParseObjectInfo> binfo;
uint numBoxes = counts[0]; uint numBoxes = counts[0];
for (uint b = 0; b < numBoxes; ++b) for (uint b = 0; b < numBoxes; ++b)
{ {
float bx1 = boxes[b * 4 + 0]; float bx1 = boxes[b * 4 + 0];
@@ -99,10 +92,8 @@ static std::vector<NvDsInferParseObjectInfo> decodeYoloTensor(
} }
static bool NvDsInferParseCustomYolo( static bool NvDsInferParseCustomYolo(
std::vector<NvDsInferLayerInfo> const& outputLayersInfo, std::vector<NvDsInferLayerInfo> const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo,
NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector<NvDsInferParseObjectInfo>& objectList,
NvDsInferParseDetectionParams const& detectionParams,
std::vector<NvDsInferParseObjectInfo>& objectList,
const uint &numClasses) const uint &numClasses)
{ {
if (outputLayersInfo.empty()) if (outputLayersInfo.empty())
@@ -113,28 +104,23 @@ static bool NvDsInferParseCustomYolo(
if (numClasses != detectionParams.numClassesConfigured) if (numClasses != detectionParams.numClassesConfigured)
{ {
std::cerr << "WARNING: Num classes mismatch. Configured: " std::cerr << "WARNING: Num classes mismatch. Configured: " << detectionParams.numClassesConfigured
<< detectionParams.numClassesConfigured
<< ", detected by network: " << numClasses << std::endl; << ", detected by network: " << numClasses << std::endl;
} }
std::vector<NvDsInferParseObjectInfo> objects; std::vector<NvDsInferParseObjectInfo> objects;
for (uint idx = 0; idx < outputLayersInfo.size() / 4; ++idx) const NvDsInferLayerInfo &counts = outputLayersInfo[0];
{ const NvDsInferLayerInfo &boxes = outputLayersInfo[1];
const NvDsInferLayerInfo &counts = outputLayersInfo[idx * 4 + 0]; const NvDsInferLayerInfo &scores = outputLayersInfo[2];
const NvDsInferLayerInfo &boxes = outputLayersInfo[idx * 4 + 1]; const NvDsInferLayerInfo &classes = outputLayersInfo[3];
const NvDsInferLayerInfo &scores = outputLayersInfo[idx * 4 + 2];
const NvDsInferLayerInfo &classes = outputLayersInfo[idx * 4 + 3];
std::vector<NvDsInferParseObjectInfo> outObjs = std::vector<NvDsInferParseObjectInfo> outObjs =
decodeYoloTensor( decodeYoloTensor(
(const int*)(counts.buffer), (const float*)(boxes.buffer), (const int*)(counts.buffer), (const float*)(boxes.buffer), (const float*)(scores.buffer),
(const float*)(scores.buffer), (const float*)(classes.buffer), (const float*)(classes.buffer), networkInfo.width, networkInfo.height);
networkInfo.width, networkInfo.height);
objects.insert(objects.end(), outObjs.begin(), outObjs.end()); objects.insert(objects.end(), outObjs.begin(), outObjs.end());
}
objectList = objects; objectList = objects;
@@ -142,10 +128,8 @@ static bool NvDsInferParseCustomYolo(
} }
extern "C" bool NvDsInferParseYolo( extern "C" bool NvDsInferParseYolo(
std::vector<NvDsInferLayerInfo> const& outputLayersInfo, std::vector<NvDsInferLayerInfo> const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo,
NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector<NvDsInferParseObjectInfo>& objectList)
NvDsInferParseDetectionParams const& detectionParams,
std::vector<NvDsInferParseObjectInfo>& objectList)
{ {
int num_classes = kNUM_CLASSES; int num_classes = kNUM_CLASSES;

View File

@@ -7,10 +7,13 @@
__global__ void sortOutput( __global__ void sortOutput(
int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, float* bboxData, float* scoreData, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, float* bboxData, float* scoreData,
const uint numOutputClasses) const uint numOutputClasses, const int topk)
{ {
uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
if (x_id >= topk)
return;
int index = d_indexes[x_id]; int index = d_indexes[x_id];
int maxIndex = d_classes[index]; int maxIndex = d_classes[index];
bboxData[x_id * 4 + 0] = d_boxes[index * 4 + 0]; bboxData[x_id * 4 + 0] = d_boxes[index * 4 + 0];
@@ -67,12 +70,20 @@ cudaError_t sortDetections(
cudaMemcpy(_d_scores, d_keys_out, count * sizeof(float), cudaMemcpyDeviceToDevice); cudaMemcpy(_d_scores, d_keys_out, count * sizeof(float), cudaMemcpyDeviceToDevice);
cudaMemcpy(_d_indexes, d_values_out, count * sizeof(int), cudaMemcpyDeviceToDevice); cudaMemcpy(_d_indexes, d_values_out, count * sizeof(int), cudaMemcpyDeviceToDevice);
int threads_per_block = count < topK ? count : topK; int _topK = count < topK ? count : topK;
sortOutput<<<1, threads_per_block, 0, stream>>>( int threads_per_block = 0;
int number_of_blocks = 4;
if (_topK % 2 == 0)
threads_per_block = _topK / number_of_blocks;
else
threads_per_block = (_topK / number_of_blocks) + 1;
sortOutput<<<number_of_blocks, threads_per_block, 0, stream>>>(
_d_indexes, _d_scores, reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize), _d_indexes, _d_scores, reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<float*>(bboxData) + (batch * topK), reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<float*>(bboxData) + (batch * topK),
reinterpret_cast<float*>(scoreData) + (batch * topK), numOutputClasses); reinterpret_cast<float*>(scoreData) + (batch * topK), numOutputClasses, _topK);
cudaFree(d_keys_out); cudaFree(d_keys_out);
cudaFree(d_values_out); cudaFree(d_values_out);

View File

@@ -16,10 +16,8 @@ __global__ void gpuYoloLayer(
uint y_id = blockIdx.y * blockDim.y + threadIdx.y; uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
uint z_id = blockIdx.z * blockDim.z + threadIdx.z; uint z_id = blockIdx.z * blockDim.z + threadIdx.z;
if ((x_id >= gridSizeX) || (y_id >= gridSizeY) || (z_id >= numBBoxes)) if (x_id >= gridSizeX || y_id >= gridSizeY || z_id >= numBBoxes)
{
return; return;
}
const int numGridCells = gridSizeX * gridSizeY; const int numGridCells = gridSizeX * gridSizeY;
const int bbindex = y_id * gridSizeX + x_id; const int bbindex = y_id * gridSizeX + x_id;
@@ -27,7 +25,8 @@ __global__ void gpuYoloLayer(
const float objectness const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
if (objectness < scoreThreshold) return; if (objectness < scoreThreshold)
return;
int count = (int)atomicAdd(&countData[0], 1); int count = (int)atomicAdd(&countData[0], 1);

View File

@@ -14,10 +14,8 @@ __global__ void gpuYoloLayer_nc(
uint y_id = blockIdx.y * blockDim.y + threadIdx.y; uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
uint z_id = blockIdx.z * blockDim.z + threadIdx.z; uint z_id = blockIdx.z * blockDim.z + threadIdx.z;
if ((x_id >= gridSizeX) || (y_id >= gridSizeY) || (z_id >= numBBoxes)) if (x_id >= gridSizeX || y_id >= gridSizeY || z_id >= numBBoxes)
{
return; return;
}
const int numGridCells = gridSizeX * gridSizeY; const int numGridCells = gridSizeX * gridSizeY;
const int bbindex = y_id * gridSizeX + x_id; const int bbindex = y_id * gridSizeX + x_id;
@@ -25,7 +23,8 @@ __global__ void gpuYoloLayer_nc(
const float objectness const float objectness
= input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]; = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)];
if (objectness < scoreThreshold) return; if (objectness < scoreThreshold)
return;
int count = (int)atomicAdd(&countData[0], 1); int count = (int)atomicAdd(&countData[0], 1);

View File

@@ -16,10 +16,8 @@ __global__ void gpuYoloLayer_r(
uint y_id = blockIdx.y * blockDim.y + threadIdx.y; uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
uint z_id = blockIdx.z * blockDim.z + threadIdx.z; uint z_id = blockIdx.z * blockDim.z + threadIdx.z;
if ((x_id >= gridSizeX) || (y_id >= gridSizeY) || (z_id >= numBBoxes)) if (x_id >= gridSizeX || y_id >= gridSizeY || z_id >= numBBoxes)
{
return; return;
}
const int numGridCells = gridSizeX * gridSizeY; const int numGridCells = gridSizeX * gridSizeY;
const int bbindex = y_id * gridSizeX + x_id; const int bbindex = y_id * gridSizeX + x_id;
@@ -27,7 +25,8 @@ __global__ void gpuYoloLayer_r(
const float objectness const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
if (objectness < scoreThreshold) return; if (objectness < scoreThreshold)
return;
int count = (int)atomicAdd(&countData[0], 1); int count = (int)atomicAdd(&countData[0], 1);

View File

@@ -37,10 +37,8 @@ __global__ void gpuRegionLayer(
uint y_id = blockIdx.y * blockDim.y + threadIdx.y; uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
uint z_id = blockIdx.z * blockDim.z + threadIdx.z; uint z_id = blockIdx.z * blockDim.z + threadIdx.z;
if ((x_id >= gridSizeX) || (y_id >= gridSizeY) || (z_id >= numBBoxes)) if (x_id >= gridSizeX || y_id >= gridSizeY || z_id >= numBBoxes)
{
return; return;
}
const int numGridCells = gridSizeX * gridSizeY; const int numGridCells = gridSizeX * gridSizeY;
const int bbindex = y_id * gridSizeX + x_id; const int bbindex = y_id * gridSizeX + x_id;
@@ -48,7 +46,8 @@ __global__ void gpuRegionLayer(
const float objectness const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
if (objectness < scoreThreshold) return; if (objectness < scoreThreshold)
return;
int count = (int)atomicAdd(&countData[0], 1); int count = (int)atomicAdd(&countData[0], 1);