GPU Batched NMS

This commit is contained in:
Marcos Luciano
2022-06-19 03:25:50 -03:00
parent 2300e3b44b
commit f621c0f429
24 changed files with 835 additions and 654 deletions

View File

@@ -14,12 +14,11 @@ interval=0
gie-unique-id=1 gie-unique-id=1
process-mode=1 process-mode=1
network-type=0 network-type=0
cluster-mode=2 cluster-mode=4
maintain-aspect-ratio=0 maintain-aspect-ratio=0
parse-bbox-func-name=NvDsInferParseYolo parse-bbox-func-name=NvDsInferParseYolo
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
engine-create-func-name=NvDsInferYoloCudaEngineGet engine-create-func-name=NvDsInferYoloCudaEngineGet
[class-attrs-all] [class-attrs-all]
nms-iou-threshold=0.45 pre-cluster-threshold=0
pre-cluster-threshold=0.25

View File

@@ -14,12 +14,11 @@ interval=0
gie-unique-id=1 gie-unique-id=1
process-mode=1 process-mode=1
network-type=0 network-type=0
cluster-mode=2 cluster-mode=4
maintain-aspect-ratio=0 maintain-aspect-ratio=0
parse-bbox-func-name=NvDsInferParseYolo parse-bbox-func-name=NvDsInferParseYolo
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
engine-create-func-name=NvDsInferYoloCudaEngineGet engine-create-func-name=NvDsInferYoloCudaEngineGet
[class-attrs-all] [class-attrs-all]
nms-iou-threshold=0.45 pre-cluster-threshold=0
pre-cluster-threshold=0.25

View File

@@ -14,12 +14,11 @@ interval=0
gie-unique-id=1 gie-unique-id=1
process-mode=1 process-mode=1
network-type=0 network-type=0
cluster-mode=2 cluster-mode=4
maintain-aspect-ratio=1 maintain-aspect-ratio=1
parse-bbox-func-name=NvDsInferParseYolo parse-bbox-func-name=NvDsInferParseYolo
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
engine-create-func-name=NvDsInferYoloCudaEngineGet engine-create-func-name=NvDsInferYoloCudaEngineGet
[class-attrs-all] [class-attrs-all]
nms-iou-threshold=0.45 pre-cluster-threshold=0
pre-cluster-threshold=0.25

View File

@@ -14,12 +14,11 @@ interval=0
gie-unique-id=1 gie-unique-id=1
process-mode=1 process-mode=1
network-type=0 network-type=0
cluster-mode=2 cluster-mode=4
maintain-aspect-ratio=1 maintain-aspect-ratio=1
parse-bbox-func-name=NvDsInferParseYolo parse-bbox-func-name=NvDsInferParseYolo
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
engine-create-func-name=NvDsInferYoloCudaEngineGet engine-create-func-name=NvDsInferYoloCudaEngineGet
[class-attrs-all] [class-attrs-all]
nms-iou-threshold=0.5 pre-cluster-threshold=0
pre-cluster-threshold=0.25

4
config_nms.txt Normal file
View File

@@ -0,0 +1,4 @@
[property]
iou-threshold=0.45
score-threshold=0.25
topk=300

View File

@@ -272,24 +272,6 @@ interval=0
## ##
#### nms-iou-threshold
```
# IOU threshold
nms-iou-threshold=0.6
```
##
#### pre-cluster-threshold
```
# Socre threshold
pre-cluster-threshold=0.25
```
##
### Testing model ### Testing model
``` ```

View File

@@ -55,7 +55,6 @@ SRCFILES:= nvdsinfer_yolo_engine.cpp \
layers/convolutional_layer.cpp \ layers/convolutional_layer.cpp \
layers/implicit_layer.cpp \ layers/implicit_layer.cpp \
layers/channels_layer.cpp \ layers/channels_layer.cpp \
layers/dropout_layer.cpp \
layers/shortcut_layer.cpp \ layers/shortcut_layer.cpp \
layers/route_layer.cpp \ layers/route_layer.cpp \
layers/upsample_layer.cpp \ layers/upsample_layer.cpp \
@@ -67,7 +66,8 @@ SRCFILES:= nvdsinfer_yolo_engine.cpp \
yoloForward.cu \ yoloForward.cu \
yoloForward_v2.cu \ yoloForward_v2.cu \
yoloForward_nc.cu \ yoloForward_nc.cu \
yoloForward_r.cu yoloForward_r.cu \
sortDetections.cu
ifeq ($(OPENCV), 1) ifeq ($(OPENCV), 1)
SRCFILES+= calibrator.cpp SRCFILES+= calibrator.cpp

View File

@@ -1,15 +0,0 @@
/*
* Created by Marcos Luciano
* https://www.github.com/marcoslucianops
*/
#include "dropout_layer.h"
nvinfer1::ILayer* dropoutLayer(
float probability,
nvinfer1::ITensor* input,
nvinfer1::INetworkDefinition* network)
{
nvinfer1::ILayer* output;
return output;
}

View File

@@ -1,16 +0,0 @@
/*
* Created by Marcos Luciano
* https://www.github.com/marcoslucianops
*/
#ifndef __DROPOUT_LAYER_H__
#define __DROPOUT_LAYER_H__
#include "NvInfer.h"
nvinfer1::ILayer* dropoutLayer(
float probability,
nvinfer1::ITensor* input,
nvinfer1::INetworkDefinition* network);
#endif

View File

@@ -63,15 +63,13 @@ static bool getYoloNetworkInfo (NetworkInfo &networkInfo, const NvDsInferContext
if (networkInfo.configFilePath.empty() || if (networkInfo.configFilePath.empty() ||
networkInfo.wtsFilePath.empty()) { networkInfo.wtsFilePath.empty()) {
std::cerr << "YOLO config file or weights file is not specified" std::cerr << "YOLO config file or weights file is not specified\n" << std::endl;
<< std::endl;
return false; return false;
} }
if (!fileExists(networkInfo.configFilePath) || if (!fileExists(networkInfo.configFilePath) ||
!fileExists(networkInfo.wtsFilePath)) { !fileExists(networkInfo.wtsFilePath)) {
std::cerr << "YOLO config file or weights file is not exist" std::cerr << "YOLO config file or weights file is not exist\n" << std::endl;
<< std::endl;
return false; return false;
} }

View File

@@ -38,15 +38,15 @@ extern "C" bool NvDsInferParseYolo(
std::vector<NvDsInferParseObjectInfo>& objectList); std::vector<NvDsInferParseObjectInfo>& objectList);
static NvDsInferParseObjectInfo convertBBox( static NvDsInferParseObjectInfo convertBBox(
const float& bx, const float& by, const float& bw, const float& bx1, const float& by1, const float& bx2,
const float& bh, const uint& netW, const uint& netH) const float& by2, const uint& netW, const uint& netH)
{ {
NvDsInferParseObjectInfo b; NvDsInferParseObjectInfo b;
float x1 = bx - bw / 2; float x1 = bx1;
float y1 = by - bh / 2; float y1 = by1;
float x2 = x1 + bw; float x2 = bx2;
float y2 = y1 + bh; float y2 = by2;
x1 = clamp(x1, 0, netW); x1 = clamp(x1, 0, netW);
y1 = clamp(y1, 0, netH); y1 = clamp(y1, 0, netH);
@@ -62,11 +62,11 @@ static NvDsInferParseObjectInfo convertBBox(
} }
static void addBBoxProposal( static void addBBoxProposal(
const float bx, const float by, const float bw, const float bh, const float bx1, const float by1, const float bx2, const float by2,
const uint& netW, const uint& netH, const int maxIndex, const uint& netW, const uint& netH, const int maxIndex,
const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo) const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
{ {
NvDsInferParseObjectInfo bbi = convertBBox(bx, by, bw, bh, netW, netH); NvDsInferParseObjectInfo bbi = convertBBox(bx1, by1, bx2, by2, netW, netH);
if (bbi.width < 1 || bbi.height < 1) return; if (bbi.width < 1 || bbi.height < 1) return;
bbi.detectionConfidence = maxProb; bbi.detectionConfidence = maxProb;
@@ -75,34 +75,25 @@ static void addBBoxProposal(
} }
static std::vector<NvDsInferParseObjectInfo> decodeYoloTensor( static std::vector<NvDsInferParseObjectInfo> decodeYoloTensor(
const float* detections, const int* counts, const float* boxes,
const uint gridSizeW, const uint gridSizeH, const uint numBBoxes, const float* scores, const float* classes,
const uint numOutputClasses, const uint& netW, const uint& netH) const uint& netW, const uint& netH)
{ {
std::vector<NvDsInferParseObjectInfo> binfo; std::vector<NvDsInferParseObjectInfo> binfo;
for (uint y = 0; y < gridSizeH; ++y) {
for (uint x = 0; x < gridSizeW; ++x) { uint numBoxes = counts[0];
for (uint b = 0; b < numBBoxes; ++b)
for (uint b = 0; b < numBoxes; ++b)
{ {
const int numGridCells = gridSizeH * gridSizeW; float bx1 = boxes[b * 4 + 0];
const int bbindex = y * gridSizeW + x; float by1 = boxes[b * 4 + 1];
float bx2 = boxes[b * 4 + 2];
float by2 = boxes[b * 4 + 3];
const float bx float maxProb = scores[b];
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 0)]; int maxIndex = classes[b];
const float by
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 1)];
const float bw
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 2)];
const float bh
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 3)];
const float maxProb
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 4)];
const int maxIndex
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 5)];
addBBoxProposal(bx, by, bw, bh, netW, netH, maxIndex, maxProb, binfo); addBBoxProposal(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo);
}
}
} }
return binfo; return binfo;
} }
@@ -112,7 +103,6 @@ static bool NvDsInferParseCustomYolo(
NvDsInferNetworkInfo const& networkInfo, NvDsInferNetworkInfo const& networkInfo,
NvDsInferParseDetectionParams const& detectionParams, NvDsInferParseDetectionParams const& detectionParams,
std::vector<NvDsInferParseObjectInfo>& objectList, std::vector<NvDsInferParseObjectInfo>& objectList,
const uint &numBBoxes,
const uint &numClasses) const uint &numClasses)
{ {
if (outputLayersInfo.empty()) if (outputLayersInfo.empty())
@@ -130,18 +120,17 @@ static bool NvDsInferParseCustomYolo(
std::vector<NvDsInferParseObjectInfo> objects; std::vector<NvDsInferParseObjectInfo> objects;
for (uint idx = 0; idx < outputLayersInfo.size(); ++idx) for (uint idx = 0; idx < outputLayersInfo.size() / 4; ++idx)
{ {
const NvDsInferLayerInfo &layer = outputLayersInfo[idx]; const NvDsInferLayerInfo &counts = outputLayersInfo[idx * 4 + 0];
const NvDsInferLayerInfo &boxes = outputLayersInfo[idx * 4 + 1];
assert(layer.inferDims.numDims == 3); const NvDsInferLayerInfo &scores = outputLayersInfo[idx * 4 + 2];
const uint gridSizeH = layer.inferDims.d[1]; const NvDsInferLayerInfo &classes = outputLayersInfo[idx * 4 + 3];
const uint gridSizeW = layer.inferDims.d[2];
std::vector<NvDsInferParseObjectInfo> outObjs = std::vector<NvDsInferParseObjectInfo> outObjs =
decodeYoloTensor( decodeYoloTensor(
(const float*)(layer.buffer), (const int*)(counts.buffer), (const float*)(boxes.buffer),
gridSizeW, gridSizeH, numBBoxes, numClasses, (const float*)(scores.buffer), (const float*)(classes.buffer),
networkInfo.width, networkInfo.height); networkInfo.width, networkInfo.height);
objects.insert(objects.end(), outObjs.begin(), outObjs.end()); objects.insert(objects.end(), outObjs.begin(), outObjs.end());
@@ -158,11 +147,10 @@ extern "C" bool NvDsInferParseYolo(
NvDsInferParseDetectionParams const& detectionParams, NvDsInferParseDetectionParams const& detectionParams,
std::vector<NvDsInferParseObjectInfo>& objectList) std::vector<NvDsInferParseObjectInfo>& objectList)
{ {
uint numBBoxes = kNUM_BBOXES; int num_classes = kNUM_CLASSES;
uint numClasses = kNUM_CLASSES;
return NvDsInferParseCustomYolo ( return NvDsInferParseCustomYolo (
outputLayersInfo, networkInfo, detectionParams, objectList, numBBoxes, numClasses); outputLayersInfo, networkInfo, detectionParams, objectList, num_classes);
} }
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYolo); CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYolo);

View File

@@ -0,0 +1,84 @@
/*
* Created by Marcos Luciano
* https://www.github.com/marcoslucianops
*/
#include <cub/cub.cuh>
__global__ void sortOutput(
int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, float* bboxData, float* scoreData,
const uint numOutputClasses)
{
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
int index = d_indexes[x_id];
int maxIndex = d_classes[index];
bboxData[x_id * 4 + 0] = d_boxes[index * 4 + 0];
bboxData[x_id * 4 + 1] = d_boxes[index * 4 + 1];
bboxData[x_id * 4 + 2] = d_boxes[index * 4 + 2];
bboxData[x_id * 4 + 3] = d_boxes[index * 4 + 3];
scoreData[x_id * numOutputClasses + maxIndex] = d_scores[x_id] - 1.f;
}
cudaError_t sortDetections(
void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* bboxData, void* scoreData, void* countData,
const uint& batchSize, uint64_t& outputSize, uint& topK, const uint& numOutputClasses, cudaStream_t stream);
cudaError_t sortDetections(
void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* bboxData, void* scoreData, void* countData,
const uint& batchSize, uint64_t& outputSize, uint& topK, const uint& numOutputClasses, cudaStream_t stream)
{
for (unsigned int batch = 0; batch < batchSize; ++batch)
{
int* _d_indexes = reinterpret_cast<int*>(d_indexes) + (batch * outputSize);
float* _d_scores = reinterpret_cast<float*>(d_scores) + (batch * outputSize);
int* _countData = reinterpret_cast<int*>(countData) + (batch);
int* _count = (int*)malloc(sizeof(int));
cudaMemcpy(_count, (int*)&_countData[0], sizeof(int), cudaMemcpyDeviceToHost);
int count = _count[0];
if (count == 0)
{
free(_count);
return cudaGetLastError();
}
size_t begin_bit = 0;
size_t end_bit = sizeof(float) * 8;
float *d_keys_out = NULL;
int *d_values_out = NULL;
cudaMalloc((void **)&d_keys_out, count * sizeof(float));
cudaMalloc((void **)&d_values_out, count * sizeof(int));
void* d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, _d_scores, d_keys_out, _d_indexes,
d_values_out, count, begin_bit, end_bit);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, _d_scores, d_keys_out, _d_indexes,
d_values_out, count, begin_bit, end_bit);
cudaMemcpy(_d_scores, d_keys_out, count * sizeof(float), cudaMemcpyDeviceToDevice);
cudaMemcpy(_d_indexes, d_values_out, count * sizeof(int), cudaMemcpyDeviceToDevice);
int threads_per_block = count < topK ? count : topK;
sortOutput<<<1, threads_per_block, 0, stream>>>(
_d_indexes, _d_scores, reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<float*>(bboxData) + (batch * topK),
reinterpret_cast<float*>(scoreData) + (batch * topK), numOutputClasses);
cudaFree(d_keys_out);
cudaFree(d_values_out);
cudaFree(d_temp_storage);
free(_count);
}
return cudaGetLastError();
}

View File

@@ -57,7 +57,7 @@ bool fileExists(const std::string fileName, bool verbose)
{ {
if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(fileName))) if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(fileName)))
{ {
if (verbose) std::cout << "File does not exist: " << fileName << std::endl; if (verbose) std::cout << "\nFile does not exist: " << fileName << std::endl;
return false; return false;
} }
return true; return true;
@@ -101,7 +101,7 @@ std::vector<float> loadWeights(const std::string weightsFilePath, const std::str
assert(file.good()); assert(file.good());
int32_t count; int32_t count;
file >> count; file >> count;
assert(count > 0 && "Invalid .wts file."); assert(count > 0 && "\nInvalid .wts file.");
uint32_t floatWeight; uint32_t floatWeight;
std::string name; std::string name;
@@ -118,7 +118,7 @@ std::vector<float> loadWeights(const std::string weightsFilePath, const std::str
} }
else { else {
std::cerr << "File " << weightsFilePath << " is not supported" << std::endl; std::cerr << "\nFile " << weightsFilePath << " is not supported" << std::endl;
std::abort(); std::abort();
} }
@@ -149,11 +149,19 @@ int getNumChannels(nvinfer1::ITensor* t)
return d.d[0]; return d.d[0];
} }
void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput, void printLayerInfo(
std::string layerOutput, std::string weightPtr) std::string layerIndex, std::string layerName, std::string layerInput, std::string layerOutput, std::string weightPtr)
{ {
std::cout << std::setw(6) << std::left << layerIndex << std::setw(24) << std::left << layerName; std::cout << std::setw(6) << std::left << layerIndex << std::setw(24) << std::left << layerName;
std::cout << std::setw(20) << std::left << layerInput << std::setw(20) << std::left std::cout << std::setw(20) << std::left << layerInput << std::setw(20) << std::left
<< layerOutput; << layerOutput;
std::cout << std::setw(7) << std::left << weightPtr << std::endl; std::cout << std::setw(7) << std::left << weightPtr << std::endl;
} }
std::string getAbsPath(std::string path)
{
std::size_t found = path.rfind("/");
if (found != std::string::npos)
path.erase(path.begin() + found, path.end());
return path;
}

View File

@@ -41,8 +41,8 @@ bool fileExists(const std::string fileName, bool verbose = true);
std::vector<float> loadWeights(const std::string weightsFilePath, const std::string& networkType); std::vector<float> loadWeights(const std::string weightsFilePath, const std::string& networkType);
std::string dimsToString(const nvinfer1::Dims d); std::string dimsToString(const nvinfer1::Dims d);
int getNumChannels(nvinfer1::ITensor* t); int getNumChannels(nvinfer1::ITensor* t);
void printLayerInfo(
void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput, std::string layerIndex, std::string layerName, std::string layerInput, std::string layerOutput, std::string weightPtr);
std::string layerOutput, std::string weightPtr); std::string getAbsPath(std::string path);
#endif #endif

View File

@@ -46,7 +46,12 @@ Yolo::Yolo(const NetworkInfo& networkInfo)
m_InputC(0), m_InputC(0),
m_InputSize(0), m_InputSize(0),
m_NumClasses(0), m_NumClasses(0),
m_LetterBox(0) m_LetterBox(0),
m_NewCoords(0),
m_YoloCount(0),
m_IouThreshold(0),
m_ScoreThreshold(0),
m_TopK(0)
{} {}
Yolo::~Yolo() Yolo::~Yolo()
@@ -61,59 +66,75 @@ nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder, nvinfer1
m_ConfigBlocks = parseConfigFile(m_ConfigFilePath); m_ConfigBlocks = parseConfigFile(m_ConfigFilePath);
parseConfigBlocks(); parseConfigBlocks();
std::string configNMS = getAbsPath(m_WtsFilePath) + "/config_nms.txt";
if (!fileExists(configNMS))
{
std::cerr << "YOLO config_nms.txt file is not specified\n" << std::endl;
assert(0);
}
m_ConfigNMSBlocks = parseConfigFile(configNMS);
parseConfigNMSBlocks();
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0); nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0);
if (parseModel(*network) != NVDSINFER_SUCCESS) { if (parseModel(*network) != NVDSINFER_SUCCESS)
{
delete network; delete network;
return nullptr; return nullptr;
} }
std::cout << "Building the TensorRT Engine" << std::endl; std::cout << "Building the TensorRT Engine\n" << std::endl;
if (m_NumClasses != m_NumDetectedClasses) { if (m_NumClasses != m_NumDetectedClasses)
std::cout << "\nNOTE: Number of classes mismatch, make sure to set num-detected-classes=" << m_NumClasses << " in config_infer file" << std::endl; {
std::cout << "NOTE: Number of classes mismatch, make sure to set num-detected-classes=" << m_NumClasses
<< " in config_infer file\n" << std::endl;
} }
if (m_LetterBox == 1) { if (m_LetterBox == 1)
std::cout << "\nNOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file to get better accuracy" << std::endl; {
std::cout << "NOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file"
<< " to get better accuracy\n" << std::endl;
} }
if (m_ClusterMode != 2) { if (m_ClusterMode != 4)
std::cout << "\nNOTE: Wrong cluster-mode is set, make sure to set cluster-mode=2 in config_infer file" << std::endl; {
std::cout << "NOTE: Wrong cluster-mode is set, make sure to set cluster-mode=4 in config_infer file\n"
<< std::endl;
} }
std::cout << "" << std::endl;
if (m_NetworkMode == "INT8" && !fileExists(m_Int8CalibPath)) { if (m_NetworkMode == "INT8" && !fileExists(m_Int8CalibPath))
{
assert(builder->platformHasFastInt8()); assert(builder->platformHasFastInt8());
#ifdef OPENCV #ifdef OPENCV
std::string calib_image_list; std::string calib_image_list;
int calib_batch_size; int calib_batch_size;
if (getenv("INT8_CALIB_IMG_PATH")) { if (getenv("INT8_CALIB_IMG_PATH"))
calib_image_list = getenv("INT8_CALIB_IMG_PATH"); calib_image_list = getenv("INT8_CALIB_IMG_PATH");
} else
else { {
std::cerr << "INT8_CALIB_IMG_PATH not set" << std::endl; std::cerr << "INT8_CALIB_IMG_PATH not set" << std::endl;
std::abort(); std::abort();
} }
if (getenv("INT8_CALIB_BATCH_SIZE")) { if (getenv("INT8_CALIB_BATCH_SIZE"))
calib_batch_size = std::stoi(getenv("INT8_CALIB_BATCH_SIZE")); calib_batch_size = std::stoi(getenv("INT8_CALIB_BATCH_SIZE"));
} else
else { {
std::cerr << "INT8_CALIB_BATCH_SIZE not set" << std::endl; std::cerr << "INT8_CALIB_BATCH_SIZE not set" << std::endl;
std::abort(); std::abort();
} }
nvinfer1::Int8EntropyCalibrator2 *calibrator = new nvinfer1::Int8EntropyCalibrator2(calib_batch_size, m_InputC, m_InputH, m_InputW, m_LetterBox, calib_image_list, m_Int8CalibPath); nvinfer1::Int8EntropyCalibrator2 *calibrator = new nvinfer1::Int8EntropyCalibrator2(
calib_batch_size, m_InputC, m_InputH, m_InputW, m_LetterBox, calib_image_list, m_Int8CalibPath);
config->setFlag(nvinfer1::BuilderFlag::kINT8); config->setFlag(nvinfer1::BuilderFlag::kINT8);
config->setInt8Calibrator(calibrator); config->setInt8Calibrator(calibrator);
#else #else
std::cerr << "OpenCV is required to run INT8 calibrator" << std::endl; std::cerr << "OpenCV is required to run INT8 calibrator\n" << std::endl;
std::abort(); assert(0);
#endif #endif
} }
nvinfer1::ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config); nvinfer1::ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
if (engine) { if (engine)
std::cout << "Building complete\n" << std::endl; std::cout << "Building complete\n" << std::endl;
} else { else
std::cerr << "Building engine failed\n" << std::endl; std::cerr << "Building engine failed\n" << std::endl;
}
delete network; delete network;
return engine; return engine;
@@ -126,28 +147,30 @@ NvDsInferStatus Yolo::parseModel(nvinfer1::INetworkDefinition& network) {
std::cout << "Building YOLO network\n" << std::endl; std::cout << "Building YOLO network\n" << std::endl;
NvDsInferStatus status = buildYoloNetwork(weights, network); NvDsInferStatus status = buildYoloNetwork(weights, network);
if (status == NVDSINFER_SUCCESS) { if (status == NVDSINFER_SUCCESS)
std::cout << "Building YOLO network complete" << std::endl; std::cout << "Building YOLO network complete" << std::endl;
} else { else
std::cerr << "Building YOLO network failed" << std::endl; std::cerr << "Building YOLO network failed" << std::endl;
}
return status; return status;
} }
NvDsInferStatus Yolo::buildYoloNetwork( NvDsInferStatus Yolo::buildYoloNetwork(std::vector<float>& weights, nvinfer1::INetworkDefinition& network)
std::vector<float>& weights, nvinfer1::INetworkDefinition& network) { {
int weightPtr = 0; int weightPtr = 0;
int channels = m_InputC; int channels = m_InputC;
std::string weightsType; std::string weightsType;
if (m_WtsFilePath.find(".weights") != std::string::npos)
if (m_WtsFilePath.find(".weights") != std::string::npos) {
weightsType = "weights"; weightsType = "weights";
} else
else {
weightsType = "wts"; weightsType = "wts";
}
float eps = 1.0e-5;
if (m_NetworkType.find("yolov5") != std::string::npos)
eps = 1.0e-3;
else if (m_NetworkType.find("yolor") != std::string::npos)
eps = 1.0e-4;
nvinfer1::ITensor* data = nvinfer1::ITensor* data =
network.addInput(m_InputBlobName.c_str(), nvinfer1::DataType::kFLOAT, network.addInput(m_InputBlobName.c_str(), nvinfer1::DataType::kFLOAT,
@@ -157,26 +180,24 @@ NvDsInferStatus Yolo::buildYoloNetwork(
nvinfer1::ITensor* previous = data; nvinfer1::ITensor* previous = data;
std::vector<nvinfer1::ITensor*> tensorOutputs; std::vector<nvinfer1::ITensor*> tensorOutputs;
uint outputTensorCount = 0; std::vector<nvinfer1::ITensor*> yoloInputs;
uint inputYoloCount = 0;
for (uint i = 0; i < m_ConfigBlocks.size(); ++i) { int modelType = -1;
for (uint i = 0; i < m_ConfigBlocks.size(); ++i)
{
assert(getNumChannels(previous) == channels); assert(getNumChannels(previous) == channels);
std::string layerIndex = "(" + std::to_string(tensorOutputs.size()) + ")"; std::string layerIndex = "(" + std::to_string(tensorOutputs.size()) + ")";
if (m_ConfigBlocks.at(i).at("type") == "net") { if (m_ConfigBlocks.at(i).at("type") == "net")
printLayerInfo("", "layer", " input", " output", "weightPtr"); printLayerInfo("", "layer", " input", " output", "weightPtr");
}
else if (m_ConfigBlocks.at(i).at("type") == "convolutional") { else if (m_ConfigBlocks.at(i).at("type") == "convolutional")
float eps = 1.0e-5; {
if (m_NetworkType.find("yolov5") != std::string::npos) {
eps = 1.0e-3;
}
else if (m_NetworkType.find("yolor") != std::string::npos) {
eps = 1.0e-4;
}
std::string inputVol = dimsToString(previous->getDimensions()); std::string inputVol = dimsToString(previous->getDimensions());
nvinfer1::ILayer* out = convolutionalLayer(i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, weightsType, channels, eps, previous, &network); nvinfer1::ILayer* out = convolutionalLayer(
i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, weightsType, channels, eps, previous, &network);
previous = out->getOutput(0); previous = out->getOutput(0);
assert(previous != nullptr); assert(previous != nullptr);
channels = getNumChannels(previous); channels = getNumChannels(previous);
@@ -186,14 +207,13 @@ NvDsInferStatus Yolo::buildYoloNetwork(
printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr)); printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
} }
else if (m_ConfigBlocks.at(i).at("type") == "implicit_add" || m_ConfigBlocks.at(i).at("type") == "implicit_mul") { else if (m_ConfigBlocks.at(i).at("type") == "implicit_add" || m_ConfigBlocks.at(i).at("type") == "implicit_mul")
{
std::string type; std::string type;
if (m_ConfigBlocks.at(i).at("type") == "implicit_add") { if (m_ConfigBlocks.at(i).at("type") == "implicit_add")
type = "add"; type = "add";
} else if (m_ConfigBlocks.at(i).at("type") == "implicit_mul")
else if (m_ConfigBlocks.at(i).at("type") == "implicit_mul") {
type = "mul"; type = "mul";
}
assert(m_ConfigBlocks.at(i).find("filters") != m_ConfigBlocks.at(i).end()); assert(m_ConfigBlocks.at(i).find("filters") != m_ConfigBlocks.at(i).end());
int filters = std::stoi(m_ConfigBlocks.at(i).at("filters")); int filters = std::stoi(m_ConfigBlocks.at(i).at("filters"));
nvinfer1::ILayer* out = implicitLayer(filters, weights, m_TrtWeights, weightPtr, &network); nvinfer1::ILayer* out = implicitLayer(filters, weights, m_TrtWeights, weightPtr, &network);
@@ -206,19 +226,17 @@ NvDsInferStatus Yolo::buildYoloNetwork(
printLayerInfo(layerIndex, layerType, " -", outputVol, std::to_string(weightPtr)); printLayerInfo(layerIndex, layerType, " -", outputVol, std::to_string(weightPtr));
} }
else if (m_ConfigBlocks.at(i).at("type") == "shift_channels" || m_ConfigBlocks.at(i).at("type") == "control_channels") { else if (m_ConfigBlocks.at(i).at("type") == "shift_channels" || m_ConfigBlocks.at(i).at("type") == "control_channels")
{
std::string type; std::string type;
if (m_ConfigBlocks.at(i).at("type") == "shift_channels") { if (m_ConfigBlocks.at(i).at("type") == "shift_channels")
type = "shift"; type = "shift";
} else if (m_ConfigBlocks.at(i).at("type") == "control_channels")
else if (m_ConfigBlocks.at(i).at("type") == "control_channels") {
type = "control"; type = "control";
}
assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end()); assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end());
int from = stoi(m_ConfigBlocks.at(i).at("from")); int from = stoi(m_ConfigBlocks.at(i).at("from"));
if (from > 0) { if (from > 0)
from = from - i + 1; from = from - i + 1;
}
assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size())); assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size()));
assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size())); assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size()));
assert(i + from - 1 < i - 2); assert(i + from - 1 < i - 2);
@@ -231,25 +249,22 @@ NvDsInferStatus Yolo::buildYoloNetwork(
printLayerInfo(layerIndex, layerType, " -", outputVol, " -"); printLayerInfo(layerIndex, layerType, " -", outputVol, " -");
} }
else if (m_ConfigBlocks.at(i).at("type") == "dropout") { else if (m_ConfigBlocks.at(i).at("type") == "dropout")
{
// Skip dropout layer // Skip dropout layer
assert(m_ConfigBlocks.at(i).find("probability") != m_ConfigBlocks.at(i).end());
/*float probability = std::stof(m_ConfigBlocks.at(i).at("probability"));
nvinfer1::ILayer* out = dropoutLayer(probability, previous, &network);
previous = out->getOutput(0);*/
assert(previous != nullptr); assert(previous != nullptr);
tensorOutputs.push_back(previous); tensorOutputs.push_back(previous);
printLayerInfo(layerIndex, "dropout", " -", " -", " -"); printLayerInfo(layerIndex, "dropout", " -", " -", " -");
} }
else if (m_ConfigBlocks.at(i).at("type") == "shortcut") { else if (m_ConfigBlocks.at(i).at("type") == "shortcut")
{
assert(m_ConfigBlocks.at(i).find("activation") != m_ConfigBlocks.at(i).end()); assert(m_ConfigBlocks.at(i).find("activation") != m_ConfigBlocks.at(i).end());
assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end()); assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end());
std::string activation = m_ConfigBlocks.at(i).at("activation"); std::string activation = m_ConfigBlocks.at(i).at("activation");
int from = stoi(m_ConfigBlocks.at(i).at("from")); int from = stoi(m_ConfigBlocks.at(i).at("from"));
if (from > 0) { if (from > 0)
from = from - i + 1; from = from - i + 1;
}
assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size())); assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size()));
assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size())); assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size()));
assert(i + from - 1 < i - 2); assert(i + from - 1 < i - 2);
@@ -267,7 +282,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
} }
} }
else if (m_ConfigBlocks.at(i).at("type") == "route") { else if (m_ConfigBlocks.at(i).at("type") == "route")
{
assert(m_ConfigBlocks.at(i).find("layers") != m_ConfigBlocks.at(i).end()); assert(m_ConfigBlocks.at(i).find("layers") != m_ConfigBlocks.at(i).end());
nvinfer1::ILayer* out = routeLayer(i, m_ConfigBlocks.at(i), tensorOutputs, &network); nvinfer1::ILayer* out = routeLayer(i, m_ConfigBlocks.at(i), tensorOutputs, &network);
previous = out->getOutput(0); previous = out->getOutput(0);
@@ -278,7 +294,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
printLayerInfo(layerIndex, "route", " -", outputVol, std::to_string(weightPtr)); printLayerInfo(layerIndex, "route", " -", outputVol, std::to_string(weightPtr));
} }
else if (m_ConfigBlocks.at(i).at("type") == "upsample") { else if (m_ConfigBlocks.at(i).at("type") == "upsample")
{
std::string inputVol = dimsToString(previous->getDimensions()); std::string inputVol = dimsToString(previous->getDimensions());
nvinfer1::ILayer* out = upsampleLayer(i - 1, m_ConfigBlocks[i], previous, &network); nvinfer1::ILayer* out = upsampleLayer(i - 1, m_ConfigBlocks[i], previous, &network);
previous = out->getOutput(0); previous = out->getOutput(0);
@@ -288,7 +305,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
printLayerInfo(layerIndex, "upsample", inputVol, outputVol, " -"); printLayerInfo(layerIndex, "upsample", inputVol, outputVol, " -");
} }
else if (m_ConfigBlocks.at(i).at("type") == "maxpool") { else if (m_ConfigBlocks.at(i).at("type") == "maxpool")
{
std::string inputVol = dimsToString(previous->getDimensions()); std::string inputVol = dimsToString(previous->getDimensions());
nvinfer1::ILayer* out = maxpoolLayer(i, m_ConfigBlocks.at(i), previous, &network); nvinfer1::ILayer* out = maxpoolLayer(i, m_ConfigBlocks.at(i), previous, &network);
previous = out->getOutput(0); previous = out->getOutput(0);
@@ -298,8 +316,10 @@ NvDsInferStatus Yolo::buildYoloNetwork(
printLayerInfo(layerIndex, "maxpool", inputVol, outputVol, std::to_string(weightPtr)); printLayerInfo(layerIndex, "maxpool", inputVol, outputVol, std::to_string(weightPtr));
} }
else if (m_ConfigBlocks.at(i).at("type") == "reorg") { else if (m_ConfigBlocks.at(i).at("type") == "reorg")
if (m_NetworkType.find("yolov5") != std::string::npos || m_NetworkType.find("yolor") != std::string::npos) { {
if (m_NetworkType.find("yolov5") != std::string::npos || m_NetworkType.find("yolor") != std::string::npos)
{
std::string inputVol = dimsToString(previous->getDimensions()); std::string inputVol = dimsToString(previous->getDimensions());
nvinfer1::ILayer* out = reorgV5Layer(i, previous, &network); nvinfer1::ILayer* out = reorgV5Layer(i, previous, &network);
previous = out->getOutput(0); previous = out->getOutput(0);
@@ -310,7 +330,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
std::string layerType = "reorgV5"; std::string layerType = "reorgV5";
printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr)); printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
} }
else { else
{
std::string inputVol = dimsToString(previous->getDimensions()); std::string inputVol = dimsToString(previous->getDimensions());
nvinfer1::IPluginV2* reorgPlugin = createReorgPlugin(2); nvinfer1::IPluginV2* reorgPlugin = createReorgPlugin(2);
assert(reorgPlugin != nullptr); assert(reorgPlugin != nullptr);
@@ -328,95 +349,127 @@ NvDsInferStatus Yolo::buildYoloNetwork(
} }
} }
else if (m_ConfigBlocks.at(i).at("type") == "yolo") { else if (m_ConfigBlocks.at(i).at("type") == "yolo" || m_ConfigBlocks.at(i).at("type") == "region")
uint modelType = 1; {
uint newCoords = 0; if (m_ConfigBlocks.at(i).at("type") == "yolo")
float scaleXY = 1.0; {
if (m_NetworkType.find("yolor") != std::string::npos) { if (m_NetworkType.find("yolor") != std::string::npos)
modelType = 2; modelType = 2;
else
modelType = 1;
} }
if (m_ConfigBlocks.at(i).find("new_coords") != m_ConfigBlocks.at(i).end()) { else
newCoords = std::stoi(m_ConfigBlocks.at(i).at("new_coords")); modelType = 0;
}
if (m_ConfigBlocks.at(i).find("scale_x_y") != m_ConfigBlocks.at(i).end()) {
scaleXY = std::stof(m_ConfigBlocks.at(i).at("scale_x_y"));
}
std::string layerName = "yolo_" + std::to_string(i); std::string layerName = modelType != 0 ? "yolo_" + std::to_string(i) : "region_" + std::to_string(i);
nvinfer1::Dims prevTensorDims = previous->getDimensions(); nvinfer1::Dims prevTensorDims = previous->getDimensions();
TensorInfo& curYoloTensor = m_OutputTensors.at(outputTensorCount); TensorInfo& curYoloTensor = m_YoloTensors.at(inputYoloCount);
m_NumClasses = curYoloTensor.numClasses;
curYoloTensor.blobName = layerName; curYoloTensor.blobName = layerName;
nvinfer1::IPluginV2* yoloPlugin curYoloTensor.gridSizeX = prevTensorDims.d[2];
= new YoloLayer(curYoloTensor.numBBoxes, curYoloTensor.numClasses, m_InputW, m_InputH, curYoloTensor.gridSizeY = prevTensorDims.d[1];
prevTensorDims.d[2], prevTensorDims.d[1], modelType, newCoords, scaleXY,
curYoloTensor.anchors, curYoloTensor.mask);
assert(yoloPlugin != nullptr);
nvinfer1::IPluginV2Layer* yolo =
network.addPluginV2(&previous, 1, *yoloPlugin);
assert(yolo != nullptr);
yolo->setName(layerName.c_str());
std::string inputVol = dimsToString(previous->getDimensions());
previous = yolo->getOutput(0);
assert(previous != nullptr);
previous->setName(layerName.c_str());
std::string outputVol = dimsToString(previous->getDimensions());
network.markOutput(*previous);
channels = getNumChannels(previous);
tensorOutputs.push_back(yolo->getOutput(0));
printLayerInfo(layerIndex, "yolo", inputVol, outputVol, std::to_string(weightPtr));
++outputTensorCount;
}
else if (m_ConfigBlocks.at(i).at("type") == "region") {
std::vector<int> mask;
std::string layerName = "region_" + std::to_string(i);
nvinfer1::Dims prevTensorDims = previous->getDimensions();
TensorInfo& curRegionTensor = m_OutputTensors.at(outputTensorCount);
m_NumClasses = curRegionTensor.numClasses;
curRegionTensor.blobName = layerName;
nvinfer1::IPluginV2* regionPlugin
= new YoloLayer(curRegionTensor.numBBoxes, curRegionTensor.numClasses, m_InputW, m_InputH,
prevTensorDims.d[2], prevTensorDims.d[1], 0, 0, 1.0, curRegionTensor.anchors,
mask);
assert(regionPlugin != nullptr);
nvinfer1::IPluginV2Layer* region =
network.addPluginV2(&previous, 1, *regionPlugin);
assert(region != nullptr);
region->setName(layerName.c_str());
std::string inputVol = dimsToString(previous->getDimensions()); std::string inputVol = dimsToString(previous->getDimensions());
previous = region->getOutput(0);
assert(previous != nullptr);
previous->setName(layerName.c_str());
std::string outputVol = dimsToString(previous->getDimensions());
network.markOutput(*previous);
channels = getNumChannels(previous); channels = getNumChannels(previous);
tensorOutputs.push_back(region->getOutput(0)); tensorOutputs.push_back(previous);
printLayerInfo(layerIndex, "region", inputVol, outputVol, std::to_string(weightPtr)); yoloInputs.push_back(previous);
++outputTensorCount; ++inputYoloCount;
printLayerInfo(layerIndex, modelType != 0 ? "yolo" : "region", inputVol, " -", " -");
} }
else else
{ {
std::cout << "Unsupported layer type --> \"" std::cout << "\nUnsupported layer type --> \"" << m_ConfigBlocks.at(i).at("type") << "\"" << std::endl;
<< m_ConfigBlocks.at(i).at("type") << "\"" << std::endl;
assert(0); assert(0);
} }
} }
if ((int)weights.size() != weightPtr) if ((int)weights.size() != weightPtr)
{ {
std::cout << "Number of unused weights left: " << weights.size() - weightPtr << std::endl; std::cout << "\nNumber of unused weights left: " << weights.size() - weightPtr << std::endl;
assert(0); assert(0);
} }
std::cout << "Output YOLO blob names: " << std::endl; if (m_YoloCount == inputYoloCount)
for (auto& tensor : m_OutputTensors) { {
assert((modelType != -1) && "\nCould not determine model type");
nvinfer1::ITensor* yoloInputTensors[inputYoloCount];
uint64_t outputSize = 0;
for (uint j = 0; j < inputYoloCount; ++j)
{
yoloInputTensors[j] = yoloInputs[j];
TensorInfo& curYoloTensor = m_YoloTensors.at(j);
outputSize += curYoloTensor.gridSizeX * curYoloTensor.gridSizeY * curYoloTensor.numBBoxes;
}
if (m_TopK > outputSize) {
std::cout << "\ntopk > Number of outputs\nPlease change the topk to " << outputSize
<< " or less in config_nms.txt file\n" << std::endl;
assert(0);
}
std::string layerName = "yolo";
nvinfer1::IPluginV2* yoloPlugin = new YoloLayer(
m_InputW, m_InputH, m_NumClasses, m_NewCoords, m_YoloTensors, outputSize, modelType, m_TopK,
m_ScoreThreshold);
assert(yoloPlugin != nullptr);
nvinfer1::IPluginV2Layer* yolo = network.addPluginV2(yoloInputTensors, inputYoloCount, *yoloPlugin);
assert(yolo != nullptr);
yolo->setName(layerName.c_str());
previous = yolo->getOutput(0);
assert(previous != nullptr);
previous->setName(layerName.c_str());
tensorOutputs.push_back(yolo->getOutput(0));
nvinfer1::ITensor* yoloTensors[] = {yolo->getOutput(0), yolo->getOutput(1)};
std::string outputVol = dimsToString(previous->getDimensions());
nvinfer1::plugin::NMSParameters nmsParams;
nmsParams.shareLocation = true;
nmsParams.backgroundLabelId = -1;
nmsParams.numClasses = m_NumClasses;
nmsParams.topK = m_TopK;
nmsParams.keepTopK = m_TopK;
nmsParams.scoreThreshold = m_ScoreThreshold;
nmsParams.iouThreshold = m_IouThreshold;
nmsParams.isNormalized = false;
layerName = "batchedNMS";
nvinfer1::IPluginV2* batchedNMS = createBatchedNMSPlugin(nmsParams);
nvinfer1::IPluginV2Layer* nms = network.addPluginV2(yoloTensors, 2, *batchedNMS);
nms->setName(layerName.c_str());
nvinfer1::ITensor* num_detections = nms->getOutput(0);
layerName = "num_detections";
num_detections->setName(layerName.c_str());
nvinfer1::ITensor* nmsed_boxes = nms->getOutput(1);
layerName = "nmsed_boxes";
nmsed_boxes->setName(layerName.c_str());
nvinfer1::ITensor* nmsed_scores = nms->getOutput(2);
layerName = "nmsed_scores";
nmsed_scores->setName(layerName.c_str());
nvinfer1::ITensor* nmsed_classes = nms->getOutput(3);
layerName = "nmsed_classes";
nmsed_classes->setName(layerName.c_str());
network.markOutput(*num_detections);
network.markOutput(*nmsed_boxes);
network.markOutput(*nmsed_scores);
network.markOutput(*nmsed_classes);
printLayerInfo("", "batched_nms", " -", outputVol, " -");
}
else {
std::cout << "\nError in yolo cfg file" << std::endl;
assert(0);
}
std::cout << "\nOutput YOLO blob names: " << std::endl;
for (auto& tensor : m_YoloTensors)
{
std::cout << tensor.blobName << std::endl; std::cout << tensor.blobName << std::endl;
} }
int nbLayers = network.getNbLayers(); int nbLayers = network.getNbLayers();
std::cout << "Total number of YOLO layers: " << nbLayers << std::endl; std::cout << "\nTotal number of YOLO layers: " << nbLayers << "\n" << std::endl;
return NVDSINFER_SUCCESS; return NVDSINFER_SUCCESS;
} }
@@ -462,39 +515,44 @@ Yolo::parseConfigFile (const std::string cfgFilePath)
void Yolo::parseConfigBlocks() void Yolo::parseConfigBlocks()
{ {
for (auto block : m_ConfigBlocks) { for (auto block : m_ConfigBlocks)
{
if (block.at("type") == "net") if (block.at("type") == "net")
{ {
assert((block.find("height") != block.end()) assert((block.find("height") != block.end()) && "Missing 'height' param in network cfg");
&& "Missing 'height' param in network cfg");
assert((block.find("width") != block.end()) && "Missing 'width' param in network cfg"); assert((block.find("width") != block.end()) && "Missing 'width' param in network cfg");
assert((block.find("channels") != block.end()) assert((block.find("channels") != block.end()) && "Missing 'channels' param in network cfg");
&& "Missing 'channels' param in network cfg");
m_InputH = std::stoul(block.at("height")); m_InputH = std::stoul(block.at("height"));
m_InputW = std::stoul(block.at("width")); m_InputW = std::stoul(block.at("width"));
m_InputC = std::stoul(block.at("channels")); m_InputC = std::stoul(block.at("channels"));
m_InputSize = m_InputC * m_InputH * m_InputW; m_InputSize = m_InputC * m_InputH * m_InputW;
if (block.find("letter_box") != block.end()) { if (block.find("letter_box") != block.end())
{
m_LetterBox = std::stoul(block.at("letter_box")); m_LetterBox = std::stoul(block.at("letter_box"));
} }
else {
m_LetterBox = 0;
} }
} else if ((block.at("type") == "region") || (block.at("type") == "yolo"))
else if ((block.at("type") == "region") || (block.at("type") == "yolo") || (block.at("type") == "detect"))
{ {
assert((block.find("num") != block.end()) assert((block.find("num") != block.end())
&& std::string("Missing 'num' param in " + block.at("type") + " layer").c_str()); && std::string("Missing 'num' param in " + block.at("type") + " layer").c_str());
assert((block.find("classes") != block.end()) assert((block.find("classes") != block.end())
&& std::string("Missing 'classes' param in " + block.at("type") + " layer") && std::string("Missing 'classes' param in " + block.at("type") + " layer").c_str());
.c_str());
assert((block.find("anchors") != block.end()) assert((block.find("anchors") != block.end())
&& std::string("Missing 'anchors' param in " + block.at("type") + " layer") && std::string("Missing 'anchors' param in " + block.at("type") + " layer").c_str());
.c_str());
++m_YoloCount;
m_NumClasses = std::stoul(block.at("classes"));
if (block.find("new_coords") != block.end())
{
m_NewCoords = std::stoul(block.at("new_coords"));
}
TensorInfo outputTensor; TensorInfo outputTensor;
std::string anchorString = block.at("anchors"); std::string anchorString = block.at("anchors");
while (!anchorString.empty()) while (!anchorString.empty())
{ {
@@ -513,7 +571,8 @@ void Yolo::parseConfigBlocks()
} }
} }
if (block.find("mask") != block.end()) { if (block.find("mask") != block.end())
{
std::string maskString = block.at("mask"); std::string maskString = block.at("mask");
while (!maskString.empty()) while (!maskString.empty())
{ {
@@ -533,17 +592,41 @@ void Yolo::parseConfigBlocks()
} }
} }
outputTensor.numBBoxes = outputTensor.mask.size() > 0 if (block.find("scale_x_y") != block.end())
? outputTensor.mask.size() {
: std::stoul(trim(block.at("num"))); outputTensor.scaleXY = std::stof(block.at("scale_x_y"));
outputTensor.numClasses = std::stoul(block.at("classes")); }
m_OutputTensors.push_back(outputTensor); else
{
outputTensor.scaleXY = 1.0;
}
outputTensor.numBBoxes
= outputTensor.mask.size() > 0 ? outputTensor.mask.size() : std::stoul(trim(block.at("num")));
m_YoloTensors.push_back(outputTensor);
} }
} }
} }
void Yolo::destroyNetworkUtils() { void Yolo::parseConfigNMSBlocks()
for (uint i = 0; i < m_TrtWeights.size(); ++i) { {
auto block = m_ConfigNMSBlocks[0];
assert((block.at("type") == "property") && "Missing 'property' param in nms cfg");
assert((block.find("iou-threshold") != block.end()) && "Missing 'iou-threshold' param in nms cfg");
assert((block.find("score-threshold") != block.end()) && "Missing 'score-threshold' param in nms cfg");
assert((block.find("topk") != block.end()) && "Missing 'topk' param in nms cfg");
m_IouThreshold = std::stof(block.at("iou-threshold"));
m_ScoreThreshold = std::stof(block.at("score-threshold"));
m_TopK = std::stoul(block.at("topk"));
}
void Yolo::destroyNetworkUtils()
{
for (uint i = 0; i < m_TrtWeights.size(); ++i)
{
if (m_TrtWeights[i].count > 0) if (m_TrtWeights[i].count > 0)
free(const_cast<void*>(m_TrtWeights[i].values)); free(const_cast<void*>(m_TrtWeights[i].values));
} }

View File

@@ -29,7 +29,6 @@
#include "layers/convolutional_layer.h" #include "layers/convolutional_layer.h"
#include "layers/implicit_layer.h" #include "layers/implicit_layer.h"
#include "layers/channels_layer.h" #include "layers/channels_layer.h"
#include "layers/dropout_layer.h"
#include "layers/shortcut_layer.h" #include "layers/shortcut_layer.h"
#include "layers/route_layer.h" #include "layers/route_layer.h"
#include "layers/upsample_layer.h" #include "layers/upsample_layer.h"
@@ -54,8 +53,10 @@ struct NetworkInfo
struct TensorInfo struct TensorInfo
{ {
std::string blobName; std::string blobName;
uint gridSizeX {0};
uint gridSizeY {0};
uint numBBoxes {0}; uint numBBoxes {0};
uint numClasses {0}; float scaleXY;
std::vector<float> anchors; std::vector<float> anchors;
std::vector<int> mask; std::vector<int> mask;
}; };
@@ -63,12 +64,15 @@ struct TensorInfo
class Yolo : public IModelParser { class Yolo : public IModelParser {
public: public:
Yolo(const NetworkInfo& networkInfo); Yolo(const NetworkInfo& networkInfo);
~Yolo() override; ~Yolo() override;
bool hasFullDimsSupported() const override { return false; } bool hasFullDimsSupported() const override { return false; }
const char* getModelName() const override { const char* getModelName() const override {
return m_ConfigFilePath.empty() ? m_NetworkType.c_str() return m_ConfigFilePath.empty() ? m_NetworkType.c_str() : m_ConfigFilePath.c_str();
: m_ConfigFilePath.c_str();
} }
NvDsInferStatus parseModel(nvinfer1::INetworkDefinition& network) override; NvDsInferStatus parseModel(nvinfer1::INetworkDefinition& network) override;
nvinfer1::ICudaEngine *createEngine (nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config); nvinfer1::ICudaEngine *createEngine (nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config);
@@ -90,17 +94,26 @@ protected:
uint64_t m_InputSize; uint64_t m_InputSize;
uint m_NumClasses; uint m_NumClasses;
uint m_LetterBox; uint m_LetterBox;
uint m_NewCoords;
uint m_YoloCount;
float m_IouThreshold;
float m_ScoreThreshold;
uint m_TopK;
std::vector<TensorInfo> m_OutputTensors; std::vector<TensorInfo> m_YoloTensors;
std::vector<std::map<std::string, std::string>> m_ConfigBlocks; std::vector<std::map<std::string, std::string>> m_ConfigBlocks;
std::vector<std::map<std::string, std::string>> m_ConfigNMSBlocks;
std::vector<nvinfer1::Weights> m_TrtWeights; std::vector<nvinfer1::Weights> m_TrtWeights;
private: private:
NvDsInferStatus buildYoloNetwork( NvDsInferStatus buildYoloNetwork(std::vector<float>& weights, nvinfer1::INetworkDefinition& network);
std::vector<float>& weights, nvinfer1::INetworkDefinition& network);
std::vector<std::map<std::string, std::string>> parseConfigFile( std::vector<std::map<std::string, std::string>> parseConfigFile(const std::string cfgFilePath);
const std::string cfgFilePath);
void parseConfigBlocks(); void parseConfigBlocks();
void parseConfigNMSBlocks();
void destroyNetworkUtils(); void destroyNetworkUtils();
}; };

View File

@@ -3,18 +3,14 @@
* https://www.github.com/marcoslucianops * https://www.github.com/marcoslucianops
*/ */
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h>
#include <string.h>
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); } inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
__global__ void gpuYoloLayer( __global__ void gpuYoloLayer(
const float* input, float* output, const uint netWidth, const uint netHeight, const uint gridSizeX, const float* input, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY,
const float* anchors, const int* mask) const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float* anchors, const int* mask)
{ {
uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
uint y_id = blockIdx.y * blockDim.y + threadIdx.y; uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
@@ -28,28 +24,32 @@ __global__ void gpuYoloLayer(
const int numGridCells = gridSizeX * gridSizeY; const int numGridCells = gridSizeX * gridSizeY;
const int bbindex = y_id * gridSizeX + x_id; const int bbindex = y_id * gridSizeX + x_id;
const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
if (objectness < scoreThreshold) return;
int count = (int)atomicAdd(&countData[0], 1);
const float alpha = scaleXY; const float alpha = scaleXY;
const float beta = -0.5 * (scaleXY - 1); const float beta = -0.5 * (scaleXY - 1);
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] float x
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]) = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
* alpha + beta + x_id) * netWidth / gridSizeX; * alpha + beta + x_id) * netWidth / gridSizeX;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] float y
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]) = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)])
* alpha + beta + y_id) * netHeight / gridSizeY; * alpha + beta + y_id) * netHeight / gridSizeY;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] float w
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]) = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)])
* anchors[mask[z_id] * 2]; * anchors[mask[z_id] * 2];
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] float h
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)])
* anchors[mask[z_id] * 2 + 1]; * anchors[mask[z_id] * 2 + 1];
const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
float maxProb = 0.0f; float maxProb = 0.0f;
int maxIndex = -1; int maxIndex = -1;
@@ -65,22 +65,26 @@ __global__ void gpuYoloLayer(
} }
} }
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] d_indexes[count] = count;
= objectness * maxProb; d_scores[count] = objectness * maxProb + 1.f;
d_boxes[count * 4 + 0] = x - 0.5 * w;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)] d_boxes[count * 4 + 1] = y - 0.5 * h;
= maxIndex; d_boxes[count * 4 + 2] = x + 0.5 * w;
d_boxes[count * 4 + 3] = y + 0.5 * h;
d_classes[count] = maxIndex;
} }
cudaError_t cudaYoloLayer( cudaError_t cudaYoloLayer(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
cudaError_t cudaYoloLayer( cudaError_t cudaYoloLayer(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream) const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
{ {
dim3 threads_per_block(16, 16, 4); dim3 threads_per_block(16, 16, 4);
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1, dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
@@ -90,9 +94,12 @@ cudaError_t cudaYoloLayer(
for (unsigned int batch = 0; batch < batchSize; ++batch) for (unsigned int batch = 0; batch < batchSize; ++batch)
{ {
gpuYoloLayer<<<number_of_blocks, threads_per_block, 0, stream>>>( gpuYoloLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
reinterpret_cast<const float*>(input) + (batch * outputSize), reinterpret_cast<const float*>(input) + (batch * inputSize),
reinterpret_cast<float*>(output) + (batch * outputSize), reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY, reinterpret_cast<float*>(d_scores) + (batch * outputSize),
reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask)); reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask));
} }
return cudaGetLastError(); return cudaGetLastError();

View File

@@ -3,16 +3,12 @@
* https://www.github.com/marcoslucianops * https://www.github.com/marcoslucianops
*/ */
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h>
#include <string.h>
__global__ void gpuYoloLayer_nc( __global__ void gpuYoloLayer_nc(
const float* input, float* output, const uint netWidth, const uint netHeight, const uint gridSizeX, const float* input, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY,
const float* anchors, const int* mask) const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float* anchors, const int* mask)
{ {
uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
uint y_id = blockIdx.y * blockDim.y + threadIdx.y; uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
@@ -26,28 +22,32 @@ __global__ void gpuYoloLayer_nc(
const int numGridCells = gridSizeX * gridSizeY; const int numGridCells = gridSizeX * gridSizeY;
const int bbindex = y_id * gridSizeX + x_id; const int bbindex = y_id * gridSizeX + x_id;
const float objectness
= input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)];
if (objectness < scoreThreshold) return;
int count = (int)atomicAdd(&countData[0], 1);
const float alpha = scaleXY; const float alpha = scaleXY;
const float beta = -0.5 * (scaleXY - 1); const float beta = -0.5 * (scaleXY - 1);
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] float x
= (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] = (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
* alpha + beta + x_id) * netWidth / gridSizeX; * alpha + beta + x_id) * netWidth / gridSizeX;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] float y
= (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] = (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
* alpha + beta + y_id) * netHeight / gridSizeY; * alpha + beta + y_id) * netHeight / gridSizeY;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] float w
= __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] * 2, 2) = __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] * 2, 2)
* anchors[mask[z_id] * 2]; * anchors[mask[z_id] * 2];
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] float h
= __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] * 2, 2) = __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] * 2, 2)
* anchors[mask[z_id] * 2 + 1]; * anchors[mask[z_id] * 2 + 1];
const float objectness
= input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)];
float maxProb = 0.0f; float maxProb = 0.0f;
int maxIndex = -1; int maxIndex = -1;
@@ -63,22 +63,26 @@ __global__ void gpuYoloLayer_nc(
} }
} }
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] d_indexes[count] = count;
= objectness * maxProb; d_scores[count] = objectness * maxProb + 1.f;
d_boxes[count * 4 + 0] = x - 0.5 * w;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)] d_boxes[count * 4 + 1] = y - 0.5 * h;
= maxIndex; d_boxes[count * 4 + 2] = x + 0.5 * w;
d_boxes[count * 4 + 3] = y + 0.5 * h;
d_classes[count] = maxIndex;
} }
cudaError_t cudaYoloLayer_nc( cudaError_t cudaYoloLayer_nc(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
cudaError_t cudaYoloLayer_nc( cudaError_t cudaYoloLayer_nc(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream) const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
{ {
dim3 threads_per_block(16, 16, 4); dim3 threads_per_block(16, 16, 4);
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1, dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
@@ -88,9 +92,12 @@ cudaError_t cudaYoloLayer_nc(
for (unsigned int batch = 0; batch < batchSize; ++batch) for (unsigned int batch = 0; batch < batchSize; ++batch)
{ {
gpuYoloLayer_nc<<<number_of_blocks, threads_per_block, 0, stream>>>( gpuYoloLayer_nc<<<number_of_blocks, threads_per_block, 0, stream>>>(
reinterpret_cast<const float*>(input) + (batch * outputSize), reinterpret_cast<const float*>(input) + (batch * inputSize),
reinterpret_cast<float*>(output) + (batch * outputSize), reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY, reinterpret_cast<float*>(d_scores) + (batch * outputSize),
reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask)); reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask));
} }
return cudaGetLastError(); return cudaGetLastError();

View File

@@ -3,18 +3,14 @@
* https://www.github.com/marcoslucianops * https://www.github.com/marcoslucianops
*/ */
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h>
#include <string.h>
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); } inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
__global__ void gpuYoloLayer_r( __global__ void gpuYoloLayer_r(
const float* input, float* output, const uint netWidth, const uint netHeight, const uint gridSizeX, const float* input, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY,
const float* anchors, const int* mask) const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float* anchors, const int* mask)
{ {
uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
uint y_id = blockIdx.y * blockDim.y + threadIdx.y; uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
@@ -28,28 +24,32 @@ __global__ void gpuYoloLayer_r(
const int numGridCells = gridSizeX * gridSizeY; const int numGridCells = gridSizeX * gridSizeY;
const int bbindex = y_id * gridSizeX + x_id; const int bbindex = y_id * gridSizeX + x_id;
const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
if (objectness < scoreThreshold) return;
int count = (int)atomicAdd(&countData[0], 1);
const float alpha = scaleXY; const float alpha = scaleXY;
const float beta = -0.5 * (scaleXY - 1); const float beta = -0.5 * (scaleXY - 1);
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] float x
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]) = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
* alpha + beta + x_id) * netWidth / gridSizeX; * alpha + beta + x_id) * netWidth / gridSizeX;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] float y
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]) = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)])
* alpha + beta + y_id) * netHeight / gridSizeY; * alpha + beta + y_id) * netHeight / gridSizeY;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] float w
= __powf(sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]) * 2, 2) = __powf(sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]) * 2, 2)
* anchors[mask[z_id] * 2]; * anchors[mask[z_id] * 2];
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] float h
= __powf(sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) * 2, 2) = __powf(sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) * 2, 2)
* anchors[mask[z_id] * 2 + 1]; * anchors[mask[z_id] * 2 + 1];
const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
float maxProb = 0.0f; float maxProb = 0.0f;
int maxIndex = -1; int maxIndex = -1;
@@ -65,22 +65,26 @@ __global__ void gpuYoloLayer_r(
} }
} }
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] d_indexes[count] = count;
= objectness * maxProb; d_scores[count] = objectness * maxProb + 1.f;
d_boxes[count * 4 + 0] = x - 0.5 * w;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)] d_boxes[count * 4 + 1] = y - 0.5 * h;
= maxIndex; d_boxes[count * 4 + 2] = x + 0.5 * w;
d_boxes[count * 4 + 3] = y + 0.5 * h;
d_classes[count] = maxIndex;
} }
cudaError_t cudaYoloLayer_r( cudaError_t cudaYoloLayer_r(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
cudaError_t cudaYoloLayer_r( cudaError_t cudaYoloLayer_r(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream) const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
{ {
dim3 threads_per_block(16, 16, 4); dim3 threads_per_block(16, 16, 4);
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1, dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
@@ -90,9 +94,12 @@ cudaError_t cudaYoloLayer_r(
for (unsigned int batch = 0; batch < batchSize; ++batch) for (unsigned int batch = 0; batch < batchSize; ++batch)
{ {
gpuYoloLayer_r<<<number_of_blocks, threads_per_block, 0, stream>>>( gpuYoloLayer_r<<<number_of_blocks, threads_per_block, 0, stream>>>(
reinterpret_cast<const float*>(input) + (batch * outputSize), reinterpret_cast<const float*>(input) + (batch * inputSize),
reinterpret_cast<float*>(output) + (batch * outputSize), reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY, reinterpret_cast<float*>(d_scores) + (batch * outputSize),
reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask)); reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask));
} }
return cudaGetLastError(); return cudaGetLastError();

View File

@@ -3,17 +3,13 @@
* https://www.github.com/marcoslucianops * https://www.github.com/marcoslucianops
*/ */
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h>
#include <string.h>
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); } inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
__device__ void softmaxGPU( __device__ void softmaxGPU(
const float* input, const int bbindex, const int numGridCells, uint z_id, const float* input, const int bbindex, const int numGridCells, uint z_id, const uint numOutputClasses, float temp,
const uint numOutputClasses, float temp, float* output) float* output)
{ {
int i; int i;
float sum = 0; float sum = 0;
@@ -33,9 +29,9 @@ __device__ void softmaxGPU(
} }
__global__ void gpuRegionLayer( __global__ void gpuRegionLayer(
const float* input, float* output, float* softmax, const uint netWidth, const uint netHeight, const float* input, float* softmax, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY,
const float* anchors) const uint numOutputClasses, const uint numBBoxes, const float* anchors)
{ {
uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
uint y_id = blockIdx.y * blockDim.y + threadIdx.y; uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
@@ -49,27 +45,31 @@ __global__ void gpuRegionLayer(
const int numGridCells = gridSizeX * gridSizeY; const int numGridCells = gridSizeX * gridSizeY;
const int bbindex = y_id * gridSizeX + x_id; const int bbindex = y_id * gridSizeX + x_id;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
if (objectness < scoreThreshold) return;
int count = (int)atomicAdd(&countData[0], 1);
float x
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]) = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
+ x_id) * netWidth / gridSizeX; + x_id) * netWidth / gridSizeX;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] float y
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]) = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)])
+ y_id) * netHeight / gridSizeY; + y_id) * netHeight / gridSizeY;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] float w
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]) = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)])
* anchors[z_id * 2] * netWidth / gridSizeX; * anchors[z_id * 2] * netWidth / gridSizeX;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] float h
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)])
* anchors[z_id * 2 + 1] * netHeight / gridSizeY; * anchors[z_id * 2 + 1] * netHeight / gridSizeY;
softmaxGPU(input, bbindex, numGridCells, z_id, numOutputClasses, 1.0, softmax); softmaxGPU(input, bbindex, numGridCells, z_id, numOutputClasses, 1.0, softmax);
const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
float maxProb = 0.0f; float maxProb = 0.0f;
int maxIndex = -1; int maxIndex = -1;
@@ -85,22 +85,26 @@ __global__ void gpuRegionLayer(
} }
} }
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] d_indexes[count] = count;
= objectness * maxProb; d_scores[count] = objectness * maxProb + 1.f;
d_boxes[count * 4 + 0] = x - 0.5 * w;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)] d_boxes[count * 4 + 1] = y - 0.5 * h;
= maxIndex; d_boxes[count * 4 + 2] = x + 0.5 * w;
d_boxes[count * 4 + 3] = y + 0.5 * h;
d_classes[count] = maxIndex;
} }
cudaError_t cudaRegionLayer( cudaError_t cudaRegionLayer(
const void* input, void* output, void* softmax, const uint& batchSize, const uint& netWidth, const void* input, void* softmax, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& numBBoxes, uint64_t& outputSize, const void* anchors, cudaStream_t stream); const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const void* anchors, cudaStream_t stream);
cudaError_t cudaRegionLayer( cudaError_t cudaRegionLayer(
const void* input, void* output, void* softmax, const uint& batchSize, const uint& netWidth, const void* input, void* softmax, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& numBBoxes, uint64_t& outputSize, const void* anchors, cudaStream_t stream) const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const void* anchors, cudaStream_t stream)
{ {
dim3 threads_per_block(16, 16, 4); dim3 threads_per_block(16, 16, 4);
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1, dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
@@ -110,10 +114,13 @@ cudaError_t cudaRegionLayer(
for (unsigned int batch = 0; batch < batchSize; ++batch) for (unsigned int batch = 0; batch < batchSize; ++batch)
{ {
gpuRegionLayer<<<number_of_blocks, threads_per_block, 0, stream>>>( gpuRegionLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
reinterpret_cast<const float*>(input) + (batch * outputSize), reinterpret_cast<const float*>(input) + (batch * inputSize),
reinterpret_cast<float*>(output) + (batch * outputSize), reinterpret_cast<float*>(softmax) + (batch * inputSize),
reinterpret_cast<float*>(softmax) + (batch * outputSize), reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, reinterpret_cast<float*>(d_scores) + (batch * outputSize),
reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes,
reinterpret_cast<const float*>(anchors)); reinterpret_cast<const float*>(anchors));
} }
return cudaGetLastError(); return cudaGetLastError();

View File

@@ -29,7 +29,6 @@
#include <iostream> #include <iostream>
#include <memory> #include <memory>
uint kNUM_BBOXES;
uint kNUM_CLASSES; uint kNUM_CLASSES;
namespace { namespace {
@@ -49,131 +48,108 @@ namespace {
} }
cudaError_t cudaYoloLayer_r( cudaError_t cudaYoloLayer_r(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
cudaError_t cudaYoloLayer_nc( cudaError_t cudaYoloLayer_nc(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
cudaError_t cudaYoloLayer( cudaError_t cudaYoloLayer(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight, const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
cudaError_t cudaRegionLayer( cudaError_t cudaRegionLayer(
const void* input, void* output, void* softmax, const uint& batchSize, const uint& netWidth, const void* input, void* softmax, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& numBBoxes, uint64_t& outputSize, const void* anchors, cudaStream_t stream); const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const void* anchors, cudaStream_t stream);
cudaError_t sortDetections(
void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* bboxData, void* scoreData, void* countData,
const uint& batchSize, uint64_t& outputSize, uint& topK, const uint& numOutputClasses, cudaStream_t stream);
YoloLayer::YoloLayer (const void* data, size_t length) YoloLayer::YoloLayer (const void* data, size_t length)
{ {
const char *d = static_cast<const char*>(data); const char *d = static_cast<const char*>(data);
read(d, m_NumBBoxes);
read(d, m_NumClasses);
read(d, m_NetWidth); read(d, m_NetWidth);
read(d, m_NetHeight); read(d, m_NetHeight);
read(d, m_GridSizeX); read(d, m_NumClasses);
read(d, m_GridSizeY);
read(d, m_Type);
read(d, m_NewCoords); read(d, m_NewCoords);
read(d, m_ScaleXY);
read(d, m_OutputSize); read(d, m_OutputSize);
read(d, m_Type);
read(d, m_TopK);
read(d, m_ScoreThreshold);
uint yoloTensorsSize;
read(d, yoloTensorsSize);
for (uint i = 0; i < yoloTensorsSize; ++i)
{
TensorInfo curYoloTensor;
read(d, curYoloTensor.gridSizeX);
read(d, curYoloTensor.gridSizeY);
read(d, curYoloTensor.numBBoxes);
read(d, curYoloTensor.scaleXY);
uint anchorsSize; uint anchorsSize;
read(d, anchorsSize); read(d, anchorsSize);
for (uint i = 0; i < anchorsSize; i++) { for (uint j = 0; j < anchorsSize; j++)
{
float result; float result;
read(d, result); read(d, result);
m_Anchors.push_back(result); curYoloTensor.anchors.push_back(result);
} }
uint maskSize; uint maskSize;
read(d, maskSize); read(d, maskSize);
for (uint i = 0; i < maskSize; i++) { for (uint j = 0; j < maskSize; j++)
{
int result; int result;
read(d, result); read(d, result);
m_Mask.push_back(result); curYoloTensor.mask.push_back(result);
}
m_YoloTensors.push_back(curYoloTensor);
} }
if (m_Anchors.size() > 0) {
float* anchors = m_Anchors.data();
CUDA_CHECK(cudaMallocHost(&p_Anchors, m_Anchors.size() * sizeof(float)));
CUDA_CHECK(cudaMemcpy(p_Anchors, anchors, m_Anchors.size() * sizeof(float), cudaMemcpyHostToDevice));
}
if (m_Mask.size() > 0) {
int* mask = m_Mask.data();
CUDA_CHECK(cudaMallocHost(&p_Mask, m_Mask.size() * sizeof(int)));
CUDA_CHECK(cudaMemcpy(p_Mask, mask, m_Mask.size() * sizeof(int), cudaMemcpyHostToDevice));
}
kNUM_BBOXES = m_NumBBoxes;
kNUM_CLASSES = m_NumClasses; kNUM_CLASSES = m_NumClasses;
}; };
YoloLayer::YoloLayer( YoloLayer::YoloLayer(
const uint& numBBoxes, const uint& numClasses, const uint& netWidth, const uint& netHeight, const uint& netWidth, const uint& netHeight, const uint& numClasses, const uint& newCoords,
const uint& gridSizeX, const uint& gridSizeY, const uint& modelType, const uint& newCoords, const std::vector<TensorInfo>& yoloTensors, const uint64_t& outputSize, const uint& modelType, const uint& topK,
const float& scaleXY, const std::vector<float> anchors, const float& scoreThreshold) :
const std::vector<int> mask) :
m_NumBBoxes(numBBoxes),
m_NumClasses(numClasses),
m_NetWidth(netWidth), m_NetWidth(netWidth),
m_NetHeight(netHeight), m_NetHeight(netHeight),
m_GridSizeX(gridSizeX), m_NumClasses(numClasses),
m_GridSizeY(gridSizeY),
m_Type(modelType),
m_NewCoords(newCoords), m_NewCoords(newCoords),
m_ScaleXY(scaleXY), m_YoloTensors(yoloTensors),
m_Anchors(anchors), m_OutputSize(outputSize),
m_Mask(mask) m_Type(modelType),
m_TopK(topK),
m_ScoreThreshold(scoreThreshold)
{ {
assert(m_NumBBoxes > 0);
assert(m_NumClasses > 0);
assert(m_NetWidth > 0); assert(m_NetWidth > 0);
assert(m_NetHeight > 0); assert(m_NetHeight > 0);
assert(m_GridSizeX > 0);
assert(m_GridSizeY > 0);
m_OutputSize = m_GridSizeX * m_GridSizeY * (m_NumBBoxes * (4 + 1 + m_NumClasses));
if (m_Anchors.size() > 0) {
float* anchors = m_Anchors.data();
CUDA_CHECK(cudaMallocHost(&p_Anchors, m_Anchors.size() * sizeof(float)));
CUDA_CHECK(cudaMemcpy(p_Anchors, anchors, m_Anchors.size() * sizeof(float), cudaMemcpyHostToDevice));
}
if (m_Mask.size() > 0) {
int* mask = m_Mask.data();
CUDA_CHECK(cudaMallocHost(&p_Mask, m_Mask.size() * sizeof(int)));
CUDA_CHECK(cudaMemcpy(p_Mask, mask, m_Mask.size() * sizeof(int), cudaMemcpyHostToDevice));
}
kNUM_BBOXES = m_NumBBoxes;
kNUM_CLASSES = m_NumClasses; kNUM_CLASSES = m_NumClasses;
}; };
YoloLayer::~YoloLayer()
{
if (m_Anchors.size() > 0) {
CUDA_CHECK(cudaFreeHost(p_Anchors));
}
if (m_Mask.size() > 0) {
CUDA_CHECK(cudaFreeHost(p_Mask));
}
}
nvinfer1::Dims nvinfer1::Dims
YoloLayer::getOutputDimensions( YoloLayer::getOutputDimensions(
int index, const nvinfer1::Dims* inputs, int nbInputDims) noexcept int index, const nvinfer1::Dims* inputs, int nbInputDims) noexcept
{ {
assert(index == 0); assert(index < 3);
assert(nbInputDims == 1); if (index == 0) {
return inputs[0]; return nvinfer1::Dims3(m_TopK, 1, 4);
}
return nvinfer1::DimsHW(m_TopK, m_NumClasses);
} }
bool YoloLayer::supportsFormat ( bool YoloLayer::supportsFormat (
@@ -188,43 +164,116 @@ YoloLayer::configureWithFormat (
const nvinfer1::Dims* outputDims, int nbOutputs, const nvinfer1::Dims* outputDims, int nbOutputs,
nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept
{ {
assert(nbInputs == 1); assert(nbInputs > 0);
assert(format == nvinfer1::PluginFormat::kLINEAR); assert(format == nvinfer1::PluginFormat::kLINEAR);
assert(inputDims != nullptr); assert(inputDims != nullptr);
} }
int32_t YoloLayer::enqueue ( int32_t YoloLayer::enqueue (
int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, int batchSize, void const* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) noexcept cudaStream_t stream) noexcept
{ {
void* countData = workspace;
void* bboxData = outputs[0];
void* scoreData = outputs[1];
CUDA_CHECK(cudaMemsetAsync((int*)countData, 0, sizeof(int) * batchSize, stream));
CUDA_CHECK(cudaMemsetAsync((float*)bboxData, 0, sizeof(float) * m_TopK * 4 * batchSize, stream));
CUDA_CHECK(cudaMemsetAsync((float*)scoreData, 0, sizeof(float) * m_TopK * m_NumClasses * batchSize, stream));
void* d_indexes;
CUDA_CHECK(cudaMallocHost(&d_indexes, sizeof(int) * m_OutputSize * batchSize));
CUDA_CHECK(cudaMemsetAsync((float*)d_indexes, 0, sizeof(int) * m_OutputSize * batchSize, stream));
void* d_scores;
CUDA_CHECK(cudaMallocHost(&d_scores, sizeof(float) * m_OutputSize * batchSize));
CUDA_CHECK(cudaMemsetAsync((float*)d_scores, 0, sizeof(float) * m_OutputSize * batchSize, stream));
void* d_boxes;
CUDA_CHECK(cudaMallocHost(&d_boxes, sizeof(float) * m_OutputSize * 4 * batchSize));
CUDA_CHECK(cudaMemsetAsync((float*)d_boxes, 0, sizeof(float) * m_OutputSize * 4 * batchSize, stream));
void* d_classes;
CUDA_CHECK(cudaMallocHost(&d_classes, sizeof(int) * m_OutputSize * batchSize));
CUDA_CHECK(cudaMemsetAsync((float*)d_classes, 0, sizeof(int) * m_OutputSize * batchSize, stream));
uint yoloTensorsSize = m_YoloTensors.size();
for (uint i = 0; i < yoloTensorsSize; ++i)
{
TensorInfo& curYoloTensor = m_YoloTensors.at(i);
uint numBBoxes = curYoloTensor.numBBoxes;
float scaleXY = curYoloTensor.scaleXY;
uint gridSizeX = curYoloTensor.gridSizeX;
uint gridSizeY = curYoloTensor.gridSizeY;
std::vector<float> anchors = curYoloTensor.anchors;
std::vector<int> mask = curYoloTensor.mask;
void* v_anchors;
void* v_mask;
if (anchors.size() > 0) {
float* f_anchors = anchors.data();
CUDA_CHECK(cudaMallocHost(&v_anchors, sizeof(float) * anchors.size()));
CUDA_CHECK(cudaMemcpy(v_anchors, f_anchors, sizeof(float) * anchors.size(), cudaMemcpyHostToDevice));
}
if (mask.size() > 0) {
int* f_mask = mask.data();
CUDA_CHECK(cudaMallocHost(&v_mask, sizeof(int) * mask.size()));
CUDA_CHECK(cudaMemcpy(v_mask, f_mask, sizeof(int) * mask.size(), cudaMemcpyHostToDevice));
}
uint64_t inputSize = gridSizeX * gridSizeY * (numBBoxes * (4 + 1 + m_NumClasses));
if (m_Type == 2) { // YOLOR incorrect param: scale_x_y = 2.0 if (m_Type == 2) { // YOLOR incorrect param: scale_x_y = 2.0
CUDA_CHECK(cudaYoloLayer_r( CUDA_CHECK(cudaYoloLayer_r(
inputs[0], outputs[0], batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY, inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
m_NumClasses, m_NumBBoxes, m_OutputSize, 2.0, p_Anchors, p_Mask, stream)); m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, 2.0, v_anchors,
v_mask, stream));
} }
else if (m_Type == 1) { else if (m_Type == 1) {
if (m_NewCoords) { if (m_NewCoords) {
CUDA_CHECK(cudaYoloLayer_nc( CUDA_CHECK(cudaYoloLayer_nc(
inputs[0], outputs[0], batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY, inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
m_NumClasses, m_NumBBoxes, m_OutputSize, m_ScaleXY, p_Anchors, p_Mask, stream)); m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY,
v_anchors, v_mask, stream));
} }
else { else {
CUDA_CHECK(cudaYoloLayer( CUDA_CHECK(cudaYoloLayer(
inputs[0], outputs[0], batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY, inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
m_NumClasses, m_NumBBoxes, m_OutputSize, m_ScaleXY, p_Anchors, p_Mask, stream)); m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY,
v_anchors, v_mask, stream));
} }
} }
else { else {
void* softmax; void* softmax;
cudaMallocHost(&softmax, sizeof(outputs[0])); CUDA_CHECK(cudaMallocHost(&softmax, sizeof(float) * inputSize * batchSize));
cudaMemcpy(softmax, outputs[0], sizeof(outputs[0]), cudaMemcpyHostToDevice); CUDA_CHECK(cudaMemsetAsync((float*)softmax, 0, sizeof(float) * inputSize * batchSize));
CUDA_CHECK(cudaRegionLayer( CUDA_CHECK(cudaRegionLayer(
inputs[0], outputs[0], softmax, batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY, inputs[i], softmax, d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
m_NumClasses, m_NumBBoxes, m_OutputSize, p_Anchors, stream)); m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, v_anchors,
stream));
CUDA_CHECK(cudaFreeHost(softmax)); CUDA_CHECK(cudaFreeHost(softmax));
} }
if (anchors.size() > 0) {
CUDA_CHECK(cudaFreeHost(v_anchors));
}
if (mask.size() > 0) {
CUDA_CHECK(cudaFreeHost(v_mask));
}
}
CUDA_CHECK(sortDetections(
d_indexes, d_scores, d_boxes, d_classes, bboxData, scoreData, countData, batchSize, m_OutputSize, m_TopK,
m_NumClasses, stream));
CUDA_CHECK(cudaFreeHost(d_indexes));
CUDA_CHECK(cudaFreeHost(d_scores));
CUDA_CHECK(cudaFreeHost(d_boxes));
CUDA_CHECK(cudaFreeHost(d_classes));
return 0; return 0;
} }
@@ -232,18 +281,28 @@ size_t YoloLayer::getSerializationSize() const noexcept
{ {
size_t totalSize = 0; size_t totalSize = 0;
totalSize += sizeof(m_NumBBoxes);
totalSize += sizeof(m_NumClasses);
totalSize += sizeof(m_NetWidth); totalSize += sizeof(m_NetWidth);
totalSize += sizeof(m_NetHeight); totalSize += sizeof(m_NetHeight);
totalSize += sizeof(m_GridSizeX); totalSize += sizeof(m_NumClasses);
totalSize += sizeof(m_GridSizeY);
totalSize += sizeof(m_Type);
totalSize += sizeof(m_NewCoords); totalSize += sizeof(m_NewCoords);
totalSize += sizeof(m_ScaleXY);
totalSize += sizeof(m_OutputSize); totalSize += sizeof(m_OutputSize);
totalSize += sizeof(uint) + sizeof(m_Anchors[0]) * m_Anchors.size(); totalSize += sizeof(m_Type);
totalSize += sizeof(uint) + sizeof(m_Mask[0]) * m_Mask.size(); totalSize += sizeof(m_TopK);
totalSize += sizeof(m_ScoreThreshold);
uint yoloTensorsSize = m_YoloTensors.size();
totalSize += sizeof(yoloTensorsSize);
for (uint i = 0; i < yoloTensorsSize; ++i)
{
const TensorInfo& curYoloTensor = m_YoloTensors.at(i);
totalSize += sizeof(curYoloTensor.gridSizeX);
totalSize += sizeof(curYoloTensor.gridSizeY);
totalSize += sizeof(curYoloTensor.numBBoxes);
totalSize += sizeof(curYoloTensor.scaleXY);
totalSize += sizeof(uint) + sizeof(curYoloTensor.anchors[0]) * curYoloTensor.anchors.size();
totalSize += sizeof(uint) + sizeof(curYoloTensor.mask[0]) * curYoloTensor.mask.size();
}
return totalSize; return totalSize;
} }
@@ -252,35 +311,46 @@ void YoloLayer::serialize(void* buffer) const noexcept
{ {
char *d = static_cast<char*>(buffer); char *d = static_cast<char*>(buffer);
write(d, m_NumBBoxes);
write(d, m_NumClasses);
write(d, m_NetWidth); write(d, m_NetWidth);
write(d, m_NetHeight); write(d, m_NetHeight);
write(d, m_GridSizeX); write(d, m_NumClasses);
write(d, m_GridSizeY);
write(d, m_Type);
write(d, m_NewCoords); write(d, m_NewCoords);
write(d, m_ScaleXY);
write(d, m_OutputSize); write(d, m_OutputSize);
write(d, m_Type);
write(d, m_TopK);
write(d, m_ScoreThreshold);
uint anchorsSize = m_Anchors.size(); uint yoloTensorsSize = m_YoloTensors.size();
write(d, yoloTensorsSize);
for (uint i = 0; i < yoloTensorsSize; ++i)
{
const TensorInfo& curYoloTensor = m_YoloTensors.at(i);
write(d, curYoloTensor.gridSizeX);
write(d, curYoloTensor.gridSizeY);
write(d, curYoloTensor.numBBoxes);
write(d, curYoloTensor.scaleXY);
uint anchorsSize = curYoloTensor.anchors.size();
write(d, anchorsSize); write(d, anchorsSize);
for (uint i = 0; i < anchorsSize; i++) { for (uint j = 0; j < anchorsSize; ++j)
write(d, m_Anchors[i]); {
write(d, curYoloTensor.anchors[j]);
} }
uint maskSize = m_Mask.size(); uint maskSize = curYoloTensor.mask.size();
write(d, maskSize); write(d, maskSize);
for (uint i = 0; i < maskSize; i++) { for (uint j = 0; j < maskSize; ++j)
write(d, m_Mask[i]); {
write(d, curYoloTensor.mask[j]);
}
} }
} }
nvinfer1::IPluginV2* YoloLayer::clone() const noexcept nvinfer1::IPluginV2* YoloLayer::clone() const noexcept
{ {
return new YoloLayer ( return new YoloLayer (
m_NumBBoxes, m_NumClasses, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY, m_Type, m_NetWidth, m_NetHeight, m_NumClasses, m_NewCoords, m_YoloTensors, m_OutputSize, m_Type, m_TopK,
m_NewCoords, m_ScaleXY, m_Anchors, m_Mask); m_ScoreThreshold);
} }
REGISTER_TENSORRT_PLUGIN(YoloLayerPluginCreator); REGISTER_TENSORRT_PLUGIN(YoloLayerPluginCreator);

View File

@@ -36,12 +36,14 @@
#include "NvInferPlugin.h" #include "NvInferPlugin.h"
#include "yolo.h"
#define CUDA_CHECK(status) \ #define CUDA_CHECK(status) \
{ \ { \
if (status != 0) \ if (status != 0) \
{ \ { \
std::cout << "CUDA failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \ std::cout << "CUDA failure: " << cudaGetErrorString(status) << " in file " << __FILE__ << " at line " \
<< " at line " << __LINE__ << std::endl; \ << __LINE__ << std::endl; \
abort(); \ abort(); \
} \ } \
} }
@@ -56,15 +58,17 @@ class YoloLayer : public nvinfer1::IPluginV2
{ {
public: public:
YoloLayer (const void* data, size_t length); YoloLayer (const void* data, size_t length);
YoloLayer ( YoloLayer (
const uint& numBBoxes, const uint& numClasses, const uint& netWidth, const uint& netHeight, const uint& netWidth, const uint& netHeight, const uint& numClasses, const uint& newCoords,
const uint& gridSizeX, const uint& gridSizeY, const uint& modelType, const uint& newCoords, const std::vector<TensorInfo>& yoloTensors, const uint64_t& outputSize, const uint& modelType, const uint& topK,
const float& scaleXY, const std::vector<float> anchors, const float& scoreThreshold);
const std::vector<int> mask);
~YoloLayer ();
const char* getPluginType () const noexcept override { return YOLOLAYER_PLUGIN_NAME; } const char* getPluginType () const noexcept override { return YOLOLAYER_PLUGIN_NAME; }
const char* getPluginVersion () const noexcept override { return YOLOLAYER_PLUGIN_VERSION; } const char* getPluginVersion () const noexcept override { return YOLOLAYER_PLUGIN_VERSION; }
int getNbOutputs () const noexcept override { return 1; }
int getNbOutputs () const noexcept override { return 2; }
nvinfer1::Dims getOutputDimensions ( nvinfer1::Dims getOutputDimensions (
int index, const nvinfer1::Dims* inputs, int index, const nvinfer1::Dims* inputs,
@@ -74,53 +78,59 @@ public:
nvinfer1::DataType type, nvinfer1::PluginFormat format) const noexcept override; nvinfer1::DataType type, nvinfer1::PluginFormat format) const noexcept override;
void configureWithFormat ( void configureWithFormat (
const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, int nbOutputs,
const nvinfer1::Dims* outputDims, int nbOutputs,
nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept override; nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept override;
int initialize () noexcept override { return 0; } int initialize () noexcept override { return 0; }
void terminate () noexcept override {} void terminate () noexcept override {}
size_t getWorkspaceSize (int maxBatchSize) const noexcept override { return 0; }
size_t getWorkspaceSize (int maxBatchSize) const noexcept override {
return maxBatchSize * sizeof(int);
}
int32_t enqueue ( int32_t enqueue (
int32_t batchSize, void const* const* inputs, void* const* outputs, int batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream)
void* workspace, cudaStream_t stream) noexcept override; noexcept override;
size_t getSerializationSize() const noexcept override; size_t getSerializationSize() const noexcept override;
void serialize (void* buffer) const noexcept override; void serialize (void* buffer) const noexcept override;
void destroy () noexcept override { delete this; } void destroy () noexcept override { delete this; }
nvinfer1::IPluginV2* clone() const noexcept override; nvinfer1::IPluginV2* clone() const noexcept override;
void setPluginNamespace (const char* pluginNamespace) noexcept override { void setPluginNamespace (const char* pluginNamespace) noexcept override {
m_Namespace = pluginNamespace; m_Namespace = pluginNamespace;
} }
virtual const char* getPluginNamespace () const noexcept override { virtual const char* getPluginNamespace () const noexcept override {
return m_Namespace.c_str(); return m_Namespace.c_str();
} }
private: private:
std::string m_Namespace {""}; std::string m_Namespace {""};
uint m_NumBBoxes {0};
uint m_NumClasses {0};
uint m_NetWidth {0}; uint m_NetWidth {0};
uint m_NetHeight {0}; uint m_NetHeight {0};
uint m_GridSizeX {0}; uint m_NumClasses {0};
uint m_GridSizeY {0};
uint m_Type {0};
uint m_NewCoords {0}; uint m_NewCoords {0};
float m_ScaleXY {0}; std::vector<TensorInfo> m_YoloTensors;
std::vector<float> m_Anchors;
std::vector<int> m_Mask;
uint64_t m_OutputSize {0}; uint64_t m_OutputSize {0};
void* p_Anchors; uint m_Type {0};
void* p_Mask; uint m_TopK {0};
float m_ScoreThreshold {0};
}; };
class YoloLayerPluginCreator : public nvinfer1::IPluginCreator class YoloLayerPluginCreator : public nvinfer1::IPluginCreator
{ {
public: public:
YoloLayerPluginCreator () {} YoloLayerPluginCreator () {}
~YoloLayerPluginCreator () {} ~YoloLayerPluginCreator () {}
const char* getPluginName () const noexcept override { return YOLOLAYER_PLUGIN_NAME; } const char* getPluginName () const noexcept override { return YOLOLAYER_PLUGIN_NAME; }
const char* getPluginVersion () const noexcept override { return YOLOLAYER_PLUGIN_VERSION; } const char* getPluginVersion () const noexcept override { return YOLOLAYER_PLUGIN_VERSION; }
const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override { const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override {
@@ -153,7 +163,6 @@ private:
std::string m_Namespace {""}; std::string m_Namespace {""};
}; };
extern uint kNUM_BBOXES;
extern uint kNUM_CLASSES; extern uint kNUM_CLASSES;
#endif // __YOLO_PLUGINS__ #endif // __YOLO_PLUGINS__

112
readme.md
View File

@@ -25,7 +25,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
* YOLOR native support * YOLOR native support
* Models benchmarks (**outdated**) * Models benchmarks (**outdated**)
* **GPU YOLO Decoder (moved from CPU to GPU to get better performance)** [#138](https://github.com/marcoslucianops/DeepStream-Yolo/issues/138) * **GPU YOLO Decoder (moved from CPU to GPU to get better performance)** [#138](https://github.com/marcoslucianops/DeepStream-Yolo/issues/138)
* **Improved NMS** [#142](https://github.com/marcoslucianops/DeepStream-Yolo/issues/142) * **GPU Batched NMS** [#142](https://github.com/marcoslucianops/DeepStream-Yolo/issues/142)
## ##
@@ -38,6 +38,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
* [Basic usage](#basic-usage) * [Basic usage](#basic-usage)
* [YOLOv5 usage](#yolov5-usage) * [YOLOv5 usage](#yolov5-usage)
* [YOLOR usage](#yolor-usage) * [YOLOR usage](#yolor-usage)
* [NMS configuration](#nms-configuration)
* [INT8 calibration](#int8-calibration) * [INT8 calibration](#int8-calibration)
* [Using your custom model](docs/customModels.md) * [Using your custom model](docs/customModels.md)
@@ -101,67 +102,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
### Benchmarks ### Benchmarks
``` New tests comming soon.
nms-iou-threshold = 0.6
pre-cluster-threshold = 0.001 (mAP eval) / 0.25 (FPS measurement)
batch-size = 1
valid = val2017 (COCO) - 1000 random images for INT8 calibration
sample = 1920x1080 video
NOTE: Used maintain-aspect-ratio=1 in config_infer file for YOLOv4 (with letter_box=1), YOLOv5 and YOLOR models.
```
#### NVIDIA GTX 1050 4GB (Mobile)
##### YOLOR-CSP performance comparison
| | DeepStream | PyTorch |
|:---------------------:|:----------:|:-------:|
| FPS (without display) | 13.32 | 10.07 |
| FPS (with display) | 12.63 | 9.41 |
##### YOLOv5n performance comparison
| | DeepStream | TensorRTx | Ultralytics |
|:---------------------:|:----------:|:---------:|:-----------:|
| FPS (without display) | 110.25 | 87.42 | 97.19 |
| FPS (with display) | 105.62 | 73.07 | 50.37 |
<details><summary>More</summary>
<br>
| DeepStream | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS<br />(without display) |
|:------------------:|:---------:|:----------:|:------------:|:-------:|:--------:|:--------------------------:|
| YOLOR-P6 | FP32 | 1280 | 0.478 | 0.663 | 0.519 | 5.53 |
| YOLOR-CSP-X* | FP32 | 640 | 0.473 | 0.664 | 0.513 | 7.59 |
| YOLOR-CSP-X | FP32 | 640 | 0.470 | 0.661 | 0.507 | 7.52 |
| YOLOR-CSP* | FP32 | 640 | 0.459 | 0.652 | 0.496 | 13.28 |
| YOLOR-CSP | FP32 | 640 | 0.449 | 0.639 | 0.483 | 13.32 |
| YOLOv5x6 6.0 | FP32 | 1280 | 0.504 | 0.681 | 0.547 | 2.22 |
| YOLOv5l6 6.0 | FP32 | 1280 | 0.492 | 0.670 | 0.535 | 4.05 |
| YOLOv5m6 6.0 | FP32 | 1280 | 0.463 | 0.642 | 0.504 | 7.54 |
| YOLOv5s6 6.0 | FP32 | 1280 | 0.394 | 0.572 | 0.424 | 18.64 |
| YOLOv5n6 6.0 | FP32 | 1280 | 0.294 | 0.452 | 0.314 | 26.94 |
| YOLOv5x 6.0 | FP32 | 640 | 0.469 | 0.654 | 0.509 | 8.24 |
| YOLOv5l 6.0 | FP32 | 640 | 0.450 | 0.634 | 0.487 | 14.96 |
| YOLOv5m 6.0 | FP32 | 640 | 0.415 | 0.601 | 0.448 | 28.30 |
| YOLOv5s 6.0 | FP32 | 640 | 0.334 | 0.516 | 0.355 | 63.55 |
| YOLOv5n 6.0 | FP32 | 640 | 0.250 | 0.417 | 0.260 | 110.25 |
| YOLOv4-P6 | FP32 | 1280 | 0.499 | 0.685 | 0.542 | 2.57 |
| YOLOv4-P5 | FP32 | 896 | 0.472 | 0.659 | 0.513 | 5.48 |
| YOLOv4-CSP-X-SWISH | FP32 | 640 | 0.473 | 0.664 | 0.513 | 7.51 |
| YOLOv4-CSP-SWISH | FP32 | 640 | 0.459 | 0.652 | 0.496 | 13.13 |
| YOLOv4x-MISH | FP32 | 640 | 0.459 | 0.650 | 0.495 | 7.53 |
| YOLOv4-CSP | FP32 | 640 | 0.440 | 0.632 | 0.474 | 13.19 |
| YOLOv4 | FP32 | 608 | 0.498 | 0.740 | 0.549 | 12.18 |
| YOLOv4-Tiny | FP32 | 416 | 0.215 | 0.403 | 0.206 | 201.20 |
| YOLOv3-SPP | FP32 | 608 | 0.411 | 0.686 | 0.433 | 12.22 |
| YOLOv3-Tiny-PRN | FP32 | 416 | 0.167 | 0.382 | 0.125 | 277.14 |
| YOLOv3 | FP32 | 608 | 0.377 | 0.672 | 0.385 | 12.51 |
| YOLOv3-Tiny | FP32 | 416 | 0.095 | 0.203 | 0.079 | 218.42 |
| YOLOv2 | FP32 | 608 | 0.286 | 0.541 | 0.273 | 25.28 |
| YOLOv2-Tiny | FP32 | 416 | 0.102 | 0.258 | 0.061 | 231.36 |
</details>
## ##
@@ -221,9 +162,11 @@ wget https://us.download.nvidia.com/tesla/510.47.03/NVIDIA-Linux-x86_64-510.47.0
* Run * Run
``` ```
sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd
``` ```
**NOTE**: This step will disable the nouveau drivers.
* Reboot * Reboot
``` ```
@@ -233,7 +176,7 @@ sudo reboot
* Install * Install
``` ```
sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --dkms --install-libglvnd sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd
``` ```
**NOTE**: If you are using a laptop with NVIDIA Optimius, run **NOTE**: If you are using a laptop with NVIDIA Optimius, run
@@ -326,7 +269,7 @@ sudo apt install libssl1.0.0 libgstreamer1.0-0 gstreamer1.0-tools gstreamer1.0-p
sudo apt-get install linux-headers-$(uname -r) sudo apt-get install linux-headers-$(uname -r)
``` ```
**NOTE**: Install DKMS only if you are using the default Ubuntu kernel **NOTE**: Install DKMS only if you are using the default Ubuntu kernel.
``` ```
sudo apt-get install dkms sudo apt-get install dkms
@@ -369,9 +312,11 @@ wget https://us.download.nvidia.com/tesla/470.129.06/NVIDIA-Linux-x86_64-470.129
* Run * Run
``` ```
sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd
``` ```
**NOTE**: This step will disable the nouveau drivers.
* Reboot * Reboot
``` ```
@@ -381,7 +326,7 @@ sudo reboot
* Install * Install
``` ```
sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --dkms --install-libglvnd sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd
``` ```
**NOTE**: If you are using a laptop with NVIDIA Optimius, run **NOTE**: If you are using a laptop with NVIDIA Optimius, run
@@ -519,11 +464,6 @@ network-mode=0
# Number of classes in label file # Number of classes in label file
num-detected-classes=80 num-detected-classes=80
... ...
[class-attrs-all]
# IOU threshold
nms-iou-threshold=0.45
# Score threshold
pre-cluster-threshold=0.25
``` ```
#### 5. Run #### 5. Run
@@ -542,6 +482,7 @@ gpu-id=0
gie-unique-id=1 gie-unique-id=1
nvbuf-memory-type=0 nvbuf-memory-type=0
config-file=config_infer_primary_yoloV2.txt config-file=config_infer_primary_yoloV2.txt
...
``` ```
## ##
@@ -618,11 +559,6 @@ network-mode=0
# Number of classes in label file # Number of classes in label file
num-detected-classes=80 num-detected-classes=80
... ...
[class-attrs-all]
# IOU threshold
nms-iou-threshold=0.45
# Score threshold
pre-cluster-threshold=0.25
``` ```
#### 8. Change the deepstream_app_config.txt file #### 8. Change the deepstream_app_config.txt file
@@ -749,11 +685,6 @@ network-mode=0
# Number of classes in label file # Number of classes in label file
num-detected-classes=80 num-detected-classes=80
... ...
[class-attrs-all]
# IOU threshold
nms-iou-threshold=0.5
# Score threshold
pre-cluster-threshold=0.25
``` ```
#### 8. Change the deepstream_app_config.txt file #### 8. Change the deepstream_app_config.txt file
@@ -776,6 +707,23 @@ deepstream-app -c deepstream_app_config.txt
## ##
### NMS Configuration
To change the `iou-threshold`, `score-threshold` and `topk` values, modify the `config_nms.txt` file and regenerate the model engine file.
**NOTE**: Lower `topk` values will result in more performance.
**NOTE**: Make sure to set cluster-mode=4 and pre-cluster-threshold=0 in config_infer file.
```
[property]
iou-threshold=0.45
score-threshold=0.25
topk=300
```
##
### INT8 calibration ### INT8 calibration
#### 1. Install OpenCV #### 1. Install OpenCV

View File

@@ -144,6 +144,7 @@ with open(cfg_file, "w") as c:
c.write("width=%d\n" % model_width) c.write("width=%d\n" % model_width)
c.write("height=%d\n" % model_height) c.write("height=%d\n" % model_height)
c.write("channels=%d\n" % model_channels) c.write("channels=%d\n" % model_channels)
c.write("letter_box=1\n")
nc = 0 nc = 0
depth_multiple = 0 depth_multiple = 0
width_multiple = 0 width_multiple = 0