GPU Batched NMS

This commit is contained in:
Marcos Luciano
2022-06-19 03:25:50 -03:00
parent 2300e3b44b
commit f621c0f429
24 changed files with 835 additions and 654 deletions

View File

@@ -14,12 +14,11 @@ interval=0
gie-unique-id=1
process-mode=1
network-type=0
cluster-mode=2
cluster-mode=4
maintain-aspect-ratio=0
parse-bbox-func-name=NvDsInferParseYolo
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
engine-create-func-name=NvDsInferYoloCudaEngineGet
[class-attrs-all]
nms-iou-threshold=0.45
pre-cluster-threshold=0.25
pre-cluster-threshold=0

View File

@@ -14,12 +14,11 @@ interval=0
gie-unique-id=1
process-mode=1
network-type=0
cluster-mode=2
cluster-mode=4
maintain-aspect-ratio=0
parse-bbox-func-name=NvDsInferParseYolo
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
engine-create-func-name=NvDsInferYoloCudaEngineGet
[class-attrs-all]
nms-iou-threshold=0.45
pre-cluster-threshold=0.25
pre-cluster-threshold=0

View File

@@ -14,12 +14,11 @@ interval=0
gie-unique-id=1
process-mode=1
network-type=0
cluster-mode=2
cluster-mode=4
maintain-aspect-ratio=1
parse-bbox-func-name=NvDsInferParseYolo
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
engine-create-func-name=NvDsInferYoloCudaEngineGet
[class-attrs-all]
nms-iou-threshold=0.45
pre-cluster-threshold=0.25
pre-cluster-threshold=0

View File

@@ -14,12 +14,11 @@ interval=0
gie-unique-id=1
process-mode=1
network-type=0
cluster-mode=2
cluster-mode=4
maintain-aspect-ratio=1
parse-bbox-func-name=NvDsInferParseYolo
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
engine-create-func-name=NvDsInferYoloCudaEngineGet
[class-attrs-all]
nms-iou-threshold=0.5
pre-cluster-threshold=0.25
pre-cluster-threshold=0

4
config_nms.txt Normal file
View File

@@ -0,0 +1,4 @@
[property]
iou-threshold=0.45
score-threshold=0.25
topk=300

View File

@@ -272,24 +272,6 @@ interval=0
##
#### nms-iou-threshold
```
# IOU threshold
nms-iou-threshold=0.6
```
##
#### pre-cluster-threshold
```
# Socre threshold
pre-cluster-threshold=0.25
```
##
### Testing model
```

View File

@@ -55,7 +55,6 @@ SRCFILES:= nvdsinfer_yolo_engine.cpp \
layers/convolutional_layer.cpp \
layers/implicit_layer.cpp \
layers/channels_layer.cpp \
layers/dropout_layer.cpp \
layers/shortcut_layer.cpp \
layers/route_layer.cpp \
layers/upsample_layer.cpp \
@@ -67,7 +66,8 @@ SRCFILES:= nvdsinfer_yolo_engine.cpp \
yoloForward.cu \
yoloForward_v2.cu \
yoloForward_nc.cu \
yoloForward_r.cu
yoloForward_r.cu \
sortDetections.cu
ifeq ($(OPENCV), 1)
SRCFILES+= calibrator.cpp

View File

@@ -1,15 +0,0 @@
/*
* Created by Marcos Luciano
* https://www.github.com/marcoslucianops
*/
#include "dropout_layer.h"
nvinfer1::ILayer* dropoutLayer(
float probability,
nvinfer1::ITensor* input,
nvinfer1::INetworkDefinition* network)
{
nvinfer1::ILayer* output;
return output;
}

View File

@@ -1,16 +0,0 @@
/*
* Created by Marcos Luciano
* https://www.github.com/marcoslucianops
*/
#ifndef __DROPOUT_LAYER_H__
#define __DROPOUT_LAYER_H__
#include "NvInfer.h"
nvinfer1::ILayer* dropoutLayer(
float probability,
nvinfer1::ITensor* input,
nvinfer1::INetworkDefinition* network);
#endif

View File

@@ -63,15 +63,13 @@ static bool getYoloNetworkInfo (NetworkInfo &networkInfo, const NvDsInferContext
if (networkInfo.configFilePath.empty() ||
networkInfo.wtsFilePath.empty()) {
std::cerr << "YOLO config file or weights file is not specified"
<< std::endl;
std::cerr << "YOLO config file or weights file is not specified\n" << std::endl;
return false;
}
if (!fileExists(networkInfo.configFilePath) ||
!fileExists(networkInfo.wtsFilePath)) {
std::cerr << "YOLO config file or weights file is not exist"
<< std::endl;
std::cerr << "YOLO config file or weights file is not exist\n" << std::endl;
return false;
}

View File

@@ -38,15 +38,15 @@ extern "C" bool NvDsInferParseYolo(
std::vector<NvDsInferParseObjectInfo>& objectList);
static NvDsInferParseObjectInfo convertBBox(
const float& bx, const float& by, const float& bw,
const float& bh, const uint& netW, const uint& netH)
const float& bx1, const float& by1, const float& bx2,
const float& by2, const uint& netW, const uint& netH)
{
NvDsInferParseObjectInfo b;
float x1 = bx - bw / 2;
float y1 = by - bh / 2;
float x2 = x1 + bw;
float y2 = y1 + bh;
float x1 = bx1;
float y1 = by1;
float x2 = bx2;
float y2 = by2;
x1 = clamp(x1, 0, netW);
y1 = clamp(y1, 0, netH);
@@ -62,11 +62,11 @@ static NvDsInferParseObjectInfo convertBBox(
}
static void addBBoxProposal(
const float bx, const float by, const float bw, const float bh,
const float bx1, const float by1, const float bx2, const float by2,
const uint& netW, const uint& netH, const int maxIndex,
const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
{
NvDsInferParseObjectInfo bbi = convertBBox(bx, by, bw, bh, netW, netH);
NvDsInferParseObjectInfo bbi = convertBBox(bx1, by1, bx2, by2, netW, netH);
if (bbi.width < 1 || bbi.height < 1) return;
bbi.detectionConfidence = maxProb;
@@ -75,34 +75,25 @@ static void addBBoxProposal(
}
static std::vector<NvDsInferParseObjectInfo> decodeYoloTensor(
const float* detections,
const uint gridSizeW, const uint gridSizeH, const uint numBBoxes,
const uint numOutputClasses, const uint& netW, const uint& netH)
const int* counts, const float* boxes,
const float* scores, const float* classes,
const uint& netW, const uint& netH)
{
std::vector<NvDsInferParseObjectInfo> binfo;
for (uint y = 0; y < gridSizeH; ++y) {
for (uint x = 0; x < gridSizeW; ++x) {
for (uint b = 0; b < numBBoxes; ++b)
{
const int numGridCells = gridSizeH * gridSizeW;
const int bbindex = y * gridSizeW + x;
const float bx
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 0)];
const float by
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 1)];
const float bw
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 2)];
const float bh
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 3)];
const float maxProb
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 4)];
const int maxIndex
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 5)];
uint numBoxes = counts[0];
addBBoxProposal(bx, by, bw, bh, netW, netH, maxIndex, maxProb, binfo);
}
}
for (uint b = 0; b < numBoxes; ++b)
{
float bx1 = boxes[b * 4 + 0];
float by1 = boxes[b * 4 + 1];
float bx2 = boxes[b * 4 + 2];
float by2 = boxes[b * 4 + 3];
float maxProb = scores[b];
int maxIndex = classes[b];
addBBoxProposal(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo);
}
return binfo;
}
@@ -112,7 +103,6 @@ static bool NvDsInferParseCustomYolo(
NvDsInferNetworkInfo const& networkInfo,
NvDsInferParseDetectionParams const& detectionParams,
std::vector<NvDsInferParseObjectInfo>& objectList,
const uint &numBBoxes,
const uint &numClasses)
{
if (outputLayersInfo.empty())
@@ -130,18 +120,17 @@ static bool NvDsInferParseCustomYolo(
std::vector<NvDsInferParseObjectInfo> objects;
for (uint idx = 0; idx < outputLayersInfo.size(); ++idx)
for (uint idx = 0; idx < outputLayersInfo.size() / 4; ++idx)
{
const NvDsInferLayerInfo &layer = outputLayersInfo[idx];
assert(layer.inferDims.numDims == 3);
const uint gridSizeH = layer.inferDims.d[1];
const uint gridSizeW = layer.inferDims.d[2];
const NvDsInferLayerInfo &counts = outputLayersInfo[idx * 4 + 0];
const NvDsInferLayerInfo &boxes = outputLayersInfo[idx * 4 + 1];
const NvDsInferLayerInfo &scores = outputLayersInfo[idx * 4 + 2];
const NvDsInferLayerInfo &classes = outputLayersInfo[idx * 4 + 3];
std::vector<NvDsInferParseObjectInfo> outObjs =
decodeYoloTensor(
(const float*)(layer.buffer),
gridSizeW, gridSizeH, numBBoxes, numClasses,
(const int*)(counts.buffer), (const float*)(boxes.buffer),
(const float*)(scores.buffer), (const float*)(classes.buffer),
networkInfo.width, networkInfo.height);
objects.insert(objects.end(), outObjs.begin(), outObjs.end());
@@ -158,11 +147,10 @@ extern "C" bool NvDsInferParseYolo(
NvDsInferParseDetectionParams const& detectionParams,
std::vector<NvDsInferParseObjectInfo>& objectList)
{
uint numBBoxes = kNUM_BBOXES;
uint numClasses = kNUM_CLASSES;
int num_classes = kNUM_CLASSES;
return NvDsInferParseCustomYolo (
outputLayersInfo, networkInfo, detectionParams, objectList, numBBoxes, numClasses);
outputLayersInfo, networkInfo, detectionParams, objectList, num_classes);
}
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYolo);

View File

@@ -0,0 +1,84 @@
/*
* Created by Marcos Luciano
* https://www.github.com/marcoslucianops
*/
#include <cub/cub.cuh>
__global__ void sortOutput(
int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, float* bboxData, float* scoreData,
const uint numOutputClasses)
{
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
int index = d_indexes[x_id];
int maxIndex = d_classes[index];
bboxData[x_id * 4 + 0] = d_boxes[index * 4 + 0];
bboxData[x_id * 4 + 1] = d_boxes[index * 4 + 1];
bboxData[x_id * 4 + 2] = d_boxes[index * 4 + 2];
bboxData[x_id * 4 + 3] = d_boxes[index * 4 + 3];
scoreData[x_id * numOutputClasses + maxIndex] = d_scores[x_id] - 1.f;
}
cudaError_t sortDetections(
void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* bboxData, void* scoreData, void* countData,
const uint& batchSize, uint64_t& outputSize, uint& topK, const uint& numOutputClasses, cudaStream_t stream);
cudaError_t sortDetections(
void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* bboxData, void* scoreData, void* countData,
const uint& batchSize, uint64_t& outputSize, uint& topK, const uint& numOutputClasses, cudaStream_t stream)
{
for (unsigned int batch = 0; batch < batchSize; ++batch)
{
int* _d_indexes = reinterpret_cast<int*>(d_indexes) + (batch * outputSize);
float* _d_scores = reinterpret_cast<float*>(d_scores) + (batch * outputSize);
int* _countData = reinterpret_cast<int*>(countData) + (batch);
int* _count = (int*)malloc(sizeof(int));
cudaMemcpy(_count, (int*)&_countData[0], sizeof(int), cudaMemcpyDeviceToHost);
int count = _count[0];
if (count == 0)
{
free(_count);
return cudaGetLastError();
}
size_t begin_bit = 0;
size_t end_bit = sizeof(float) * 8;
float *d_keys_out = NULL;
int *d_values_out = NULL;
cudaMalloc((void **)&d_keys_out, count * sizeof(float));
cudaMalloc((void **)&d_values_out, count * sizeof(int));
void* d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, _d_scores, d_keys_out, _d_indexes,
d_values_out, count, begin_bit, end_bit);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, _d_scores, d_keys_out, _d_indexes,
d_values_out, count, begin_bit, end_bit);
cudaMemcpy(_d_scores, d_keys_out, count * sizeof(float), cudaMemcpyDeviceToDevice);
cudaMemcpy(_d_indexes, d_values_out, count * sizeof(int), cudaMemcpyDeviceToDevice);
int threads_per_block = count < topK ? count : topK;
sortOutput<<<1, threads_per_block, 0, stream>>>(
_d_indexes, _d_scores, reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<float*>(bboxData) + (batch * topK),
reinterpret_cast<float*>(scoreData) + (batch * topK), numOutputClasses);
cudaFree(d_keys_out);
cudaFree(d_values_out);
cudaFree(d_temp_storage);
free(_count);
}
return cudaGetLastError();
}

View File

@@ -57,7 +57,7 @@ bool fileExists(const std::string fileName, bool verbose)
{
if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(fileName)))
{
if (verbose) std::cout << "File does not exist: " << fileName << std::endl;
if (verbose) std::cout << "\nFile does not exist: " << fileName << std::endl;
return false;
}
return true;
@@ -101,7 +101,7 @@ std::vector<float> loadWeights(const std::string weightsFilePath, const std::str
assert(file.good());
int32_t count;
file >> count;
assert(count > 0 && "Invalid .wts file.");
assert(count > 0 && "\nInvalid .wts file.");
uint32_t floatWeight;
std::string name;
@@ -118,7 +118,7 @@ std::vector<float> loadWeights(const std::string weightsFilePath, const std::str
}
else {
std::cerr << "File " << weightsFilePath << " is not supported" << std::endl;
std::cerr << "\nFile " << weightsFilePath << " is not supported" << std::endl;
std::abort();
}
@@ -149,11 +149,19 @@ int getNumChannels(nvinfer1::ITensor* t)
return d.d[0];
}
void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput,
std::string layerOutput, std::string weightPtr)
void printLayerInfo(
std::string layerIndex, std::string layerName, std::string layerInput, std::string layerOutput, std::string weightPtr)
{
std::cout << std::setw(6) << std::left << layerIndex << std::setw(24) << std::left << layerName;
std::cout << std::setw(20) << std::left << layerInput << std::setw(20) << std::left
<< layerOutput;
std::cout << std::setw(7) << std::left << weightPtr << std::endl;
}
std::string getAbsPath(std::string path)
{
std::size_t found = path.rfind("/");
if (found != std::string::npos)
path.erase(path.begin() + found, path.end());
return path;
}

View File

@@ -41,8 +41,8 @@ bool fileExists(const std::string fileName, bool verbose = true);
std::vector<float> loadWeights(const std::string weightsFilePath, const std::string& networkType);
std::string dimsToString(const nvinfer1::Dims d);
int getNumChannels(nvinfer1::ITensor* t);
void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput,
std::string layerOutput, std::string weightPtr);
void printLayerInfo(
std::string layerIndex, std::string layerName, std::string layerInput, std::string layerOutput, std::string weightPtr);
std::string getAbsPath(std::string path);
#endif

View File

@@ -46,7 +46,12 @@ Yolo::Yolo(const NetworkInfo& networkInfo)
m_InputC(0),
m_InputSize(0),
m_NumClasses(0),
m_LetterBox(0)
m_LetterBox(0),
m_NewCoords(0),
m_YoloCount(0),
m_IouThreshold(0),
m_ScoreThreshold(0),
m_TopK(0)
{}
Yolo::~Yolo()
@@ -61,59 +66,75 @@ nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder, nvinfer1
m_ConfigBlocks = parseConfigFile(m_ConfigFilePath);
parseConfigBlocks();
std::string configNMS = getAbsPath(m_WtsFilePath) + "/config_nms.txt";
if (!fileExists(configNMS))
{
std::cerr << "YOLO config_nms.txt file is not specified\n" << std::endl;
assert(0);
}
m_ConfigNMSBlocks = parseConfigFile(configNMS);
parseConfigNMSBlocks();
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0);
if (parseModel(*network) != NVDSINFER_SUCCESS) {
if (parseModel(*network) != NVDSINFER_SUCCESS)
{
delete network;
return nullptr;
}
std::cout << "Building the TensorRT Engine" << std::endl;
std::cout << "Building the TensorRT Engine\n" << std::endl;
if (m_NumClasses != m_NumDetectedClasses) {
std::cout << "\nNOTE: Number of classes mismatch, make sure to set num-detected-classes=" << m_NumClasses << " in config_infer file" << std::endl;
if (m_NumClasses != m_NumDetectedClasses)
{
std::cout << "NOTE: Number of classes mismatch, make sure to set num-detected-classes=" << m_NumClasses
<< " in config_infer file\n" << std::endl;
}
if (m_LetterBox == 1) {
std::cout << "\nNOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file to get better accuracy" << std::endl;
if (m_LetterBox == 1)
{
std::cout << "NOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file"
<< " to get better accuracy\n" << std::endl;
}
if (m_ClusterMode != 2) {
std::cout << "\nNOTE: Wrong cluster-mode is set, make sure to set cluster-mode=2 in config_infer file" << std::endl;
if (m_ClusterMode != 4)
{
std::cout << "NOTE: Wrong cluster-mode is set, make sure to set cluster-mode=4 in config_infer file\n"
<< std::endl;
}
std::cout << "" << std::endl;
if (m_NetworkMode == "INT8" && !fileExists(m_Int8CalibPath)) {
if (m_NetworkMode == "INT8" && !fileExists(m_Int8CalibPath))
{
assert(builder->platformHasFastInt8());
#ifdef OPENCV
std::string calib_image_list;
int calib_batch_size;
if (getenv("INT8_CALIB_IMG_PATH")) {
if (getenv("INT8_CALIB_IMG_PATH"))
calib_image_list = getenv("INT8_CALIB_IMG_PATH");
}
else {
else
{
std::cerr << "INT8_CALIB_IMG_PATH not set" << std::endl;
std::abort();
}
if (getenv("INT8_CALIB_BATCH_SIZE")) {
if (getenv("INT8_CALIB_BATCH_SIZE"))
calib_batch_size = std::stoi(getenv("INT8_CALIB_BATCH_SIZE"));
}
else {
else
{
std::cerr << "INT8_CALIB_BATCH_SIZE not set" << std::endl;
std::abort();
}
nvinfer1::Int8EntropyCalibrator2 *calibrator = new nvinfer1::Int8EntropyCalibrator2(calib_batch_size, m_InputC, m_InputH, m_InputW, m_LetterBox, calib_image_list, m_Int8CalibPath);
nvinfer1::Int8EntropyCalibrator2 *calibrator = new nvinfer1::Int8EntropyCalibrator2(
calib_batch_size, m_InputC, m_InputH, m_InputW, m_LetterBox, calib_image_list, m_Int8CalibPath);
config->setFlag(nvinfer1::BuilderFlag::kINT8);
config->setInt8Calibrator(calibrator);
#else
std::cerr << "OpenCV is required to run INT8 calibrator" << std::endl;
std::abort();
std::cerr << "OpenCV is required to run INT8 calibrator\n" << std::endl;
assert(0);
#endif
}
nvinfer1::ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
if (engine) {
if (engine)
std::cout << "Building complete\n" << std::endl;
} else {
else
std::cerr << "Building engine failed\n" << std::endl;
}
delete network;
return engine;
@@ -126,28 +147,30 @@ NvDsInferStatus Yolo::parseModel(nvinfer1::INetworkDefinition& network) {
std::cout << "Building YOLO network\n" << std::endl;
NvDsInferStatus status = buildYoloNetwork(weights, network);
if (status == NVDSINFER_SUCCESS) {
if (status == NVDSINFER_SUCCESS)
std::cout << "Building YOLO network complete" << std::endl;
} else {
else
std::cerr << "Building YOLO network failed" << std::endl;
}
return status;
}
NvDsInferStatus Yolo::buildYoloNetwork(
std::vector<float>& weights, nvinfer1::INetworkDefinition& network) {
NvDsInferStatus Yolo::buildYoloNetwork(std::vector<float>& weights, nvinfer1::INetworkDefinition& network)
{
int weightPtr = 0;
int channels = m_InputC;
std::string weightsType;
if (m_WtsFilePath.find(".weights") != std::string::npos) {
if (m_WtsFilePath.find(".weights") != std::string::npos)
weightsType = "weights";
}
else {
else
weightsType = "wts";
}
float eps = 1.0e-5;
if (m_NetworkType.find("yolov5") != std::string::npos)
eps = 1.0e-3;
else if (m_NetworkType.find("yolor") != std::string::npos)
eps = 1.0e-4;
nvinfer1::ITensor* data =
network.addInput(m_InputBlobName.c_str(), nvinfer1::DataType::kFLOAT,
@@ -157,26 +180,24 @@ NvDsInferStatus Yolo::buildYoloNetwork(
nvinfer1::ITensor* previous = data;
std::vector<nvinfer1::ITensor*> tensorOutputs;
uint outputTensorCount = 0;
std::vector<nvinfer1::ITensor*> yoloInputs;
uint inputYoloCount = 0;
for (uint i = 0; i < m_ConfigBlocks.size(); ++i) {
int modelType = -1;
for (uint i = 0; i < m_ConfigBlocks.size(); ++i)
{
assert(getNumChannels(previous) == channels);
std::string layerIndex = "(" + std::to_string(tensorOutputs.size()) + ")";
if (m_ConfigBlocks.at(i).at("type") == "net") {
if (m_ConfigBlocks.at(i).at("type") == "net")
printLayerInfo("", "layer", " input", " output", "weightPtr");
}
else if (m_ConfigBlocks.at(i).at("type") == "convolutional") {
float eps = 1.0e-5;
if (m_NetworkType.find("yolov5") != std::string::npos) {
eps = 1.0e-3;
}
else if (m_NetworkType.find("yolor") != std::string::npos) {
eps = 1.0e-4;
}
else if (m_ConfigBlocks.at(i).at("type") == "convolutional")
{
std::string inputVol = dimsToString(previous->getDimensions());
nvinfer1::ILayer* out = convolutionalLayer(i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, weightsType, channels, eps, previous, &network);
nvinfer1::ILayer* out = convolutionalLayer(
i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, weightsType, channels, eps, previous, &network);
previous = out->getOutput(0);
assert(previous != nullptr);
channels = getNumChannels(previous);
@@ -186,14 +207,13 @@ NvDsInferStatus Yolo::buildYoloNetwork(
printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
}
else if (m_ConfigBlocks.at(i).at("type") == "implicit_add" || m_ConfigBlocks.at(i).at("type") == "implicit_mul") {
else if (m_ConfigBlocks.at(i).at("type") == "implicit_add" || m_ConfigBlocks.at(i).at("type") == "implicit_mul")
{
std::string type;
if (m_ConfigBlocks.at(i).at("type") == "implicit_add") {
if (m_ConfigBlocks.at(i).at("type") == "implicit_add")
type = "add";
}
else if (m_ConfigBlocks.at(i).at("type") == "implicit_mul") {
else if (m_ConfigBlocks.at(i).at("type") == "implicit_mul")
type = "mul";
}
assert(m_ConfigBlocks.at(i).find("filters") != m_ConfigBlocks.at(i).end());
int filters = std::stoi(m_ConfigBlocks.at(i).at("filters"));
nvinfer1::ILayer* out = implicitLayer(filters, weights, m_TrtWeights, weightPtr, &network);
@@ -206,19 +226,17 @@ NvDsInferStatus Yolo::buildYoloNetwork(
printLayerInfo(layerIndex, layerType, " -", outputVol, std::to_string(weightPtr));
}
else if (m_ConfigBlocks.at(i).at("type") == "shift_channels" || m_ConfigBlocks.at(i).at("type") == "control_channels") {
else if (m_ConfigBlocks.at(i).at("type") == "shift_channels" || m_ConfigBlocks.at(i).at("type") == "control_channels")
{
std::string type;
if (m_ConfigBlocks.at(i).at("type") == "shift_channels") {
if (m_ConfigBlocks.at(i).at("type") == "shift_channels")
type = "shift";
}
else if (m_ConfigBlocks.at(i).at("type") == "control_channels") {
else if (m_ConfigBlocks.at(i).at("type") == "control_channels")
type = "control";
}
assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end());
int from = stoi(m_ConfigBlocks.at(i).at("from"));
if (from > 0) {
if (from > 0)
from = from - i + 1;
}
assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size()));
assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size()));
assert(i + from - 1 < i - 2);
@@ -231,25 +249,22 @@ NvDsInferStatus Yolo::buildYoloNetwork(
printLayerInfo(layerIndex, layerType, " -", outputVol, " -");
}
else if (m_ConfigBlocks.at(i).at("type") == "dropout") {
else if (m_ConfigBlocks.at(i).at("type") == "dropout")
{
// Skip dropout layer
assert(m_ConfigBlocks.at(i).find("probability") != m_ConfigBlocks.at(i).end());
/*float probability = std::stof(m_ConfigBlocks.at(i).at("probability"));
nvinfer1::ILayer* out = dropoutLayer(probability, previous, &network);
previous = out->getOutput(0);*/
assert(previous != nullptr);
tensorOutputs.push_back(previous);
printLayerInfo(layerIndex, "dropout", " -", " -", " -");
}
else if (m_ConfigBlocks.at(i).at("type") == "shortcut") {
else if (m_ConfigBlocks.at(i).at("type") == "shortcut")
{
assert(m_ConfigBlocks.at(i).find("activation") != m_ConfigBlocks.at(i).end());
assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end());
std::string activation = m_ConfigBlocks.at(i).at("activation");
int from = stoi(m_ConfigBlocks.at(i).at("from"));
if (from > 0) {
if (from > 0)
from = from - i + 1;
}
assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size()));
assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size()));
assert(i + from - 1 < i - 2);
@@ -267,7 +282,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
}
}
else if (m_ConfigBlocks.at(i).at("type") == "route") {
else if (m_ConfigBlocks.at(i).at("type") == "route")
{
assert(m_ConfigBlocks.at(i).find("layers") != m_ConfigBlocks.at(i).end());
nvinfer1::ILayer* out = routeLayer(i, m_ConfigBlocks.at(i), tensorOutputs, &network);
previous = out->getOutput(0);
@@ -278,7 +294,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
printLayerInfo(layerIndex, "route", " -", outputVol, std::to_string(weightPtr));
}
else if (m_ConfigBlocks.at(i).at("type") == "upsample") {
else if (m_ConfigBlocks.at(i).at("type") == "upsample")
{
std::string inputVol = dimsToString(previous->getDimensions());
nvinfer1::ILayer* out = upsampleLayer(i - 1, m_ConfigBlocks[i], previous, &network);
previous = out->getOutput(0);
@@ -288,7 +305,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
printLayerInfo(layerIndex, "upsample", inputVol, outputVol, " -");
}
else if (m_ConfigBlocks.at(i).at("type") == "maxpool") {
else if (m_ConfigBlocks.at(i).at("type") == "maxpool")
{
std::string inputVol = dimsToString(previous->getDimensions());
nvinfer1::ILayer* out = maxpoolLayer(i, m_ConfigBlocks.at(i), previous, &network);
previous = out->getOutput(0);
@@ -298,8 +316,10 @@ NvDsInferStatus Yolo::buildYoloNetwork(
printLayerInfo(layerIndex, "maxpool", inputVol, outputVol, std::to_string(weightPtr));
}
else if (m_ConfigBlocks.at(i).at("type") == "reorg") {
if (m_NetworkType.find("yolov5") != std::string::npos || m_NetworkType.find("yolor") != std::string::npos) {
else if (m_ConfigBlocks.at(i).at("type") == "reorg")
{
if (m_NetworkType.find("yolov5") != std::string::npos || m_NetworkType.find("yolor") != std::string::npos)
{
std::string inputVol = dimsToString(previous->getDimensions());
nvinfer1::ILayer* out = reorgV5Layer(i, previous, &network);
previous = out->getOutput(0);
@@ -310,7 +330,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
std::string layerType = "reorgV5";
printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
}
else {
else
{
std::string inputVol = dimsToString(previous->getDimensions());
nvinfer1::IPluginV2* reorgPlugin = createReorgPlugin(2);
assert(reorgPlugin != nullptr);
@@ -328,95 +349,127 @@ NvDsInferStatus Yolo::buildYoloNetwork(
}
}
else if (m_ConfigBlocks.at(i).at("type") == "yolo") {
uint modelType = 1;
uint newCoords = 0;
float scaleXY = 1.0;
if (m_NetworkType.find("yolor") != std::string::npos) {
modelType = 2;
}
if (m_ConfigBlocks.at(i).find("new_coords") != m_ConfigBlocks.at(i).end()) {
newCoords = std::stoi(m_ConfigBlocks.at(i).at("new_coords"));
}
if (m_ConfigBlocks.at(i).find("scale_x_y") != m_ConfigBlocks.at(i).end()) {
scaleXY = std::stof(m_ConfigBlocks.at(i).at("scale_x_y"));
else if (m_ConfigBlocks.at(i).at("type") == "yolo" || m_ConfigBlocks.at(i).at("type") == "region")
{
if (m_ConfigBlocks.at(i).at("type") == "yolo")
{
if (m_NetworkType.find("yolor") != std::string::npos)
modelType = 2;
else
modelType = 1;
}
else
modelType = 0;
std::string layerName = "yolo_" + std::to_string(i);
std::string layerName = modelType != 0 ? "yolo_" + std::to_string(i) : "region_" + std::to_string(i);
nvinfer1::Dims prevTensorDims = previous->getDimensions();
TensorInfo& curYoloTensor = m_OutputTensors.at(outputTensorCount);
m_NumClasses = curYoloTensor.numClasses;
TensorInfo& curYoloTensor = m_YoloTensors.at(inputYoloCount);
curYoloTensor.blobName = layerName;
nvinfer1::IPluginV2* yoloPlugin
= new YoloLayer(curYoloTensor.numBBoxes, curYoloTensor.numClasses, m_InputW, m_InputH,
prevTensorDims.d[2], prevTensorDims.d[1], modelType, newCoords, scaleXY,
curYoloTensor.anchors, curYoloTensor.mask);
assert(yoloPlugin != nullptr);
nvinfer1::IPluginV2Layer* yolo =
network.addPluginV2(&previous, 1, *yoloPlugin);
assert(yolo != nullptr);
yolo->setName(layerName.c_str());
std::string inputVol = dimsToString(previous->getDimensions());
previous = yolo->getOutput(0);
assert(previous != nullptr);
previous->setName(layerName.c_str());
std::string outputVol = dimsToString(previous->getDimensions());
network.markOutput(*previous);
channels = getNumChannels(previous);
tensorOutputs.push_back(yolo->getOutput(0));
printLayerInfo(layerIndex, "yolo", inputVol, outputVol, std::to_string(weightPtr));
++outputTensorCount;
}
curYoloTensor.gridSizeX = prevTensorDims.d[2];
curYoloTensor.gridSizeY = prevTensorDims.d[1];
else if (m_ConfigBlocks.at(i).at("type") == "region") {
std::vector<int> mask;
std::string layerName = "region_" + std::to_string(i);
nvinfer1::Dims prevTensorDims = previous->getDimensions();
TensorInfo& curRegionTensor = m_OutputTensors.at(outputTensorCount);
m_NumClasses = curRegionTensor.numClasses;
curRegionTensor.blobName = layerName;
nvinfer1::IPluginV2* regionPlugin
= new YoloLayer(curRegionTensor.numBBoxes, curRegionTensor.numClasses, m_InputW, m_InputH,
prevTensorDims.d[2], prevTensorDims.d[1], 0, 0, 1.0, curRegionTensor.anchors,
mask);
assert(regionPlugin != nullptr);
nvinfer1::IPluginV2Layer* region =
network.addPluginV2(&previous, 1, *regionPlugin);
assert(region != nullptr);
region->setName(layerName.c_str());
std::string inputVol = dimsToString(previous->getDimensions());
previous = region->getOutput(0);
assert(previous != nullptr);
previous->setName(layerName.c_str());
std::string outputVol = dimsToString(previous->getDimensions());
network.markOutput(*previous);
channels = getNumChannels(previous);
tensorOutputs.push_back(region->getOutput(0));
printLayerInfo(layerIndex, "region", inputVol, outputVol, std::to_string(weightPtr));
++outputTensorCount;
tensorOutputs.push_back(previous);
yoloInputs.push_back(previous);
++inputYoloCount;
printLayerInfo(layerIndex, modelType != 0 ? "yolo" : "region", inputVol, " -", " -");
}
else
{
std::cout << "Unsupported layer type --> \""
<< m_ConfigBlocks.at(i).at("type") << "\"" << std::endl;
std::cout << "\nUnsupported layer type --> \"" << m_ConfigBlocks.at(i).at("type") << "\"" << std::endl;
assert(0);
}
}
if ((int)weights.size() != weightPtr)
{
std::cout << "Number of unused weights left: " << weights.size() - weightPtr << std::endl;
std::cout << "\nNumber of unused weights left: " << weights.size() - weightPtr << std::endl;
assert(0);
}
std::cout << "Output YOLO blob names: " << std::endl;
for (auto& tensor : m_OutputTensors) {
if (m_YoloCount == inputYoloCount)
{
assert((modelType != -1) && "\nCould not determine model type");
nvinfer1::ITensor* yoloInputTensors[inputYoloCount];
uint64_t outputSize = 0;
for (uint j = 0; j < inputYoloCount; ++j)
{
yoloInputTensors[j] = yoloInputs[j];
TensorInfo& curYoloTensor = m_YoloTensors.at(j);
outputSize += curYoloTensor.gridSizeX * curYoloTensor.gridSizeY * curYoloTensor.numBBoxes;
}
if (m_TopK > outputSize) {
std::cout << "\ntopk > Number of outputs\nPlease change the topk to " << outputSize
<< " or less in config_nms.txt file\n" << std::endl;
assert(0);
}
std::string layerName = "yolo";
nvinfer1::IPluginV2* yoloPlugin = new YoloLayer(
m_InputW, m_InputH, m_NumClasses, m_NewCoords, m_YoloTensors, outputSize, modelType, m_TopK,
m_ScoreThreshold);
assert(yoloPlugin != nullptr);
nvinfer1::IPluginV2Layer* yolo = network.addPluginV2(yoloInputTensors, inputYoloCount, *yoloPlugin);
assert(yolo != nullptr);
yolo->setName(layerName.c_str());
previous = yolo->getOutput(0);
assert(previous != nullptr);
previous->setName(layerName.c_str());
tensorOutputs.push_back(yolo->getOutput(0));
nvinfer1::ITensor* yoloTensors[] = {yolo->getOutput(0), yolo->getOutput(1)};
std::string outputVol = dimsToString(previous->getDimensions());
nvinfer1::plugin::NMSParameters nmsParams;
nmsParams.shareLocation = true;
nmsParams.backgroundLabelId = -1;
nmsParams.numClasses = m_NumClasses;
nmsParams.topK = m_TopK;
nmsParams.keepTopK = m_TopK;
nmsParams.scoreThreshold = m_ScoreThreshold;
nmsParams.iouThreshold = m_IouThreshold;
nmsParams.isNormalized = false;
layerName = "batchedNMS";
nvinfer1::IPluginV2* batchedNMS = createBatchedNMSPlugin(nmsParams);
nvinfer1::IPluginV2Layer* nms = network.addPluginV2(yoloTensors, 2, *batchedNMS);
nms->setName(layerName.c_str());
nvinfer1::ITensor* num_detections = nms->getOutput(0);
layerName = "num_detections";
num_detections->setName(layerName.c_str());
nvinfer1::ITensor* nmsed_boxes = nms->getOutput(1);
layerName = "nmsed_boxes";
nmsed_boxes->setName(layerName.c_str());
nvinfer1::ITensor* nmsed_scores = nms->getOutput(2);
layerName = "nmsed_scores";
nmsed_scores->setName(layerName.c_str());
nvinfer1::ITensor* nmsed_classes = nms->getOutput(3);
layerName = "nmsed_classes";
nmsed_classes->setName(layerName.c_str());
network.markOutput(*num_detections);
network.markOutput(*nmsed_boxes);
network.markOutput(*nmsed_scores);
network.markOutput(*nmsed_classes);
printLayerInfo("", "batched_nms", " -", outputVol, " -");
}
else {
std::cout << "\nError in yolo cfg file" << std::endl;
assert(0);
}
std::cout << "\nOutput YOLO blob names: " << std::endl;
for (auto& tensor : m_YoloTensors)
{
std::cout << tensor.blobName << std::endl;
}
int nbLayers = network.getNbLayers();
std::cout << "Total number of YOLO layers: " << nbLayers << std::endl;
std::cout << "\nTotal number of YOLO layers: " << nbLayers << "\n" << std::endl;
return NVDSINFER_SUCCESS;
}
@@ -462,39 +515,44 @@ Yolo::parseConfigFile (const std::string cfgFilePath)
void Yolo::parseConfigBlocks()
{
for (auto block : m_ConfigBlocks) {
for (auto block : m_ConfigBlocks)
{
if (block.at("type") == "net")
{
assert((block.find("height") != block.end())
&& "Missing 'height' param in network cfg");
assert((block.find("height") != block.end()) && "Missing 'height' param in network cfg");
assert((block.find("width") != block.end()) && "Missing 'width' param in network cfg");
assert((block.find("channels") != block.end())
&& "Missing 'channels' param in network cfg");
assert((block.find("channels") != block.end()) && "Missing 'channels' param in network cfg");
m_InputH = std::stoul(block.at("height"));
m_InputW = std::stoul(block.at("width"));
m_InputC = std::stoul(block.at("channels"));
m_InputSize = m_InputC * m_InputH * m_InputW;
if (block.find("letter_box") != block.end()) {
if (block.find("letter_box") != block.end())
{
m_LetterBox = std::stoul(block.at("letter_box"));
}
else {
m_LetterBox = 0;
}
}
else if ((block.at("type") == "region") || (block.at("type") == "yolo") || (block.at("type") == "detect"))
else if ((block.at("type") == "region") || (block.at("type") == "yolo"))
{
assert((block.find("num") != block.end())
&& std::string("Missing 'num' param in " + block.at("type") + " layer").c_str());
assert((block.find("classes") != block.end())
&& std::string("Missing 'classes' param in " + block.at("type") + " layer")
.c_str());
&& std::string("Missing 'classes' param in " + block.at("type") + " layer").c_str());
assert((block.find("anchors") != block.end())
&& std::string("Missing 'anchors' param in " + block.at("type") + " layer")
.c_str());
&& std::string("Missing 'anchors' param in " + block.at("type") + " layer").c_str());
++m_YoloCount;
m_NumClasses = std::stoul(block.at("classes"));
if (block.find("new_coords") != block.end())
{
m_NewCoords = std::stoul(block.at("new_coords"));
}
TensorInfo outputTensor;
std::string anchorString = block.at("anchors");
while (!anchorString.empty())
{
@@ -513,7 +571,8 @@ void Yolo::parseConfigBlocks()
}
}
if (block.find("mask") != block.end()) {
if (block.find("mask") != block.end())
{
std::string maskString = block.at("mask");
while (!maskString.empty())
{
@@ -533,17 +592,41 @@ void Yolo::parseConfigBlocks()
}
}
outputTensor.numBBoxes = outputTensor.mask.size() > 0
? outputTensor.mask.size()
: std::stoul(trim(block.at("num")));
outputTensor.numClasses = std::stoul(block.at("classes"));
m_OutputTensors.push_back(outputTensor);
if (block.find("scale_x_y") != block.end())
{
outputTensor.scaleXY = std::stof(block.at("scale_x_y"));
}
else
{
outputTensor.scaleXY = 1.0;
}
outputTensor.numBBoxes
= outputTensor.mask.size() > 0 ? outputTensor.mask.size() : std::stoul(trim(block.at("num")));
m_YoloTensors.push_back(outputTensor);
}
}
}
void Yolo::destroyNetworkUtils() {
for (uint i = 0; i < m_TrtWeights.size(); ++i) {
void Yolo::parseConfigNMSBlocks()
{
auto block = m_ConfigNMSBlocks[0];
assert((block.at("type") == "property") && "Missing 'property' param in nms cfg");
assert((block.find("iou-threshold") != block.end()) && "Missing 'iou-threshold' param in nms cfg");
assert((block.find("score-threshold") != block.end()) && "Missing 'score-threshold' param in nms cfg");
assert((block.find("topk") != block.end()) && "Missing 'topk' param in nms cfg");
m_IouThreshold = std::stof(block.at("iou-threshold"));
m_ScoreThreshold = std::stof(block.at("score-threshold"));
m_TopK = std::stoul(block.at("topk"));
}
void Yolo::destroyNetworkUtils()
{
for (uint i = 0; i < m_TrtWeights.size(); ++i)
{
if (m_TrtWeights[i].count > 0)
free(const_cast<void*>(m_TrtWeights[i].values));
}

View File

@@ -29,7 +29,6 @@
#include "layers/convolutional_layer.h"
#include "layers/implicit_layer.h"
#include "layers/channels_layer.h"
#include "layers/dropout_layer.h"
#include "layers/shortcut_layer.h"
#include "layers/route_layer.h"
#include "layers/upsample_layer.h"
@@ -54,8 +53,10 @@ struct NetworkInfo
struct TensorInfo
{
std::string blobName;
uint gridSizeX {0};
uint gridSizeY {0};
uint numBBoxes {0};
uint numClasses {0};
float scaleXY;
std::vector<float> anchors;
std::vector<int> mask;
};
@@ -63,12 +64,15 @@ struct TensorInfo
class Yolo : public IModelParser {
public:
Yolo(const NetworkInfo& networkInfo);
~Yolo() override;
bool hasFullDimsSupported() const override { return false; }
const char* getModelName() const override {
return m_ConfigFilePath.empty() ? m_NetworkType.c_str()
: m_ConfigFilePath.c_str();
return m_ConfigFilePath.empty() ? m_NetworkType.c_str() : m_ConfigFilePath.c_str();
}
NvDsInferStatus parseModel(nvinfer1::INetworkDefinition& network) override;
nvinfer1::ICudaEngine *createEngine (nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config);
@@ -90,17 +94,26 @@ protected:
uint64_t m_InputSize;
uint m_NumClasses;
uint m_LetterBox;
uint m_NewCoords;
uint m_YoloCount;
float m_IouThreshold;
float m_ScoreThreshold;
uint m_TopK;
std::vector<TensorInfo> m_OutputTensors;
std::vector<TensorInfo> m_YoloTensors;
std::vector<std::map<std::string, std::string>> m_ConfigBlocks;
std::vector<std::map<std::string, std::string>> m_ConfigNMSBlocks;
std::vector<nvinfer1::Weights> m_TrtWeights;
private:
NvDsInferStatus buildYoloNetwork(
std::vector<float>& weights, nvinfer1::INetworkDefinition& network);
std::vector<std::map<std::string, std::string>> parseConfigFile(
const std::string cfgFilePath);
NvDsInferStatus buildYoloNetwork(std::vector<float>& weights, nvinfer1::INetworkDefinition& network);
std::vector<std::map<std::string, std::string>> parseConfigFile(const std::string cfgFilePath);
void parseConfigBlocks();
void parseConfigNMSBlocks();
void destroyNetworkUtils();
};

View File

@@ -3,18 +3,14 @@
* https://www.github.com/marcoslucianops
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
__global__ void gpuYoloLayer(
const float* input, float* output, const uint netWidth, const uint netHeight, const uint gridSizeX,
const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY,
const float* anchors, const int* mask)
const float* input, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY,
const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float* anchors, const int* mask)
{
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
@@ -28,28 +24,32 @@ __global__ void gpuYoloLayer(
const int numGridCells = gridSizeX * gridSizeY;
const int bbindex = y_id * gridSizeX + x_id;
const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
if (objectness < scoreThreshold) return;
int count = (int)atomicAdd(&countData[0], 1);
const float alpha = scaleXY;
const float beta = -0.5 * (scaleXY - 1);
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
float x
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
* alpha + beta + x_id) * netWidth / gridSizeX;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
float y
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)])
* alpha + beta + y_id) * netHeight / gridSizeY;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
float w
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)])
* anchors[mask[z_id] * 2];
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
float h
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)])
* anchors[mask[z_id] * 2 + 1];
const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
float maxProb = 0.0f;
int maxIndex = -1;
@@ -65,22 +65,26 @@ __global__ void gpuYoloLayer(
}
}
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
= objectness * maxProb;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)]
= maxIndex;
d_indexes[count] = count;
d_scores[count] = objectness * maxProb + 1.f;
d_boxes[count * 4 + 0] = x - 0.5 * w;
d_boxes[count * 4 + 1] = y - 0.5 * h;
d_boxes[count * 4 + 2] = x + 0.5 * w;
d_boxes[count * 4 + 3] = y + 0.5 * h;
d_classes[count] = maxIndex;
}
cudaError_t cudaYoloLayer(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
cudaError_t cudaYoloLayer(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
{
dim3 threads_per_block(16, 16, 4);
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
@@ -90,9 +94,12 @@ cudaError_t cudaYoloLayer(
for (unsigned int batch = 0; batch < batchSize; ++batch)
{
gpuYoloLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
reinterpret_cast<const float*>(input) + (batch * outputSize),
reinterpret_cast<float*>(output) + (batch * outputSize),
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
reinterpret_cast<const float*>(input) + (batch * inputSize),
reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
reinterpret_cast<float*>(d_scores) + (batch * outputSize),
reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask));
}
return cudaGetLastError();

View File

@@ -3,16 +3,12 @@
* https://www.github.com/marcoslucianops
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
__global__ void gpuYoloLayer_nc(
const float* input, float* output, const uint netWidth, const uint netHeight, const uint gridSizeX,
const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY,
const float* anchors, const int* mask)
const float* input, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY,
const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float* anchors, const int* mask)
{
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
@@ -26,28 +22,32 @@ __global__ void gpuYoloLayer_nc(
const int numGridCells = gridSizeX * gridSizeY;
const int bbindex = y_id * gridSizeX + x_id;
const float objectness
= input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)];
if (objectness < scoreThreshold) return;
int count = (int)atomicAdd(&countData[0], 1);
const float alpha = scaleXY;
const float beta = -0.5 * (scaleXY - 1);
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
float x
= (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
* alpha + beta + x_id) * netWidth / gridSizeX;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
float y
= (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
* alpha + beta + y_id) * netHeight / gridSizeY;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
float w
= __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] * 2, 2)
* anchors[mask[z_id] * 2];
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
float h
= __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] * 2, 2)
* anchors[mask[z_id] * 2 + 1];
const float objectness
= input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)];
float maxProb = 0.0f;
int maxIndex = -1;
@@ -63,22 +63,26 @@ __global__ void gpuYoloLayer_nc(
}
}
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
= objectness * maxProb;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)]
= maxIndex;
d_indexes[count] = count;
d_scores[count] = objectness * maxProb + 1.f;
d_boxes[count * 4 + 0] = x - 0.5 * w;
d_boxes[count * 4 + 1] = y - 0.5 * h;
d_boxes[count * 4 + 2] = x + 0.5 * w;
d_boxes[count * 4 + 3] = y + 0.5 * h;
d_classes[count] = maxIndex;
}
cudaError_t cudaYoloLayer_nc(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
cudaError_t cudaYoloLayer_nc(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
{
dim3 threads_per_block(16, 16, 4);
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
@@ -88,9 +92,12 @@ cudaError_t cudaYoloLayer_nc(
for (unsigned int batch = 0; batch < batchSize; ++batch)
{
gpuYoloLayer_nc<<<number_of_blocks, threads_per_block, 0, stream>>>(
reinterpret_cast<const float*>(input) + (batch * outputSize),
reinterpret_cast<float*>(output) + (batch * outputSize),
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
reinterpret_cast<const float*>(input) + (batch * inputSize),
reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
reinterpret_cast<float*>(d_scores) + (batch * outputSize),
reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask));
}
return cudaGetLastError();

View File

@@ -3,18 +3,14 @@
* https://www.github.com/marcoslucianops
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
__global__ void gpuYoloLayer_r(
const float* input, float* output, const uint netWidth, const uint netHeight, const uint gridSizeX,
const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY,
const float* anchors, const int* mask)
const float* input, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY,
const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float* anchors, const int* mask)
{
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
@@ -28,28 +24,32 @@ __global__ void gpuYoloLayer_r(
const int numGridCells = gridSizeX * gridSizeY;
const int bbindex = y_id * gridSizeX + x_id;
const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
if (objectness < scoreThreshold) return;
int count = (int)atomicAdd(&countData[0], 1);
const float alpha = scaleXY;
const float beta = -0.5 * (scaleXY - 1);
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
float x
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
* alpha + beta + x_id) * netWidth / gridSizeX;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
float y
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)])
* alpha + beta + y_id) * netHeight / gridSizeY;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
float w
= __powf(sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]) * 2, 2)
* anchors[mask[z_id] * 2];
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
float h
= __powf(sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) * 2, 2)
* anchors[mask[z_id] * 2 + 1];
const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
float maxProb = 0.0f;
int maxIndex = -1;
@@ -65,22 +65,26 @@ __global__ void gpuYoloLayer_r(
}
}
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
= objectness * maxProb;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)]
= maxIndex;
d_indexes[count] = count;
d_scores[count] = objectness * maxProb + 1.f;
d_boxes[count * 4 + 0] = x - 0.5 * w;
d_boxes[count * 4 + 1] = y - 0.5 * h;
d_boxes[count * 4 + 2] = x + 0.5 * w;
d_boxes[count * 4 + 3] = y + 0.5 * h;
d_classes[count] = maxIndex;
}
cudaError_t cudaYoloLayer_r(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
cudaError_t cudaYoloLayer_r(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
{
dim3 threads_per_block(16, 16, 4);
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
@@ -90,9 +94,12 @@ cudaError_t cudaYoloLayer_r(
for (unsigned int batch = 0; batch < batchSize; ++batch)
{
gpuYoloLayer_r<<<number_of_blocks, threads_per_block, 0, stream>>>(
reinterpret_cast<const float*>(input) + (batch * outputSize),
reinterpret_cast<float*>(output) + (batch * outputSize),
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
reinterpret_cast<const float*>(input) + (batch * inputSize),
reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
reinterpret_cast<float*>(d_scores) + (batch * outputSize),
reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask));
}
return cudaGetLastError();

View File

@@ -3,17 +3,13 @@
* https://www.github.com/marcoslucianops
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
__device__ void softmaxGPU(
const float* input, const int bbindex, const int numGridCells, uint z_id,
const uint numOutputClasses, float temp, float* output)
const float* input, const int bbindex, const int numGridCells, uint z_id, const uint numOutputClasses, float temp,
float* output)
{
int i;
float sum = 0;
@@ -33,9 +29,9 @@ __device__ void softmaxGPU(
}
__global__ void gpuRegionLayer(
const float* input, float* output, float* softmax, const uint netWidth, const uint netHeight,
const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes,
const float* anchors)
const float* input, float* softmax, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY,
const uint numOutputClasses, const uint numBBoxes, const float* anchors)
{
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
@@ -49,27 +45,31 @@ __global__ void gpuRegionLayer(
const int numGridCells = gridSizeX * gridSizeY;
const int bbindex = y_id * gridSizeX + x_id;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
if (objectness < scoreThreshold) return;
int count = (int)atomicAdd(&countData[0], 1);
float x
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
+ x_id) * netWidth / gridSizeX;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
float y
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)])
+ y_id) * netHeight / gridSizeY;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
float w
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)])
* anchors[z_id * 2] * netWidth / gridSizeX;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
float h
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)])
* anchors[z_id * 2 + 1] * netHeight / gridSizeY;
softmaxGPU(input, bbindex, numGridCells, z_id, numOutputClasses, 1.0, softmax);
const float objectness
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
float maxProb = 0.0f;
int maxIndex = -1;
@@ -85,22 +85,26 @@ __global__ void gpuRegionLayer(
}
}
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
= objectness * maxProb;
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)]
= maxIndex;
d_indexes[count] = count;
d_scores[count] = objectness * maxProb + 1.f;
d_boxes[count * 4 + 0] = x - 0.5 * w;
d_boxes[count * 4 + 1] = y - 0.5 * h;
d_boxes[count * 4 + 2] = x + 0.5 * w;
d_boxes[count * 4 + 3] = y + 0.5 * h;
d_classes[count] = maxIndex;
}
cudaError_t cudaRegionLayer(
const void* input, void* output, void* softmax, const uint& batchSize, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses,
const uint& numBBoxes, uint64_t& outputSize, const void* anchors, cudaStream_t stream);
const void* input, void* softmax, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const void* anchors, cudaStream_t stream);
cudaError_t cudaRegionLayer(
const void* input, void* output, void* softmax, const uint& batchSize, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses,
const uint& numBBoxes, uint64_t& outputSize, const void* anchors, cudaStream_t stream)
const void* input, void* softmax, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const void* anchors, cudaStream_t stream)
{
dim3 threads_per_block(16, 16, 4);
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
@@ -110,10 +114,13 @@ cudaError_t cudaRegionLayer(
for (unsigned int batch = 0; batch < batchSize; ++batch)
{
gpuRegionLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
reinterpret_cast<const float*>(input) + (batch * outputSize),
reinterpret_cast<float*>(output) + (batch * outputSize),
reinterpret_cast<float*>(softmax) + (batch * outputSize),
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes,
reinterpret_cast<const float*>(input) + (batch * inputSize),
reinterpret_cast<float*>(softmax) + (batch * inputSize),
reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
reinterpret_cast<float*>(d_scores) + (batch * outputSize),
reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes,
reinterpret_cast<const float*>(anchors));
}
return cudaGetLastError();

View File

@@ -29,7 +29,6 @@
#include <iostream>
#include <memory>
uint kNUM_BBOXES;
uint kNUM_CLASSES;
namespace {
@@ -49,131 +48,108 @@ namespace {
}
cudaError_t cudaYoloLayer_r(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
cudaError_t cudaYoloLayer_nc(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
cudaError_t cudaYoloLayer(
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
cudaError_t cudaRegionLayer(
const void* input, void* output, void* softmax, const uint& batchSize, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses,
const uint& numBBoxes, uint64_t& outputSize, const void* anchors, cudaStream_t stream);
const void* input, void* softmax, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
const void* anchors, cudaStream_t stream);
cudaError_t sortDetections(
void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* bboxData, void* scoreData, void* countData,
const uint& batchSize, uint64_t& outputSize, uint& topK, const uint& numOutputClasses, cudaStream_t stream);
YoloLayer::YoloLayer (const void* data, size_t length)
{
const char *d = static_cast<const char*>(data);
read(d, m_NumBBoxes);
read(d, m_NumClasses);
read(d, m_NetWidth);
read(d, m_NetHeight);
read(d, m_GridSizeX);
read(d, m_GridSizeY);
read(d, m_Type);
read(d, m_NumClasses);
read(d, m_NewCoords);
read(d, m_ScaleXY);
read(d, m_OutputSize);
read(d, m_Type);
read(d, m_TopK);
read(d, m_ScoreThreshold);
uint anchorsSize;
read(d, anchorsSize);
for (uint i = 0; i < anchorsSize; i++) {
float result;
read(d, result);
m_Anchors.push_back(result);
uint yoloTensorsSize;
read(d, yoloTensorsSize);
for (uint i = 0; i < yoloTensorsSize; ++i)
{
TensorInfo curYoloTensor;
read(d, curYoloTensor.gridSizeX);
read(d, curYoloTensor.gridSizeY);
read(d, curYoloTensor.numBBoxes);
read(d, curYoloTensor.scaleXY);
uint anchorsSize;
read(d, anchorsSize);
for (uint j = 0; j < anchorsSize; j++)
{
float result;
read(d, result);
curYoloTensor.anchors.push_back(result);
}
uint maskSize;
read(d, maskSize);
for (uint j = 0; j < maskSize; j++)
{
int result;
read(d, result);
curYoloTensor.mask.push_back(result);
}
m_YoloTensors.push_back(curYoloTensor);
}
uint maskSize;
read(d, maskSize);
for (uint i = 0; i < maskSize; i++) {
int result;
read(d, result);
m_Mask.push_back(result);
}
if (m_Anchors.size() > 0) {
float* anchors = m_Anchors.data();
CUDA_CHECK(cudaMallocHost(&p_Anchors, m_Anchors.size() * sizeof(float)));
CUDA_CHECK(cudaMemcpy(p_Anchors, anchors, m_Anchors.size() * sizeof(float), cudaMemcpyHostToDevice));
}
if (m_Mask.size() > 0) {
int* mask = m_Mask.data();
CUDA_CHECK(cudaMallocHost(&p_Mask, m_Mask.size() * sizeof(int)));
CUDA_CHECK(cudaMemcpy(p_Mask, mask, m_Mask.size() * sizeof(int), cudaMemcpyHostToDevice));
}
kNUM_BBOXES = m_NumBBoxes;
kNUM_CLASSES = m_NumClasses;
};
YoloLayer::YoloLayer (
const uint& numBBoxes, const uint& numClasses, const uint& netWidth, const uint& netHeight,
const uint& gridSizeX, const uint& gridSizeY, const uint& modelType, const uint& newCoords,
const float& scaleXY, const std::vector<float> anchors,
const std::vector<int> mask) :
m_NumBBoxes(numBBoxes),
m_NumClasses(numClasses),
YoloLayer::YoloLayer(
const uint& netWidth, const uint& netHeight, const uint& numClasses, const uint& newCoords,
const std::vector<TensorInfo>& yoloTensors, const uint64_t& outputSize, const uint& modelType, const uint& topK,
const float& scoreThreshold) :
m_NetWidth(netWidth),
m_NetHeight(netHeight),
m_GridSizeX(gridSizeX),
m_GridSizeY(gridSizeY),
m_Type(modelType),
m_NumClasses(numClasses),
m_NewCoords(newCoords),
m_ScaleXY(scaleXY),
m_Anchors(anchors),
m_Mask(mask)
m_YoloTensors(yoloTensors),
m_OutputSize(outputSize),
m_Type(modelType),
m_TopK(topK),
m_ScoreThreshold(scoreThreshold)
{
assert(m_NumBBoxes > 0);
assert(m_NumClasses > 0);
assert(m_NetWidth > 0);
assert(m_NetHeight > 0);
assert(m_GridSizeX > 0);
assert(m_GridSizeY > 0);
m_OutputSize = m_GridSizeX * m_GridSizeY * (m_NumBBoxes * (4 + 1 + m_NumClasses));
if (m_Anchors.size() > 0) {
float* anchors = m_Anchors.data();
CUDA_CHECK(cudaMallocHost(&p_Anchors, m_Anchors.size() * sizeof(float)));
CUDA_CHECK(cudaMemcpy(p_Anchors, anchors, m_Anchors.size() * sizeof(float), cudaMemcpyHostToDevice));
}
if (m_Mask.size() > 0) {
int* mask = m_Mask.data();
CUDA_CHECK(cudaMallocHost(&p_Mask, m_Mask.size() * sizeof(int)));
CUDA_CHECK(cudaMemcpy(p_Mask, mask, m_Mask.size() * sizeof(int), cudaMemcpyHostToDevice));
}
kNUM_BBOXES = m_NumBBoxes;
kNUM_CLASSES = m_NumClasses;
};
YoloLayer::~YoloLayer()
{
if (m_Anchors.size() > 0) {
CUDA_CHECK(cudaFreeHost(p_Anchors));
}
if (m_Mask.size() > 0) {
CUDA_CHECK(cudaFreeHost(p_Mask));
}
}
nvinfer1::Dims
YoloLayer::getOutputDimensions(
int index, const nvinfer1::Dims* inputs, int nbInputDims) noexcept
{
assert(index == 0);
assert(nbInputDims == 1);
return inputs[0];
assert(index < 3);
if (index == 0) {
return nvinfer1::Dims3(m_TopK, 1, 4);
}
return nvinfer1::DimsHW(m_TopK, m_NumClasses);
}
bool YoloLayer::supportsFormat (
@@ -188,43 +164,116 @@ YoloLayer::configureWithFormat (
const nvinfer1::Dims* outputDims, int nbOutputs,
nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept
{
assert(nbInputs == 1);
assert(nbInputs > 0);
assert(format == nvinfer1::PluginFormat::kLINEAR);
assert(inputDims != nullptr);
}
int32_t YoloLayer::enqueue (
int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace,
int batchSize, void const* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) noexcept
{
if (m_Type == 2) { // YOLOR incorrect param: scale_x_y = 2.0
CUDA_CHECK(cudaYoloLayer_r(
inputs[0], outputs[0], batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY,
m_NumClasses, m_NumBBoxes, m_OutputSize, 2.0, p_Anchors, p_Mask, stream));
}
else if (m_Type == 1) {
if (m_NewCoords) {
CUDA_CHECK(cudaYoloLayer_nc(
inputs[0], outputs[0], batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY,
m_NumClasses, m_NumBBoxes, m_OutputSize, m_ScaleXY, p_Anchors, p_Mask, stream));
void* countData = workspace;
void* bboxData = outputs[0];
void* scoreData = outputs[1];
CUDA_CHECK(cudaMemsetAsync((int*)countData, 0, sizeof(int) * batchSize, stream));
CUDA_CHECK(cudaMemsetAsync((float*)bboxData, 0, sizeof(float) * m_TopK * 4 * batchSize, stream));
CUDA_CHECK(cudaMemsetAsync((float*)scoreData, 0, sizeof(float) * m_TopK * m_NumClasses * batchSize, stream));
void* d_indexes;
CUDA_CHECK(cudaMallocHost(&d_indexes, sizeof(int) * m_OutputSize * batchSize));
CUDA_CHECK(cudaMemsetAsync((float*)d_indexes, 0, sizeof(int) * m_OutputSize * batchSize, stream));
void* d_scores;
CUDA_CHECK(cudaMallocHost(&d_scores, sizeof(float) * m_OutputSize * batchSize));
CUDA_CHECK(cudaMemsetAsync((float*)d_scores, 0, sizeof(float) * m_OutputSize * batchSize, stream));
void* d_boxes;
CUDA_CHECK(cudaMallocHost(&d_boxes, sizeof(float) * m_OutputSize * 4 * batchSize));
CUDA_CHECK(cudaMemsetAsync((float*)d_boxes, 0, sizeof(float) * m_OutputSize * 4 * batchSize, stream));
void* d_classes;
CUDA_CHECK(cudaMallocHost(&d_classes, sizeof(int) * m_OutputSize * batchSize));
CUDA_CHECK(cudaMemsetAsync((float*)d_classes, 0, sizeof(int) * m_OutputSize * batchSize, stream));
uint yoloTensorsSize = m_YoloTensors.size();
for (uint i = 0; i < yoloTensorsSize; ++i)
{
TensorInfo& curYoloTensor = m_YoloTensors.at(i);
uint numBBoxes = curYoloTensor.numBBoxes;
float scaleXY = curYoloTensor.scaleXY;
uint gridSizeX = curYoloTensor.gridSizeX;
uint gridSizeY = curYoloTensor.gridSizeY;
std::vector<float> anchors = curYoloTensor.anchors;
std::vector<int> mask = curYoloTensor.mask;
void* v_anchors;
void* v_mask;
if (anchors.size() > 0) {
float* f_anchors = anchors.data();
CUDA_CHECK(cudaMallocHost(&v_anchors, sizeof(float) * anchors.size()));
CUDA_CHECK(cudaMemcpy(v_anchors, f_anchors, sizeof(float) * anchors.size(), cudaMemcpyHostToDevice));
}
if (mask.size() > 0) {
int* f_mask = mask.data();
CUDA_CHECK(cudaMallocHost(&v_mask, sizeof(int) * mask.size()));
CUDA_CHECK(cudaMemcpy(v_mask, f_mask, sizeof(int) * mask.size(), cudaMemcpyHostToDevice));
}
uint64_t inputSize = gridSizeX * gridSizeY * (numBBoxes * (4 + 1 + m_NumClasses));
if (m_Type == 2) { // YOLOR incorrect param: scale_x_y = 2.0
CUDA_CHECK(cudaYoloLayer_r(
inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, 2.0, v_anchors,
v_mask, stream));
}
else if (m_Type == 1) {
if (m_NewCoords) {
CUDA_CHECK(cudaYoloLayer_nc(
inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY,
v_anchors, v_mask, stream));
}
else {
CUDA_CHECK(cudaYoloLayer(
inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY,
v_anchors, v_mask, stream));
}
}
else {
CUDA_CHECK(cudaYoloLayer(
inputs[0], outputs[0], batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY,
m_NumClasses, m_NumBBoxes, m_OutputSize, m_ScaleXY, p_Anchors, p_Mask, stream));
void* softmax;
CUDA_CHECK(cudaMallocHost(&softmax, sizeof(float) * inputSize * batchSize));
CUDA_CHECK(cudaMemsetAsync((float*)softmax, 0, sizeof(float) * inputSize * batchSize));
CUDA_CHECK(cudaRegionLayer(
inputs[i], softmax, d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, v_anchors,
stream));
CUDA_CHECK(cudaFreeHost(softmax));
}
if (anchors.size() > 0) {
CUDA_CHECK(cudaFreeHost(v_anchors));
}
if (mask.size() > 0) {
CUDA_CHECK(cudaFreeHost(v_mask));
}
}
else {
void* softmax;
cudaMallocHost(&softmax, sizeof(outputs[0]));
cudaMemcpy(softmax, outputs[0], sizeof(outputs[0]), cudaMemcpyHostToDevice);
CUDA_CHECK(cudaRegionLayer(
inputs[0], outputs[0], softmax, batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY,
m_NumClasses, m_NumBBoxes, m_OutputSize, p_Anchors, stream));
CUDA_CHECK(sortDetections(
d_indexes, d_scores, d_boxes, d_classes, bboxData, scoreData, countData, batchSize, m_OutputSize, m_TopK,
m_NumClasses, stream));
CUDA_CHECK(cudaFreeHost(d_indexes));
CUDA_CHECK(cudaFreeHost(d_scores));
CUDA_CHECK(cudaFreeHost(d_boxes));
CUDA_CHECK(cudaFreeHost(d_classes));
CUDA_CHECK(cudaFreeHost(softmax));
}
return 0;
}
@@ -232,18 +281,28 @@ size_t YoloLayer::getSerializationSize() const noexcept
{
size_t totalSize = 0;
totalSize += sizeof(m_NumBBoxes);
totalSize += sizeof(m_NumClasses);
totalSize += sizeof(m_NetWidth);
totalSize += sizeof(m_NetHeight);
totalSize += sizeof(m_GridSizeX);
totalSize += sizeof(m_GridSizeY);
totalSize += sizeof(m_Type);
totalSize += sizeof(m_NumClasses);
totalSize += sizeof(m_NewCoords);
totalSize += sizeof(m_ScaleXY);
totalSize += sizeof(m_OutputSize);
totalSize += sizeof(uint) + sizeof(m_Anchors[0]) * m_Anchors.size();
totalSize += sizeof(uint) + sizeof(m_Mask[0]) * m_Mask.size();
totalSize += sizeof(m_Type);
totalSize += sizeof(m_TopK);
totalSize += sizeof(m_ScoreThreshold);
uint yoloTensorsSize = m_YoloTensors.size();
totalSize += sizeof(yoloTensorsSize);
for (uint i = 0; i < yoloTensorsSize; ++i)
{
const TensorInfo& curYoloTensor = m_YoloTensors.at(i);
totalSize += sizeof(curYoloTensor.gridSizeX);
totalSize += sizeof(curYoloTensor.gridSizeY);
totalSize += sizeof(curYoloTensor.numBBoxes);
totalSize += sizeof(curYoloTensor.scaleXY);
totalSize += sizeof(uint) + sizeof(curYoloTensor.anchors[0]) * curYoloTensor.anchors.size();
totalSize += sizeof(uint) + sizeof(curYoloTensor.mask[0]) * curYoloTensor.mask.size();
}
return totalSize;
}
@@ -252,35 +311,46 @@ void YoloLayer::serialize(void* buffer) const noexcept
{
char *d = static_cast<char*>(buffer);
write(d, m_NumBBoxes);
write(d, m_NumClasses);
write(d, m_NetWidth);
write(d, m_NetHeight);
write(d, m_GridSizeX);
write(d, m_GridSizeY);
write(d, m_Type);
write(d, m_NumClasses);
write(d, m_NewCoords);
write(d, m_ScaleXY);
write(d, m_OutputSize);
write(d, m_Type);
write(d, m_TopK);
write(d, m_ScoreThreshold);
uint anchorsSize = m_Anchors.size();
write(d, anchorsSize);
for (uint i = 0; i < anchorsSize; i++) {
write(d, m_Anchors[i]);
}
uint yoloTensorsSize = m_YoloTensors.size();
write(d, yoloTensorsSize);
for (uint i = 0; i < yoloTensorsSize; ++i)
{
const TensorInfo& curYoloTensor = m_YoloTensors.at(i);
write(d, curYoloTensor.gridSizeX);
write(d, curYoloTensor.gridSizeY);
write(d, curYoloTensor.numBBoxes);
write(d, curYoloTensor.scaleXY);
uint maskSize = m_Mask.size();
write(d, maskSize);
for (uint i = 0; i < maskSize; i++) {
write(d, m_Mask[i]);
uint anchorsSize = curYoloTensor.anchors.size();
write(d, anchorsSize);
for (uint j = 0; j < anchorsSize; ++j)
{
write(d, curYoloTensor.anchors[j]);
}
uint maskSize = curYoloTensor.mask.size();
write(d, maskSize);
for (uint j = 0; j < maskSize; ++j)
{
write(d, curYoloTensor.mask[j]);
}
}
}
nvinfer1::IPluginV2* YoloLayer::clone() const noexcept
{
return new YoloLayer (
m_NumBBoxes, m_NumClasses, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY, m_Type,
m_NewCoords, m_ScaleXY, m_Anchors, m_Mask);
m_NetWidth, m_NetHeight, m_NumClasses, m_NewCoords, m_YoloTensors, m_OutputSize, m_Type, m_TopK,
m_ScoreThreshold);
}
REGISTER_TENSORRT_PLUGIN(YoloLayerPluginCreator);

View File

@@ -36,14 +36,16 @@
#include "NvInferPlugin.h"
#define CUDA_CHECK(status) \
{ \
if (status != 0) \
{ \
std::cout << "CUDA failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \
<< " at line " << __LINE__ << std::endl; \
abort(); \
} \
#include "yolo.h"
#define CUDA_CHECK(status) \
{ \
if (status != 0) \
{ \
std::cout << "CUDA failure: " << cudaGetErrorString(status) << " in file " << __FILE__ << " at line " \
<< __LINE__ << std::endl; \
abort(); \
} \
}
namespace
@@ -56,15 +58,17 @@ class YoloLayer : public nvinfer1::IPluginV2
{
public:
YoloLayer (const void* data, size_t length);
YoloLayer (
const uint& numBBoxes, const uint& numClasses, const uint& netWidth, const uint& netHeight,
const uint& gridSizeX, const uint& gridSizeY, const uint& modelType, const uint& newCoords,
const float& scaleXY, const std::vector<float> anchors,
const std::vector<int> mask);
~YoloLayer ();
const uint& netWidth, const uint& netHeight, const uint& numClasses, const uint& newCoords,
const std::vector<TensorInfo>& yoloTensors, const uint64_t& outputSize, const uint& modelType, const uint& topK,
const float& scoreThreshold);
const char* getPluginType () const noexcept override { return YOLOLAYER_PLUGIN_NAME; }
const char* getPluginVersion () const noexcept override { return YOLOLAYER_PLUGIN_VERSION; }
int getNbOutputs () const noexcept override { return 1; }
int getNbOutputs () const noexcept override { return 2; }
nvinfer1::Dims getOutputDimensions (
int index, const nvinfer1::Dims* inputs,
@@ -74,53 +78,59 @@ public:
nvinfer1::DataType type, nvinfer1::PluginFormat format) const noexcept override;
void configureWithFormat (
const nvinfer1::Dims* inputDims, int nbInputs,
const nvinfer1::Dims* outputDims, int nbOutputs,
const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, int nbOutputs,
nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept override;
int initialize () noexcept override { return 0; }
void terminate () noexcept override {}
size_t getWorkspaceSize (int maxBatchSize) const noexcept override { return 0; }
size_t getWorkspaceSize (int maxBatchSize) const noexcept override {
return maxBatchSize * sizeof(int);
}
int32_t enqueue (
int32_t batchSize, void const* const* inputs, void* const* outputs,
void* workspace, cudaStream_t stream) noexcept override;
int batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream)
noexcept override;
size_t getSerializationSize() const noexcept override;
void serialize (void* buffer) const noexcept override;
void destroy () noexcept override { delete this; }
nvinfer1::IPluginV2* clone() const noexcept override;
void setPluginNamespace (const char* pluginNamespace) noexcept override {
m_Namespace = pluginNamespace;
}
virtual const char* getPluginNamespace () const noexcept override {
return m_Namespace.c_str();
}
private:
std::string m_Namespace {""};
uint m_NumBBoxes {0};
uint m_NumClasses {0};
uint m_NetWidth {0};
uint m_NetHeight {0};
uint m_GridSizeX {0};
uint m_GridSizeY {0};
uint m_Type {0};
uint m_NumClasses {0};
uint m_NewCoords {0};
float m_ScaleXY {0};
std::vector<float> m_Anchors;
std::vector<int> m_Mask;
std::vector<TensorInfo> m_YoloTensors;
uint64_t m_OutputSize {0};
void* p_Anchors;
void* p_Mask;
uint m_Type {0};
uint m_TopK {0};
float m_ScoreThreshold {0};
};
class YoloLayerPluginCreator : public nvinfer1::IPluginCreator
{
public:
YoloLayerPluginCreator () {}
~YoloLayerPluginCreator () {}
const char* getPluginName () const noexcept override { return YOLOLAYER_PLUGIN_NAME; }
const char* getPluginVersion () const noexcept override { return YOLOLAYER_PLUGIN_VERSION; }
const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override {
@@ -153,7 +163,6 @@ private:
std::string m_Namespace {""};
};
extern uint kNUM_BBOXES;
extern uint kNUM_CLASSES;
#endif // __YOLO_PLUGINS__

112
readme.md
View File

@@ -25,7 +25,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
* YOLOR native support
* Models benchmarks (**outdated**)
* **GPU YOLO Decoder (moved from CPU to GPU to get better performance)** [#138](https://github.com/marcoslucianops/DeepStream-Yolo/issues/138)
* **Improved NMS** [#142](https://github.com/marcoslucianops/DeepStream-Yolo/issues/142)
* **GPU Batched NMS** [#142](https://github.com/marcoslucianops/DeepStream-Yolo/issues/142)
##
@@ -38,6 +38,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
* [Basic usage](#basic-usage)
* [YOLOv5 usage](#yolov5-usage)
* [YOLOR usage](#yolor-usage)
* [NMS configuration](#nms-configuration)
* [INT8 calibration](#int8-calibration)
* [Using your custom model](docs/customModels.md)
@@ -101,67 +102,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
### Benchmarks
```
nms-iou-threshold = 0.6
pre-cluster-threshold = 0.001 (mAP eval) / 0.25 (FPS measurement)
batch-size = 1
valid = val2017 (COCO) - 1000 random images for INT8 calibration
sample = 1920x1080 video
NOTE: Used maintain-aspect-ratio=1 in config_infer file for YOLOv4 (with letter_box=1), YOLOv5 and YOLOR models.
```
#### NVIDIA GTX 1050 4GB (Mobile)
##### YOLOR-CSP performance comparison
| | DeepStream | PyTorch |
|:---------------------:|:----------:|:-------:|
| FPS (without display) | 13.32 | 10.07 |
| FPS (with display) | 12.63 | 9.41 |
##### YOLOv5n performance comparison
| | DeepStream | TensorRTx | Ultralytics |
|:---------------------:|:----------:|:---------:|:-----------:|
| FPS (without display) | 110.25 | 87.42 | 97.19 |
| FPS (with display) | 105.62 | 73.07 | 50.37 |
<details><summary>More</summary>
<br>
| DeepStream | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS<br />(without display) |
|:------------------:|:---------:|:----------:|:------------:|:-------:|:--------:|:--------------------------:|
| YOLOR-P6 | FP32 | 1280 | 0.478 | 0.663 | 0.519 | 5.53 |
| YOLOR-CSP-X* | FP32 | 640 | 0.473 | 0.664 | 0.513 | 7.59 |
| YOLOR-CSP-X | FP32 | 640 | 0.470 | 0.661 | 0.507 | 7.52 |
| YOLOR-CSP* | FP32 | 640 | 0.459 | 0.652 | 0.496 | 13.28 |
| YOLOR-CSP | FP32 | 640 | 0.449 | 0.639 | 0.483 | 13.32 |
| YOLOv5x6 6.0 | FP32 | 1280 | 0.504 | 0.681 | 0.547 | 2.22 |
| YOLOv5l6 6.0 | FP32 | 1280 | 0.492 | 0.670 | 0.535 | 4.05 |
| YOLOv5m6 6.0 | FP32 | 1280 | 0.463 | 0.642 | 0.504 | 7.54 |
| YOLOv5s6 6.0 | FP32 | 1280 | 0.394 | 0.572 | 0.424 | 18.64 |
| YOLOv5n6 6.0 | FP32 | 1280 | 0.294 | 0.452 | 0.314 | 26.94 |
| YOLOv5x 6.0 | FP32 | 640 | 0.469 | 0.654 | 0.509 | 8.24 |
| YOLOv5l 6.0 | FP32 | 640 | 0.450 | 0.634 | 0.487 | 14.96 |
| YOLOv5m 6.0 | FP32 | 640 | 0.415 | 0.601 | 0.448 | 28.30 |
| YOLOv5s 6.0 | FP32 | 640 | 0.334 | 0.516 | 0.355 | 63.55 |
| YOLOv5n 6.0 | FP32 | 640 | 0.250 | 0.417 | 0.260 | 110.25 |
| YOLOv4-P6 | FP32 | 1280 | 0.499 | 0.685 | 0.542 | 2.57 |
| YOLOv4-P5 | FP32 | 896 | 0.472 | 0.659 | 0.513 | 5.48 |
| YOLOv4-CSP-X-SWISH | FP32 | 640 | 0.473 | 0.664 | 0.513 | 7.51 |
| YOLOv4-CSP-SWISH | FP32 | 640 | 0.459 | 0.652 | 0.496 | 13.13 |
| YOLOv4x-MISH | FP32 | 640 | 0.459 | 0.650 | 0.495 | 7.53 |
| YOLOv4-CSP | FP32 | 640 | 0.440 | 0.632 | 0.474 | 13.19 |
| YOLOv4 | FP32 | 608 | 0.498 | 0.740 | 0.549 | 12.18 |
| YOLOv4-Tiny | FP32 | 416 | 0.215 | 0.403 | 0.206 | 201.20 |
| YOLOv3-SPP | FP32 | 608 | 0.411 | 0.686 | 0.433 | 12.22 |
| YOLOv3-Tiny-PRN | FP32 | 416 | 0.167 | 0.382 | 0.125 | 277.14 |
| YOLOv3 | FP32 | 608 | 0.377 | 0.672 | 0.385 | 12.51 |
| YOLOv3-Tiny | FP32 | 416 | 0.095 | 0.203 | 0.079 | 218.42 |
| YOLOv2 | FP32 | 608 | 0.286 | 0.541 | 0.273 | 25.28 |
| YOLOv2-Tiny | FP32 | 416 | 0.102 | 0.258 | 0.061 | 231.36 |
</details>
New tests comming soon.
##
@@ -221,9 +162,11 @@ wget https://us.download.nvidia.com/tesla/510.47.03/NVIDIA-Linux-x86_64-510.47.0
* Run
```
sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau
sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd
```
**NOTE**: This step will disable the nouveau drivers.
* Reboot
```
@@ -233,7 +176,7 @@ sudo reboot
* Install
```
sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --dkms --install-libglvnd
sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd
```
**NOTE**: If you are using a laptop with NVIDIA Optimius, run
@@ -326,7 +269,7 @@ sudo apt install libssl1.0.0 libgstreamer1.0-0 gstreamer1.0-tools gstreamer1.0-p
sudo apt-get install linux-headers-$(uname -r)
```
**NOTE**: Install DKMS only if you are using the default Ubuntu kernel
**NOTE**: Install DKMS only if you are using the default Ubuntu kernel.
```
sudo apt-get install dkms
@@ -369,9 +312,11 @@ wget https://us.download.nvidia.com/tesla/470.129.06/NVIDIA-Linux-x86_64-470.129
* Run
```
sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau
sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd
```
**NOTE**: This step will disable the nouveau drivers.
* Reboot
```
@@ -381,7 +326,7 @@ sudo reboot
* Install
```
sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --dkms --install-libglvnd
sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd
```
**NOTE**: If you are using a laptop with NVIDIA Optimius, run
@@ -519,11 +464,6 @@ network-mode=0
# Number of classes in label file
num-detected-classes=80
...
[class-attrs-all]
# IOU threshold
nms-iou-threshold=0.45
# Score threshold
pre-cluster-threshold=0.25
```
#### 5. Run
@@ -542,6 +482,7 @@ gpu-id=0
gie-unique-id=1
nvbuf-memory-type=0
config-file=config_infer_primary_yoloV2.txt
...
```
##
@@ -618,11 +559,6 @@ network-mode=0
# Number of classes in label file
num-detected-classes=80
...
[class-attrs-all]
# IOU threshold
nms-iou-threshold=0.45
# Score threshold
pre-cluster-threshold=0.25
```
#### 8. Change the deepstream_app_config.txt file
@@ -749,11 +685,6 @@ network-mode=0
# Number of classes in label file
num-detected-classes=80
...
[class-attrs-all]
# IOU threshold
nms-iou-threshold=0.5
# Score threshold
pre-cluster-threshold=0.25
```
#### 8. Change the deepstream_app_config.txt file
@@ -776,6 +707,23 @@ deepstream-app -c deepstream_app_config.txt
##
### NMS Configuration
To change the `iou-threshold`, `score-threshold` and `topk` values, modify the `config_nms.txt` file and regenerate the model engine file.
**NOTE**: Lower `topk` values will result in more performance.
**NOTE**: Make sure to set cluster-mode=4 and pre-cluster-threshold=0 in config_infer file.
```
[property]
iou-threshold=0.45
score-threshold=0.25
topk=300
```
##
### INT8 calibration
#### 1. Install OpenCV

View File

@@ -144,6 +144,7 @@ with open(cfg_file, "w") as c:
c.write("width=%d\n" % model_width)
c.write("height=%d\n" % model_height)
c.write("channels=%d\n" % model_channels)
c.write("letter_box=1\n")
nc = 0
depth_multiple = 0
width_multiple = 0