GPU Batched NMS
This commit is contained in:
@@ -14,12 +14,11 @@ interval=0
|
|||||||
gie-unique-id=1
|
gie-unique-id=1
|
||||||
process-mode=1
|
process-mode=1
|
||||||
network-type=0
|
network-type=0
|
||||||
cluster-mode=2
|
cluster-mode=4
|
||||||
maintain-aspect-ratio=0
|
maintain-aspect-ratio=0
|
||||||
parse-bbox-func-name=NvDsInferParseYolo
|
parse-bbox-func-name=NvDsInferParseYolo
|
||||||
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
|
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
|
||||||
engine-create-func-name=NvDsInferYoloCudaEngineGet
|
engine-create-func-name=NvDsInferYoloCudaEngineGet
|
||||||
|
|
||||||
[class-attrs-all]
|
[class-attrs-all]
|
||||||
nms-iou-threshold=0.45
|
pre-cluster-threshold=0
|
||||||
pre-cluster-threshold=0.25
|
|
||||||
|
|||||||
@@ -14,12 +14,11 @@ interval=0
|
|||||||
gie-unique-id=1
|
gie-unique-id=1
|
||||||
process-mode=1
|
process-mode=1
|
||||||
network-type=0
|
network-type=0
|
||||||
cluster-mode=2
|
cluster-mode=4
|
||||||
maintain-aspect-ratio=0
|
maintain-aspect-ratio=0
|
||||||
parse-bbox-func-name=NvDsInferParseYolo
|
parse-bbox-func-name=NvDsInferParseYolo
|
||||||
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
|
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
|
||||||
engine-create-func-name=NvDsInferYoloCudaEngineGet
|
engine-create-func-name=NvDsInferYoloCudaEngineGet
|
||||||
|
|
||||||
[class-attrs-all]
|
[class-attrs-all]
|
||||||
nms-iou-threshold=0.45
|
pre-cluster-threshold=0
|
||||||
pre-cluster-threshold=0.25
|
|
||||||
|
|||||||
@@ -14,12 +14,11 @@ interval=0
|
|||||||
gie-unique-id=1
|
gie-unique-id=1
|
||||||
process-mode=1
|
process-mode=1
|
||||||
network-type=0
|
network-type=0
|
||||||
cluster-mode=2
|
cluster-mode=4
|
||||||
maintain-aspect-ratio=1
|
maintain-aspect-ratio=1
|
||||||
parse-bbox-func-name=NvDsInferParseYolo
|
parse-bbox-func-name=NvDsInferParseYolo
|
||||||
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
|
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
|
||||||
engine-create-func-name=NvDsInferYoloCudaEngineGet
|
engine-create-func-name=NvDsInferYoloCudaEngineGet
|
||||||
|
|
||||||
[class-attrs-all]
|
[class-attrs-all]
|
||||||
nms-iou-threshold=0.45
|
pre-cluster-threshold=0
|
||||||
pre-cluster-threshold=0.25
|
|
||||||
|
|||||||
@@ -14,12 +14,11 @@ interval=0
|
|||||||
gie-unique-id=1
|
gie-unique-id=1
|
||||||
process-mode=1
|
process-mode=1
|
||||||
network-type=0
|
network-type=0
|
||||||
cluster-mode=2
|
cluster-mode=4
|
||||||
maintain-aspect-ratio=1
|
maintain-aspect-ratio=1
|
||||||
parse-bbox-func-name=NvDsInferParseYolo
|
parse-bbox-func-name=NvDsInferParseYolo
|
||||||
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
|
custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
|
||||||
engine-create-func-name=NvDsInferYoloCudaEngineGet
|
engine-create-func-name=NvDsInferYoloCudaEngineGet
|
||||||
|
|
||||||
[class-attrs-all]
|
[class-attrs-all]
|
||||||
nms-iou-threshold=0.5
|
pre-cluster-threshold=0
|
||||||
pre-cluster-threshold=0.25
|
|
||||||
|
|||||||
4
config_nms.txt
Normal file
4
config_nms.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
[property]
|
||||||
|
iou-threshold=0.45
|
||||||
|
score-threshold=0.25
|
||||||
|
topk=300
|
||||||
@@ -272,24 +272,6 @@ interval=0
|
|||||||
|
|
||||||
##
|
##
|
||||||
|
|
||||||
#### nms-iou-threshold
|
|
||||||
|
|
||||||
```
|
|
||||||
# IOU threshold
|
|
||||||
nms-iou-threshold=0.6
|
|
||||||
```
|
|
||||||
|
|
||||||
##
|
|
||||||
|
|
||||||
#### pre-cluster-threshold
|
|
||||||
|
|
||||||
```
|
|
||||||
# Socre threshold
|
|
||||||
pre-cluster-threshold=0.25
|
|
||||||
```
|
|
||||||
|
|
||||||
##
|
|
||||||
|
|
||||||
### Testing model
|
### Testing model
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -55,7 +55,6 @@ SRCFILES:= nvdsinfer_yolo_engine.cpp \
|
|||||||
layers/convolutional_layer.cpp \
|
layers/convolutional_layer.cpp \
|
||||||
layers/implicit_layer.cpp \
|
layers/implicit_layer.cpp \
|
||||||
layers/channels_layer.cpp \
|
layers/channels_layer.cpp \
|
||||||
layers/dropout_layer.cpp \
|
|
||||||
layers/shortcut_layer.cpp \
|
layers/shortcut_layer.cpp \
|
||||||
layers/route_layer.cpp \
|
layers/route_layer.cpp \
|
||||||
layers/upsample_layer.cpp \
|
layers/upsample_layer.cpp \
|
||||||
@@ -67,7 +66,8 @@ SRCFILES:= nvdsinfer_yolo_engine.cpp \
|
|||||||
yoloForward.cu \
|
yoloForward.cu \
|
||||||
yoloForward_v2.cu \
|
yoloForward_v2.cu \
|
||||||
yoloForward_nc.cu \
|
yoloForward_nc.cu \
|
||||||
yoloForward_r.cu
|
yoloForward_r.cu \
|
||||||
|
sortDetections.cu
|
||||||
|
|
||||||
ifeq ($(OPENCV), 1)
|
ifeq ($(OPENCV), 1)
|
||||||
SRCFILES+= calibrator.cpp
|
SRCFILES+= calibrator.cpp
|
||||||
|
|||||||
@@ -1,15 +0,0 @@
|
|||||||
/*
|
|
||||||
* Created by Marcos Luciano
|
|
||||||
* https://www.github.com/marcoslucianops
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "dropout_layer.h"
|
|
||||||
|
|
||||||
nvinfer1::ILayer* dropoutLayer(
|
|
||||||
float probability,
|
|
||||||
nvinfer1::ITensor* input,
|
|
||||||
nvinfer1::INetworkDefinition* network)
|
|
||||||
{
|
|
||||||
nvinfer1::ILayer* output;
|
|
||||||
return output;
|
|
||||||
}
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
/*
|
|
||||||
* Created by Marcos Luciano
|
|
||||||
* https://www.github.com/marcoslucianops
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __DROPOUT_LAYER_H__
|
|
||||||
#define __DROPOUT_LAYER_H__
|
|
||||||
|
|
||||||
#include "NvInfer.h"
|
|
||||||
|
|
||||||
nvinfer1::ILayer* dropoutLayer(
|
|
||||||
float probability,
|
|
||||||
nvinfer1::ITensor* input,
|
|
||||||
nvinfer1::INetworkDefinition* network);
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -63,15 +63,13 @@ static bool getYoloNetworkInfo (NetworkInfo &networkInfo, const NvDsInferContext
|
|||||||
|
|
||||||
if (networkInfo.configFilePath.empty() ||
|
if (networkInfo.configFilePath.empty() ||
|
||||||
networkInfo.wtsFilePath.empty()) {
|
networkInfo.wtsFilePath.empty()) {
|
||||||
std::cerr << "YOLO config file or weights file is not specified"
|
std::cerr << "YOLO config file or weights file is not specified\n" << std::endl;
|
||||||
<< std::endl;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!fileExists(networkInfo.configFilePath) ||
|
if (!fileExists(networkInfo.configFilePath) ||
|
||||||
!fileExists(networkInfo.wtsFilePath)) {
|
!fileExists(networkInfo.wtsFilePath)) {
|
||||||
std::cerr << "YOLO config file or weights file is not exist"
|
std::cerr << "YOLO config file or weights file is not exist\n" << std::endl;
|
||||||
<< std::endl;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -38,15 +38,15 @@ extern "C" bool NvDsInferParseYolo(
|
|||||||
std::vector<NvDsInferParseObjectInfo>& objectList);
|
std::vector<NvDsInferParseObjectInfo>& objectList);
|
||||||
|
|
||||||
static NvDsInferParseObjectInfo convertBBox(
|
static NvDsInferParseObjectInfo convertBBox(
|
||||||
const float& bx, const float& by, const float& bw,
|
const float& bx1, const float& by1, const float& bx2,
|
||||||
const float& bh, const uint& netW, const uint& netH)
|
const float& by2, const uint& netW, const uint& netH)
|
||||||
{
|
{
|
||||||
NvDsInferParseObjectInfo b;
|
NvDsInferParseObjectInfo b;
|
||||||
|
|
||||||
float x1 = bx - bw / 2;
|
float x1 = bx1;
|
||||||
float y1 = by - bh / 2;
|
float y1 = by1;
|
||||||
float x2 = x1 + bw;
|
float x2 = bx2;
|
||||||
float y2 = y1 + bh;
|
float y2 = by2;
|
||||||
|
|
||||||
x1 = clamp(x1, 0, netW);
|
x1 = clamp(x1, 0, netW);
|
||||||
y1 = clamp(y1, 0, netH);
|
y1 = clamp(y1, 0, netH);
|
||||||
@@ -62,11 +62,11 @@ static NvDsInferParseObjectInfo convertBBox(
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void addBBoxProposal(
|
static void addBBoxProposal(
|
||||||
const float bx, const float by, const float bw, const float bh,
|
const float bx1, const float by1, const float bx2, const float by2,
|
||||||
const uint& netW, const uint& netH, const int maxIndex,
|
const uint& netW, const uint& netH, const int maxIndex,
|
||||||
const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
|
const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
|
||||||
{
|
{
|
||||||
NvDsInferParseObjectInfo bbi = convertBBox(bx, by, bw, bh, netW, netH);
|
NvDsInferParseObjectInfo bbi = convertBBox(bx1, by1, bx2, by2, netW, netH);
|
||||||
if (bbi.width < 1 || bbi.height < 1) return;
|
if (bbi.width < 1 || bbi.height < 1) return;
|
||||||
|
|
||||||
bbi.detectionConfidence = maxProb;
|
bbi.detectionConfidence = maxProb;
|
||||||
@@ -75,34 +75,25 @@ static void addBBoxProposal(
|
|||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<NvDsInferParseObjectInfo> decodeYoloTensor(
|
static std::vector<NvDsInferParseObjectInfo> decodeYoloTensor(
|
||||||
const float* detections,
|
const int* counts, const float* boxes,
|
||||||
const uint gridSizeW, const uint gridSizeH, const uint numBBoxes,
|
const float* scores, const float* classes,
|
||||||
const uint numOutputClasses, const uint& netW, const uint& netH)
|
const uint& netW, const uint& netH)
|
||||||
{
|
{
|
||||||
std::vector<NvDsInferParseObjectInfo> binfo;
|
std::vector<NvDsInferParseObjectInfo> binfo;
|
||||||
for (uint y = 0; y < gridSizeH; ++y) {
|
|
||||||
for (uint x = 0; x < gridSizeW; ++x) {
|
uint numBoxes = counts[0];
|
||||||
for (uint b = 0; b < numBBoxes; ++b)
|
|
||||||
|
for (uint b = 0; b < numBoxes; ++b)
|
||||||
{
|
{
|
||||||
const int numGridCells = gridSizeH * gridSizeW;
|
float bx1 = boxes[b * 4 + 0];
|
||||||
const int bbindex = y * gridSizeW + x;
|
float by1 = boxes[b * 4 + 1];
|
||||||
|
float bx2 = boxes[b * 4 + 2];
|
||||||
|
float by2 = boxes[b * 4 + 3];
|
||||||
|
|
||||||
const float bx
|
float maxProb = scores[b];
|
||||||
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 0)];
|
int maxIndex = classes[b];
|
||||||
const float by
|
|
||||||
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 1)];
|
|
||||||
const float bw
|
|
||||||
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 2)];
|
|
||||||
const float bh
|
|
||||||
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 3)];
|
|
||||||
const float maxProb
|
|
||||||
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 4)];
|
|
||||||
const int maxIndex
|
|
||||||
= detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 5)];
|
|
||||||
|
|
||||||
addBBoxProposal(bx, by, bw, bh, netW, netH, maxIndex, maxProb, binfo);
|
addBBoxProposal(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo);
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return binfo;
|
return binfo;
|
||||||
}
|
}
|
||||||
@@ -112,7 +103,6 @@ static bool NvDsInferParseCustomYolo(
|
|||||||
NvDsInferNetworkInfo const& networkInfo,
|
NvDsInferNetworkInfo const& networkInfo,
|
||||||
NvDsInferParseDetectionParams const& detectionParams,
|
NvDsInferParseDetectionParams const& detectionParams,
|
||||||
std::vector<NvDsInferParseObjectInfo>& objectList,
|
std::vector<NvDsInferParseObjectInfo>& objectList,
|
||||||
const uint &numBBoxes,
|
|
||||||
const uint &numClasses)
|
const uint &numClasses)
|
||||||
{
|
{
|
||||||
if (outputLayersInfo.empty())
|
if (outputLayersInfo.empty())
|
||||||
@@ -130,18 +120,17 @@ static bool NvDsInferParseCustomYolo(
|
|||||||
|
|
||||||
std::vector<NvDsInferParseObjectInfo> objects;
|
std::vector<NvDsInferParseObjectInfo> objects;
|
||||||
|
|
||||||
for (uint idx = 0; idx < outputLayersInfo.size(); ++idx)
|
for (uint idx = 0; idx < outputLayersInfo.size() / 4; ++idx)
|
||||||
{
|
{
|
||||||
const NvDsInferLayerInfo &layer = outputLayersInfo[idx];
|
const NvDsInferLayerInfo &counts = outputLayersInfo[idx * 4 + 0];
|
||||||
|
const NvDsInferLayerInfo &boxes = outputLayersInfo[idx * 4 + 1];
|
||||||
assert(layer.inferDims.numDims == 3);
|
const NvDsInferLayerInfo &scores = outputLayersInfo[idx * 4 + 2];
|
||||||
const uint gridSizeH = layer.inferDims.d[1];
|
const NvDsInferLayerInfo &classes = outputLayersInfo[idx * 4 + 3];
|
||||||
const uint gridSizeW = layer.inferDims.d[2];
|
|
||||||
|
|
||||||
std::vector<NvDsInferParseObjectInfo> outObjs =
|
std::vector<NvDsInferParseObjectInfo> outObjs =
|
||||||
decodeYoloTensor(
|
decodeYoloTensor(
|
||||||
(const float*)(layer.buffer),
|
(const int*)(counts.buffer), (const float*)(boxes.buffer),
|
||||||
gridSizeW, gridSizeH, numBBoxes, numClasses,
|
(const float*)(scores.buffer), (const float*)(classes.buffer),
|
||||||
networkInfo.width, networkInfo.height);
|
networkInfo.width, networkInfo.height);
|
||||||
|
|
||||||
objects.insert(objects.end(), outObjs.begin(), outObjs.end());
|
objects.insert(objects.end(), outObjs.begin(), outObjs.end());
|
||||||
@@ -158,11 +147,10 @@ extern "C" bool NvDsInferParseYolo(
|
|||||||
NvDsInferParseDetectionParams const& detectionParams,
|
NvDsInferParseDetectionParams const& detectionParams,
|
||||||
std::vector<NvDsInferParseObjectInfo>& objectList)
|
std::vector<NvDsInferParseObjectInfo>& objectList)
|
||||||
{
|
{
|
||||||
uint numBBoxes = kNUM_BBOXES;
|
int num_classes = kNUM_CLASSES;
|
||||||
uint numClasses = kNUM_CLASSES;
|
|
||||||
|
|
||||||
return NvDsInferParseCustomYolo (
|
return NvDsInferParseCustomYolo (
|
||||||
outputLayersInfo, networkInfo, detectionParams, objectList, numBBoxes, numClasses);
|
outputLayersInfo, networkInfo, detectionParams, objectList, num_classes);
|
||||||
}
|
}
|
||||||
|
|
||||||
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYolo);
|
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYolo);
|
||||||
|
|||||||
84
nvdsinfer_custom_impl_Yolo/sortDetections.cu
Normal file
84
nvdsinfer_custom_impl_Yolo/sortDetections.cu
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
/*
|
||||||
|
* Created by Marcos Luciano
|
||||||
|
* https://www.github.com/marcoslucianops
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <cub/cub.cuh>
|
||||||
|
|
||||||
|
__global__ void sortOutput(
|
||||||
|
int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, float* bboxData, float* scoreData,
|
||||||
|
const uint numOutputClasses)
|
||||||
|
{
|
||||||
|
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
int index = d_indexes[x_id];
|
||||||
|
int maxIndex = d_classes[index];
|
||||||
|
bboxData[x_id * 4 + 0] = d_boxes[index * 4 + 0];
|
||||||
|
bboxData[x_id * 4 + 1] = d_boxes[index * 4 + 1];
|
||||||
|
bboxData[x_id * 4 + 2] = d_boxes[index * 4 + 2];
|
||||||
|
bboxData[x_id * 4 + 3] = d_boxes[index * 4 + 3];
|
||||||
|
scoreData[x_id * numOutputClasses + maxIndex] = d_scores[x_id] - 1.f;
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaError_t sortDetections(
|
||||||
|
void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* bboxData, void* scoreData, void* countData,
|
||||||
|
const uint& batchSize, uint64_t& outputSize, uint& topK, const uint& numOutputClasses, cudaStream_t stream);
|
||||||
|
|
||||||
|
cudaError_t sortDetections(
|
||||||
|
void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* bboxData, void* scoreData, void* countData,
|
||||||
|
const uint& batchSize, uint64_t& outputSize, uint& topK, const uint& numOutputClasses, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
for (unsigned int batch = 0; batch < batchSize; ++batch)
|
||||||
|
{
|
||||||
|
int* _d_indexes = reinterpret_cast<int*>(d_indexes) + (batch * outputSize);
|
||||||
|
float* _d_scores = reinterpret_cast<float*>(d_scores) + (batch * outputSize);
|
||||||
|
|
||||||
|
int* _countData = reinterpret_cast<int*>(countData) + (batch);
|
||||||
|
int* _count = (int*)malloc(sizeof(int));
|
||||||
|
cudaMemcpy(_count, (int*)&_countData[0], sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
int count = _count[0];
|
||||||
|
|
||||||
|
if (count == 0)
|
||||||
|
{
|
||||||
|
free(_count);
|
||||||
|
return cudaGetLastError();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t begin_bit = 0;
|
||||||
|
size_t end_bit = sizeof(float) * 8;
|
||||||
|
|
||||||
|
float *d_keys_out = NULL;
|
||||||
|
int *d_values_out = NULL;
|
||||||
|
|
||||||
|
cudaMalloc((void **)&d_keys_out, count * sizeof(float));
|
||||||
|
cudaMalloc((void **)&d_values_out, count * sizeof(int));
|
||||||
|
|
||||||
|
void* d_temp_storage = NULL;
|
||||||
|
size_t temp_storage_bytes = 0;
|
||||||
|
|
||||||
|
cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, _d_scores, d_keys_out, _d_indexes,
|
||||||
|
d_values_out, count, begin_bit, end_bit);
|
||||||
|
|
||||||
|
cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
||||||
|
|
||||||
|
cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, _d_scores, d_keys_out, _d_indexes,
|
||||||
|
d_values_out, count, begin_bit, end_bit);
|
||||||
|
|
||||||
|
cudaMemcpy(_d_scores, d_keys_out, count * sizeof(float), cudaMemcpyDeviceToDevice);
|
||||||
|
cudaMemcpy(_d_indexes, d_values_out, count * sizeof(int), cudaMemcpyDeviceToDevice);
|
||||||
|
|
||||||
|
int threads_per_block = count < topK ? count : topK;
|
||||||
|
|
||||||
|
sortOutput<<<1, threads_per_block, 0, stream>>>(
|
||||||
|
_d_indexes, _d_scores, reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
|
||||||
|
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<float*>(bboxData) + (batch * topK),
|
||||||
|
reinterpret_cast<float*>(scoreData) + (batch * topK), numOutputClasses);
|
||||||
|
|
||||||
|
cudaFree(d_keys_out);
|
||||||
|
cudaFree(d_values_out);
|
||||||
|
cudaFree(d_temp_storage);
|
||||||
|
|
||||||
|
free(_count);
|
||||||
|
}
|
||||||
|
return cudaGetLastError();
|
||||||
|
}
|
||||||
@@ -57,7 +57,7 @@ bool fileExists(const std::string fileName, bool verbose)
|
|||||||
{
|
{
|
||||||
if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(fileName)))
|
if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(fileName)))
|
||||||
{
|
{
|
||||||
if (verbose) std::cout << "File does not exist: " << fileName << std::endl;
|
if (verbose) std::cout << "\nFile does not exist: " << fileName << std::endl;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@@ -101,7 +101,7 @@ std::vector<float> loadWeights(const std::string weightsFilePath, const std::str
|
|||||||
assert(file.good());
|
assert(file.good());
|
||||||
int32_t count;
|
int32_t count;
|
||||||
file >> count;
|
file >> count;
|
||||||
assert(count > 0 && "Invalid .wts file.");
|
assert(count > 0 && "\nInvalid .wts file.");
|
||||||
|
|
||||||
uint32_t floatWeight;
|
uint32_t floatWeight;
|
||||||
std::string name;
|
std::string name;
|
||||||
@@ -118,7 +118,7 @@ std::vector<float> loadWeights(const std::string weightsFilePath, const std::str
|
|||||||
}
|
}
|
||||||
|
|
||||||
else {
|
else {
|
||||||
std::cerr << "File " << weightsFilePath << " is not supported" << std::endl;
|
std::cerr << "\nFile " << weightsFilePath << " is not supported" << std::endl;
|
||||||
std::abort();
|
std::abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -149,11 +149,19 @@ int getNumChannels(nvinfer1::ITensor* t)
|
|||||||
return d.d[0];
|
return d.d[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput,
|
void printLayerInfo(
|
||||||
std::string layerOutput, std::string weightPtr)
|
std::string layerIndex, std::string layerName, std::string layerInput, std::string layerOutput, std::string weightPtr)
|
||||||
{
|
{
|
||||||
std::cout << std::setw(6) << std::left << layerIndex << std::setw(24) << std::left << layerName;
|
std::cout << std::setw(6) << std::left << layerIndex << std::setw(24) << std::left << layerName;
|
||||||
std::cout << std::setw(20) << std::left << layerInput << std::setw(20) << std::left
|
std::cout << std::setw(20) << std::left << layerInput << std::setw(20) << std::left
|
||||||
<< layerOutput;
|
<< layerOutput;
|
||||||
std::cout << std::setw(7) << std::left << weightPtr << std::endl;
|
std::cout << std::setw(7) << std::left << weightPtr << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string getAbsPath(std::string path)
|
||||||
|
{
|
||||||
|
std::size_t found = path.rfind("/");
|
||||||
|
if (found != std::string::npos)
|
||||||
|
path.erase(path.begin() + found, path.end());
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
|||||||
@@ -41,8 +41,8 @@ bool fileExists(const std::string fileName, bool verbose = true);
|
|||||||
std::vector<float> loadWeights(const std::string weightsFilePath, const std::string& networkType);
|
std::vector<float> loadWeights(const std::string weightsFilePath, const std::string& networkType);
|
||||||
std::string dimsToString(const nvinfer1::Dims d);
|
std::string dimsToString(const nvinfer1::Dims d);
|
||||||
int getNumChannels(nvinfer1::ITensor* t);
|
int getNumChannels(nvinfer1::ITensor* t);
|
||||||
|
void printLayerInfo(
|
||||||
void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput,
|
std::string layerIndex, std::string layerName, std::string layerInput, std::string layerOutput, std::string weightPtr);
|
||||||
std::string layerOutput, std::string weightPtr);
|
std::string getAbsPath(std::string path);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -46,7 +46,12 @@ Yolo::Yolo(const NetworkInfo& networkInfo)
|
|||||||
m_InputC(0),
|
m_InputC(0),
|
||||||
m_InputSize(0),
|
m_InputSize(0),
|
||||||
m_NumClasses(0),
|
m_NumClasses(0),
|
||||||
m_LetterBox(0)
|
m_LetterBox(0),
|
||||||
|
m_NewCoords(0),
|
||||||
|
m_YoloCount(0),
|
||||||
|
m_IouThreshold(0),
|
||||||
|
m_ScoreThreshold(0),
|
||||||
|
m_TopK(0)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
Yolo::~Yolo()
|
Yolo::~Yolo()
|
||||||
@@ -61,59 +66,75 @@ nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder, nvinfer1
|
|||||||
m_ConfigBlocks = parseConfigFile(m_ConfigFilePath);
|
m_ConfigBlocks = parseConfigFile(m_ConfigFilePath);
|
||||||
parseConfigBlocks();
|
parseConfigBlocks();
|
||||||
|
|
||||||
|
std::string configNMS = getAbsPath(m_WtsFilePath) + "/config_nms.txt";
|
||||||
|
if (!fileExists(configNMS))
|
||||||
|
{
|
||||||
|
std::cerr << "YOLO config_nms.txt file is not specified\n" << std::endl;
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
m_ConfigNMSBlocks = parseConfigFile(configNMS);
|
||||||
|
parseConfigNMSBlocks();
|
||||||
|
|
||||||
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0);
|
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0);
|
||||||
if (parseModel(*network) != NVDSINFER_SUCCESS) {
|
if (parseModel(*network) != NVDSINFER_SUCCESS)
|
||||||
|
{
|
||||||
delete network;
|
delete network;
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << "Building the TensorRT Engine" << std::endl;
|
std::cout << "Building the TensorRT Engine\n" << std::endl;
|
||||||
|
|
||||||
if (m_NumClasses != m_NumDetectedClasses) {
|
if (m_NumClasses != m_NumDetectedClasses)
|
||||||
std::cout << "\nNOTE: Number of classes mismatch, make sure to set num-detected-classes=" << m_NumClasses << " in config_infer file" << std::endl;
|
{
|
||||||
|
std::cout << "NOTE: Number of classes mismatch, make sure to set num-detected-classes=" << m_NumClasses
|
||||||
|
<< " in config_infer file\n" << std::endl;
|
||||||
}
|
}
|
||||||
if (m_LetterBox == 1) {
|
if (m_LetterBox == 1)
|
||||||
std::cout << "\nNOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file to get better accuracy" << std::endl;
|
{
|
||||||
|
std::cout << "NOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file"
|
||||||
|
<< " to get better accuracy\n" << std::endl;
|
||||||
}
|
}
|
||||||
if (m_ClusterMode != 2) {
|
if (m_ClusterMode != 4)
|
||||||
std::cout << "\nNOTE: Wrong cluster-mode is set, make sure to set cluster-mode=2 in config_infer file" << std::endl;
|
{
|
||||||
|
std::cout << "NOTE: Wrong cluster-mode is set, make sure to set cluster-mode=4 in config_infer file\n"
|
||||||
|
<< std::endl;
|
||||||
}
|
}
|
||||||
std::cout << "" << std::endl;
|
|
||||||
|
|
||||||
if (m_NetworkMode == "INT8" && !fileExists(m_Int8CalibPath)) {
|
if (m_NetworkMode == "INT8" && !fileExists(m_Int8CalibPath))
|
||||||
|
{
|
||||||
assert(builder->platformHasFastInt8());
|
assert(builder->platformHasFastInt8());
|
||||||
#ifdef OPENCV
|
#ifdef OPENCV
|
||||||
std::string calib_image_list;
|
std::string calib_image_list;
|
||||||
int calib_batch_size;
|
int calib_batch_size;
|
||||||
if (getenv("INT8_CALIB_IMG_PATH")) {
|
if (getenv("INT8_CALIB_IMG_PATH"))
|
||||||
calib_image_list = getenv("INT8_CALIB_IMG_PATH");
|
calib_image_list = getenv("INT8_CALIB_IMG_PATH");
|
||||||
}
|
else
|
||||||
else {
|
{
|
||||||
std::cerr << "INT8_CALIB_IMG_PATH not set" << std::endl;
|
std::cerr << "INT8_CALIB_IMG_PATH not set" << std::endl;
|
||||||
std::abort();
|
std::abort();
|
||||||
}
|
}
|
||||||
if (getenv("INT8_CALIB_BATCH_SIZE")) {
|
if (getenv("INT8_CALIB_BATCH_SIZE"))
|
||||||
calib_batch_size = std::stoi(getenv("INT8_CALIB_BATCH_SIZE"));
|
calib_batch_size = std::stoi(getenv("INT8_CALIB_BATCH_SIZE"));
|
||||||
}
|
else
|
||||||
else {
|
{
|
||||||
std::cerr << "INT8_CALIB_BATCH_SIZE not set" << std::endl;
|
std::cerr << "INT8_CALIB_BATCH_SIZE not set" << std::endl;
|
||||||
std::abort();
|
std::abort();
|
||||||
}
|
}
|
||||||
nvinfer1::Int8EntropyCalibrator2 *calibrator = new nvinfer1::Int8EntropyCalibrator2(calib_batch_size, m_InputC, m_InputH, m_InputW, m_LetterBox, calib_image_list, m_Int8CalibPath);
|
nvinfer1::Int8EntropyCalibrator2 *calibrator = new nvinfer1::Int8EntropyCalibrator2(
|
||||||
|
calib_batch_size, m_InputC, m_InputH, m_InputW, m_LetterBox, calib_image_list, m_Int8CalibPath);
|
||||||
config->setFlag(nvinfer1::BuilderFlag::kINT8);
|
config->setFlag(nvinfer1::BuilderFlag::kINT8);
|
||||||
config->setInt8Calibrator(calibrator);
|
config->setInt8Calibrator(calibrator);
|
||||||
#else
|
#else
|
||||||
std::cerr << "OpenCV is required to run INT8 calibrator" << std::endl;
|
std::cerr << "OpenCV is required to run INT8 calibrator\n" << std::endl;
|
||||||
std::abort();
|
assert(0);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
nvinfer1::ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
|
nvinfer1::ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
|
||||||
if (engine) {
|
if (engine)
|
||||||
std::cout << "Building complete\n" << std::endl;
|
std::cout << "Building complete\n" << std::endl;
|
||||||
} else {
|
else
|
||||||
std::cerr << "Building engine failed\n" << std::endl;
|
std::cerr << "Building engine failed\n" << std::endl;
|
||||||
}
|
|
||||||
|
|
||||||
delete network;
|
delete network;
|
||||||
return engine;
|
return engine;
|
||||||
@@ -126,28 +147,30 @@ NvDsInferStatus Yolo::parseModel(nvinfer1::INetworkDefinition& network) {
|
|||||||
std::cout << "Building YOLO network\n" << std::endl;
|
std::cout << "Building YOLO network\n" << std::endl;
|
||||||
NvDsInferStatus status = buildYoloNetwork(weights, network);
|
NvDsInferStatus status = buildYoloNetwork(weights, network);
|
||||||
|
|
||||||
if (status == NVDSINFER_SUCCESS) {
|
if (status == NVDSINFER_SUCCESS)
|
||||||
std::cout << "Building YOLO network complete" << std::endl;
|
std::cout << "Building YOLO network complete" << std::endl;
|
||||||
} else {
|
else
|
||||||
std::cerr << "Building YOLO network failed" << std::endl;
|
std::cerr << "Building YOLO network failed" << std::endl;
|
||||||
}
|
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
NvDsInferStatus Yolo::buildYoloNetwork(
|
NvDsInferStatus Yolo::buildYoloNetwork(std::vector<float>& weights, nvinfer1::INetworkDefinition& network)
|
||||||
std::vector<float>& weights, nvinfer1::INetworkDefinition& network) {
|
{
|
||||||
int weightPtr = 0;
|
int weightPtr = 0;
|
||||||
int channels = m_InputC;
|
int channels = m_InputC;
|
||||||
|
|
||||||
std::string weightsType;
|
std::string weightsType;
|
||||||
|
if (m_WtsFilePath.find(".weights") != std::string::npos)
|
||||||
if (m_WtsFilePath.find(".weights") != std::string::npos) {
|
|
||||||
weightsType = "weights";
|
weightsType = "weights";
|
||||||
}
|
else
|
||||||
else {
|
|
||||||
weightsType = "wts";
|
weightsType = "wts";
|
||||||
}
|
|
||||||
|
float eps = 1.0e-5;
|
||||||
|
if (m_NetworkType.find("yolov5") != std::string::npos)
|
||||||
|
eps = 1.0e-3;
|
||||||
|
else if (m_NetworkType.find("yolor") != std::string::npos)
|
||||||
|
eps = 1.0e-4;
|
||||||
|
|
||||||
nvinfer1::ITensor* data =
|
nvinfer1::ITensor* data =
|
||||||
network.addInput(m_InputBlobName.c_str(), nvinfer1::DataType::kFLOAT,
|
network.addInput(m_InputBlobName.c_str(), nvinfer1::DataType::kFLOAT,
|
||||||
@@ -157,26 +180,24 @@ NvDsInferStatus Yolo::buildYoloNetwork(
|
|||||||
|
|
||||||
nvinfer1::ITensor* previous = data;
|
nvinfer1::ITensor* previous = data;
|
||||||
std::vector<nvinfer1::ITensor*> tensorOutputs;
|
std::vector<nvinfer1::ITensor*> tensorOutputs;
|
||||||
uint outputTensorCount = 0;
|
std::vector<nvinfer1::ITensor*> yoloInputs;
|
||||||
|
uint inputYoloCount = 0;
|
||||||
|
|
||||||
for (uint i = 0; i < m_ConfigBlocks.size(); ++i) {
|
int modelType = -1;
|
||||||
|
|
||||||
|
for (uint i = 0; i < m_ConfigBlocks.size(); ++i)
|
||||||
|
{
|
||||||
assert(getNumChannels(previous) == channels);
|
assert(getNumChannels(previous) == channels);
|
||||||
std::string layerIndex = "(" + std::to_string(tensorOutputs.size()) + ")";
|
std::string layerIndex = "(" + std::to_string(tensorOutputs.size()) + ")";
|
||||||
|
|
||||||
if (m_ConfigBlocks.at(i).at("type") == "net") {
|
if (m_ConfigBlocks.at(i).at("type") == "net")
|
||||||
printLayerInfo("", "layer", " input", " output", "weightPtr");
|
printLayerInfo("", "layer", " input", " output", "weightPtr");
|
||||||
}
|
|
||||||
|
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "convolutional") {
|
else if (m_ConfigBlocks.at(i).at("type") == "convolutional")
|
||||||
float eps = 1.0e-5;
|
{
|
||||||
if (m_NetworkType.find("yolov5") != std::string::npos) {
|
|
||||||
eps = 1.0e-3;
|
|
||||||
}
|
|
||||||
else if (m_NetworkType.find("yolor") != std::string::npos) {
|
|
||||||
eps = 1.0e-4;
|
|
||||||
}
|
|
||||||
std::string inputVol = dimsToString(previous->getDimensions());
|
std::string inputVol = dimsToString(previous->getDimensions());
|
||||||
nvinfer1::ILayer* out = convolutionalLayer(i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, weightsType, channels, eps, previous, &network);
|
nvinfer1::ILayer* out = convolutionalLayer(
|
||||||
|
i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, weightsType, channels, eps, previous, &network);
|
||||||
previous = out->getOutput(0);
|
previous = out->getOutput(0);
|
||||||
assert(previous != nullptr);
|
assert(previous != nullptr);
|
||||||
channels = getNumChannels(previous);
|
channels = getNumChannels(previous);
|
||||||
@@ -186,14 +207,13 @@ NvDsInferStatus Yolo::buildYoloNetwork(
|
|||||||
printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
|
printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "implicit_add" || m_ConfigBlocks.at(i).at("type") == "implicit_mul") {
|
else if (m_ConfigBlocks.at(i).at("type") == "implicit_add" || m_ConfigBlocks.at(i).at("type") == "implicit_mul")
|
||||||
|
{
|
||||||
std::string type;
|
std::string type;
|
||||||
if (m_ConfigBlocks.at(i).at("type") == "implicit_add") {
|
if (m_ConfigBlocks.at(i).at("type") == "implicit_add")
|
||||||
type = "add";
|
type = "add";
|
||||||
}
|
else if (m_ConfigBlocks.at(i).at("type") == "implicit_mul")
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "implicit_mul") {
|
|
||||||
type = "mul";
|
type = "mul";
|
||||||
}
|
|
||||||
assert(m_ConfigBlocks.at(i).find("filters") != m_ConfigBlocks.at(i).end());
|
assert(m_ConfigBlocks.at(i).find("filters") != m_ConfigBlocks.at(i).end());
|
||||||
int filters = std::stoi(m_ConfigBlocks.at(i).at("filters"));
|
int filters = std::stoi(m_ConfigBlocks.at(i).at("filters"));
|
||||||
nvinfer1::ILayer* out = implicitLayer(filters, weights, m_TrtWeights, weightPtr, &network);
|
nvinfer1::ILayer* out = implicitLayer(filters, weights, m_TrtWeights, weightPtr, &network);
|
||||||
@@ -206,19 +226,17 @@ NvDsInferStatus Yolo::buildYoloNetwork(
|
|||||||
printLayerInfo(layerIndex, layerType, " -", outputVol, std::to_string(weightPtr));
|
printLayerInfo(layerIndex, layerType, " -", outputVol, std::to_string(weightPtr));
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "shift_channels" || m_ConfigBlocks.at(i).at("type") == "control_channels") {
|
else if (m_ConfigBlocks.at(i).at("type") == "shift_channels" || m_ConfigBlocks.at(i).at("type") == "control_channels")
|
||||||
|
{
|
||||||
std::string type;
|
std::string type;
|
||||||
if (m_ConfigBlocks.at(i).at("type") == "shift_channels") {
|
if (m_ConfigBlocks.at(i).at("type") == "shift_channels")
|
||||||
type = "shift";
|
type = "shift";
|
||||||
}
|
else if (m_ConfigBlocks.at(i).at("type") == "control_channels")
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "control_channels") {
|
|
||||||
type = "control";
|
type = "control";
|
||||||
}
|
|
||||||
assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end());
|
assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end());
|
||||||
int from = stoi(m_ConfigBlocks.at(i).at("from"));
|
int from = stoi(m_ConfigBlocks.at(i).at("from"));
|
||||||
if (from > 0) {
|
if (from > 0)
|
||||||
from = from - i + 1;
|
from = from - i + 1;
|
||||||
}
|
|
||||||
assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size()));
|
assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size()));
|
||||||
assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size()));
|
assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size()));
|
||||||
assert(i + from - 1 < i - 2);
|
assert(i + from - 1 < i - 2);
|
||||||
@@ -231,25 +249,22 @@ NvDsInferStatus Yolo::buildYoloNetwork(
|
|||||||
printLayerInfo(layerIndex, layerType, " -", outputVol, " -");
|
printLayerInfo(layerIndex, layerType, " -", outputVol, " -");
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "dropout") {
|
else if (m_ConfigBlocks.at(i).at("type") == "dropout")
|
||||||
|
{
|
||||||
// Skip dropout layer
|
// Skip dropout layer
|
||||||
assert(m_ConfigBlocks.at(i).find("probability") != m_ConfigBlocks.at(i).end());
|
|
||||||
/*float probability = std::stof(m_ConfigBlocks.at(i).at("probability"));
|
|
||||||
nvinfer1::ILayer* out = dropoutLayer(probability, previous, &network);
|
|
||||||
previous = out->getOutput(0);*/
|
|
||||||
assert(previous != nullptr);
|
assert(previous != nullptr);
|
||||||
tensorOutputs.push_back(previous);
|
tensorOutputs.push_back(previous);
|
||||||
printLayerInfo(layerIndex, "dropout", " -", " -", " -");
|
printLayerInfo(layerIndex, "dropout", " -", " -", " -");
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "shortcut") {
|
else if (m_ConfigBlocks.at(i).at("type") == "shortcut")
|
||||||
|
{
|
||||||
assert(m_ConfigBlocks.at(i).find("activation") != m_ConfigBlocks.at(i).end());
|
assert(m_ConfigBlocks.at(i).find("activation") != m_ConfigBlocks.at(i).end());
|
||||||
assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end());
|
assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end());
|
||||||
std::string activation = m_ConfigBlocks.at(i).at("activation");
|
std::string activation = m_ConfigBlocks.at(i).at("activation");
|
||||||
int from = stoi(m_ConfigBlocks.at(i).at("from"));
|
int from = stoi(m_ConfigBlocks.at(i).at("from"));
|
||||||
if (from > 0) {
|
if (from > 0)
|
||||||
from = from - i + 1;
|
from = from - i + 1;
|
||||||
}
|
|
||||||
assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size()));
|
assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size()));
|
||||||
assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size()));
|
assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size()));
|
||||||
assert(i + from - 1 < i - 2);
|
assert(i + from - 1 < i - 2);
|
||||||
@@ -267,7 +282,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "route") {
|
else if (m_ConfigBlocks.at(i).at("type") == "route")
|
||||||
|
{
|
||||||
assert(m_ConfigBlocks.at(i).find("layers") != m_ConfigBlocks.at(i).end());
|
assert(m_ConfigBlocks.at(i).find("layers") != m_ConfigBlocks.at(i).end());
|
||||||
nvinfer1::ILayer* out = routeLayer(i, m_ConfigBlocks.at(i), tensorOutputs, &network);
|
nvinfer1::ILayer* out = routeLayer(i, m_ConfigBlocks.at(i), tensorOutputs, &network);
|
||||||
previous = out->getOutput(0);
|
previous = out->getOutput(0);
|
||||||
@@ -278,7 +294,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
|
|||||||
printLayerInfo(layerIndex, "route", " -", outputVol, std::to_string(weightPtr));
|
printLayerInfo(layerIndex, "route", " -", outputVol, std::to_string(weightPtr));
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "upsample") {
|
else if (m_ConfigBlocks.at(i).at("type") == "upsample")
|
||||||
|
{
|
||||||
std::string inputVol = dimsToString(previous->getDimensions());
|
std::string inputVol = dimsToString(previous->getDimensions());
|
||||||
nvinfer1::ILayer* out = upsampleLayer(i - 1, m_ConfigBlocks[i], previous, &network);
|
nvinfer1::ILayer* out = upsampleLayer(i - 1, m_ConfigBlocks[i], previous, &network);
|
||||||
previous = out->getOutput(0);
|
previous = out->getOutput(0);
|
||||||
@@ -288,7 +305,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
|
|||||||
printLayerInfo(layerIndex, "upsample", inputVol, outputVol, " -");
|
printLayerInfo(layerIndex, "upsample", inputVol, outputVol, " -");
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "maxpool") {
|
else if (m_ConfigBlocks.at(i).at("type") == "maxpool")
|
||||||
|
{
|
||||||
std::string inputVol = dimsToString(previous->getDimensions());
|
std::string inputVol = dimsToString(previous->getDimensions());
|
||||||
nvinfer1::ILayer* out = maxpoolLayer(i, m_ConfigBlocks.at(i), previous, &network);
|
nvinfer1::ILayer* out = maxpoolLayer(i, m_ConfigBlocks.at(i), previous, &network);
|
||||||
previous = out->getOutput(0);
|
previous = out->getOutput(0);
|
||||||
@@ -298,8 +316,10 @@ NvDsInferStatus Yolo::buildYoloNetwork(
|
|||||||
printLayerInfo(layerIndex, "maxpool", inputVol, outputVol, std::to_string(weightPtr));
|
printLayerInfo(layerIndex, "maxpool", inputVol, outputVol, std::to_string(weightPtr));
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "reorg") {
|
else if (m_ConfigBlocks.at(i).at("type") == "reorg")
|
||||||
if (m_NetworkType.find("yolov5") != std::string::npos || m_NetworkType.find("yolor") != std::string::npos) {
|
{
|
||||||
|
if (m_NetworkType.find("yolov5") != std::string::npos || m_NetworkType.find("yolor") != std::string::npos)
|
||||||
|
{
|
||||||
std::string inputVol = dimsToString(previous->getDimensions());
|
std::string inputVol = dimsToString(previous->getDimensions());
|
||||||
nvinfer1::ILayer* out = reorgV5Layer(i, previous, &network);
|
nvinfer1::ILayer* out = reorgV5Layer(i, previous, &network);
|
||||||
previous = out->getOutput(0);
|
previous = out->getOutput(0);
|
||||||
@@ -310,7 +330,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
|
|||||||
std::string layerType = "reorgV5";
|
std::string layerType = "reorgV5";
|
||||||
printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
|
printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
|
||||||
}
|
}
|
||||||
else {
|
else
|
||||||
|
{
|
||||||
std::string inputVol = dimsToString(previous->getDimensions());
|
std::string inputVol = dimsToString(previous->getDimensions());
|
||||||
nvinfer1::IPluginV2* reorgPlugin = createReorgPlugin(2);
|
nvinfer1::IPluginV2* reorgPlugin = createReorgPlugin(2);
|
||||||
assert(reorgPlugin != nullptr);
|
assert(reorgPlugin != nullptr);
|
||||||
@@ -328,95 +349,127 @@ NvDsInferStatus Yolo::buildYoloNetwork(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "yolo") {
|
else if (m_ConfigBlocks.at(i).at("type") == "yolo" || m_ConfigBlocks.at(i).at("type") == "region")
|
||||||
uint modelType = 1;
|
{
|
||||||
uint newCoords = 0;
|
if (m_ConfigBlocks.at(i).at("type") == "yolo")
|
||||||
float scaleXY = 1.0;
|
{
|
||||||
if (m_NetworkType.find("yolor") != std::string::npos) {
|
if (m_NetworkType.find("yolor") != std::string::npos)
|
||||||
modelType = 2;
|
modelType = 2;
|
||||||
|
else
|
||||||
|
modelType = 1;
|
||||||
}
|
}
|
||||||
if (m_ConfigBlocks.at(i).find("new_coords") != m_ConfigBlocks.at(i).end()) {
|
else
|
||||||
newCoords = std::stoi(m_ConfigBlocks.at(i).at("new_coords"));
|
modelType = 0;
|
||||||
}
|
|
||||||
if (m_ConfigBlocks.at(i).find("scale_x_y") != m_ConfigBlocks.at(i).end()) {
|
|
||||||
scaleXY = std::stof(m_ConfigBlocks.at(i).at("scale_x_y"));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string layerName = "yolo_" + std::to_string(i);
|
std::string layerName = modelType != 0 ? "yolo_" + std::to_string(i) : "region_" + std::to_string(i);
|
||||||
nvinfer1::Dims prevTensorDims = previous->getDimensions();
|
nvinfer1::Dims prevTensorDims = previous->getDimensions();
|
||||||
TensorInfo& curYoloTensor = m_OutputTensors.at(outputTensorCount);
|
TensorInfo& curYoloTensor = m_YoloTensors.at(inputYoloCount);
|
||||||
m_NumClasses = curYoloTensor.numClasses;
|
|
||||||
curYoloTensor.blobName = layerName;
|
curYoloTensor.blobName = layerName;
|
||||||
nvinfer1::IPluginV2* yoloPlugin
|
curYoloTensor.gridSizeX = prevTensorDims.d[2];
|
||||||
= new YoloLayer(curYoloTensor.numBBoxes, curYoloTensor.numClasses, m_InputW, m_InputH,
|
curYoloTensor.gridSizeY = prevTensorDims.d[1];
|
||||||
prevTensorDims.d[2], prevTensorDims.d[1], modelType, newCoords, scaleXY,
|
|
||||||
curYoloTensor.anchors, curYoloTensor.mask);
|
|
||||||
assert(yoloPlugin != nullptr);
|
|
||||||
nvinfer1::IPluginV2Layer* yolo =
|
|
||||||
network.addPluginV2(&previous, 1, *yoloPlugin);
|
|
||||||
assert(yolo != nullptr);
|
|
||||||
yolo->setName(layerName.c_str());
|
|
||||||
std::string inputVol = dimsToString(previous->getDimensions());
|
|
||||||
previous = yolo->getOutput(0);
|
|
||||||
assert(previous != nullptr);
|
|
||||||
previous->setName(layerName.c_str());
|
|
||||||
std::string outputVol = dimsToString(previous->getDimensions());
|
|
||||||
network.markOutput(*previous);
|
|
||||||
channels = getNumChannels(previous);
|
|
||||||
tensorOutputs.push_back(yolo->getOutput(0));
|
|
||||||
printLayerInfo(layerIndex, "yolo", inputVol, outputVol, std::to_string(weightPtr));
|
|
||||||
++outputTensorCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (m_ConfigBlocks.at(i).at("type") == "region") {
|
|
||||||
std::vector<int> mask;
|
|
||||||
std::string layerName = "region_" + std::to_string(i);
|
|
||||||
nvinfer1::Dims prevTensorDims = previous->getDimensions();
|
|
||||||
TensorInfo& curRegionTensor = m_OutputTensors.at(outputTensorCount);
|
|
||||||
m_NumClasses = curRegionTensor.numClasses;
|
|
||||||
curRegionTensor.blobName = layerName;
|
|
||||||
nvinfer1::IPluginV2* regionPlugin
|
|
||||||
= new YoloLayer(curRegionTensor.numBBoxes, curRegionTensor.numClasses, m_InputW, m_InputH,
|
|
||||||
prevTensorDims.d[2], prevTensorDims.d[1], 0, 0, 1.0, curRegionTensor.anchors,
|
|
||||||
mask);
|
|
||||||
assert(regionPlugin != nullptr);
|
|
||||||
nvinfer1::IPluginV2Layer* region =
|
|
||||||
network.addPluginV2(&previous, 1, *regionPlugin);
|
|
||||||
assert(region != nullptr);
|
|
||||||
region->setName(layerName.c_str());
|
|
||||||
std::string inputVol = dimsToString(previous->getDimensions());
|
std::string inputVol = dimsToString(previous->getDimensions());
|
||||||
previous = region->getOutput(0);
|
|
||||||
assert(previous != nullptr);
|
|
||||||
previous->setName(layerName.c_str());
|
|
||||||
std::string outputVol = dimsToString(previous->getDimensions());
|
|
||||||
network.markOutput(*previous);
|
|
||||||
channels = getNumChannels(previous);
|
channels = getNumChannels(previous);
|
||||||
tensorOutputs.push_back(region->getOutput(0));
|
tensorOutputs.push_back(previous);
|
||||||
printLayerInfo(layerIndex, "region", inputVol, outputVol, std::to_string(weightPtr));
|
yoloInputs.push_back(previous);
|
||||||
++outputTensorCount;
|
++inputYoloCount;
|
||||||
|
printLayerInfo(layerIndex, modelType != 0 ? "yolo" : "region", inputVol, " -", " -");
|
||||||
}
|
}
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
std::cout << "Unsupported layer type --> \""
|
std::cout << "\nUnsupported layer type --> \"" << m_ConfigBlocks.at(i).at("type") << "\"" << std::endl;
|
||||||
<< m_ConfigBlocks.at(i).at("type") << "\"" << std::endl;
|
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((int)weights.size() != weightPtr)
|
if ((int)weights.size() != weightPtr)
|
||||||
{
|
{
|
||||||
std::cout << "Number of unused weights left: " << weights.size() - weightPtr << std::endl;
|
std::cout << "\nNumber of unused weights left: " << weights.size() - weightPtr << std::endl;
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << "Output YOLO blob names: " << std::endl;
|
if (m_YoloCount == inputYoloCount)
|
||||||
for (auto& tensor : m_OutputTensors) {
|
{
|
||||||
|
assert((modelType != -1) && "\nCould not determine model type");
|
||||||
|
|
||||||
|
nvinfer1::ITensor* yoloInputTensors[inputYoloCount];
|
||||||
|
uint64_t outputSize = 0;
|
||||||
|
for (uint j = 0; j < inputYoloCount; ++j)
|
||||||
|
{
|
||||||
|
yoloInputTensors[j] = yoloInputs[j];
|
||||||
|
TensorInfo& curYoloTensor = m_YoloTensors.at(j);
|
||||||
|
outputSize += curYoloTensor.gridSizeX * curYoloTensor.gridSizeY * curYoloTensor.numBBoxes;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m_TopK > outputSize) {
|
||||||
|
std::cout << "\ntopk > Number of outputs\nPlease change the topk to " << outputSize
|
||||||
|
<< " or less in config_nms.txt file\n" << std::endl;
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string layerName = "yolo";
|
||||||
|
nvinfer1::IPluginV2* yoloPlugin = new YoloLayer(
|
||||||
|
m_InputW, m_InputH, m_NumClasses, m_NewCoords, m_YoloTensors, outputSize, modelType, m_TopK,
|
||||||
|
m_ScoreThreshold);
|
||||||
|
assert(yoloPlugin != nullptr);
|
||||||
|
nvinfer1::IPluginV2Layer* yolo = network.addPluginV2(yoloInputTensors, inputYoloCount, *yoloPlugin);
|
||||||
|
assert(yolo != nullptr);
|
||||||
|
yolo->setName(layerName.c_str());
|
||||||
|
previous = yolo->getOutput(0);
|
||||||
|
assert(previous != nullptr);
|
||||||
|
previous->setName(layerName.c_str());
|
||||||
|
tensorOutputs.push_back(yolo->getOutput(0));
|
||||||
|
|
||||||
|
nvinfer1::ITensor* yoloTensors[] = {yolo->getOutput(0), yolo->getOutput(1)};
|
||||||
|
std::string outputVol = dimsToString(previous->getDimensions());
|
||||||
|
|
||||||
|
nvinfer1::plugin::NMSParameters nmsParams;
|
||||||
|
nmsParams.shareLocation = true;
|
||||||
|
nmsParams.backgroundLabelId = -1;
|
||||||
|
nmsParams.numClasses = m_NumClasses;
|
||||||
|
nmsParams.topK = m_TopK;
|
||||||
|
nmsParams.keepTopK = m_TopK;
|
||||||
|
nmsParams.scoreThreshold = m_ScoreThreshold;
|
||||||
|
nmsParams.iouThreshold = m_IouThreshold;
|
||||||
|
nmsParams.isNormalized = false;
|
||||||
|
|
||||||
|
layerName = "batchedNMS";
|
||||||
|
nvinfer1::IPluginV2* batchedNMS = createBatchedNMSPlugin(nmsParams);
|
||||||
|
nvinfer1::IPluginV2Layer* nms = network.addPluginV2(yoloTensors, 2, *batchedNMS);
|
||||||
|
nms->setName(layerName.c_str());
|
||||||
|
nvinfer1::ITensor* num_detections = nms->getOutput(0);
|
||||||
|
layerName = "num_detections";
|
||||||
|
num_detections->setName(layerName.c_str());
|
||||||
|
nvinfer1::ITensor* nmsed_boxes = nms->getOutput(1);
|
||||||
|
layerName = "nmsed_boxes";
|
||||||
|
nmsed_boxes->setName(layerName.c_str());
|
||||||
|
nvinfer1::ITensor* nmsed_scores = nms->getOutput(2);
|
||||||
|
layerName = "nmsed_scores";
|
||||||
|
nmsed_scores->setName(layerName.c_str());
|
||||||
|
nvinfer1::ITensor* nmsed_classes = nms->getOutput(3);
|
||||||
|
layerName = "nmsed_classes";
|
||||||
|
nmsed_classes->setName(layerName.c_str());
|
||||||
|
network.markOutput(*num_detections);
|
||||||
|
network.markOutput(*nmsed_boxes);
|
||||||
|
network.markOutput(*nmsed_scores);
|
||||||
|
network.markOutput(*nmsed_classes);
|
||||||
|
|
||||||
|
printLayerInfo("", "batched_nms", " -", outputVol, " -");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
std::cout << "\nError in yolo cfg file" << std::endl;
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "\nOutput YOLO blob names: " << std::endl;
|
||||||
|
for (auto& tensor : m_YoloTensors)
|
||||||
|
{
|
||||||
std::cout << tensor.blobName << std::endl;
|
std::cout << tensor.blobName << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
int nbLayers = network.getNbLayers();
|
int nbLayers = network.getNbLayers();
|
||||||
std::cout << "Total number of YOLO layers: " << nbLayers << std::endl;
|
std::cout << "\nTotal number of YOLO layers: " << nbLayers << "\n" << std::endl;
|
||||||
|
|
||||||
return NVDSINFER_SUCCESS;
|
return NVDSINFER_SUCCESS;
|
||||||
}
|
}
|
||||||
@@ -462,39 +515,44 @@ Yolo::parseConfigFile (const std::string cfgFilePath)
|
|||||||
|
|
||||||
void Yolo::parseConfigBlocks()
|
void Yolo::parseConfigBlocks()
|
||||||
{
|
{
|
||||||
for (auto block : m_ConfigBlocks) {
|
for (auto block : m_ConfigBlocks)
|
||||||
|
{
|
||||||
if (block.at("type") == "net")
|
if (block.at("type") == "net")
|
||||||
{
|
{
|
||||||
assert((block.find("height") != block.end())
|
assert((block.find("height") != block.end()) && "Missing 'height' param in network cfg");
|
||||||
&& "Missing 'height' param in network cfg");
|
|
||||||
assert((block.find("width") != block.end()) && "Missing 'width' param in network cfg");
|
assert((block.find("width") != block.end()) && "Missing 'width' param in network cfg");
|
||||||
assert((block.find("channels") != block.end())
|
assert((block.find("channels") != block.end()) && "Missing 'channels' param in network cfg");
|
||||||
&& "Missing 'channels' param in network cfg");
|
|
||||||
|
|
||||||
m_InputH = std::stoul(block.at("height"));
|
m_InputH = std::stoul(block.at("height"));
|
||||||
m_InputW = std::stoul(block.at("width"));
|
m_InputW = std::stoul(block.at("width"));
|
||||||
m_InputC = std::stoul(block.at("channels"));
|
m_InputC = std::stoul(block.at("channels"));
|
||||||
m_InputSize = m_InputC * m_InputH * m_InputW;
|
m_InputSize = m_InputC * m_InputH * m_InputW;
|
||||||
|
|
||||||
if (block.find("letter_box") != block.end()) {
|
if (block.find("letter_box") != block.end())
|
||||||
|
{
|
||||||
m_LetterBox = std::stoul(block.at("letter_box"));
|
m_LetterBox = std::stoul(block.at("letter_box"));
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
m_LetterBox = 0;
|
|
||||||
}
|
}
|
||||||
}
|
else if ((block.at("type") == "region") || (block.at("type") == "yolo"))
|
||||||
else if ((block.at("type") == "region") || (block.at("type") == "yolo") || (block.at("type") == "detect"))
|
|
||||||
{
|
{
|
||||||
assert((block.find("num") != block.end())
|
assert((block.find("num") != block.end())
|
||||||
&& std::string("Missing 'num' param in " + block.at("type") + " layer").c_str());
|
&& std::string("Missing 'num' param in " + block.at("type") + " layer").c_str());
|
||||||
assert((block.find("classes") != block.end())
|
assert((block.find("classes") != block.end())
|
||||||
&& std::string("Missing 'classes' param in " + block.at("type") + " layer")
|
&& std::string("Missing 'classes' param in " + block.at("type") + " layer").c_str());
|
||||||
.c_str());
|
|
||||||
assert((block.find("anchors") != block.end())
|
assert((block.find("anchors") != block.end())
|
||||||
&& std::string("Missing 'anchors' param in " + block.at("type") + " layer")
|
&& std::string("Missing 'anchors' param in " + block.at("type") + " layer").c_str());
|
||||||
.c_str());
|
|
||||||
|
++m_YoloCount;
|
||||||
|
|
||||||
|
m_NumClasses = std::stoul(block.at("classes"));
|
||||||
|
|
||||||
|
if (block.find("new_coords") != block.end())
|
||||||
|
{
|
||||||
|
m_NewCoords = std::stoul(block.at("new_coords"));
|
||||||
|
}
|
||||||
|
|
||||||
TensorInfo outputTensor;
|
TensorInfo outputTensor;
|
||||||
|
|
||||||
std::string anchorString = block.at("anchors");
|
std::string anchorString = block.at("anchors");
|
||||||
while (!anchorString.empty())
|
while (!anchorString.empty())
|
||||||
{
|
{
|
||||||
@@ -513,7 +571,8 @@ void Yolo::parseConfigBlocks()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (block.find("mask") != block.end()) {
|
if (block.find("mask") != block.end())
|
||||||
|
{
|
||||||
std::string maskString = block.at("mask");
|
std::string maskString = block.at("mask");
|
||||||
while (!maskString.empty())
|
while (!maskString.empty())
|
||||||
{
|
{
|
||||||
@@ -533,17 +592,41 @@ void Yolo::parseConfigBlocks()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
outputTensor.numBBoxes = outputTensor.mask.size() > 0
|
if (block.find("scale_x_y") != block.end())
|
||||||
? outputTensor.mask.size()
|
{
|
||||||
: std::stoul(trim(block.at("num")));
|
outputTensor.scaleXY = std::stof(block.at("scale_x_y"));
|
||||||
outputTensor.numClasses = std::stoul(block.at("classes"));
|
}
|
||||||
m_OutputTensors.push_back(outputTensor);
|
else
|
||||||
|
{
|
||||||
|
outputTensor.scaleXY = 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
outputTensor.numBBoxes
|
||||||
|
= outputTensor.mask.size() > 0 ? outputTensor.mask.size() : std::stoul(trim(block.at("num")));
|
||||||
|
|
||||||
|
m_YoloTensors.push_back(outputTensor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Yolo::destroyNetworkUtils() {
|
void Yolo::parseConfigNMSBlocks()
|
||||||
for (uint i = 0; i < m_TrtWeights.size(); ++i) {
|
{
|
||||||
|
auto block = m_ConfigNMSBlocks[0];
|
||||||
|
|
||||||
|
assert((block.at("type") == "property") && "Missing 'property' param in nms cfg");
|
||||||
|
assert((block.find("iou-threshold") != block.end()) && "Missing 'iou-threshold' param in nms cfg");
|
||||||
|
assert((block.find("score-threshold") != block.end()) && "Missing 'score-threshold' param in nms cfg");
|
||||||
|
assert((block.find("topk") != block.end()) && "Missing 'topk' param in nms cfg");
|
||||||
|
|
||||||
|
m_IouThreshold = std::stof(block.at("iou-threshold"));
|
||||||
|
m_ScoreThreshold = std::stof(block.at("score-threshold"));
|
||||||
|
m_TopK = std::stoul(block.at("topk"));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Yolo::destroyNetworkUtils()
|
||||||
|
{
|
||||||
|
for (uint i = 0; i < m_TrtWeights.size(); ++i)
|
||||||
|
{
|
||||||
if (m_TrtWeights[i].count > 0)
|
if (m_TrtWeights[i].count > 0)
|
||||||
free(const_cast<void*>(m_TrtWeights[i].values));
|
free(const_cast<void*>(m_TrtWeights[i].values));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,7 +29,6 @@
|
|||||||
#include "layers/convolutional_layer.h"
|
#include "layers/convolutional_layer.h"
|
||||||
#include "layers/implicit_layer.h"
|
#include "layers/implicit_layer.h"
|
||||||
#include "layers/channels_layer.h"
|
#include "layers/channels_layer.h"
|
||||||
#include "layers/dropout_layer.h"
|
|
||||||
#include "layers/shortcut_layer.h"
|
#include "layers/shortcut_layer.h"
|
||||||
#include "layers/route_layer.h"
|
#include "layers/route_layer.h"
|
||||||
#include "layers/upsample_layer.h"
|
#include "layers/upsample_layer.h"
|
||||||
@@ -54,8 +53,10 @@ struct NetworkInfo
|
|||||||
struct TensorInfo
|
struct TensorInfo
|
||||||
{
|
{
|
||||||
std::string blobName;
|
std::string blobName;
|
||||||
|
uint gridSizeX {0};
|
||||||
|
uint gridSizeY {0};
|
||||||
uint numBBoxes {0};
|
uint numBBoxes {0};
|
||||||
uint numClasses {0};
|
float scaleXY;
|
||||||
std::vector<float> anchors;
|
std::vector<float> anchors;
|
||||||
std::vector<int> mask;
|
std::vector<int> mask;
|
||||||
};
|
};
|
||||||
@@ -63,12 +64,15 @@ struct TensorInfo
|
|||||||
class Yolo : public IModelParser {
|
class Yolo : public IModelParser {
|
||||||
public:
|
public:
|
||||||
Yolo(const NetworkInfo& networkInfo);
|
Yolo(const NetworkInfo& networkInfo);
|
||||||
|
|
||||||
~Yolo() override;
|
~Yolo() override;
|
||||||
|
|
||||||
bool hasFullDimsSupported() const override { return false; }
|
bool hasFullDimsSupported() const override { return false; }
|
||||||
|
|
||||||
const char* getModelName() const override {
|
const char* getModelName() const override {
|
||||||
return m_ConfigFilePath.empty() ? m_NetworkType.c_str()
|
return m_ConfigFilePath.empty() ? m_NetworkType.c_str() : m_ConfigFilePath.c_str();
|
||||||
: m_ConfigFilePath.c_str();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
NvDsInferStatus parseModel(nvinfer1::INetworkDefinition& network) override;
|
NvDsInferStatus parseModel(nvinfer1::INetworkDefinition& network) override;
|
||||||
|
|
||||||
nvinfer1::ICudaEngine *createEngine (nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config);
|
nvinfer1::ICudaEngine *createEngine (nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config);
|
||||||
@@ -90,17 +94,26 @@ protected:
|
|||||||
uint64_t m_InputSize;
|
uint64_t m_InputSize;
|
||||||
uint m_NumClasses;
|
uint m_NumClasses;
|
||||||
uint m_LetterBox;
|
uint m_LetterBox;
|
||||||
|
uint m_NewCoords;
|
||||||
|
uint m_YoloCount;
|
||||||
|
float m_IouThreshold;
|
||||||
|
float m_ScoreThreshold;
|
||||||
|
uint m_TopK;
|
||||||
|
|
||||||
std::vector<TensorInfo> m_OutputTensors;
|
std::vector<TensorInfo> m_YoloTensors;
|
||||||
std::vector<std::map<std::string, std::string>> m_ConfigBlocks;
|
std::vector<std::map<std::string, std::string>> m_ConfigBlocks;
|
||||||
|
std::vector<std::map<std::string, std::string>> m_ConfigNMSBlocks;
|
||||||
std::vector<nvinfer1::Weights> m_TrtWeights;
|
std::vector<nvinfer1::Weights> m_TrtWeights;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
NvDsInferStatus buildYoloNetwork(
|
NvDsInferStatus buildYoloNetwork(std::vector<float>& weights, nvinfer1::INetworkDefinition& network);
|
||||||
std::vector<float>& weights, nvinfer1::INetworkDefinition& network);
|
|
||||||
std::vector<std::map<std::string, std::string>> parseConfigFile(
|
std::vector<std::map<std::string, std::string>> parseConfigFile(const std::string cfgFilePath);
|
||||||
const std::string cfgFilePath);
|
|
||||||
void parseConfigBlocks();
|
void parseConfigBlocks();
|
||||||
|
|
||||||
|
void parseConfigNMSBlocks();
|
||||||
|
|
||||||
void destroyNetworkUtils();
|
void destroyNetworkUtils();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -3,18 +3,14 @@
|
|||||||
* https://www.github.com/marcoslucianops
|
* https://www.github.com/marcoslucianops
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <cuda.h>
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
|
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
|
||||||
|
|
||||||
__global__ void gpuYoloLayer(
|
__global__ void gpuYoloLayer(
|
||||||
const float* input, float* output, const uint netWidth, const uint netHeight, const uint gridSizeX,
|
const float* input, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
|
||||||
const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY,
|
const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY,
|
||||||
const float* anchors, const int* mask)
|
const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float* anchors, const int* mask)
|
||||||
{
|
{
|
||||||
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
|
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
|
uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
@@ -28,28 +24,32 @@ __global__ void gpuYoloLayer(
|
|||||||
const int numGridCells = gridSizeX * gridSizeY;
|
const int numGridCells = gridSizeX * gridSizeY;
|
||||||
const int bbindex = y_id * gridSizeX + x_id;
|
const int bbindex = y_id * gridSizeX + x_id;
|
||||||
|
|
||||||
|
const float objectness
|
||||||
|
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
|
||||||
|
|
||||||
|
if (objectness < scoreThreshold) return;
|
||||||
|
|
||||||
|
int count = (int)atomicAdd(&countData[0], 1);
|
||||||
|
|
||||||
const float alpha = scaleXY;
|
const float alpha = scaleXY;
|
||||||
const float beta = -0.5 * (scaleXY - 1);
|
const float beta = -0.5 * (scaleXY - 1);
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
|
float x
|
||||||
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
|
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
|
||||||
* alpha + beta + x_id) * netWidth / gridSizeX;
|
* alpha + beta + x_id) * netWidth / gridSizeX;
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
|
float y
|
||||||
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)])
|
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)])
|
||||||
* alpha + beta + y_id) * netHeight / gridSizeY;
|
* alpha + beta + y_id) * netHeight / gridSizeY;
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
|
float w
|
||||||
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)])
|
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)])
|
||||||
* anchors[mask[z_id] * 2];
|
* anchors[mask[z_id] * 2];
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
|
float h
|
||||||
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)])
|
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)])
|
||||||
* anchors[mask[z_id] * 2 + 1];
|
* anchors[mask[z_id] * 2 + 1];
|
||||||
|
|
||||||
const float objectness
|
|
||||||
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
|
|
||||||
|
|
||||||
float maxProb = 0.0f;
|
float maxProb = 0.0f;
|
||||||
int maxIndex = -1;
|
int maxIndex = -1;
|
||||||
|
|
||||||
@@ -65,22 +65,26 @@ __global__ void gpuYoloLayer(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
|
d_indexes[count] = count;
|
||||||
= objectness * maxProb;
|
d_scores[count] = objectness * maxProb + 1.f;
|
||||||
|
d_boxes[count * 4 + 0] = x - 0.5 * w;
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)]
|
d_boxes[count * 4 + 1] = y - 0.5 * h;
|
||||||
= maxIndex;
|
d_boxes[count * 4 + 2] = x + 0.5 * w;
|
||||||
|
d_boxes[count * 4 + 3] = y + 0.5 * h;
|
||||||
|
d_classes[count] = maxIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaYoloLayer(
|
cudaError_t cudaYoloLayer(
|
||||||
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
|
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
|
||||||
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
|
||||||
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
|
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
||||||
|
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
|
||||||
|
|
||||||
cudaError_t cudaYoloLayer(
|
cudaError_t cudaYoloLayer(
|
||||||
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
|
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
|
||||||
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
|
||||||
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
|
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
||||||
|
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
dim3 threads_per_block(16, 16, 4);
|
dim3 threads_per_block(16, 16, 4);
|
||||||
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
|
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
|
||||||
@@ -90,9 +94,12 @@ cudaError_t cudaYoloLayer(
|
|||||||
for (unsigned int batch = 0; batch < batchSize; ++batch)
|
for (unsigned int batch = 0; batch < batchSize; ++batch)
|
||||||
{
|
{
|
||||||
gpuYoloLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
|
gpuYoloLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
|
||||||
reinterpret_cast<const float*>(input) + (batch * outputSize),
|
reinterpret_cast<const float*>(input) + (batch * inputSize),
|
||||||
reinterpret_cast<float*>(output) + (batch * outputSize),
|
reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
|
||||||
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
|
reinterpret_cast<float*>(d_scores) + (batch * outputSize),
|
||||||
|
reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
|
||||||
|
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
|
||||||
|
scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
|
||||||
reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask));
|
reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask));
|
||||||
}
|
}
|
||||||
return cudaGetLastError();
|
return cudaGetLastError();
|
||||||
|
|||||||
@@ -3,16 +3,12 @@
|
|||||||
* https://www.github.com/marcoslucianops
|
* https://www.github.com/marcoslucianops
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <cuda.h>
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
__global__ void gpuYoloLayer_nc(
|
__global__ void gpuYoloLayer_nc(
|
||||||
const float* input, float* output, const uint netWidth, const uint netHeight, const uint gridSizeX,
|
const float* input, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
|
||||||
const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY,
|
const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY,
|
||||||
const float* anchors, const int* mask)
|
const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float* anchors, const int* mask)
|
||||||
{
|
{
|
||||||
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
|
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
|
uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
@@ -26,28 +22,32 @@ __global__ void gpuYoloLayer_nc(
|
|||||||
const int numGridCells = gridSizeX * gridSizeY;
|
const int numGridCells = gridSizeX * gridSizeY;
|
||||||
const int bbindex = y_id * gridSizeX + x_id;
|
const int bbindex = y_id * gridSizeX + x_id;
|
||||||
|
|
||||||
|
const float objectness
|
||||||
|
= input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)];
|
||||||
|
|
||||||
|
if (objectness < scoreThreshold) return;
|
||||||
|
|
||||||
|
int count = (int)atomicAdd(&countData[0], 1);
|
||||||
|
|
||||||
const float alpha = scaleXY;
|
const float alpha = scaleXY;
|
||||||
const float beta = -0.5 * (scaleXY - 1);
|
const float beta = -0.5 * (scaleXY - 1);
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
|
float x
|
||||||
= (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
|
= (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
|
||||||
* alpha + beta + x_id) * netWidth / gridSizeX;
|
* alpha + beta + x_id) * netWidth / gridSizeX;
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
|
float y
|
||||||
= (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
|
= (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
|
||||||
* alpha + beta + y_id) * netHeight / gridSizeY;
|
* alpha + beta + y_id) * netHeight / gridSizeY;
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
|
float w
|
||||||
= __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] * 2, 2)
|
= __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] * 2, 2)
|
||||||
* anchors[mask[z_id] * 2];
|
* anchors[mask[z_id] * 2];
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
|
float h
|
||||||
= __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] * 2, 2)
|
= __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] * 2, 2)
|
||||||
* anchors[mask[z_id] * 2 + 1];
|
* anchors[mask[z_id] * 2 + 1];
|
||||||
|
|
||||||
const float objectness
|
|
||||||
= input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)];
|
|
||||||
|
|
||||||
float maxProb = 0.0f;
|
float maxProb = 0.0f;
|
||||||
int maxIndex = -1;
|
int maxIndex = -1;
|
||||||
|
|
||||||
@@ -63,22 +63,26 @@ __global__ void gpuYoloLayer_nc(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
|
d_indexes[count] = count;
|
||||||
= objectness * maxProb;
|
d_scores[count] = objectness * maxProb + 1.f;
|
||||||
|
d_boxes[count * 4 + 0] = x - 0.5 * w;
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)]
|
d_boxes[count * 4 + 1] = y - 0.5 * h;
|
||||||
= maxIndex;
|
d_boxes[count * 4 + 2] = x + 0.5 * w;
|
||||||
|
d_boxes[count * 4 + 3] = y + 0.5 * h;
|
||||||
|
d_classes[count] = maxIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaYoloLayer_nc(
|
cudaError_t cudaYoloLayer_nc(
|
||||||
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
|
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
|
||||||
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
|
||||||
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
|
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
||||||
|
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
|
||||||
|
|
||||||
cudaError_t cudaYoloLayer_nc(
|
cudaError_t cudaYoloLayer_nc(
|
||||||
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
|
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
|
||||||
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
|
||||||
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
|
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
||||||
|
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
dim3 threads_per_block(16, 16, 4);
|
dim3 threads_per_block(16, 16, 4);
|
||||||
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
|
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
|
||||||
@@ -88,9 +92,12 @@ cudaError_t cudaYoloLayer_nc(
|
|||||||
for (unsigned int batch = 0; batch < batchSize; ++batch)
|
for (unsigned int batch = 0; batch < batchSize; ++batch)
|
||||||
{
|
{
|
||||||
gpuYoloLayer_nc<<<number_of_blocks, threads_per_block, 0, stream>>>(
|
gpuYoloLayer_nc<<<number_of_blocks, threads_per_block, 0, stream>>>(
|
||||||
reinterpret_cast<const float*>(input) + (batch * outputSize),
|
reinterpret_cast<const float*>(input) + (batch * inputSize),
|
||||||
reinterpret_cast<float*>(output) + (batch * outputSize),
|
reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
|
||||||
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
|
reinterpret_cast<float*>(d_scores) + (batch * outputSize),
|
||||||
|
reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
|
||||||
|
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
|
||||||
|
scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
|
||||||
reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask));
|
reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask));
|
||||||
}
|
}
|
||||||
return cudaGetLastError();
|
return cudaGetLastError();
|
||||||
|
|||||||
@@ -3,18 +3,14 @@
|
|||||||
* https://www.github.com/marcoslucianops
|
* https://www.github.com/marcoslucianops
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <cuda.h>
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
|
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
|
||||||
|
|
||||||
__global__ void gpuYoloLayer_r(
|
__global__ void gpuYoloLayer_r(
|
||||||
const float* input, float* output, const uint netWidth, const uint netHeight, const uint gridSizeX,
|
const float* input, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
|
||||||
const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY,
|
const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY,
|
||||||
const float* anchors, const int* mask)
|
const uint numOutputClasses, const uint numBBoxes, const float scaleXY, const float* anchors, const int* mask)
|
||||||
{
|
{
|
||||||
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
|
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
|
uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
@@ -28,28 +24,32 @@ __global__ void gpuYoloLayer_r(
|
|||||||
const int numGridCells = gridSizeX * gridSizeY;
|
const int numGridCells = gridSizeX * gridSizeY;
|
||||||
const int bbindex = y_id * gridSizeX + x_id;
|
const int bbindex = y_id * gridSizeX + x_id;
|
||||||
|
|
||||||
|
const float objectness
|
||||||
|
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
|
||||||
|
|
||||||
|
if (objectness < scoreThreshold) return;
|
||||||
|
|
||||||
|
int count = (int)atomicAdd(&countData[0], 1);
|
||||||
|
|
||||||
const float alpha = scaleXY;
|
const float alpha = scaleXY;
|
||||||
const float beta = -0.5 * (scaleXY - 1);
|
const float beta = -0.5 * (scaleXY - 1);
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
|
float x
|
||||||
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
|
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
|
||||||
* alpha + beta + x_id) * netWidth / gridSizeX;
|
* alpha + beta + x_id) * netWidth / gridSizeX;
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
|
float y
|
||||||
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)])
|
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)])
|
||||||
* alpha + beta + y_id) * netHeight / gridSizeY;
|
* alpha + beta + y_id) * netHeight / gridSizeY;
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
|
float w
|
||||||
= __powf(sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]) * 2, 2)
|
= __powf(sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]) * 2, 2)
|
||||||
* anchors[mask[z_id] * 2];
|
* anchors[mask[z_id] * 2];
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
|
float h
|
||||||
= __powf(sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) * 2, 2)
|
= __powf(sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) * 2, 2)
|
||||||
* anchors[mask[z_id] * 2 + 1];
|
* anchors[mask[z_id] * 2 + 1];
|
||||||
|
|
||||||
const float objectness
|
|
||||||
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
|
|
||||||
|
|
||||||
float maxProb = 0.0f;
|
float maxProb = 0.0f;
|
||||||
int maxIndex = -1;
|
int maxIndex = -1;
|
||||||
|
|
||||||
@@ -65,22 +65,26 @@ __global__ void gpuYoloLayer_r(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
|
d_indexes[count] = count;
|
||||||
= objectness * maxProb;
|
d_scores[count] = objectness * maxProb + 1.f;
|
||||||
|
d_boxes[count * 4 + 0] = x - 0.5 * w;
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)]
|
d_boxes[count * 4 + 1] = y - 0.5 * h;
|
||||||
= maxIndex;
|
d_boxes[count * 4 + 2] = x + 0.5 * w;
|
||||||
|
d_boxes[count * 4 + 3] = y + 0.5 * h;
|
||||||
|
d_classes[count] = maxIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaYoloLayer_r(
|
cudaError_t cudaYoloLayer_r(
|
||||||
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
|
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
|
||||||
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
|
||||||
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
|
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
||||||
|
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
|
||||||
|
|
||||||
cudaError_t cudaYoloLayer_r(
|
cudaError_t cudaYoloLayer_r(
|
||||||
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
|
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
|
||||||
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
|
||||||
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
|
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
||||||
|
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
dim3 threads_per_block(16, 16, 4);
|
dim3 threads_per_block(16, 16, 4);
|
||||||
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
|
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
|
||||||
@@ -90,9 +94,12 @@ cudaError_t cudaYoloLayer_r(
|
|||||||
for (unsigned int batch = 0; batch < batchSize; ++batch)
|
for (unsigned int batch = 0; batch < batchSize; ++batch)
|
||||||
{
|
{
|
||||||
gpuYoloLayer_r<<<number_of_blocks, threads_per_block, 0, stream>>>(
|
gpuYoloLayer_r<<<number_of_blocks, threads_per_block, 0, stream>>>(
|
||||||
reinterpret_cast<const float*>(input) + (batch * outputSize),
|
reinterpret_cast<const float*>(input) + (batch * inputSize),
|
||||||
reinterpret_cast<float*>(output) + (batch * outputSize),
|
reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
|
||||||
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
|
reinterpret_cast<float*>(d_scores) + (batch * outputSize),
|
||||||
|
reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
|
||||||
|
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
|
||||||
|
scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
|
||||||
reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask));
|
reinterpret_cast<const float*>(anchors), reinterpret_cast<const int*>(mask));
|
||||||
}
|
}
|
||||||
return cudaGetLastError();
|
return cudaGetLastError();
|
||||||
|
|||||||
@@ -3,17 +3,13 @@
|
|||||||
* https://www.github.com/marcoslucianops
|
* https://www.github.com/marcoslucianops
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <cuda.h>
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
|
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
|
||||||
|
|
||||||
__device__ void softmaxGPU(
|
__device__ void softmaxGPU(
|
||||||
const float* input, const int bbindex, const int numGridCells, uint z_id,
|
const float* input, const int bbindex, const int numGridCells, uint z_id, const uint numOutputClasses, float temp,
|
||||||
const uint numOutputClasses, float temp, float* output)
|
float* output)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
@@ -33,9 +29,9 @@ __device__ void softmaxGPU(
|
|||||||
}
|
}
|
||||||
|
|
||||||
__global__ void gpuRegionLayer(
|
__global__ void gpuRegionLayer(
|
||||||
const float* input, float* output, float* softmax, const uint netWidth, const uint netHeight,
|
const float* input, float* softmax, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
|
||||||
const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes,
|
const float scoreThreshold, const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY,
|
||||||
const float* anchors)
|
const uint numOutputClasses, const uint numBBoxes, const float* anchors)
|
||||||
{
|
{
|
||||||
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
|
uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
|
uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
@@ -49,27 +45,31 @@ __global__ void gpuRegionLayer(
|
|||||||
const int numGridCells = gridSizeX * gridSizeY;
|
const int numGridCells = gridSizeX * gridSizeY;
|
||||||
const int bbindex = y_id * gridSizeX + x_id;
|
const int bbindex = y_id * gridSizeX + x_id;
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
|
const float objectness
|
||||||
|
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
|
||||||
|
|
||||||
|
if (objectness < scoreThreshold) return;
|
||||||
|
|
||||||
|
int count = (int)atomicAdd(&countData[0], 1);
|
||||||
|
|
||||||
|
float x
|
||||||
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
|
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
|
||||||
+ x_id) * netWidth / gridSizeX;
|
+ x_id) * netWidth / gridSizeX;
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
|
float y
|
||||||
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)])
|
= (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)])
|
||||||
+ y_id) * netHeight / gridSizeY;
|
+ y_id) * netHeight / gridSizeY;
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
|
float w
|
||||||
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)])
|
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)])
|
||||||
* anchors[z_id * 2] * netWidth / gridSizeX;
|
* anchors[z_id * 2] * netWidth / gridSizeX;
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
|
float h
|
||||||
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)])
|
= __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)])
|
||||||
* anchors[z_id * 2 + 1] * netHeight / gridSizeY;
|
* anchors[z_id * 2 + 1] * netHeight / gridSizeY;
|
||||||
|
|
||||||
softmaxGPU(input, bbindex, numGridCells, z_id, numOutputClasses, 1.0, softmax);
|
softmaxGPU(input, bbindex, numGridCells, z_id, numOutputClasses, 1.0, softmax);
|
||||||
|
|
||||||
const float objectness
|
|
||||||
= sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
|
|
||||||
|
|
||||||
float maxProb = 0.0f;
|
float maxProb = 0.0f;
|
||||||
int maxIndex = -1;
|
int maxIndex = -1;
|
||||||
|
|
||||||
@@ -85,22 +85,26 @@ __global__ void gpuRegionLayer(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
|
d_indexes[count] = count;
|
||||||
= objectness * maxProb;
|
d_scores[count] = objectness * maxProb + 1.f;
|
||||||
|
d_boxes[count * 4 + 0] = x - 0.5 * w;
|
||||||
output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 5)]
|
d_boxes[count * 4 + 1] = y - 0.5 * h;
|
||||||
= maxIndex;
|
d_boxes[count * 4 + 2] = x + 0.5 * w;
|
||||||
|
d_boxes[count * 4 + 3] = y + 0.5 * h;
|
||||||
|
d_classes[count] = maxIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaRegionLayer(
|
cudaError_t cudaRegionLayer(
|
||||||
const void* input, void* output, void* softmax, const uint& batchSize, const uint& netWidth,
|
const void* input, void* softmax, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
|
||||||
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses,
|
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
|
||||||
const uint& numBBoxes, uint64_t& outputSize, const void* anchors, cudaStream_t stream);
|
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
||||||
|
const void* anchors, cudaStream_t stream);
|
||||||
|
|
||||||
cudaError_t cudaRegionLayer(
|
cudaError_t cudaRegionLayer(
|
||||||
const void* input, void* output, void* softmax, const uint& batchSize, const uint& netWidth,
|
const void* input, void* softmax, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
|
||||||
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses,
|
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
|
||||||
const uint& numBBoxes, uint64_t& outputSize, const void* anchors, cudaStream_t stream)
|
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
||||||
|
const void* anchors, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
dim3 threads_per_block(16, 16, 4);
|
dim3 threads_per_block(16, 16, 4);
|
||||||
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
|
dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
|
||||||
@@ -110,10 +114,13 @@ cudaError_t cudaRegionLayer(
|
|||||||
for (unsigned int batch = 0; batch < batchSize; ++batch)
|
for (unsigned int batch = 0; batch < batchSize; ++batch)
|
||||||
{
|
{
|
||||||
gpuRegionLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
|
gpuRegionLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
|
||||||
reinterpret_cast<const float*>(input) + (batch * outputSize),
|
reinterpret_cast<const float*>(input) + (batch * inputSize),
|
||||||
reinterpret_cast<float*>(output) + (batch * outputSize),
|
reinterpret_cast<float*>(softmax) + (batch * inputSize),
|
||||||
reinterpret_cast<float*>(softmax) + (batch * outputSize),
|
reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
|
||||||
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes,
|
reinterpret_cast<float*>(d_scores) + (batch * outputSize),
|
||||||
|
reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
|
||||||
|
reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
|
||||||
|
scoreThreshold, netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes,
|
||||||
reinterpret_cast<const float*>(anchors));
|
reinterpret_cast<const float*>(anchors));
|
||||||
}
|
}
|
||||||
return cudaGetLastError();
|
return cudaGetLastError();
|
||||||
|
|||||||
@@ -29,7 +29,6 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
uint kNUM_BBOXES;
|
|
||||||
uint kNUM_CLASSES;
|
uint kNUM_CLASSES;
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
@@ -49,131 +48,108 @@ namespace {
|
|||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaYoloLayer_r(
|
cudaError_t cudaYoloLayer_r(
|
||||||
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
|
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
|
||||||
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
|
||||||
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
|
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
||||||
|
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
|
||||||
|
|
||||||
cudaError_t cudaYoloLayer_nc(
|
cudaError_t cudaYoloLayer_nc(
|
||||||
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
|
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
|
||||||
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
|
||||||
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
|
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
||||||
|
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
|
||||||
|
|
||||||
cudaError_t cudaYoloLayer(
|
cudaError_t cudaYoloLayer(
|
||||||
const void* input, void* output, const uint& batchSize, const uint& netWidth, const uint& netHeight,
|
const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
|
||||||
const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
|
||||||
uint64_t& outputSize, const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
|
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
||||||
|
const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream);
|
||||||
|
|
||||||
cudaError_t cudaRegionLayer(
|
cudaError_t cudaRegionLayer(
|
||||||
const void* input, void* output, void* softmax, const uint& batchSize, const uint& netWidth,
|
const void* input, void* softmax, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
|
||||||
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses,
|
const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
|
||||||
const uint& numBBoxes, uint64_t& outputSize, const void* anchors, cudaStream_t stream);
|
const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes,
|
||||||
|
const void* anchors, cudaStream_t stream);
|
||||||
|
|
||||||
|
cudaError_t sortDetections(
|
||||||
|
void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* bboxData, void* scoreData, void* countData,
|
||||||
|
const uint& batchSize, uint64_t& outputSize, uint& topK, const uint& numOutputClasses, cudaStream_t stream);
|
||||||
|
|
||||||
YoloLayer::YoloLayer (const void* data, size_t length)
|
YoloLayer::YoloLayer (const void* data, size_t length)
|
||||||
{
|
{
|
||||||
const char *d = static_cast<const char*>(data);
|
const char *d = static_cast<const char*>(data);
|
||||||
|
|
||||||
read(d, m_NumBBoxes);
|
|
||||||
read(d, m_NumClasses);
|
|
||||||
read(d, m_NetWidth);
|
read(d, m_NetWidth);
|
||||||
read(d, m_NetHeight);
|
read(d, m_NetHeight);
|
||||||
read(d, m_GridSizeX);
|
read(d, m_NumClasses);
|
||||||
read(d, m_GridSizeY);
|
|
||||||
read(d, m_Type);
|
|
||||||
read(d, m_NewCoords);
|
read(d, m_NewCoords);
|
||||||
read(d, m_ScaleXY);
|
|
||||||
read(d, m_OutputSize);
|
read(d, m_OutputSize);
|
||||||
|
read(d, m_Type);
|
||||||
|
read(d, m_TopK);
|
||||||
|
read(d, m_ScoreThreshold);
|
||||||
|
|
||||||
|
uint yoloTensorsSize;
|
||||||
|
read(d, yoloTensorsSize);
|
||||||
|
for (uint i = 0; i < yoloTensorsSize; ++i)
|
||||||
|
{
|
||||||
|
TensorInfo curYoloTensor;
|
||||||
|
read(d, curYoloTensor.gridSizeX);
|
||||||
|
read(d, curYoloTensor.gridSizeY);
|
||||||
|
read(d, curYoloTensor.numBBoxes);
|
||||||
|
read(d, curYoloTensor.scaleXY);
|
||||||
|
|
||||||
uint anchorsSize;
|
uint anchorsSize;
|
||||||
read(d, anchorsSize);
|
read(d, anchorsSize);
|
||||||
for (uint i = 0; i < anchorsSize; i++) {
|
for (uint j = 0; j < anchorsSize; j++)
|
||||||
|
{
|
||||||
float result;
|
float result;
|
||||||
read(d, result);
|
read(d, result);
|
||||||
m_Anchors.push_back(result);
|
curYoloTensor.anchors.push_back(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint maskSize;
|
uint maskSize;
|
||||||
read(d, maskSize);
|
read(d, maskSize);
|
||||||
for (uint i = 0; i < maskSize; i++) {
|
for (uint j = 0; j < maskSize; j++)
|
||||||
|
{
|
||||||
int result;
|
int result;
|
||||||
read(d, result);
|
read(d, result);
|
||||||
m_Mask.push_back(result);
|
curYoloTensor.mask.push_back(result);
|
||||||
|
}
|
||||||
|
m_YoloTensors.push_back(curYoloTensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_Anchors.size() > 0) {
|
|
||||||
float* anchors = m_Anchors.data();
|
|
||||||
CUDA_CHECK(cudaMallocHost(&p_Anchors, m_Anchors.size() * sizeof(float)));
|
|
||||||
CUDA_CHECK(cudaMemcpy(p_Anchors, anchors, m_Anchors.size() * sizeof(float), cudaMemcpyHostToDevice));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (m_Mask.size() > 0) {
|
|
||||||
int* mask = m_Mask.data();
|
|
||||||
CUDA_CHECK(cudaMallocHost(&p_Mask, m_Mask.size() * sizeof(int)));
|
|
||||||
CUDA_CHECK(cudaMemcpy(p_Mask, mask, m_Mask.size() * sizeof(int), cudaMemcpyHostToDevice));
|
|
||||||
}
|
|
||||||
|
|
||||||
kNUM_BBOXES = m_NumBBoxes;
|
|
||||||
kNUM_CLASSES = m_NumClasses;
|
kNUM_CLASSES = m_NumClasses;
|
||||||
};
|
};
|
||||||
|
|
||||||
YoloLayer::YoloLayer(
|
YoloLayer::YoloLayer(
|
||||||
const uint& numBBoxes, const uint& numClasses, const uint& netWidth, const uint& netHeight,
|
const uint& netWidth, const uint& netHeight, const uint& numClasses, const uint& newCoords,
|
||||||
const uint& gridSizeX, const uint& gridSizeY, const uint& modelType, const uint& newCoords,
|
const std::vector<TensorInfo>& yoloTensors, const uint64_t& outputSize, const uint& modelType, const uint& topK,
|
||||||
const float& scaleXY, const std::vector<float> anchors,
|
const float& scoreThreshold) :
|
||||||
const std::vector<int> mask) :
|
|
||||||
m_NumBBoxes(numBBoxes),
|
|
||||||
m_NumClasses(numClasses),
|
|
||||||
m_NetWidth(netWidth),
|
m_NetWidth(netWidth),
|
||||||
m_NetHeight(netHeight),
|
m_NetHeight(netHeight),
|
||||||
m_GridSizeX(gridSizeX),
|
m_NumClasses(numClasses),
|
||||||
m_GridSizeY(gridSizeY),
|
|
||||||
m_Type(modelType),
|
|
||||||
m_NewCoords(newCoords),
|
m_NewCoords(newCoords),
|
||||||
m_ScaleXY(scaleXY),
|
m_YoloTensors(yoloTensors),
|
||||||
m_Anchors(anchors),
|
m_OutputSize(outputSize),
|
||||||
m_Mask(mask)
|
m_Type(modelType),
|
||||||
|
m_TopK(topK),
|
||||||
|
m_ScoreThreshold(scoreThreshold)
|
||||||
{
|
{
|
||||||
assert(m_NumBBoxes > 0);
|
|
||||||
assert(m_NumClasses > 0);
|
|
||||||
assert(m_NetWidth > 0);
|
assert(m_NetWidth > 0);
|
||||||
assert(m_NetHeight > 0);
|
assert(m_NetHeight > 0);
|
||||||
assert(m_GridSizeX > 0);
|
|
||||||
assert(m_GridSizeY > 0);
|
|
||||||
|
|
||||||
m_OutputSize = m_GridSizeX * m_GridSizeY * (m_NumBBoxes * (4 + 1 + m_NumClasses));
|
|
||||||
|
|
||||||
if (m_Anchors.size() > 0) {
|
|
||||||
float* anchors = m_Anchors.data();
|
|
||||||
CUDA_CHECK(cudaMallocHost(&p_Anchors, m_Anchors.size() * sizeof(float)));
|
|
||||||
CUDA_CHECK(cudaMemcpy(p_Anchors, anchors, m_Anchors.size() * sizeof(float), cudaMemcpyHostToDevice));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (m_Mask.size() > 0) {
|
|
||||||
int* mask = m_Mask.data();
|
|
||||||
CUDA_CHECK(cudaMallocHost(&p_Mask, m_Mask.size() * sizeof(int)));
|
|
||||||
CUDA_CHECK(cudaMemcpy(p_Mask, mask, m_Mask.size() * sizeof(int), cudaMemcpyHostToDevice));
|
|
||||||
}
|
|
||||||
|
|
||||||
kNUM_BBOXES = m_NumBBoxes;
|
|
||||||
kNUM_CLASSES = m_NumClasses;
|
kNUM_CLASSES = m_NumClasses;
|
||||||
};
|
};
|
||||||
|
|
||||||
YoloLayer::~YoloLayer()
|
|
||||||
{
|
|
||||||
if (m_Anchors.size() > 0) {
|
|
||||||
CUDA_CHECK(cudaFreeHost(p_Anchors));
|
|
||||||
}
|
|
||||||
if (m_Mask.size() > 0) {
|
|
||||||
CUDA_CHECK(cudaFreeHost(p_Mask));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
nvinfer1::Dims
|
nvinfer1::Dims
|
||||||
YoloLayer::getOutputDimensions(
|
YoloLayer::getOutputDimensions(
|
||||||
int index, const nvinfer1::Dims* inputs, int nbInputDims) noexcept
|
int index, const nvinfer1::Dims* inputs, int nbInputDims) noexcept
|
||||||
{
|
{
|
||||||
assert(index == 0);
|
assert(index < 3);
|
||||||
assert(nbInputDims == 1);
|
if (index == 0) {
|
||||||
return inputs[0];
|
return nvinfer1::Dims3(m_TopK, 1, 4);
|
||||||
|
}
|
||||||
|
return nvinfer1::DimsHW(m_TopK, m_NumClasses);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool YoloLayer::supportsFormat (
|
bool YoloLayer::supportsFormat (
|
||||||
@@ -188,43 +164,116 @@ YoloLayer::configureWithFormat (
|
|||||||
const nvinfer1::Dims* outputDims, int nbOutputs,
|
const nvinfer1::Dims* outputDims, int nbOutputs,
|
||||||
nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept
|
nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept
|
||||||
{
|
{
|
||||||
assert(nbInputs == 1);
|
assert(nbInputs > 0);
|
||||||
assert(format == nvinfer1::PluginFormat::kLINEAR);
|
assert(format == nvinfer1::PluginFormat::kLINEAR);
|
||||||
assert(inputDims != nullptr);
|
assert(inputDims != nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t YoloLayer::enqueue (
|
int32_t YoloLayer::enqueue (
|
||||||
int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace,
|
int batchSize, void const* const* inputs, void* const* outputs, void* workspace,
|
||||||
cudaStream_t stream) noexcept
|
cudaStream_t stream) noexcept
|
||||||
{
|
{
|
||||||
|
void* countData = workspace;
|
||||||
|
void* bboxData = outputs[0];
|
||||||
|
void* scoreData = outputs[1];
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemsetAsync((int*)countData, 0, sizeof(int) * batchSize, stream));
|
||||||
|
CUDA_CHECK(cudaMemsetAsync((float*)bboxData, 0, sizeof(float) * m_TopK * 4 * batchSize, stream));
|
||||||
|
CUDA_CHECK(cudaMemsetAsync((float*)scoreData, 0, sizeof(float) * m_TopK * m_NumClasses * batchSize, stream));
|
||||||
|
|
||||||
|
void* d_indexes;
|
||||||
|
CUDA_CHECK(cudaMallocHost(&d_indexes, sizeof(int) * m_OutputSize * batchSize));
|
||||||
|
CUDA_CHECK(cudaMemsetAsync((float*)d_indexes, 0, sizeof(int) * m_OutputSize * batchSize, stream));
|
||||||
|
|
||||||
|
void* d_scores;
|
||||||
|
CUDA_CHECK(cudaMallocHost(&d_scores, sizeof(float) * m_OutputSize * batchSize));
|
||||||
|
CUDA_CHECK(cudaMemsetAsync((float*)d_scores, 0, sizeof(float) * m_OutputSize * batchSize, stream));
|
||||||
|
|
||||||
|
void* d_boxes;
|
||||||
|
CUDA_CHECK(cudaMallocHost(&d_boxes, sizeof(float) * m_OutputSize * 4 * batchSize));
|
||||||
|
CUDA_CHECK(cudaMemsetAsync((float*)d_boxes, 0, sizeof(float) * m_OutputSize * 4 * batchSize, stream));
|
||||||
|
|
||||||
|
void* d_classes;
|
||||||
|
CUDA_CHECK(cudaMallocHost(&d_classes, sizeof(int) * m_OutputSize * batchSize));
|
||||||
|
CUDA_CHECK(cudaMemsetAsync((float*)d_classes, 0, sizeof(int) * m_OutputSize * batchSize, stream));
|
||||||
|
|
||||||
|
uint yoloTensorsSize = m_YoloTensors.size();
|
||||||
|
for (uint i = 0; i < yoloTensorsSize; ++i)
|
||||||
|
{
|
||||||
|
TensorInfo& curYoloTensor = m_YoloTensors.at(i);
|
||||||
|
|
||||||
|
uint numBBoxes = curYoloTensor.numBBoxes;
|
||||||
|
float scaleXY = curYoloTensor.scaleXY;
|
||||||
|
uint gridSizeX = curYoloTensor.gridSizeX;
|
||||||
|
uint gridSizeY = curYoloTensor.gridSizeY;
|
||||||
|
std::vector<float> anchors = curYoloTensor.anchors;
|
||||||
|
std::vector<int> mask = curYoloTensor.mask;
|
||||||
|
|
||||||
|
void* v_anchors;
|
||||||
|
void* v_mask;
|
||||||
|
if (anchors.size() > 0) {
|
||||||
|
float* f_anchors = anchors.data();
|
||||||
|
CUDA_CHECK(cudaMallocHost(&v_anchors, sizeof(float) * anchors.size()));
|
||||||
|
CUDA_CHECK(cudaMemcpy(v_anchors, f_anchors, sizeof(float) * anchors.size(), cudaMemcpyHostToDevice));
|
||||||
|
}
|
||||||
|
if (mask.size() > 0) {
|
||||||
|
int* f_mask = mask.data();
|
||||||
|
CUDA_CHECK(cudaMallocHost(&v_mask, sizeof(int) * mask.size()));
|
||||||
|
CUDA_CHECK(cudaMemcpy(v_mask, f_mask, sizeof(int) * mask.size(), cudaMemcpyHostToDevice));
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t inputSize = gridSizeX * gridSizeY * (numBBoxes * (4 + 1 + m_NumClasses));
|
||||||
|
|
||||||
if (m_Type == 2) { // YOLOR incorrect param: scale_x_y = 2.0
|
if (m_Type == 2) { // YOLOR incorrect param: scale_x_y = 2.0
|
||||||
CUDA_CHECK(cudaYoloLayer_r(
|
CUDA_CHECK(cudaYoloLayer_r(
|
||||||
inputs[0], outputs[0], batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY,
|
inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
|
||||||
m_NumClasses, m_NumBBoxes, m_OutputSize, 2.0, p_Anchors, p_Mask, stream));
|
m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, 2.0, v_anchors,
|
||||||
|
v_mask, stream));
|
||||||
}
|
}
|
||||||
else if (m_Type == 1) {
|
else if (m_Type == 1) {
|
||||||
if (m_NewCoords) {
|
if (m_NewCoords) {
|
||||||
CUDA_CHECK(cudaYoloLayer_nc(
|
CUDA_CHECK(cudaYoloLayer_nc(
|
||||||
inputs[0], outputs[0], batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY,
|
inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
|
||||||
m_NumClasses, m_NumBBoxes, m_OutputSize, m_ScaleXY, p_Anchors, p_Mask, stream));
|
m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY,
|
||||||
|
v_anchors, v_mask, stream));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
CUDA_CHECK(cudaYoloLayer(
|
CUDA_CHECK(cudaYoloLayer(
|
||||||
inputs[0], outputs[0], batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY,
|
inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
|
||||||
m_NumClasses, m_NumBBoxes, m_OutputSize, m_ScaleXY, p_Anchors, p_Mask, stream));
|
m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY,
|
||||||
|
v_anchors, v_mask, stream));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
void* softmax;
|
void* softmax;
|
||||||
cudaMallocHost(&softmax, sizeof(outputs[0]));
|
CUDA_CHECK(cudaMallocHost(&softmax, sizeof(float) * inputSize * batchSize));
|
||||||
cudaMemcpy(softmax, outputs[0], sizeof(outputs[0]), cudaMemcpyHostToDevice);
|
CUDA_CHECK(cudaMemsetAsync((float*)softmax, 0, sizeof(float) * inputSize * batchSize));
|
||||||
|
|
||||||
CUDA_CHECK(cudaRegionLayer(
|
CUDA_CHECK(cudaRegionLayer(
|
||||||
inputs[0], outputs[0], softmax, batchSize, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY,
|
inputs[i], softmax, d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
|
||||||
m_NumClasses, m_NumBBoxes, m_OutputSize, p_Anchors, stream));
|
m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, v_anchors,
|
||||||
|
stream));
|
||||||
|
|
||||||
CUDA_CHECK(cudaFreeHost(softmax));
|
CUDA_CHECK(cudaFreeHost(softmax));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (anchors.size() > 0) {
|
||||||
|
CUDA_CHECK(cudaFreeHost(v_anchors));
|
||||||
|
}
|
||||||
|
if (mask.size() > 0) {
|
||||||
|
CUDA_CHECK(cudaFreeHost(v_mask));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
CUDA_CHECK(sortDetections(
|
||||||
|
d_indexes, d_scores, d_boxes, d_classes, bboxData, scoreData, countData, batchSize, m_OutputSize, m_TopK,
|
||||||
|
m_NumClasses, stream));
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaFreeHost(d_indexes));
|
||||||
|
CUDA_CHECK(cudaFreeHost(d_scores));
|
||||||
|
CUDA_CHECK(cudaFreeHost(d_boxes));
|
||||||
|
CUDA_CHECK(cudaFreeHost(d_classes));
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -232,18 +281,28 @@ size_t YoloLayer::getSerializationSize() const noexcept
|
|||||||
{
|
{
|
||||||
size_t totalSize = 0;
|
size_t totalSize = 0;
|
||||||
|
|
||||||
totalSize += sizeof(m_NumBBoxes);
|
|
||||||
totalSize += sizeof(m_NumClasses);
|
|
||||||
totalSize += sizeof(m_NetWidth);
|
totalSize += sizeof(m_NetWidth);
|
||||||
totalSize += sizeof(m_NetHeight);
|
totalSize += sizeof(m_NetHeight);
|
||||||
totalSize += sizeof(m_GridSizeX);
|
totalSize += sizeof(m_NumClasses);
|
||||||
totalSize += sizeof(m_GridSizeY);
|
|
||||||
totalSize += sizeof(m_Type);
|
|
||||||
totalSize += sizeof(m_NewCoords);
|
totalSize += sizeof(m_NewCoords);
|
||||||
totalSize += sizeof(m_ScaleXY);
|
|
||||||
totalSize += sizeof(m_OutputSize);
|
totalSize += sizeof(m_OutputSize);
|
||||||
totalSize += sizeof(uint) + sizeof(m_Anchors[0]) * m_Anchors.size();
|
totalSize += sizeof(m_Type);
|
||||||
totalSize += sizeof(uint) + sizeof(m_Mask[0]) * m_Mask.size();
|
totalSize += sizeof(m_TopK);
|
||||||
|
totalSize += sizeof(m_ScoreThreshold);
|
||||||
|
|
||||||
|
uint yoloTensorsSize = m_YoloTensors.size();
|
||||||
|
totalSize += sizeof(yoloTensorsSize);
|
||||||
|
|
||||||
|
for (uint i = 0; i < yoloTensorsSize; ++i)
|
||||||
|
{
|
||||||
|
const TensorInfo& curYoloTensor = m_YoloTensors.at(i);
|
||||||
|
totalSize += sizeof(curYoloTensor.gridSizeX);
|
||||||
|
totalSize += sizeof(curYoloTensor.gridSizeY);
|
||||||
|
totalSize += sizeof(curYoloTensor.numBBoxes);
|
||||||
|
totalSize += sizeof(curYoloTensor.scaleXY);
|
||||||
|
totalSize += sizeof(uint) + sizeof(curYoloTensor.anchors[0]) * curYoloTensor.anchors.size();
|
||||||
|
totalSize += sizeof(uint) + sizeof(curYoloTensor.mask[0]) * curYoloTensor.mask.size();
|
||||||
|
}
|
||||||
|
|
||||||
return totalSize;
|
return totalSize;
|
||||||
}
|
}
|
||||||
@@ -252,35 +311,46 @@ void YoloLayer::serialize(void* buffer) const noexcept
|
|||||||
{
|
{
|
||||||
char *d = static_cast<char*>(buffer);
|
char *d = static_cast<char*>(buffer);
|
||||||
|
|
||||||
write(d, m_NumBBoxes);
|
|
||||||
write(d, m_NumClasses);
|
|
||||||
write(d, m_NetWidth);
|
write(d, m_NetWidth);
|
||||||
write(d, m_NetHeight);
|
write(d, m_NetHeight);
|
||||||
write(d, m_GridSizeX);
|
write(d, m_NumClasses);
|
||||||
write(d, m_GridSizeY);
|
|
||||||
write(d, m_Type);
|
|
||||||
write(d, m_NewCoords);
|
write(d, m_NewCoords);
|
||||||
write(d, m_ScaleXY);
|
|
||||||
write(d, m_OutputSize);
|
write(d, m_OutputSize);
|
||||||
|
write(d, m_Type);
|
||||||
|
write(d, m_TopK);
|
||||||
|
write(d, m_ScoreThreshold);
|
||||||
|
|
||||||
uint anchorsSize = m_Anchors.size();
|
uint yoloTensorsSize = m_YoloTensors.size();
|
||||||
|
write(d, yoloTensorsSize);
|
||||||
|
for (uint i = 0; i < yoloTensorsSize; ++i)
|
||||||
|
{
|
||||||
|
const TensorInfo& curYoloTensor = m_YoloTensors.at(i);
|
||||||
|
write(d, curYoloTensor.gridSizeX);
|
||||||
|
write(d, curYoloTensor.gridSizeY);
|
||||||
|
write(d, curYoloTensor.numBBoxes);
|
||||||
|
write(d, curYoloTensor.scaleXY);
|
||||||
|
|
||||||
|
uint anchorsSize = curYoloTensor.anchors.size();
|
||||||
write(d, anchorsSize);
|
write(d, anchorsSize);
|
||||||
for (uint i = 0; i < anchorsSize; i++) {
|
for (uint j = 0; j < anchorsSize; ++j)
|
||||||
write(d, m_Anchors[i]);
|
{
|
||||||
|
write(d, curYoloTensor.anchors[j]);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint maskSize = m_Mask.size();
|
uint maskSize = curYoloTensor.mask.size();
|
||||||
write(d, maskSize);
|
write(d, maskSize);
|
||||||
for (uint i = 0; i < maskSize; i++) {
|
for (uint j = 0; j < maskSize; ++j)
|
||||||
write(d, m_Mask[i]);
|
{
|
||||||
|
write(d, curYoloTensor.mask[j]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
nvinfer1::IPluginV2* YoloLayer::clone() const noexcept
|
nvinfer1::IPluginV2* YoloLayer::clone() const noexcept
|
||||||
{
|
{
|
||||||
return new YoloLayer (
|
return new YoloLayer (
|
||||||
m_NumBBoxes, m_NumClasses, m_NetWidth, m_NetHeight, m_GridSizeX, m_GridSizeY, m_Type,
|
m_NetWidth, m_NetHeight, m_NumClasses, m_NewCoords, m_YoloTensors, m_OutputSize, m_Type, m_TopK,
|
||||||
m_NewCoords, m_ScaleXY, m_Anchors, m_Mask);
|
m_ScoreThreshold);
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTER_TENSORRT_PLUGIN(YoloLayerPluginCreator);
|
REGISTER_TENSORRT_PLUGIN(YoloLayerPluginCreator);
|
||||||
|
|||||||
@@ -36,12 +36,14 @@
|
|||||||
|
|
||||||
#include "NvInferPlugin.h"
|
#include "NvInferPlugin.h"
|
||||||
|
|
||||||
|
#include "yolo.h"
|
||||||
|
|
||||||
#define CUDA_CHECK(status) \
|
#define CUDA_CHECK(status) \
|
||||||
{ \
|
{ \
|
||||||
if (status != 0) \
|
if (status != 0) \
|
||||||
{ \
|
{ \
|
||||||
std::cout << "CUDA failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \
|
std::cout << "CUDA failure: " << cudaGetErrorString(status) << " in file " << __FILE__ << " at line " \
|
||||||
<< " at line " << __LINE__ << std::endl; \
|
<< __LINE__ << std::endl; \
|
||||||
abort(); \
|
abort(); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
@@ -56,15 +58,17 @@ class YoloLayer : public nvinfer1::IPluginV2
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
YoloLayer (const void* data, size_t length);
|
YoloLayer (const void* data, size_t length);
|
||||||
|
|
||||||
YoloLayer (
|
YoloLayer (
|
||||||
const uint& numBBoxes, const uint& numClasses, const uint& netWidth, const uint& netHeight,
|
const uint& netWidth, const uint& netHeight, const uint& numClasses, const uint& newCoords,
|
||||||
const uint& gridSizeX, const uint& gridSizeY, const uint& modelType, const uint& newCoords,
|
const std::vector<TensorInfo>& yoloTensors, const uint64_t& outputSize, const uint& modelType, const uint& topK,
|
||||||
const float& scaleXY, const std::vector<float> anchors,
|
const float& scoreThreshold);
|
||||||
const std::vector<int> mask);
|
|
||||||
~YoloLayer ();
|
|
||||||
const char* getPluginType () const noexcept override { return YOLOLAYER_PLUGIN_NAME; }
|
const char* getPluginType () const noexcept override { return YOLOLAYER_PLUGIN_NAME; }
|
||||||
|
|
||||||
const char* getPluginVersion () const noexcept override { return YOLOLAYER_PLUGIN_VERSION; }
|
const char* getPluginVersion () const noexcept override { return YOLOLAYER_PLUGIN_VERSION; }
|
||||||
int getNbOutputs () const noexcept override { return 1; }
|
|
||||||
|
int getNbOutputs () const noexcept override { return 2; }
|
||||||
|
|
||||||
nvinfer1::Dims getOutputDimensions (
|
nvinfer1::Dims getOutputDimensions (
|
||||||
int index, const nvinfer1::Dims* inputs,
|
int index, const nvinfer1::Dims* inputs,
|
||||||
@@ -74,53 +78,59 @@ public:
|
|||||||
nvinfer1::DataType type, nvinfer1::PluginFormat format) const noexcept override;
|
nvinfer1::DataType type, nvinfer1::PluginFormat format) const noexcept override;
|
||||||
|
|
||||||
void configureWithFormat (
|
void configureWithFormat (
|
||||||
const nvinfer1::Dims* inputDims, int nbInputs,
|
const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, int nbOutputs,
|
||||||
const nvinfer1::Dims* outputDims, int nbOutputs,
|
|
||||||
nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept override;
|
nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept override;
|
||||||
|
|
||||||
int initialize () noexcept override { return 0; }
|
int initialize () noexcept override { return 0; }
|
||||||
|
|
||||||
void terminate () noexcept override {}
|
void terminate () noexcept override {}
|
||||||
size_t getWorkspaceSize (int maxBatchSize) const noexcept override { return 0; }
|
|
||||||
|
size_t getWorkspaceSize (int maxBatchSize) const noexcept override {
|
||||||
|
return maxBatchSize * sizeof(int);
|
||||||
|
}
|
||||||
|
|
||||||
int32_t enqueue (
|
int32_t enqueue (
|
||||||
int32_t batchSize, void const* const* inputs, void* const* outputs,
|
int batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream)
|
||||||
void* workspace, cudaStream_t stream) noexcept override;
|
noexcept override;
|
||||||
|
|
||||||
size_t getSerializationSize() const noexcept override;
|
size_t getSerializationSize() const noexcept override;
|
||||||
|
|
||||||
void serialize (void* buffer) const noexcept override;
|
void serialize (void* buffer) const noexcept override;
|
||||||
|
|
||||||
void destroy () noexcept override { delete this; }
|
void destroy () noexcept override { delete this; }
|
||||||
|
|
||||||
nvinfer1::IPluginV2* clone() const noexcept override;
|
nvinfer1::IPluginV2* clone() const noexcept override;
|
||||||
|
|
||||||
void setPluginNamespace (const char* pluginNamespace) noexcept override {
|
void setPluginNamespace (const char* pluginNamespace) noexcept override {
|
||||||
m_Namespace = pluginNamespace;
|
m_Namespace = pluginNamespace;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual const char* getPluginNamespace () const noexcept override {
|
virtual const char* getPluginNamespace () const noexcept override {
|
||||||
return m_Namespace.c_str();
|
return m_Namespace.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::string m_Namespace {""};
|
std::string m_Namespace {""};
|
||||||
uint m_NumBBoxes {0};
|
|
||||||
uint m_NumClasses {0};
|
|
||||||
uint m_NetWidth {0};
|
uint m_NetWidth {0};
|
||||||
uint m_NetHeight {0};
|
uint m_NetHeight {0};
|
||||||
uint m_GridSizeX {0};
|
uint m_NumClasses {0};
|
||||||
uint m_GridSizeY {0};
|
|
||||||
uint m_Type {0};
|
|
||||||
uint m_NewCoords {0};
|
uint m_NewCoords {0};
|
||||||
float m_ScaleXY {0};
|
std::vector<TensorInfo> m_YoloTensors;
|
||||||
std::vector<float> m_Anchors;
|
|
||||||
std::vector<int> m_Mask;
|
|
||||||
uint64_t m_OutputSize {0};
|
uint64_t m_OutputSize {0};
|
||||||
void* p_Anchors;
|
uint m_Type {0};
|
||||||
void* p_Mask;
|
uint m_TopK {0};
|
||||||
|
float m_ScoreThreshold {0};
|
||||||
};
|
};
|
||||||
|
|
||||||
class YoloLayerPluginCreator : public nvinfer1::IPluginCreator
|
class YoloLayerPluginCreator : public nvinfer1::IPluginCreator
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
YoloLayerPluginCreator () {}
|
YoloLayerPluginCreator () {}
|
||||||
|
|
||||||
~YoloLayerPluginCreator () {}
|
~YoloLayerPluginCreator () {}
|
||||||
|
|
||||||
const char* getPluginName () const noexcept override { return YOLOLAYER_PLUGIN_NAME; }
|
const char* getPluginName () const noexcept override { return YOLOLAYER_PLUGIN_NAME; }
|
||||||
|
|
||||||
const char* getPluginVersion () const noexcept override { return YOLOLAYER_PLUGIN_VERSION; }
|
const char* getPluginVersion () const noexcept override { return YOLOLAYER_PLUGIN_VERSION; }
|
||||||
|
|
||||||
const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override {
|
const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override {
|
||||||
@@ -153,7 +163,6 @@ private:
|
|||||||
std::string m_Namespace {""};
|
std::string m_Namespace {""};
|
||||||
};
|
};
|
||||||
|
|
||||||
extern uint kNUM_BBOXES;
|
|
||||||
extern uint kNUM_CLASSES;
|
extern uint kNUM_CLASSES;
|
||||||
|
|
||||||
#endif // __YOLO_PLUGINS__
|
#endif // __YOLO_PLUGINS__
|
||||||
|
|||||||
112
readme.md
112
readme.md
@@ -25,7 +25,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
|
|||||||
* YOLOR native support
|
* YOLOR native support
|
||||||
* Models benchmarks (**outdated**)
|
* Models benchmarks (**outdated**)
|
||||||
* **GPU YOLO Decoder (moved from CPU to GPU to get better performance)** [#138](https://github.com/marcoslucianops/DeepStream-Yolo/issues/138)
|
* **GPU YOLO Decoder (moved from CPU to GPU to get better performance)** [#138](https://github.com/marcoslucianops/DeepStream-Yolo/issues/138)
|
||||||
* **Improved NMS** [#142](https://github.com/marcoslucianops/DeepStream-Yolo/issues/142)
|
* **GPU Batched NMS** [#142](https://github.com/marcoslucianops/DeepStream-Yolo/issues/142)
|
||||||
|
|
||||||
##
|
##
|
||||||
|
|
||||||
@@ -38,6 +38,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
|
|||||||
* [Basic usage](#basic-usage)
|
* [Basic usage](#basic-usage)
|
||||||
* [YOLOv5 usage](#yolov5-usage)
|
* [YOLOv5 usage](#yolov5-usage)
|
||||||
* [YOLOR usage](#yolor-usage)
|
* [YOLOR usage](#yolor-usage)
|
||||||
|
* [NMS configuration](#nms-configuration)
|
||||||
* [INT8 calibration](#int8-calibration)
|
* [INT8 calibration](#int8-calibration)
|
||||||
* [Using your custom model](docs/customModels.md)
|
* [Using your custom model](docs/customModels.md)
|
||||||
|
|
||||||
@@ -101,67 +102,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
|
|||||||
|
|
||||||
### Benchmarks
|
### Benchmarks
|
||||||
|
|
||||||
```
|
New tests comming soon.
|
||||||
nms-iou-threshold = 0.6
|
|
||||||
pre-cluster-threshold = 0.001 (mAP eval) / 0.25 (FPS measurement)
|
|
||||||
batch-size = 1
|
|
||||||
valid = val2017 (COCO) - 1000 random images for INT8 calibration
|
|
||||||
sample = 1920x1080 video
|
|
||||||
NOTE: Used maintain-aspect-ratio=1 in config_infer file for YOLOv4 (with letter_box=1), YOLOv5 and YOLOR models.
|
|
||||||
```
|
|
||||||
|
|
||||||
#### NVIDIA GTX 1050 4GB (Mobile)
|
|
||||||
|
|
||||||
##### YOLOR-CSP performance comparison
|
|
||||||
|
|
||||||
| | DeepStream | PyTorch |
|
|
||||||
|:---------------------:|:----------:|:-------:|
|
|
||||||
| FPS (without display) | 13.32 | 10.07 |
|
|
||||||
| FPS (with display) | 12.63 | 9.41 |
|
|
||||||
|
|
||||||
##### YOLOv5n performance comparison
|
|
||||||
|
|
||||||
| | DeepStream | TensorRTx | Ultralytics |
|
|
||||||
|:---------------------:|:----------:|:---------:|:-----------:|
|
|
||||||
| FPS (without display) | 110.25 | 87.42 | 97.19 |
|
|
||||||
| FPS (with display) | 105.62 | 73.07 | 50.37 |
|
|
||||||
|
|
||||||
<details><summary>More</summary>
|
|
||||||
<br>
|
|
||||||
|
|
||||||
| DeepStream | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS<br />(without display) |
|
|
||||||
|:------------------:|:---------:|:----------:|:------------:|:-------:|:--------:|:--------------------------:|
|
|
||||||
| YOLOR-P6 | FP32 | 1280 | 0.478 | 0.663 | 0.519 | 5.53 |
|
|
||||||
| YOLOR-CSP-X* | FP32 | 640 | 0.473 | 0.664 | 0.513 | 7.59 |
|
|
||||||
| YOLOR-CSP-X | FP32 | 640 | 0.470 | 0.661 | 0.507 | 7.52 |
|
|
||||||
| YOLOR-CSP* | FP32 | 640 | 0.459 | 0.652 | 0.496 | 13.28 |
|
|
||||||
| YOLOR-CSP | FP32 | 640 | 0.449 | 0.639 | 0.483 | 13.32 |
|
|
||||||
| YOLOv5x6 6.0 | FP32 | 1280 | 0.504 | 0.681 | 0.547 | 2.22 |
|
|
||||||
| YOLOv5l6 6.0 | FP32 | 1280 | 0.492 | 0.670 | 0.535 | 4.05 |
|
|
||||||
| YOLOv5m6 6.0 | FP32 | 1280 | 0.463 | 0.642 | 0.504 | 7.54 |
|
|
||||||
| YOLOv5s6 6.0 | FP32 | 1280 | 0.394 | 0.572 | 0.424 | 18.64 |
|
|
||||||
| YOLOv5n6 6.0 | FP32 | 1280 | 0.294 | 0.452 | 0.314 | 26.94 |
|
|
||||||
| YOLOv5x 6.0 | FP32 | 640 | 0.469 | 0.654 | 0.509 | 8.24 |
|
|
||||||
| YOLOv5l 6.0 | FP32 | 640 | 0.450 | 0.634 | 0.487 | 14.96 |
|
|
||||||
| YOLOv5m 6.0 | FP32 | 640 | 0.415 | 0.601 | 0.448 | 28.30 |
|
|
||||||
| YOLOv5s 6.0 | FP32 | 640 | 0.334 | 0.516 | 0.355 | 63.55 |
|
|
||||||
| YOLOv5n 6.0 | FP32 | 640 | 0.250 | 0.417 | 0.260 | 110.25 |
|
|
||||||
| YOLOv4-P6 | FP32 | 1280 | 0.499 | 0.685 | 0.542 | 2.57 |
|
|
||||||
| YOLOv4-P5 | FP32 | 896 | 0.472 | 0.659 | 0.513 | 5.48 |
|
|
||||||
| YOLOv4-CSP-X-SWISH | FP32 | 640 | 0.473 | 0.664 | 0.513 | 7.51 |
|
|
||||||
| YOLOv4-CSP-SWISH | FP32 | 640 | 0.459 | 0.652 | 0.496 | 13.13 |
|
|
||||||
| YOLOv4x-MISH | FP32 | 640 | 0.459 | 0.650 | 0.495 | 7.53 |
|
|
||||||
| YOLOv4-CSP | FP32 | 640 | 0.440 | 0.632 | 0.474 | 13.19 |
|
|
||||||
| YOLOv4 | FP32 | 608 | 0.498 | 0.740 | 0.549 | 12.18 |
|
|
||||||
| YOLOv4-Tiny | FP32 | 416 | 0.215 | 0.403 | 0.206 | 201.20 |
|
|
||||||
| YOLOv3-SPP | FP32 | 608 | 0.411 | 0.686 | 0.433 | 12.22 |
|
|
||||||
| YOLOv3-Tiny-PRN | FP32 | 416 | 0.167 | 0.382 | 0.125 | 277.14 |
|
|
||||||
| YOLOv3 | FP32 | 608 | 0.377 | 0.672 | 0.385 | 12.51 |
|
|
||||||
| YOLOv3-Tiny | FP32 | 416 | 0.095 | 0.203 | 0.079 | 218.42 |
|
|
||||||
| YOLOv2 | FP32 | 608 | 0.286 | 0.541 | 0.273 | 25.28 |
|
|
||||||
| YOLOv2-Tiny | FP32 | 416 | 0.102 | 0.258 | 0.061 | 231.36 |
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
##
|
##
|
||||||
|
|
||||||
@@ -221,9 +162,11 @@ wget https://us.download.nvidia.com/tesla/510.47.03/NVIDIA-Linux-x86_64-510.47.0
|
|||||||
* Run
|
* Run
|
||||||
|
|
||||||
```
|
```
|
||||||
sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau
|
sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**NOTE**: This step will disable the nouveau drivers.
|
||||||
|
|
||||||
* Reboot
|
* Reboot
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -233,7 +176,7 @@ sudo reboot
|
|||||||
* Install
|
* Install
|
||||||
|
|
||||||
```
|
```
|
||||||
sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --dkms --install-libglvnd
|
sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd
|
||||||
```
|
```
|
||||||
|
|
||||||
**NOTE**: If you are using a laptop with NVIDIA Optimius, run
|
**NOTE**: If you are using a laptop with NVIDIA Optimius, run
|
||||||
@@ -326,7 +269,7 @@ sudo apt install libssl1.0.0 libgstreamer1.0-0 gstreamer1.0-tools gstreamer1.0-p
|
|||||||
sudo apt-get install linux-headers-$(uname -r)
|
sudo apt-get install linux-headers-$(uname -r)
|
||||||
```
|
```
|
||||||
|
|
||||||
**NOTE**: Install DKMS only if you are using the default Ubuntu kernel
|
**NOTE**: Install DKMS only if you are using the default Ubuntu kernel.
|
||||||
|
|
||||||
```
|
```
|
||||||
sudo apt-get install dkms
|
sudo apt-get install dkms
|
||||||
@@ -369,9 +312,11 @@ wget https://us.download.nvidia.com/tesla/470.129.06/NVIDIA-Linux-x86_64-470.129
|
|||||||
* Run
|
* Run
|
||||||
|
|
||||||
```
|
```
|
||||||
sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau
|
sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**NOTE**: This step will disable the nouveau drivers.
|
||||||
|
|
||||||
* Reboot
|
* Reboot
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -381,7 +326,7 @@ sudo reboot
|
|||||||
* Install
|
* Install
|
||||||
|
|
||||||
```
|
```
|
||||||
sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --dkms --install-libglvnd
|
sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd
|
||||||
```
|
```
|
||||||
|
|
||||||
**NOTE**: If you are using a laptop with NVIDIA Optimius, run
|
**NOTE**: If you are using a laptop with NVIDIA Optimius, run
|
||||||
@@ -519,11 +464,6 @@ network-mode=0
|
|||||||
# Number of classes in label file
|
# Number of classes in label file
|
||||||
num-detected-classes=80
|
num-detected-classes=80
|
||||||
...
|
...
|
||||||
[class-attrs-all]
|
|
||||||
# IOU threshold
|
|
||||||
nms-iou-threshold=0.45
|
|
||||||
# Score threshold
|
|
||||||
pre-cluster-threshold=0.25
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 5. Run
|
#### 5. Run
|
||||||
@@ -542,6 +482,7 @@ gpu-id=0
|
|||||||
gie-unique-id=1
|
gie-unique-id=1
|
||||||
nvbuf-memory-type=0
|
nvbuf-memory-type=0
|
||||||
config-file=config_infer_primary_yoloV2.txt
|
config-file=config_infer_primary_yoloV2.txt
|
||||||
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
##
|
##
|
||||||
@@ -618,11 +559,6 @@ network-mode=0
|
|||||||
# Number of classes in label file
|
# Number of classes in label file
|
||||||
num-detected-classes=80
|
num-detected-classes=80
|
||||||
...
|
...
|
||||||
[class-attrs-all]
|
|
||||||
# IOU threshold
|
|
||||||
nms-iou-threshold=0.45
|
|
||||||
# Score threshold
|
|
||||||
pre-cluster-threshold=0.25
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 8. Change the deepstream_app_config.txt file
|
#### 8. Change the deepstream_app_config.txt file
|
||||||
@@ -749,11 +685,6 @@ network-mode=0
|
|||||||
# Number of classes in label file
|
# Number of classes in label file
|
||||||
num-detected-classes=80
|
num-detected-classes=80
|
||||||
...
|
...
|
||||||
[class-attrs-all]
|
|
||||||
# IOU threshold
|
|
||||||
nms-iou-threshold=0.5
|
|
||||||
# Score threshold
|
|
||||||
pre-cluster-threshold=0.25
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 8. Change the deepstream_app_config.txt file
|
#### 8. Change the deepstream_app_config.txt file
|
||||||
@@ -776,6 +707,23 @@ deepstream-app -c deepstream_app_config.txt
|
|||||||
|
|
||||||
##
|
##
|
||||||
|
|
||||||
|
### NMS Configuration
|
||||||
|
|
||||||
|
To change the `iou-threshold`, `score-threshold` and `topk` values, modify the `config_nms.txt` file and regenerate the model engine file.
|
||||||
|
|
||||||
|
**NOTE**: Lower `topk` values will result in more performance.
|
||||||
|
|
||||||
|
**NOTE**: Make sure to set cluster-mode=4 and pre-cluster-threshold=0 in config_infer file.
|
||||||
|
|
||||||
|
```
|
||||||
|
[property]
|
||||||
|
iou-threshold=0.45
|
||||||
|
score-threshold=0.25
|
||||||
|
topk=300
|
||||||
|
```
|
||||||
|
|
||||||
|
##
|
||||||
|
|
||||||
### INT8 calibration
|
### INT8 calibration
|
||||||
|
|
||||||
#### 1. Install OpenCV
|
#### 1. Install OpenCV
|
||||||
|
|||||||
@@ -144,6 +144,7 @@ with open(cfg_file, "w") as c:
|
|||||||
c.write("width=%d\n" % model_width)
|
c.write("width=%d\n" % model_width)
|
||||||
c.write("height=%d\n" % model_height)
|
c.write("height=%d\n" % model_height)
|
||||||
c.write("channels=%d\n" % model_channels)
|
c.write("channels=%d\n" % model_channels)
|
||||||
|
c.write("letter_box=1\n")
|
||||||
nc = 0
|
nc = 0
|
||||||
depth_multiple = 0
|
depth_multiple = 0
|
||||||
width_multiple = 0
|
width_multiple = 0
|
||||||
|
|||||||
Reference in New Issue
Block a user