Support for YOLOv5 4.0

Added support for YOLOv5 4.0
This commit is contained in:
Marcos Luciano
2021-05-03 21:01:05 -03:00
parent 170f9cac74
commit ae849d2f8b
10 changed files with 995 additions and 2 deletions

View File

@@ -0,0 +1,52 @@
#
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
CUDA_VER?=
ifeq ($(CUDA_VER),)
$(error "CUDA_VER is not set")
endif
CC:= g++
NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
CFLAGS+= -I../../includes -I/usr/local/cuda-$(CUDA_VER)/include
LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
INCS:= $(wildcard *.h)
SRCFILES:= nvdsparsebbox_Yolo.cpp \
yololayer.cu
TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
TARGET_OBJS:= $(SRCFILES:.cpp=.o)
TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
all: $(TARGET_LIB)
%.o: %.cpp $(INCS) Makefile
$(CC) -c -o $@ $(CFLAGS) $<
%.o: %.cu $(INCS) Makefile
$(NVCC) -c -o $@ --compiler-options '-fPIC' $<
$(TARGET_LIB) : $(TARGET_OBJS)
$(CC) -o $@ $(TARGET_OBJS) $(LFLAGS)
clean:
rm -rf $(TARGET_LIB)
rm -rf $(TARGET_OBJS)

View File

@@ -0,0 +1,18 @@
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_
#include <cuda_runtime_api.h>
#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
{\
cudaError_t error_code = callstr;\
if (error_code != cudaSuccess) {\
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
assert(0);\
}\
}
#endif // CUDA_CHECK
#endif // TRTX_CUDA_UTILS_H_

View File

@@ -0,0 +1,122 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <unordered_map>
#include "nvdsinfer_custom_impl.h"
#include <map>
#define kNMS_THRESH 0.45
static constexpr int LOCATIONS = 4;
struct alignas(float) Detection{
//center_x center_y w h
float bbox[LOCATIONS];
float conf; // bbox_conf * cls_conf
float class_id;
};
float iou(float lbox[4], float rbox[4]) {
float interBox[] = {
std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
};
if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
return 0.0f;
float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
}
bool cmp(Detection& a, Detection& b) {
return a.conf > b.conf;
}
void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nms_thresh) {
int det_size = sizeof(Detection) / sizeof(float);
std::map<float, std::vector<Detection>> m;
for (int i = 0; i < output[0] && i < 1000; i++) {
if (output[1 + det_size * i + 4] <= conf_thresh) continue;
Detection det;
memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Detection>());
m[det.class_id].push_back(det);
}
for (auto it = m.begin(); it != m.end(); it++) {
auto& dets = it->second;
std::sort(dets.begin(), dets.end(), cmp);
for (size_t m = 0; m < dets.size(); ++m) {
auto& item = dets[m];
res.push_back(item);
for (size_t n = m + 1; n < dets.size(); ++n) {
if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
dets.erase(dets.begin()+n);
--n;
}
}
}
}
}
/* This is a sample bounding box parsing function for the sample YoloV5 detector model */
static bool NvDsInferParseYoloV5(
std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
NvDsInferNetworkInfo const& networkInfo,
NvDsInferParseDetectionParams const& detectionParams,
std::vector<NvDsInferParseObjectInfo>& objectList)
{
const float kCONF_THRESH = detectionParams.perClassThreshold[0];
std::vector<Detection> res;
nms(res, (float*)(outputLayersInfo[0].buffer), kCONF_THRESH, kNMS_THRESH);
for(auto& r : res) {
NvDsInferParseObjectInfo oinfo;
oinfo.classId = r.class_id;
oinfo.left = static_cast<unsigned int>(r.bbox[0]-r.bbox[2]*0.5f);
oinfo.top = static_cast<unsigned int>(r.bbox[1]-r.bbox[3]*0.5f);
oinfo.width = static_cast<unsigned int>(r.bbox[2]);
oinfo.height = static_cast<unsigned int>(r.bbox[3]);
oinfo.detectionConfidence = r.conf;
objectList.push_back(oinfo);
}
return true;
}
extern "C" bool NvDsInferParseCustomYoloV5(
std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferParseObjectInfo> &objectList)
{
return NvDsInferParseYoloV5(
outputLayersInfo, networkInfo, detectionParams, objectList);
}
/* Check that the custom function has been defined correctly */
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV5);

View File

@@ -0,0 +1,333 @@
#include <assert.h>
#include <vector>
#include <iostream>
#include "yololayer.h"
#include "cuda_utils.h"
namespace Tn
{
template<typename T>
void write(char*& buffer, const T& val)
{
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
template<typename T>
void read(const char*& buffer, T& val)
{
val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
}
}
using namespace Yolo;
namespace nvinfer1
{
YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel)
{
mClassCount = classCount;
mYoloV5NetWidth = netWidth;
mYoloV5NetHeight = netHeight;
mMaxOutObject = maxOut;
mYoloKernel = vYoloKernel;
mKernelCount = vYoloKernel.size();
CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
for (int ii = 0; ii < mKernelCount; ii++)
{
CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
const auto& yolo = mYoloKernel[ii];
CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
}
}
YoloLayerPlugin::~YoloLayerPlugin()
{
for (int ii = 0; ii < mKernelCount; ii++)
{
CUDA_CHECK(cudaFree(mAnchor[ii]));
}
CUDA_CHECK(cudaFreeHost(mAnchor));
}
// create the plugin at runtime from a byte stream
YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
{
using namespace Tn;
const char *d = reinterpret_cast<const char *>(data), *a = d;
read(d, mClassCount);
read(d, mThreadCount);
read(d, mKernelCount);
read(d, mYoloV5NetWidth);
read(d, mYoloV5NetHeight);
read(d, mMaxOutObject);
mYoloKernel.resize(mKernelCount);
auto kernelSize = mKernelCount * sizeof(YoloKernel);
memcpy(mYoloKernel.data(), d, kernelSize);
d += kernelSize;
CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
for (int ii = 0; ii < mKernelCount; ii++)
{
CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
const auto& yolo = mYoloKernel[ii];
CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
}
assert(d == a + length);
}
void YoloLayerPlugin::serialize(void* buffer) const
{
using namespace Tn;
char* d = static_cast<char*>(buffer), *a = d;
write(d, mClassCount);
write(d, mThreadCount);
write(d, mKernelCount);
write(d, mYoloV5NetWidth);
write(d, mYoloV5NetHeight);
write(d, mMaxOutObject);
auto kernelSize = mKernelCount * sizeof(YoloKernel);
memcpy(d, mYoloKernel.data(), kernelSize);
d += kernelSize;
assert(d == a + getSerializationSize());
}
size_t YoloLayerPlugin::getSerializationSize() const
{
return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight) + sizeof(mMaxOutObject);
}
int YoloLayerPlugin::initialize()
{
return 0;
}
Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
{
//output the result to channel
int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float);
return Dims3(totalsize + 1, 1, 1);
}
// Set plugin namespace
void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
{
mPluginNamespace = pluginNamespace;
}
const char* YoloLayerPlugin::getPluginNamespace() const
{
return mPluginNamespace;
}
// Return the DataType of the plugin output at the requested index
DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
{
return DataType::kFLOAT;
}
// Return true if output tensor is broadcast across a batch.
bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
{
return false;
}
// Return true if plugin can use input that is broadcast across batch without replication.
bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
{
return false;
}
void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
{
}
// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
{
}
// Detach the plugin object from its execution context.
void YoloLayerPlugin::detachFromContext() {}
const char* YoloLayerPlugin::getPluginType() const
{
return "YoloLayer_TRT";
}
const char* YoloLayerPlugin::getPluginVersion() const
{
return "1";
}
void YoloLayerPlugin::destroy()
{
delete this;
}
// Clone the plugin
IPluginV2IOExt* YoloLayerPlugin::clone() const
{
YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, mYoloKernel);
p->setPluginNamespace(mPluginNamespace);
return p;
}
__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };
__global__ void CalDetection(const float *input, float *output, int noElements,
const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT * 2], int classes, int outputElem)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx >= noElements) return;
int total_grid = yoloWidth * yoloHeight;
int bnIdx = idx / total_grid;
idx = idx - total_grid * bnIdx;
int info_len_i = 5 + classes;
const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
for (int k = 0; k < 3; ++k) {
float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
if (box_prob < IGNORE_THRESH) continue;
int class_id = 0;
float max_cls_prob = 0.0;
for (int i = 5; i < info_len_i; ++i) {
float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
if (p > max_cls_prob) {
max_cls_prob = p;
class_id = i - 5;
}
}
float *res_count = output + bnIdx * outputElem;
int count = (int)atomicAdd(res_count, 1);
if (count >= maxoutobject) return;
char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection);
Detection* det = (Detection*)(data);
int row = idx / yoloWidth;
int col = idx % yoloWidth;
//Location
// pytorch:
// y = x[i].sigmoid()
// y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i] # xy
// y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
// X: (sigmoid(tx) + cx)/FeaturemapW * netwidth
det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;
// W: (Pw * e^tw) / FeaturemapW * netwidth
// v5: https://github.com/ultralytics/yolov5/issues/471
det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k];
det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1];
det->conf = box_prob * max_cls_prob;
det->class_id = class_id;
}
}
void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize)
{
int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
for (int idx = 0; idx < batchSize; ++idx) {
CUDA_CHECK(cudaMemset(output + idx * outputElem, 0, sizeof(float)));
}
int numElem = 0;
for (unsigned int i = 0; i < mYoloKernel.size(); ++i)
{
const auto& yolo = mYoloKernel[i];
numElem = yolo.width*yolo.height*batchSize;
if (numElem < mThreadCount)
mThreadCount = numElem;
//printf("Net: %d %d \n", mYoloV5NetWidth, mYoloV5NetHeight);
CalDetection << < (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> >
(inputs[i], output, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem);
}
}
int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
{
forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
return 0;
}
PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
YoloPluginCreator::YoloPluginCreator()
{
mPluginAttributes.clear();
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char* YoloPluginCreator::getPluginName() const
{
return "YoloLayer_TRT";
}
const char* YoloPluginCreator::getPluginVersion() const
{
return "1";
}
const PluginFieldCollection* YoloPluginCreator::getFieldNames()
{
return &mFC;
}
IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
{
int class_count = -1;
int input_w = -1;
int input_h = -1;
int max_output_object_count = -1;
std::vector<Yolo::YoloKernel> yolo_kernels(3);
const PluginField* fields = fc->fields;
for (int i = 0; i < fc->nbFields; i++) {
if (strcmp(fields[i].name, "netdata") == 0) {
assert(fields[i].type == PluginFieldType::kFLOAT32);
int *tmp = (int*)(fields[i].data);
class_count = tmp[0];
input_w = tmp[1];
input_h = tmp[2];
max_output_object_count = tmp[3];
} else if (strstr(fields[i].name, "yolodata") != NULL) {
assert(fields[i].type == PluginFieldType::kFLOAT32);
int *tmp = (int*)(fields[i].data);
YoloKernel kernel;
kernel.width = tmp[0];
kernel.height = tmp[1];
for (int j = 0; j < fields[i].length - 2; j++) {
kernel.anchors[j] = tmp[j + 2];
}
yolo_kernels[2 - (fields[i].name[8] - '1')] = kernel;
}
}
assert(class_count && input_w && input_h && max_output_object_count);
YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, yolo_kernels);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
{
// This object will be deleted when the network is destroyed, which will
// call YoloLayerPlugin::destroy()
YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
}

View File

@@ -0,0 +1,137 @@
#ifndef _YOLO_LAYER_H
#define _YOLO_LAYER_H
#include <vector>
#include <string>
#include "NvInfer.h"
namespace Yolo
{
static constexpr int CHECK_COUNT = 3;
static constexpr float IGNORE_THRESH = 0.1f;
struct YoloKernel
{
int width;
int height;
float anchors[CHECK_COUNT * 2];
};
static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
static constexpr int CLASS_NUM = 80;
static constexpr int INPUT_H = 640; // yolov5's input height and width must be divisible by 32.
static constexpr int INPUT_W = 640;
static constexpr int LOCATIONS = 4;
struct alignas(float) Detection {
//center_x center_y w h
float bbox[LOCATIONS];
float conf; // bbox_conf * cls_conf
float class_id;
};
}
namespace nvinfer1
{
class YoloLayerPlugin : public IPluginV2IOExt
{
public:
YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel);
YoloLayerPlugin(const void* data, size_t length);
~YoloLayerPlugin();
int getNbOutputs() const override
{
return 1;
}
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
int initialize() override;
virtual void terminate() override {};
virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }
virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
virtual size_t getSerializationSize() const override;
virtual void serialize(void* buffer) const override;
bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
}
const char* getPluginType() const override;
const char* getPluginVersion() const override;
void destroy() override;
IPluginV2IOExt* clone() const override;
void setPluginNamespace(const char* pluginNamespace) override;
const char* getPluginNamespace() const override;
DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
bool canBroadcastInputAcrossBatch(int inputIndex) const override;
void attachToContext(
cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;
void detachFromContext() override;
private:
void forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize = 1);
int mThreadCount = 256;
const char* mPluginNamespace;
int mKernelCount;
int mClassCount;
int mYoloV5NetWidth;
int mYoloV5NetHeight;
int mMaxOutObject;
std::vector<Yolo::YoloKernel> mYoloKernel;
void** mAnchor;
};
class YoloPluginCreator : public IPluginCreator
{
public:
YoloPluginCreator();
~YoloPluginCreator() override = default;
const char* getPluginName() const override;
const char* getPluginVersion() const override;
const PluginFieldCollection* getFieldNames() override;
IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;
IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
void setPluginNamespace(const char* libNamespace) override
{
mNamespace = libNamespace;
}
const char* getPluginNamespace() const override
{
return mNamespace.c_str();
}
private:
std::string mNamespace;
static PluginFieldCollection mFC;
static std::vector<PluginField> mPluginAttributes;
};
REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
};
#endif