New features

- Added support for INT8 calibration - Added support for non square models - Updated mAP comparison between models
2021-06-18 00:30:10 -03:00
parent 312e9a448d
commit cbd9675dc2
74 changed files with 3287 additions and 700 deletions
--- a/examples/multiple_inferences/deepstream_app_config.txt
+++ b/examples/multiple_inferences/deepstream_app_config.txt
@@ -14,7 +14,7 @@ nvbuf-memory-type=0
 [source0]
 enable=1
 type=3
-uri=rtsp://192.168.1.2/Streaming/Channels/101/httppreview
+uri=file:///opt/nvidia/deepstream/deepstream-5.1/samples/streams/sample_1080p_h264.mp4
 num-sources=1
 gpu-id=0
 cudadec-memtype=0
--- a/examples/multiple_inferences/pgie/config_infer_primary.txt
+++ b/examples/multiple_inferences/pgie/config_infer_primary.txt
@@ -4,10 +4,11 @@ net-scale-factor=0.0039215697906911373
 model-color-format=0
 custom-network-config=pgie/yolo.cfg
 model-file=yolo.weights
-model-engine-file=model_b1_gpu0_fp16.engine
+model-engine-file=model_b1_gpu0_fp32.engine
 #int8-calib-file=calib.table
 labelfile-path=labels.txt
 batch-size=1
-network-mode=2
+network-mode=0
 num-detected-classes=2
 interval=0
 gie-unique-id=1
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/Makefile
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/Makefile
@@ -27,13 +27,25 @@ CUDA_VER?=
 ifeq ($(CUDA_VER),)
  $(error "CUDA_VER is not set")
 endif
 OPENCV?=
 ifeq ($(OPENCV),)
  OPENCV=0
 endif
 CC:= g++
 NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
 CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
-CFLAGS+= -I../../../includes -I/usr/local/cuda-$(CUDA_VER)/include
+CFLAGS+= -I/opt/nvidia/deepstream/deepstream-5.1/sources/includes -I/usr/local/cuda-$(CUDA_VER)/include
-LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
+ifeq ($(OPENCV), 1)
 COMMON= -DOPENCV
 CFLAGS+= $(shell pkg-config --cflags opencv4 2> /dev/null || pkg-config --cflags opencv)
 LIBS+= $(shell pkg-config --libs opencv4 2> /dev/null || pkg-config --libs opencv)
 endif
 LIBS+= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
 LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
 INCS:= $(wildcard *.h)
@@ -50,6 +62,11 @@ SRCFILES:= nvdsinfer_yolo_engine.cpp \
           utils.cpp \
           yolo.cpp \
           yoloForward.cu
 ifeq ($(OPENCV), 1)
 SRCFILES+= calibrator.cpp
 endif
 TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
 TARGET_OBJS:= $(SRCFILES:.cpp=.o)
@@ -58,7 +75,7 @@ TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
 all: $(TARGET_LIB)
 %.o: %.cpp $(INCS) Makefile
-	$(CC) -c -o $@ $(CFLAGS) $<
+	$(CC) -c $(COMMON) -o $@ $(CFLAGS) $<
 %.o: %.cu $(INCS) Makefile
 	$(NVCC) -c -o $@ --compiler-options '-fPIC' $<
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/calibrator.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/calibrator.cpp
@@ -0,0 +1,130 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "calibrator.h"
 #include <fstream>
 #include <iterator>
 namespace nvinfer1
 {
    int8EntroyCalibrator::int8EntroyCalibrator(const int &batchsize, const int &channels, const int &height, const int &width, const int &letterbox, const std::string &imgPath,
        const std::string &calibTablePath):batchSize(batchsize), inputC(channels), inputH(height), inputW(width), letterBox(letterbox), calibTablePath(calibTablePath), imageIndex(0)
    {
        inputCount = batchsize * channels * height * width;
        std::fstream f(imgPath);
        if (f.is_open())
        {
            std::string temp;
            while (std::getline(f, temp)) imgPaths.push_back(temp);
        }
        batchData = new float[inputCount];
        CUDA_CHECK(cudaMalloc(&deviceInput, inputCount * sizeof(float)));
    }
    int8EntroyCalibrator::~int8EntroyCalibrator()
    {
        CUDA_CHECK(cudaFree(deviceInput));
        if (batchData)
            delete[] batchData;
    }
    bool int8EntroyCalibrator::getBatch(void **bindings, const char **names, int nbBindings)
    {
        if (imageIndex + batchSize > uint(imgPaths.size()))
            return false;
        float* ptr = batchData;
        for (size_t j = imageIndex; j < imageIndex + batchSize; ++j)
        {
            cv::Mat img = cv::imread(imgPaths[j], cv::IMREAD_COLOR);
            std::vector<float>inputData = prepareImage(img, inputC, inputH, inputW, letterBox);
            int len = (int)(inputData.size());
            memcpy(ptr, inputData.data(), len * sizeof(float));
            ptr += inputData.size();
            std::cout << "Load image: " << imgPaths[j] << std::endl;
            std::cout << "Progress: " << (j + 1)*100. / imgPaths.size() << "%" << std::endl;
        }
        imageIndex += batchSize;
        CUDA_CHECK(cudaMemcpy(deviceInput, batchData, inputCount * sizeof(float), cudaMemcpyHostToDevice));
        bindings[0] = deviceInput;
        return true;
    }
    const void* int8EntroyCalibrator::readCalibrationCache(std::size_t &length)
    {
        calibrationCache.clear();
        std::ifstream input(calibTablePath, std::ios::binary);
        input >> std::noskipws;
        if (readCache && input.good())
        {
            std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
                std::back_inserter(calibrationCache));
        }
        length = calibrationCache.size();
        return length ? calibrationCache.data() : nullptr;
    }
    void int8EntroyCalibrator::writeCalibrationCache(const void *cache, std::size_t length)
    {
        std::ofstream output(calibTablePath, std::ios::binary);
        output.write(reinterpret_cast<const char*>(cache), length);
    }
 }
 std::vector<float> prepareImage(cv::Mat& img, int input_c, int input_h, int input_w, int letter_box)
 {
    cv::Mat out;
    if (letter_box == 2)
    {
        int image_w = img.cols;
        int image_h = img.rows;
        int resize_w = 0;
        int resize_h = 0;
        int offset_top = 0;
        int offset_bottom = 0;
        int offset_left = 0;
        int offset_right = 0;
        if ((float)input_h / image_h > (float)input_w / image_w)
        {
            resize_w = input_w;
            resize_h = (input_w * image_h) / image_w;
            offset_bottom = input_h - resize_h;
        }
        else
        {
            resize_h = input_h;
            resize_w = (input_h * image_w) / image_h;
            offset_right = input_w - resize_w;
        }
        cv::resize(img, out, cv::Size(resize_w, resize_h), 0, 0, cv::INTER_CUBIC);
        cv::copyMakeBorder(out, out, offset_top, offset_bottom, offset_left, offset_right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
    }
    else
    {
        cv::resize(img, out, cv::Size(input_w, input_h), 0, 0, cv::INTER_CUBIC);
    }
    cv::cvtColor(out, out, cv::COLOR_BGR2RGB);
    if (input_c == 3)
    {
        out.convertTo(out, CV_32FC3, 1.0 / 255.0);
    }
    else
    {
        out.convertTo(out, CV_32FC1, 1.0 / 255.0);
    }
    std::vector<cv::Mat> input_channels(input_c);
    cv::split(out, input_channels);
    std::vector<float> result(input_h * input_w * input_c);
    auto data = result.data();
    int channelLength = input_h * input_w;
    for (int i = 0; i < input_c; ++i)
    {
        memcpy(data, input_channels[i].data, channelLength * sizeof(float));
        data += channelLength;
    }
    return result;
 }
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/calibrator.h
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/calibrator.h
@@ -0,0 +1,62 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef CALIBRATOR_H
 #define CALIBRATOR_H
 #include "opencv2/opencv.hpp"
 #include "cuda_runtime.h"
 #include "NvInfer.h"
 #include <vector>
 #include <string>
 #ifndef CUDA_CHECK
 #define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }
 #endif
 namespace nvinfer1 {
    class int8EntroyCalibrator : public nvinfer1::IInt8EntropyCalibrator2 {
    public:
        int8EntroyCalibrator(const int &batchsize,
                             const int &channels,
                             const int &height,
                             const int &width,
                             const int &letterbox,
                             const std::string &imgPath,
                             const std::string &calibTablePath);
        virtual ~int8EntroyCalibrator();
        int getBatchSize() const override { return batchSize; }
        bool getBatch(void *bindings[], const char *names[], int nbBindings) override;
        const void *readCalibrationCache(std::size_t &length) override;
        void writeCalibrationCache(const void *ptr, std::size_t length) override;
    private:
        int batchSize;
        int inputC;
        int inputH;
        int inputW;
        int letterBox;
        std::string calibTablePath;
        size_t imageIndex;
        size_t inputCount;
        std::vector<std::string> imgPaths;
        float *batchData{ nullptr };
        void  *deviceInput{ nullptr };
        bool readCache;
        std::vector<char> calibrationCache;
    };
 }
 std::vector<float> prepareImage(cv::Mat& img, int input_c, int input_h, int input_w, int letter_box);
 #endif //CALIBRATOR_H
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/activation_layer.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/activation_layer.cpp
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/activation_layer.h
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/activation_layer.h
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.cpp
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.h
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.h
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.cpp
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.h
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.h
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.cpp
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.h
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.h
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/route_layer.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/route_layer.cpp
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/route_layer.h
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/route_layer.h
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.cpp
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.h
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.h
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp
@@ -0,0 +1,24 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "upsample_layer.h"
 nvinfer1::ILayer* upsampleLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network)
 {
    assert(block.at("type") == "upsample");
    int stride = std::stoi(block.at("stride"));
    nvinfer1::IResizeLayer* resize_layer = network->addResize(*input);
    resize_layer->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    float scale[3] = {1, stride, stride};
    resize_layer->setScales(scale, 3);
    std::string layer_name = "upsample_" + std::to_string(layerIdx);
    resize_layer->setName(layer_name.c_str());
    return resize_layer;
 }
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.h
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.h
@@ -15,9 +15,6 @@
 nvinfer1::ILayer* upsampleLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    std::vector<float>& weights,
    std::vector<nvinfer1::Weights>& trtWeights,
    int& inputChannels,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network);
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp
@@ -45,9 +45,20 @@ static bool getYoloNetworkInfo (NetworkInfo &networkInfo, const NvDsInferContext
    networkInfo.networkType     = yoloType;
    networkInfo.configFilePath  = initParams->customNetworkConfigFilePath;
    networkInfo.wtsFilePath     = initParams->modelFilePath;
    networkInfo.int8CalibPath   = initParams->int8CalibrationFilePath;
    networkInfo.deviceType      = (initParams->useDLA ? "kDLA" : "kGPU");
    networkInfo.inputBlobName   = "data";
    if(initParams->networkMode == 0) {
        networkInfo.networkMode = "FP32";
    }
    else if(initParams->networkMode == 1) {
        networkInfo.networkMode = "INT8";
    }
    else if(initParams->networkMode == 2) {
        networkInfo.networkMode = "FP16";
    }
    if (networkInfo.configFilePath.empty() ||
        networkInfo.wtsFilePath.empty()) {
        std::cerr << "YOLO config file or weights file is not specified"
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp
@@ -302,7 +302,6 @@ static bool NvDsInferParseYolo(
        const uint gridSizeH = layer.inferDims.d[1];
        const uint gridSizeW = layer.inferDims.d[2];
        const uint stride = DIVUP(networkInfo.width, gridSizeW);
        //assert(stride == DIVUP(networkInfo.height, gridSizeH));
        std::vector<NvDsInferParseObjectInfo> outObjs =
            decodeYoloTensor((const float*)(layer.buffer), masks[idx], anchors, gridSizeW, gridSizeH, stride, masks[idx].size(),
@@ -344,7 +343,6 @@ static bool NvDsInferParseYoloV2(
    const uint gridSizeH = layer.inferDims.d[1];
    const uint gridSizeW = layer.inferDims.d[2];
    const uint stride = DIVUP(networkInfo.width, gridSizeW);
    //assert(stride == DIVUP(networkInfo.height, gridSizeH));
    for (auto& anchor : anchors) {
        anchor *= stride;
    }
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/utils.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/utils.cpp
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/utils.h
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/utils.h
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/yolo.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/yolo.cpp
@@ -25,6 +25,11 @@
 #include "yolo.h"
 #include "yoloPlugins.h"
 #include <stdlib.h>
 #ifdef OPENCV
 #include "calibrator.h"
 #endif
 void orderParams(std::vector<std::vector<int>> *maskVector) {
    std::vector<std::vector<int>> maskinput = *maskVector;
@@ -45,6 +50,8 @@ Yolo::Yolo(const NetworkInfo& networkInfo)
    : m_NetworkType(networkInfo.networkType), // YOLO type
      m_ConfigFilePath(networkInfo.configFilePath), // YOLO cfg
      m_WtsFilePath(networkInfo.wtsFilePath), // YOLO weights
      m_Int8CalibPath(networkInfo.int8CalibPath), // INT8 calibration path
      m_NetworkMode(networkInfo.networkMode), // FP32, INT8, FP16
      m_DeviceType(networkInfo.deviceType), // kDLA, kGPU
      m_InputBlobName(networkInfo.inputBlobName), // data
      m_InputH(0),
@@ -62,6 +69,38 @@ nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder)
 {
    assert (builder);
    m_ConfigBlocks = parseConfigFile(m_ConfigFilePath);
    parseConfigBlocks();
    orderParams(&m_OutputMasks);
    if (m_NetworkMode == "INT8" && !fileExists(m_Int8CalibPath)) {
        assert(builder->platformHasFastInt8());
 #ifdef OPENCV
        std::string calib_image_list;
        int calib_batch_size;
        if (getenv("INT8_CALIB_IMG_PATH")) {
            calib_image_list = getenv("INT8_CALIB_IMG_PATH");
        }
        else {
            std::cerr << "INT8_CALIB_IMG_PATH not set" << std::endl;
            std::abort();
        }
        if (getenv("INT8_CALIB_BATCH_SIZE")) {
            calib_batch_size = std::stoi(getenv("INT8_CALIB_BATCH_SIZE"));
        }
        else {
            std::cerr << "INT8_CALIB_BATCH_SIZE not set" << std::endl;
            std::abort();
        }
        nvinfer1::int8EntroyCalibrator *calibrator = new nvinfer1::int8EntroyCalibrator(calib_batch_size, m_InputC, m_InputH, m_InputW, m_LetterBox, calib_image_list, m_Int8CalibPath);
        builder->setInt8Mode(true);
        builder->setInt8Calibrator(calibrator);
 #else
        std::cerr << "OpenCV is required to run INT8 calibrator" << std::endl;
        std::abort();
 #endif
    }
    std::vector<float> weights = loadWeights(m_WtsFilePath, m_NetworkType);
    std::vector<nvinfer1::Weights> trtWeights;
@@ -71,8 +110,12 @@ nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder)
        return nullptr;
    }
    // Build the engine
    std::cout << "Building the TensorRT Engine" << std::endl;
    if (m_LetterBox == 1) {
        std::cout << "\nNOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file to get better accuracy\n" << std::endl;
    }
    nvinfer1::ICudaEngine * engine = builder->buildCudaEngine(*network);
    if (engine) {
        std::cout << "Building complete\n" << std::endl;
@@ -80,7 +123,6 @@ nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder)
        std::cerr << "Building engine failed\n" << std::endl;
    }
    // destroy
    network->destroy();
    return engine;
 }
@@ -88,12 +130,7 @@ nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder)
 NvDsInferStatus Yolo::parseModel(nvinfer1::INetworkDefinition& network) {
    destroyNetworkUtils();
    m_ConfigBlocks = parseConfigFile(m_ConfigFilePath);
    parseConfigBlocks();
    orderParams(&m_OutputMasks);
    std::vector<float> weights = loadWeights(m_WtsFilePath, m_NetworkType);
    // build yolo network
    std::cout << "Building YOLO network" << std::endl;
    NvDsInferStatus status = buildYoloNetwork(weights, network);
@@ -121,9 +158,7 @@ NvDsInferStatus Yolo::buildYoloNetwork(
    std::vector<nvinfer1::ITensor*> tensorOutputs;
    uint outputTensorCount = 0;
    // build the network using the network API
    for (uint i = 0; i < m_ConfigBlocks.size(); ++i) {
        // check if num. of channels is correct
        assert(getNumChannels(previous) == channels);
        std::string layerIndex = "(" + std::to_string(tensorOutputs.size()) + ")";
@@ -192,7 +227,7 @@ NvDsInferStatus Yolo::buildYoloNetwork(
        else if (m_ConfigBlocks.at(i).at("type") == "upsample") {
            std::string inputVol = dimsToString(previous->getDimensions());
-            nvinfer1::ILayer* out = upsampleLayer(i - 1, m_ConfigBlocks[i], weights, m_TrtWeights, channels, previous, &network);
+            nvinfer1::ILayer* out = upsampleLayer(i - 1, m_ConfigBlocks[i], previous, &network);
            previous = out->getOutput(0);
            assert(previous != nullptr);
            std::string outputVol = dimsToString(previous->getDimensions());
@@ -212,7 +247,6 @@ NvDsInferStatus Yolo::buildYoloNetwork(
        else if (m_ConfigBlocks.at(i).at("type") == "yolo") {
            nvinfer1::Dims prevTensorDims = previous->getDimensions();
            //assert(prevTensorDims.d[1] == prevTensorDims.d[2]);
            TensorInfo& curYoloTensor = m_OutputTensors.at(outputTensorCount);
            curYoloTensor.gridSizeY = prevTensorDims.d[1];
            curYoloTensor.gridSizeX = prevTensorDims.d[2];
@@ -262,7 +296,6 @@ NvDsInferStatus Yolo::buildYoloNetwork(
        //YOLOv2 support
        else if (m_ConfigBlocks.at(i).at("type") == "region") {
            nvinfer1::Dims prevTensorDims = previous->getDimensions();
            //assert(prevTensorDims.d[1] == prevTensorDims.d[2]);
            TensorInfo& curRegionTensor = m_OutputTensors.at(outputTensorCount);
            curRegionTensor.gridSizeY = prevTensorDims.d[1];
            curRegionTensor.gridSizeX = prevTensorDims.d[2];
@@ -391,8 +424,14 @@ void Yolo::parseConfigBlocks()
            m_InputH = std::stoul(block.at("height"));
            m_InputW = std::stoul(block.at("width"));
            m_InputC = std::stoul(block.at("channels"));
            //assert(m_InputW == m_InputH);
            m_InputSize = m_InputC * m_InputH * m_InputW;
            if (block.find("letter_box") != block.end()) {
                m_LetterBox = std::stoul(block.at("letter_box"));
            }
            else {
                m_LetterBox = 0;
            }
        }
        else if ((block.at("type") == "region") || (block.at("type") == "yolo"))
        {
@@ -460,7 +499,6 @@ void Yolo::parseConfigBlocks()
 }
 void Yolo::destroyNetworkUtils() {
    // deallocate the weights
    for (uint i = 0; i < m_TrtWeights.size(); ++i) {
        if (m_TrtWeights[i].count > 0)
            free(const_cast<void*>(m_TrtWeights[i].values));
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/yolo.h
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/yolo.h
@@ -40,6 +40,8 @@ struct NetworkInfo
    std::string networkType;
    std::string configFilePath;
    std::string wtsFilePath;
    std::string int8CalibPath;
    std::string networkMode;
    std::string deviceType;
    std::string inputBlobName;
 };
@@ -76,6 +78,8 @@ protected:
    const std::string m_NetworkType;
    const std::string m_ConfigFilePath;
    const std::string m_WtsFilePath;
    const std::string m_Int8CalibPath;
    const std::string m_NetworkMode;
    const std::string m_DeviceType;
    const std::string m_InputBlobName;
    std::vector<TensorInfo> m_OutputTensors;
@@ -85,6 +89,7 @@ protected:
    uint m_InputW;
    uint m_InputC;
    uint64_t m_InputSize;
    uint m_LetterBox;
    std::vector<nvinfer1::Weights> m_TrtWeights;
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/yoloForward.cu
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/yoloForward.cu
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp
--- a/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
+++ b/examples/multiple_inferences/pgie/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
@@ -56,7 +56,7 @@ class YoloLayer : public nvinfer1::IPluginV2
 {
 public:
    YoloLayer (const void* data, size_t length);
-    YoloLayer (const uint& numBoxes, const uint& numClasses, const uint& gridSize,
+    YoloLayer (const uint& numBoxes, const uint& numClasses, const uint& gridSizeX, const uint& gridSizeY,
                const uint model_type, const uint new_coords, const float scale_x_y, const float beta_nms,
                const std::vector<float> anchors, const std::vector<std::vector<int>> mask);
    const char* getPluginType () const override { return YOLOLAYER_PLUGIN_NAME; }
@@ -96,7 +96,8 @@ public:
 private:
    uint m_NumBoxes {0};
    uint m_NumClasses {0};
-    uint m_GridSize {0};
+    uint m_GridSizeX {0};
    uint m_GridSizeY {0};
    uint64_t m_OutputSize {0};
    std::string m_Namespace {""};
--- a/examples/multiple_inferences/sgie1/config_infer_secondary1.txt
+++ b/examples/multiple_inferences/sgie1/config_infer_secondary1.txt
@@ -4,10 +4,11 @@ net-scale-factor=0.0039215697906911373
 model-color-format=0
 custom-network-config=sgie1/yolo.cfg
 model-file=yolo.weights
-model-engine-file=model_b16_gpu0_fp16.engine
+model-engine-file=model_b16_gpu0_fp32.engine
 #int8-calib-file=calib.table
 labelfile-path=labels.txt
 batch-size=16
-network-mode=2
+network-mode=0
 num-detected-classes=10
 interval=0
 gie-unique-id=2
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/Makefile
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/Makefile
@@ -27,13 +27,25 @@ CUDA_VER?=
 ifeq ($(CUDA_VER),)
  $(error "CUDA_VER is not set")
 endif
 OPENCV?=
 ifeq ($(OPENCV),)
  OPENCV=0
 endif
 CC:= g++
 NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
 CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
-CFLAGS+= -I../../../includes -I/usr/local/cuda-$(CUDA_VER)/include
+CFLAGS+= -I/opt/nvidia/deepstream/deepstream-5.1/sources/includes -I/usr/local/cuda-$(CUDA_VER)/include
-LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
+ifeq ($(OPENCV), 1)
 COMMON= -DOPENCV
 CFLAGS+= $(shell pkg-config --cflags opencv4 2> /dev/null || pkg-config --cflags opencv)
 LIBS+= $(shell pkg-config --libs opencv4 2> /dev/null || pkg-config --libs opencv)
 endif
 LIBS+= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
 LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
 INCS:= $(wildcard *.h)
@@ -50,6 +62,11 @@ SRCFILES:= nvdsinfer_yolo_engine.cpp \
           utils.cpp \
           yolo.cpp \
           yoloForward.cu
 ifeq ($(OPENCV), 1)
 SRCFILES+= calibrator.cpp
 endif
 TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
 TARGET_OBJS:= $(SRCFILES:.cpp=.o)
@@ -58,7 +75,7 @@ TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
 all: $(TARGET_LIB)
 %.o: %.cpp $(INCS) Makefile
-	$(CC) -c -o $@ $(CFLAGS) $<
+	$(CC) -c $(COMMON) -o $@ $(CFLAGS) $<
 %.o: %.cu $(INCS) Makefile
 	$(NVCC) -c -o $@ --compiler-options '-fPIC' $<
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/calibrator.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/calibrator.cpp
@@ -0,0 +1,130 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "calibrator.h"
 #include <fstream>
 #include <iterator>
 namespace nvinfer1
 {
    int8EntroyCalibrator::int8EntroyCalibrator(const int &batchsize, const int &channels, const int &height, const int &width, const int &letterbox, const std::string &imgPath,
        const std::string &calibTablePath):batchSize(batchsize), inputC(channels), inputH(height), inputW(width), letterBox(letterbox), calibTablePath(calibTablePath), imageIndex(0)
    {
        inputCount = batchsize * channels * height * width;
        std::fstream f(imgPath);
        if (f.is_open())
        {
            std::string temp;
            while (std::getline(f, temp)) imgPaths.push_back(temp);
        }
        batchData = new float[inputCount];
        CUDA_CHECK(cudaMalloc(&deviceInput, inputCount * sizeof(float)));
    }
    int8EntroyCalibrator::~int8EntroyCalibrator()
    {
        CUDA_CHECK(cudaFree(deviceInput));
        if (batchData)
            delete[] batchData;
    }
    bool int8EntroyCalibrator::getBatch(void **bindings, const char **names, int nbBindings)
    {
        if (imageIndex + batchSize > uint(imgPaths.size()))
            return false;
        float* ptr = batchData;
        for (size_t j = imageIndex; j < imageIndex + batchSize; ++j)
        {
            cv::Mat img = cv::imread(imgPaths[j], cv::IMREAD_COLOR);
            std::vector<float>inputData = prepareImage(img, inputC, inputH, inputW, letterBox);
            int len = (int)(inputData.size());
            memcpy(ptr, inputData.data(), len * sizeof(float));
            ptr += inputData.size();
            std::cout << "Load image: " << imgPaths[j] << std::endl;
            std::cout << "Progress: " << (j + 1)*100. / imgPaths.size() << "%" << std::endl;
        }
        imageIndex += batchSize;
        CUDA_CHECK(cudaMemcpy(deviceInput, batchData, inputCount * sizeof(float), cudaMemcpyHostToDevice));
        bindings[0] = deviceInput;
        return true;
    }
    const void* int8EntroyCalibrator::readCalibrationCache(std::size_t &length)
    {
        calibrationCache.clear();
        std::ifstream input(calibTablePath, std::ios::binary);
        input >> std::noskipws;
        if (readCache && input.good())
        {
            std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
                std::back_inserter(calibrationCache));
        }
        length = calibrationCache.size();
        return length ? calibrationCache.data() : nullptr;
    }
    void int8EntroyCalibrator::writeCalibrationCache(const void *cache, std::size_t length)
    {
        std::ofstream output(calibTablePath, std::ios::binary);
        output.write(reinterpret_cast<const char*>(cache), length);
    }
 }
 std::vector<float> prepareImage(cv::Mat& img, int input_c, int input_h, int input_w, int letter_box)
 {
    cv::Mat out;
    if (letter_box == 2)
    {
        int image_w = img.cols;
        int image_h = img.rows;
        int resize_w = 0;
        int resize_h = 0;
        int offset_top = 0;
        int offset_bottom = 0;
        int offset_left = 0;
        int offset_right = 0;
        if ((float)input_h / image_h > (float)input_w / image_w)
        {
            resize_w = input_w;
            resize_h = (input_w * image_h) / image_w;
            offset_bottom = input_h - resize_h;
        }
        else
        {
            resize_h = input_h;
            resize_w = (input_h * image_w) / image_h;
            offset_right = input_w - resize_w;
        }
        cv::resize(img, out, cv::Size(resize_w, resize_h), 0, 0, cv::INTER_CUBIC);
        cv::copyMakeBorder(out, out, offset_top, offset_bottom, offset_left, offset_right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
    }
    else
    {
        cv::resize(img, out, cv::Size(input_w, input_h), 0, 0, cv::INTER_CUBIC);
    }
    cv::cvtColor(out, out, cv::COLOR_BGR2RGB);
    if (input_c == 3)
    {
        out.convertTo(out, CV_32FC3, 1.0 / 255.0);
    }
    else
    {
        out.convertTo(out, CV_32FC1, 1.0 / 255.0);
    }
    std::vector<cv::Mat> input_channels(input_c);
    cv::split(out, input_channels);
    std::vector<float> result(input_h * input_w * input_c);
    auto data = result.data();
    int channelLength = input_h * input_w;
    for (int i = 0; i < input_c; ++i)
    {
        memcpy(data, input_channels[i].data, channelLength * sizeof(float));
        data += channelLength;
    }
    return result;
 }
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/calibrator.h
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/calibrator.h
@@ -0,0 +1,62 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef CALIBRATOR_H
 #define CALIBRATOR_H
 #include "opencv2/opencv.hpp"
 #include "cuda_runtime.h"
 #include "NvInfer.h"
 #include <vector>
 #include <string>
 #ifndef CUDA_CHECK
 #define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }
 #endif
 namespace nvinfer1 {
    class int8EntroyCalibrator : public nvinfer1::IInt8EntropyCalibrator2 {
    public:
        int8EntroyCalibrator(const int &batchsize,
                             const int &channels,
                             const int &height,
                             const int &width,
                             const int &letterbox,
                             const std::string &imgPath,
                             const std::string &calibTablePath);
        virtual ~int8EntroyCalibrator();
        int getBatchSize() const override { return batchSize; }
        bool getBatch(void *bindings[], const char *names[], int nbBindings) override;
        const void *readCalibrationCache(std::size_t &length) override;
        void writeCalibrationCache(const void *ptr, std::size_t length) override;
    private:
        int batchSize;
        int inputC;
        int inputH;
        int inputW;
        int letterBox;
        std::string calibTablePath;
        size_t imageIndex;
        size_t inputCount;
        std::vector<std::string> imgPaths;
        float *batchData{ nullptr };
        void  *deviceInput{ nullptr };
        bool readCache;
        std::vector<char> calibrationCache;
    };
 }
 std::vector<float> prepareImage(cv::Mat& img, int input_c, int input_h, int input_w, int letter_box);
 #endif //CALIBRATOR_H
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/activation_layer.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/activation_layer.cpp
@@ -0,0 +1,82 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "activation_layer.h"
 nvinfer1::ILayer* activationLayer(
    int layerIdx,
    std::string activation,
    nvinfer1::ILayer* output,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network)
 {
    if (activation == "relu")
    {
        nvinfer1::IActivationLayer* relu = network->addActivation(
            *input, nvinfer1::ActivationType::kRELU);
        assert(relu != nullptr);
        std::string reluLayerName = "relu_" + std::to_string(layerIdx);
        relu->setName(reluLayerName.c_str());
        output = relu;
    }
    else if (activation == "sigmoid" || activation == "logistic")
    {
        nvinfer1::IActivationLayer* sigmoid = network->addActivation(
            *input, nvinfer1::ActivationType::kSIGMOID);
        assert(sigmoid != nullptr);
        std::string sigmoidLayerName = "sigmoid_" + std::to_string(layerIdx);
        sigmoid->setName(sigmoidLayerName.c_str());
        output = sigmoid;
    }
    else if (activation == "tanh")
    {
        nvinfer1::IActivationLayer* tanh = network->addActivation(
            *input, nvinfer1::ActivationType::kTANH);
        assert(tanh != nullptr);
        std::string tanhLayerName = "tanh_" + std::to_string(layerIdx);
        tanh->setName(tanhLayerName.c_str());
        output = tanh;
    }
    else if (activation == "leaky")
    {
        nvinfer1::IActivationLayer* leaky = network->addActivation(
            *input, nvinfer1::ActivationType::kLEAKY_RELU);
        leaky->setAlpha(0.1);
        assert(leaky != nullptr);
        std::string leakyLayerName = "leaky_" + std::to_string(layerIdx);
        leaky->setName(leakyLayerName.c_str());
        output = leaky;
    }
    else if (activation == "softplus")
    {
        nvinfer1::IActivationLayer* softplus = network->addActivation(
            *input, nvinfer1::ActivationType::kSOFTPLUS);
        assert(softplus != nullptr);
        std::string softplusLayerName = "softplus_" + std::to_string(layerIdx);
        softplus->setName(softplusLayerName.c_str());
        output = softplus;
    }
    else if (activation == "mish")
    {
        nvinfer1::IActivationLayer* softplus = network->addActivation(
            *input, nvinfer1::ActivationType::kSOFTPLUS);
        assert(softplus != nullptr);
        std::string softplusLayerName = "softplus_" + std::to_string(layerIdx);
        softplus->setName(softplusLayerName.c_str());
        nvinfer1::IActivationLayer* tanh = network->addActivation(
            *softplus->getOutput(0), nvinfer1::ActivationType::kTANH);
        assert(tanh != nullptr);
        std::string tanhLayerName = "tanh_" + std::to_string(layerIdx);
        tanh->setName(tanhLayerName.c_str());
        nvinfer1::IElementWiseLayer* mish = network->addElementWise(
            *tanh->getOutput(0), *input,
            nvinfer1::ElementWiseOperation::kPROD);
        assert(mish != nullptr);
        std::string mishLayerName = "mish_" + std::to_string(layerIdx);
        mish->setName(mishLayerName.c_str());
        output = mish;
    }
    return output;
 }
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/activation_layer.h
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/activation_layer.h
@@ -0,0 +1,23 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef __ACTIVATION_LAYER_H__
 #define __ACTIVATION_LAYER_H__
 #include <string>
 #include <cassert>
 #include "NvInfer.h"
 #include "activation_layer.h"
 nvinfer1::ILayer* activationLayer(
    int layerIdx,
    std::string activation,
    nvinfer1::ILayer* output,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network);
 #endif
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.cpp
@@ -0,0 +1,168 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include <math.h>
 #include "convolutional_layer.h"
 nvinfer1::ILayer* convolutionalLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    std::vector<float>& weights,
    std::vector<nvinfer1::Weights>& trtWeights,
    int& weightPtr,
    int& inputChannels,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network)
 {
    assert(block.at("type") == "convolutional");
    assert(block.find("filters") != block.end());
    assert(block.find("pad") != block.end());
    assert(block.find("size") != block.end());
    assert(block.find("stride") != block.end());
    int filters = std::stoi(block.at("filters"));
    int padding = std::stoi(block.at("pad"));
    int kernelSize = std::stoi(block.at("size"));
    int stride = std::stoi(block.at("stride"));
    std::string activation = block.at("activation");
    int bias = filters;
    bool batchNormalize = false;
    if (block.find("batch_normalize") != block.end())
    {
        bias = 0;
        batchNormalize = (block.at("batch_normalize") == "1");
    }
    int groups = 1;
    if (block.find("groups") != block.end())
    {
        groups = std::stoi(block.at("groups"));
    }
    int pad;
    if (padding)
        pad = (kernelSize - 1) / 2;
    else
        pad = 0;
    int size = filters * inputChannels * kernelSize * kernelSize / groups;
    std::vector<float> bnBiases;
    std::vector<float> bnWeights;
    std::vector<float> bnRunningMean;
    std::vector<float> bnRunningVar;
    nvinfer1::Weights convWt{nvinfer1::DataType::kFLOAT, nullptr, size};
    nvinfer1::Weights convBias{nvinfer1::DataType::kFLOAT, nullptr, bias};
    if (batchNormalize == false)
    {
        float* val = new float[filters];
        for (int i = 0; i < filters; ++i)
        {
            val[i] = weights[weightPtr];
            weightPtr++;
        }
        convBias.values = val;
        trtWeights.push_back(convBias);
        val = new float[size];
        for (int i = 0; i < size; ++i)
        {
            val[i] = weights[weightPtr];
            weightPtr++;
        }
        convWt.values = val;
        trtWeights.push_back(convWt);
    }
    else
    {
        for (int i = 0; i < filters; ++i)
        {
            bnBiases.push_back(weights[weightPtr]);
            weightPtr++;
        }
        for (int i = 0; i < filters; ++i)
        {
            bnWeights.push_back(weights[weightPtr]);
            weightPtr++;
        }
        for (int i = 0; i < filters; ++i)
        {
            bnRunningMean.push_back(weights[weightPtr]);
            weightPtr++;
        }
        for (int i = 0; i < filters; ++i)
        {
            bnRunningVar.push_back(sqrt(weights[weightPtr] + 1.0e-5));
            weightPtr++;
        }
        float* val = new float[size];
        for (int i = 0; i < size; ++i)
        {
            val[i] = weights[weightPtr];
            weightPtr++;
        }
        convWt.values = val;
        trtWeights.push_back(convWt);
        trtWeights.push_back(convBias);
    }
    nvinfer1::IConvolutionLayer* conv = network->addConvolution(
        *input, filters, nvinfer1::DimsHW{kernelSize, kernelSize}, convWt, convBias);
    assert(conv != nullptr);
    std::string convLayerName = "conv_" + std::to_string(layerIdx);
    conv->setName(convLayerName.c_str());
    conv->setStride(nvinfer1::DimsHW{stride, stride});
    conv->setPadding(nvinfer1::DimsHW{pad, pad});
    if (block.find("groups") != block.end())
    {
        conv->setNbGroups(groups);
    }
    nvinfer1::ILayer* output = conv;
    if (batchNormalize == true)
    {
        size = filters;
        nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, size};
        nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, nullptr, size};
        nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, size};
        float* shiftWt = new float[size];
        for (int i = 0; i < size; ++i)
        {
            shiftWt[i]
                = bnBiases.at(i) - ((bnRunningMean.at(i) * bnWeights.at(i)) / bnRunningVar.at(i));
        }
        shift.values = shiftWt;
        float* scaleWt = new float[size];
        for (int i = 0; i < size; ++i)
        {
            scaleWt[i] = bnWeights.at(i) / bnRunningVar[i];
        }
        scale.values = scaleWt;
        float* powerWt = new float[size];
        for (int i = 0; i < size; ++i)
        {
            powerWt[i] = 1.0;
        }
        power.values = powerWt;
        trtWeights.push_back(shift);
        trtWeights.push_back(scale);
        trtWeights.push_back(power);
        nvinfer1::IScaleLayer* bn = network->addScale(
            *output->getOutput(0), nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
        assert(bn != nullptr);
        std::string bnLayerName = "batch_norm_" + std::to_string(layerIdx);
        bn->setName(bnLayerName.c_str());
        output = bn;
    }
    output = activationLayer(layerIdx, activation, output, output->getOutput(0), network);
    assert(output != nullptr);
    return output;
 }
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.h
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.h
@@ -0,0 +1,26 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef __CONVOLUTIONAL_LAYER_H__
 #define __CONVOLUTIONAL_LAYER_H__
 #include <map>
 #include <vector>
 #include "NvInfer.h"
 #include "activation_layer.h"
 nvinfer1::ILayer* convolutionalLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    std::vector<float>& weights,
    std::vector<nvinfer1::Weights>& trtWeights,
    int& weightPtr,
    int& inputChannels,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network);
 #endif
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.cpp
@@ -0,0 +1,15 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "dropout_layer.h"
 nvinfer1::ILayer* dropoutLayer(
    float probability,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network)
 {
    nvinfer1::ILayer* output;
    return output;
 }
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.h
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/dropout_layer.h
@@ -0,0 +1,16 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef __DROPOUT_LAYER_H__
 #define __DROPOUT_LAYER_H__
 #include "NvInfer.h"
 nvinfer1::ILayer* dropoutLayer(
    float probability,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network);
 #endif
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.cpp
@@ -0,0 +1,30 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "maxpool_layer.h"
 nvinfer1::ILayer* maxpoolLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network)
 {
    assert(block.at("type") == "maxpool");
    assert(block.find("size") != block.end());
    assert(block.find("stride") != block.end());
    int size = std::stoi(block.at("size"));
    int stride = std::stoi(block.at("stride"));
    nvinfer1::IPoolingLayer* pool
        = network->addPooling(*input, nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{size, size});
    assert(pool);
    std::string maxpoolLayerName = "maxpool_" + std::to_string(layerIdx);
    pool->setStride(nvinfer1::DimsHW{stride, stride});
    pool->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
    pool->setName(maxpoolLayerName.c_str());
    return pool;
 }
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.h
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.h
@@ -0,0 +1,20 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef __MAXPOOL_LAYER_H__
 #define __MAXPOOL_LAYER_H__
 #include <map>
 #include <cassert>
 #include "NvInfer.h"
 nvinfer1::ILayer* maxpoolLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network);
 #endif
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/route_layer.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/route_layer.cpp
@@ -0,0 +1,63 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "route_layer.h"
 nvinfer1::ILayer* routeLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    std::vector<nvinfer1::ITensor*> tensorOutputs,
    nvinfer1::INetworkDefinition* network)
 {
    std::string strLayers = block.at("layers");
    std::vector<int> idxLayers;
    size_t lastPos = 0, pos = 0;
    while ((pos = strLayers.find(',', lastPos)) != std::string::npos) {
        int vL = std::stoi(trim(strLayers.substr(lastPos, pos - lastPos)));
        idxLayers.push_back (vL);
        lastPos = pos + 1;
    }
    if (lastPos < strLayers.length()) {
        std::string lastV = trim(strLayers.substr(lastPos));
        if (!lastV.empty()) {
            idxLayers.push_back (std::stoi(lastV));
        }
    }
    assert (!idxLayers.empty());
    std::vector<nvinfer1::ITensor*> concatInputs;
    for (int idxLayer : idxLayers) {
        if (idxLayer < 0) {
            idxLayer = tensorOutputs.size() + idxLayer;
        }
        assert (idxLayer >= 0 && idxLayer < (int)tensorOutputs.size());
        concatInputs.push_back (tensorOutputs[idxLayer]);
    }
    nvinfer1::IConcatenationLayer* concat =
        network->addConcatenation(concatInputs.data(), concatInputs.size());
    assert(concat != nullptr);
    std::string concatLayerName = "route_" + std::to_string(layerIdx - 1);
    concat->setName(concatLayerName.c_str());
    concat->setAxis(0);
    nvinfer1::ILayer* output = concat;
    if (block.find("groups") != block.end()) {
        nvinfer1::Dims prevTensorDims = output->getOutput(0)->getDimensions();
        int groups = stoi(block.at("groups"));
        int group_id = stoi(block.at("group_id"));
        int startSlice = (prevTensorDims.d[0] / groups) * group_id;
        int channelSlice = (prevTensorDims.d[0] / groups);
        nvinfer1::ISliceLayer* sl = network->addSlice(
            *output->getOutput(0),
            nvinfer1::Dims3{startSlice, 0, 0},
            nvinfer1::Dims3{channelSlice, prevTensorDims.d[1], prevTensorDims.d[2]},
            nvinfer1::Dims3{1, 1, 1});
        assert(sl != nullptr);
        output = sl;
    }
    return output;
 }
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/route_layer.h
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/route_layer.h
@@ -0,0 +1,18 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef __ROUTE_LAYER_H__
 #define __ROUTE_LAYER_H__
 #include "NvInfer.h"
 #include "../utils.h"
 nvinfer1::ILayer* routeLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    std::vector<nvinfer1::ITensor*> tensorOutputs,
    nvinfer1::INetworkDefinition* network);
 #endif
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.cpp
@@ -0,0 +1,45 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "shortcut_layer.h"
 nvinfer1::ILayer* shortcutLayer(
    int layerIdx,
    std::string activation,
    std::string inputVol,
    std::string shortcutVol,
    nvinfer1::ITensor* input,
    nvinfer1::ITensor* shortcutTensor,
    nvinfer1::INetworkDefinition* network)
 {
    nvinfer1::ILayer* output;
    nvinfer1::ITensor* outputTensor;
    if (inputVol != shortcutVol)
    {
        nvinfer1::ISliceLayer* sl = network->addSlice(
            *shortcutTensor,
            nvinfer1::Dims3{0, 0, 0},
            input->getDimensions(),
            nvinfer1::Dims3{1, 1, 1});
        assert(sl != nullptr);
        outputTensor = sl->getOutput(0);
        assert(outputTensor != nullptr);
    } else 
    {
        outputTensor = shortcutTensor;
        assert(outputTensor != nullptr);
    }
    nvinfer1::IElementWiseLayer* ew = network->addElementWise(
        *input, *outputTensor,
        nvinfer1::ElementWiseOperation::kSUM);
    assert(ew != nullptr);
    output = activationLayer(layerIdx, activation, ew, ew->getOutput(0), network);
    assert(output != nullptr);
    return output;
 }
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.h
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.h
@@ -0,0 +1,22 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef __SHORTCUT_LAYER_H__
 #define __SHORTCUT_LAYER_H__
 #include "NvInfer.h"
 #include "activation_layer.h"
 nvinfer1::ILayer* shortcutLayer(
    int layerIdx,
    std::string activation,
    std::string inputVol,
    std::string shortcutVol,
    nvinfer1::ITensor* input,
    nvinfer1::ITensor* shortcutTensor,
    nvinfer1::INetworkDefinition* network);
 #endif
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp
@@ -0,0 +1,24 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "upsample_layer.h"
 nvinfer1::ILayer* upsampleLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network)
 {
    assert(block.at("type") == "upsample");
    int stride = std::stoi(block.at("stride"));
    nvinfer1::IResizeLayer* resize_layer = network->addResize(*input);
    resize_layer->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    float scale[3] = {1, stride, stride};
    resize_layer->setScales(scale, 3);
    std::string layer_name = "upsample_" + std::to_string(layerIdx);
    resize_layer->setName(layer_name.c_str());
    return resize_layer;
 }
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.h
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.h
@@ -0,0 +1,21 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef __UPSAMPLE_LAYER_H__
 #define __UPSAMPLE_LAYER_H__
 #include <map>
 #include <vector>
 #include <cassert>
 #include "NvInfer.h"
 nvinfer1::ILayer* upsampleLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network);
 #endif
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp
@@ -0,0 +1,118 @@
 /*
 * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 * Edited by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "nvdsinfer_custom_impl.h"
 #include "nvdsinfer_context.h"
 #include "yoloPlugins.h"
 #include "yolo.h"
 #include <algorithm>
 #define USE_CUDA_ENGINE_GET_API 1
 static bool getYoloNetworkInfo (NetworkInfo &networkInfo, const NvDsInferContextInitParams* initParams)
 {
    std::string yoloCfg = initParams->customNetworkConfigFilePath;
    std::string yoloType;
    std::transform (yoloCfg.begin(), yoloCfg.end(), yoloCfg.begin(), [] (uint8_t c) {
        return std::tolower (c);});
    yoloType = yoloCfg.substr(0, yoloCfg.find(".cfg"));
    networkInfo.networkType     = yoloType;
    networkInfo.configFilePath  = initParams->customNetworkConfigFilePath;
    networkInfo.wtsFilePath     = initParams->modelFilePath;
    networkInfo.int8CalibPath   = initParams->int8CalibrationFilePath;
    networkInfo.deviceType      = (initParams->useDLA ? "kDLA" : "kGPU");
    networkInfo.inputBlobName   = "data";
    if(initParams->networkMode == 0) {
        networkInfo.networkMode = "FP32";
    }
    else if(initParams->networkMode == 1) {
        networkInfo.networkMode = "INT8";
    }
    else if(initParams->networkMode == 2) {
        networkInfo.networkMode = "FP16";
    }
    if (networkInfo.configFilePath.empty() ||
        networkInfo.wtsFilePath.empty()) {
        std::cerr << "YOLO config file or weights file is not specified"
                  << std::endl;
        return false;
    }
    if (!fileExists(networkInfo.configFilePath) ||
        !fileExists(networkInfo.wtsFilePath)) {
        std::cerr << "YOLO config file or weights file is not exist"
                  << std::endl;
        return false;
    }
    return true;
 }
 #if !USE_CUDA_ENGINE_GET_API
 IModelParser* NvDsInferCreateModelParser(
    const NvDsInferContextInitParams* initParams) {
    NetworkInfo networkInfo;
    if (!getYoloNetworkInfo(networkInfo, initParams)) {
      return nullptr;
    }
    return new Yolo(networkInfo);
 }
 #else
 extern "C"
 bool NvDsInferYoloCudaEngineGet(nvinfer1::IBuilder * const builder,
        const NvDsInferContextInitParams * const initParams,
        nvinfer1::DataType dataType,
        nvinfer1::ICudaEngine *& cudaEngine);
 extern "C"
 bool NvDsInferYoloCudaEngineGet(nvinfer1::IBuilder * const builder,
        const NvDsInferContextInitParams * const initParams,
        nvinfer1::DataType dataType,
        nvinfer1::ICudaEngine *& cudaEngine)
 {
    NetworkInfo networkInfo;
    if (!getYoloNetworkInfo(networkInfo, initParams)) {
      return false;
    }
    Yolo yolo(networkInfo);
    cudaEngine = yolo.createEngine (builder);
    if (cudaEngine == nullptr)
    {
        std::cerr << "Failed to build CUDA engine on "
                  << networkInfo.configFilePath << std::endl;
        return false;
    }
    return true;
 }
 #endif
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp
@@ -0,0 +1,378 @@
 /*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 * Edited by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include <algorithm>
 #include <cmath>
 #include <sstream>
 #include "nvdsinfer_custom_impl.h"
 #include "utils.h"
 #include "yoloPlugins.h"
 extern "C" bool NvDsInferParseYolo(
    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
    NvDsInferNetworkInfo const& networkInfo,
    NvDsInferParseDetectionParams const& detectionParams,
    std::vector<NvDsInferParseObjectInfo>& objectList);
 static std::vector<NvDsInferParseObjectInfo>
 nonMaximumSuppression(const float nmsThresh, std::vector<NvDsInferParseObjectInfo> binfo)
 {
    auto overlap1D = [](float x1min, float x1max, float x2min, float x2max) -> float {
        if (x1min > x2min)
        {
            std::swap(x1min, x2min);
            std::swap(x1max, x2max);
        }
        return x1max < x2min ? 0 : std::min(x1max, x2max) - x2min;
    };
    auto computeIoU
        = [&overlap1D](NvDsInferParseObjectInfo& bbox1, NvDsInferParseObjectInfo& bbox2) -> float {
        float overlapX
            = overlap1D(bbox1.left, bbox1.left + bbox1.width, bbox2.left, bbox2.left + bbox2.width);
        float overlapY
            = overlap1D(bbox1.top, bbox1.top + bbox1.height, bbox2.top, bbox2.top + bbox2.height);
        float area1 = (bbox1.width) * (bbox1.height);
        float area2 = (bbox2.width) * (bbox2.height);
        float overlap2D = overlapX * overlapY;
        float u = area1 + area2 - overlap2D;
        return u == 0 ? 0 : overlap2D / u;
    };
    std::stable_sort(binfo.begin(), binfo.end(),
                     [](const NvDsInferParseObjectInfo& b1, const NvDsInferParseObjectInfo& b2) {
                         return b1.detectionConfidence > b2.detectionConfidence;
                     });
    std::vector<NvDsInferParseObjectInfo> out;
    for (auto i : binfo)
    {
        bool keep = true;
        for (auto j : out)
        {
            if (keep)
            {
                float overlap = computeIoU(i, j);
                keep = overlap <= nmsThresh;
            }
            else
                break;
        }
        if (keep) out.push_back(i);
    }
    return out;
 }
 static std::vector<NvDsInferParseObjectInfo>
 nmsAllClasses(const float nmsThresh,
        std::vector<NvDsInferParseObjectInfo>& binfo,
        const uint numClasses)
 {
    std::vector<NvDsInferParseObjectInfo> result;
    std::vector<std::vector<NvDsInferParseObjectInfo>> splitBoxes(numClasses);
    for (auto& box : binfo)
    {
        splitBoxes.at(box.classId).push_back(box);
    }
    for (auto& boxes : splitBoxes)
    {
        boxes = nonMaximumSuppression(nmsThresh, boxes);
        result.insert(result.end(), boxes.begin(), boxes.end());
    }
    return result;
 }
 static NvDsInferParseObjectInfo convertBBox(const float& bx, const float& by, const float& bw,
                                     const float& bh, const int& stride, const uint& netW,
                                     const uint& netH)
 {
    NvDsInferParseObjectInfo b;
    float xCenter = bx * stride;
    float yCenter = by * stride;
    float x0 = xCenter - bw / 2;
    float y0 = yCenter - bh / 2;
    float x1 = x0 + bw;
    float y1 = y0 + bh;
    x0 = clamp(x0, 0, netW);
    y0 = clamp(y0, 0, netH);
    x1 = clamp(x1, 0, netW);
    y1 = clamp(y1, 0, netH);
    b.left = x0;
    b.width = clamp(x1 - x0, 0, netW);
    b.top = y0;
    b.height = clamp(y1 - y0, 0, netH);
    return b;
 }
 static void addBBoxProposal(const float bx, const float by, const float bw, const float bh,
                     const uint stride, const uint& netW, const uint& netH, const int maxIndex,
                     const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
 {
    NvDsInferParseObjectInfo bbi = convertBBox(bx, by, bw, bh, stride, netW, netH);
    if (bbi.width < 1 || bbi.height < 1) return;
    bbi.detectionConfidence = maxProb;
    bbi.classId = maxIndex;
    binfo.push_back(bbi);
 }
 static std::vector<NvDsInferParseObjectInfo>
 decodeYoloTensor(
    const float* detections, const std::vector<int> &mask, const std::vector<float> &anchors,
    const uint gridSizeW, const uint gridSizeH, const uint stride, const uint numBBoxes,
    const uint numOutputClasses, const uint& netW,
    const uint& netH,
    const float confThresh)
 {
    std::vector<NvDsInferParseObjectInfo> binfo;
    for (uint y = 0; y < gridSizeH; ++y) {
        for (uint x = 0; x < gridSizeW; ++x) {
            for (uint b = 0; b < numBBoxes; ++b)
            {
                const float pw = anchors[mask[b] * 2];
                const float ph = anchors[mask[b] * 2 + 1];
                const int numGridCells = gridSizeH * gridSizeW;
                const int bbindex = y * gridSizeW + x;
                const float bx
                    = x + detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 0)];
                const float by
                    = y + detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 1)];
                const float bw
                    = pw * detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 2)];
                const float bh
                    = ph * detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 3)];
                const float objectness
                    = detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 4)];
                float maxProb = 0.0f;
                int maxIndex = -1;
                for (uint i = 0; i < numOutputClasses; ++i)
                {
                    float prob
                        = (detections[bbindex
                                      + numGridCells * (b * (5 + numOutputClasses) + (5 + i))]);
                    if (prob > maxProb)
                    {
                        maxProb = prob;
                        maxIndex = i;
                    }
                }
                maxProb = objectness * maxProb;
                if (maxProb > confThresh)
                {
                    addBBoxProposal(bx, by, bw, bh, stride, netW, netH, maxIndex, maxProb, binfo);
                }
            }
        }
    }
    return binfo;
 }
 static std::vector<NvDsInferParseObjectInfo>
 decodeYoloV2Tensor(
    const float* detections, const std::vector<float> &anchors,
    const uint gridSizeW, const uint gridSizeH, const uint stride, const uint numBBoxes,
    const uint numOutputClasses, const uint& netW,
    const uint& netH)
 {
    std::vector<NvDsInferParseObjectInfo> binfo;
    for (uint y = 0; y < gridSizeH; ++y) {
        for (uint x = 0; x < gridSizeW; ++x) {
            for (uint b = 0; b < numBBoxes; ++b)
            {
                const float pw = anchors[b * 2];
                const float ph = anchors[b * 2 + 1];
                const int numGridCells = gridSizeH * gridSizeW;
                const int bbindex = y * gridSizeW + x;
                const float bx
                    = x + detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 0)];
                const float by
                    = y + detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 1)];
                const float bw
                    = pw * detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 2)];
                const float bh
                    = ph * detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 3)];
                const float objectness
                    = detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 4)];
                float maxProb = 0.0f;
                int maxIndex = -1;
                for (uint i = 0; i < numOutputClasses; ++i)
                {
                    float prob
                        = (detections[bbindex
                                      + numGridCells * (b * (5 + numOutputClasses) + (5 + i))]);
                    if (prob > maxProb)
                    {
                        maxProb = prob;
                        maxIndex = i;
                    }
                }
                maxProb = objectness * maxProb;
                addBBoxProposal(bx, by, bw, bh, stride, netW, netH, maxIndex, maxProb, binfo);
            }
        }
    }
    return binfo;
 }
 static inline std::vector<const NvDsInferLayerInfo*>
 SortLayers(const std::vector<NvDsInferLayerInfo> & outputLayersInfo)
 {
    std::vector<const NvDsInferLayerInfo*> outLayers;
    for (auto const &layer : outputLayersInfo) {
        outLayers.push_back (&layer);
    }
    std::sort(outLayers.begin(), outLayers.end(),
        [](const NvDsInferLayerInfo* a, const NvDsInferLayerInfo* b) {
            return a->inferDims.d[1] < b->inferDims.d[1];
        });
    return outLayers;
 }
 static bool NvDsInferParseYolo(
    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
    NvDsInferNetworkInfo const& networkInfo,
    NvDsInferParseDetectionParams const& detectionParams,
    std::vector<NvDsInferParseObjectInfo>& objectList,
    const std::vector<float> &anchors,
    const std::vector<std::vector<int>> &masks,
    const uint &num_classes,
    const float &beta_nms)
 {
    const float kCONF_THRESH = detectionParams.perClassThreshold[0];
    const std::vector<const NvDsInferLayerInfo*> sortedLayers =
        SortLayers (outputLayersInfo);
    if (sortedLayers.size() != masks.size()) {
        std::cerr << "ERROR: YOLO output layer.size: " << sortedLayers.size()
                  << " does not match mask.size: " << masks.size() << std::endl;
        return false;
    }
    if (num_classes != detectionParams.numClassesConfigured)
    {
        std::cerr << "WARNING: Num classes mismatch. Configured: "
                  << detectionParams.numClassesConfigured
                  << ", detected by network: " << num_classes << std::endl;
    }
    std::vector<NvDsInferParseObjectInfo> objects;
    for (uint idx = 0; idx < masks.size(); ++idx) {
        const NvDsInferLayerInfo &layer = *sortedLayers[idx]; // 255 x Grid x Grid
        assert(layer.inferDims.numDims == 3);
        const uint gridSizeH = layer.inferDims.d[1];
        const uint gridSizeW = layer.inferDims.d[2];
        const uint stride = DIVUP(networkInfo.width, gridSizeW);
        std::vector<NvDsInferParseObjectInfo> outObjs =
            decodeYoloTensor((const float*)(layer.buffer), masks[idx], anchors, gridSizeW, gridSizeH, stride, masks[idx].size(),
                       num_classes, networkInfo.width, networkInfo.height, kCONF_THRESH);
        objects.insert(objects.end(), outObjs.begin(), outObjs.end());
    }
    objectList.clear();
    objectList = nmsAllClasses(beta_nms, objects, num_classes);
    return true;
 }
 static bool NvDsInferParseYoloV2(
    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
    NvDsInferNetworkInfo const& networkInfo,
    NvDsInferParseDetectionParams const& detectionParams,
    std::vector<NvDsInferParseObjectInfo>& objectList,
    std::vector<float> &anchors,
    const uint &num_classes)
 {
    if (outputLayersInfo.empty()) {
        std::cerr << "Could not find output layer in bbox parsing" << std::endl;;
        return false;
    }
    const uint kNUM_BBOXES = anchors.size() / 2;
    const NvDsInferLayerInfo &layer = outputLayersInfo[0];
    if (num_classes != detectionParams.numClassesConfigured)
    {
        std::cerr << "WARNING: Num classes mismatch. Configured: "
                  << detectionParams.numClassesConfigured
                  << ", detected by network: " << num_classes << std::endl;
    }
    assert(layer.inferDims.numDims == 3);
    const uint gridSizeH = layer.inferDims.d[1];
    const uint gridSizeW = layer.inferDims.d[2];
    const uint stride = DIVUP(networkInfo.width, gridSizeW);
    for (auto& anchor : anchors) {
        anchor *= stride;
    }
    std::vector<NvDsInferParseObjectInfo> objects =
        decodeYoloV2Tensor((const float*)(layer.buffer), anchors, gridSizeW, gridSizeH, stride, kNUM_BBOXES,
                   num_classes, networkInfo.width, networkInfo.height);
    objectList = objects;
    return true;
 }
 extern "C" bool NvDsInferParseYolo(
    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
    NvDsInferNetworkInfo const& networkInfo,
    NvDsInferParseDetectionParams const& detectionParams,
    std::vector<NvDsInferParseObjectInfo>& objectList)
 {
    int num_classes = kNUM_CLASSES;
    float beta_nms = kBETA_NMS;
    std::vector<float> anchors = kANCHORS;
    std::vector<std::vector<int>> mask = kMASK;
    if (mask.size() > 0) {
        return NvDsInferParseYolo (outputLayersInfo, networkInfo, detectionParams, objectList, anchors, mask, num_classes, beta_nms);
    }
    else {
        return NvDsInferParseYoloV2 (outputLayersInfo, networkInfo, detectionParams, objectList, anchors, num_classes);
    }
 }
 CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYolo);
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/utils.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/utils.cpp
@@ -0,0 +1,150 @@
 /*
 * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 * Edited by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "utils.h"
 #include <experimental/filesystem>
 #include <iomanip>
 #include <algorithm>
 #include <math.h>
 static void leftTrim(std::string& s)
 {
    s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) { return !isspace(ch); }));
 }
 static void rightTrim(std::string& s)
 {
    s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { return !isspace(ch); }).base(), s.end());
 }
 std::string trim(std::string s)
 {
    leftTrim(s);
    rightTrim(s);
    return s;
 }
 float clamp(const float val, const float minVal, const float maxVal)
 {
    assert(minVal <= maxVal);
    return std::min(maxVal, std::max(minVal, val));
 }
 bool fileExists(const std::string fileName, bool verbose)
 {
    if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(fileName)))
    {
        if (verbose) std::cout << "File does not exist: " << fileName << std::endl;
        return false;
    }
    return true;
 }
 std::vector<float> loadWeights(const std::string weightsFilePath, const std::string& networkType)
 {
    assert(fileExists(weightsFilePath));
    std::cout << "\nLoading pre-trained weights" << std::endl;
    std::ifstream file(weightsFilePath, std::ios_base::binary);
    assert(file.good());
    std::string line;
    if (networkType.find("yolov2") != std::string::npos && networkType.find("yolov2-tiny") == std::string::npos)
    {
        // Remove 4 int32 bytes of data from the stream belonging to the header
        file.ignore(4 * 4);
    }
    else
    {
        // Remove 5 int32 bytes of data from the stream belonging to the header
        file.ignore(4 * 5);
    }
    std::vector<float> weights;
    char floatWeight[4];
    while (!file.eof())
    {
        file.read(floatWeight, 4);
        assert(file.gcount() == 4);
        weights.push_back(*reinterpret_cast<float*>(floatWeight));
        if (file.peek() == std::istream::traits_type::eof()) break;
    }
    std::cout << "Loading weights of " << networkType << " complete"
              << std::endl;
    std::cout << "Total weights read: " << weights.size() << std::endl;
    return weights;
 }
 std::string dimsToString(const nvinfer1::Dims d)
 {
    std::stringstream s;
    assert(d.nbDims >= 1);
    for (int i = 0; i < d.nbDims - 1; ++i)
    {
        s << std::setw(4) << d.d[i] << " x";
    }
    s << std::setw(4) << d.d[d.nbDims - 1];
    return s.str();
 }
 void displayDimType(const nvinfer1::Dims d)
 {
    std::cout << "(" << d.nbDims << ") ";
    for (int i = 0; i < d.nbDims; ++i)
    {
        switch (d.type[i])
        {
        case nvinfer1::DimensionType::kSPATIAL: std::cout << "kSPATIAL "; break;
        case nvinfer1::DimensionType::kCHANNEL: std::cout << "kCHANNEL "; break;
        case nvinfer1::DimensionType::kINDEX: std::cout << "kINDEX "; break;
        case nvinfer1::DimensionType::kSEQUENCE: std::cout << "kSEQUENCE "; break;
        }
    }
    std::cout << std::endl;
 }
 int getNumChannels(nvinfer1::ITensor* t)
 {
    nvinfer1::Dims d = t->getDimensions();
    assert(d.nbDims == 3);
    return d.d[0];
 }
 uint64_t get3DTensorVolume(nvinfer1::Dims inputDims)
 {
    assert(inputDims.nbDims == 3);
    return inputDims.d[0] * inputDims.d[1] * inputDims.d[2];
 }
 void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput,
                    std::string layerOutput, std::string weightPtr)
 {
    std::cout << std::setw(6) << std::left << layerIndex << std::setw(24) << std::left << layerName;
    std::cout << std::setw(20) << std::left << layerInput << std::setw(20) << std::left
              << layerOutput;
    std::cout << std::setw(7) << std::left << weightPtr << std::endl;
 }
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/utils.h
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/utils.h
@@ -0,0 +1,53 @@
 /*
 * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 * Edited by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef __UTILS_H__
 #define __UTILS_H__
 #include <map>
 #include <vector>
 #include <cassert>
 #include <iostream>
 #include <fstream>
 #include "NvInfer.h"
 #define UNUSED(expr) (void)(expr)
 #define DIVUP(n, d) ((n) + (d)-1) / (d)
 std::string trim(std::string s);
 float clamp(const float val, const float minVal, const float maxVal);
 bool fileExists(const std::string fileName, bool verbose = true);
 std::vector<float> loadWeights(const std::string weightsFilePath, const std::string& networkType);
 std::string dimsToString(const nvinfer1::Dims d);
 void displayDimType(const nvinfer1::Dims d);
 int getNumChannels(nvinfer1::ITensor* t);
 uint64_t get3DTensorVolume(nvinfer1::Dims inputDims);
 void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput,
                    std::string layerOutput, std::string weightPtr);
 #endif
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/yolo.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/yolo.cpp
@@ -0,0 +1,507 @@
 /*
 * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 * Edited by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "yolo.h"
 #include "yoloPlugins.h"
 #include <stdlib.h>
 #ifdef OPENCV
 #include "calibrator.h"
 #endif
 void orderParams(std::vector<std::vector<int>> *maskVector) {
    std::vector<std::vector<int>> maskinput = *maskVector;
    std::vector<int> maskPartial;
    for (uint i = 0; i < maskinput.size(); i++) {
 		for (uint j = i + 1; j < maskinput.size(); j++) {
 			if (maskinput[i][0] <= maskinput[j][0]) {
 				maskPartial = maskinput[i];
 				maskinput[i] = maskinput[j];
 				maskinput[j] = maskPartial;
            }
 		}
 	}
    *maskVector = maskinput;
 }
 Yolo::Yolo(const NetworkInfo& networkInfo)
    : m_NetworkType(networkInfo.networkType), // YOLO type
      m_ConfigFilePath(networkInfo.configFilePath), // YOLO cfg
      m_WtsFilePath(networkInfo.wtsFilePath), // YOLO weights
      m_Int8CalibPath(networkInfo.int8CalibPath), // INT8 calibration path
      m_NetworkMode(networkInfo.networkMode), // FP32, INT8, FP16
      m_DeviceType(networkInfo.deviceType), // kDLA, kGPU
      m_InputBlobName(networkInfo.inputBlobName), // data
      m_InputH(0),
      m_InputW(0),
      m_InputC(0),
      m_InputSize(0)
 {}
 Yolo::~Yolo()
 {
    destroyNetworkUtils();
 }
 nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder)
 {
    assert (builder);
    m_ConfigBlocks = parseConfigFile(m_ConfigFilePath);
    parseConfigBlocks();
    orderParams(&m_OutputMasks);
    if (m_NetworkMode == "INT8" && !fileExists(m_Int8CalibPath)) {
        assert(builder->platformHasFastInt8());
 #ifdef OPENCV
        std::string calib_image_list;
        int calib_batch_size;
        if (getenv("INT8_CALIB_IMG_PATH")) {
            calib_image_list = getenv("INT8_CALIB_IMG_PATH");
        }
        else {
            std::cerr << "INT8_CALIB_IMG_PATH not set" << std::endl;
            std::abort();
        }
        if (getenv("INT8_CALIB_BATCH_SIZE")) {
            calib_batch_size = std::stoi(getenv("INT8_CALIB_BATCH_SIZE"));
        }
        else {
            std::cerr << "INT8_CALIB_BATCH_SIZE not set" << std::endl;
            std::abort();
        }
        nvinfer1::int8EntroyCalibrator *calibrator = new nvinfer1::int8EntroyCalibrator(calib_batch_size, m_InputC, m_InputH, m_InputW, m_LetterBox, calib_image_list, m_Int8CalibPath);
        builder->setInt8Mode(true);
        builder->setInt8Calibrator(calibrator);
 #else
        std::cerr << "OpenCV is required to run INT8 calibrator" << std::endl;
        std::abort();
 #endif
    }
    std::vector<float> weights = loadWeights(m_WtsFilePath, m_NetworkType);
    std::vector<nvinfer1::Weights> trtWeights;
    nvinfer1::INetworkDefinition *network = builder->createNetwork();
    if (parseModel(*network) != NVDSINFER_SUCCESS) {
        network->destroy();
        return nullptr;
    }
    std::cout << "Building the TensorRT Engine" << std::endl;
    if (m_LetterBox == 1) {
        std::cout << "\nNOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file to get better accuracy\n" << std::endl;
    }
    nvinfer1::ICudaEngine * engine = builder->buildCudaEngine(*network);
    if (engine) {
        std::cout << "Building complete\n" << std::endl;
    } else {
        std::cerr << "Building engine failed\n" << std::endl;
    }
    network->destroy();
    return engine;
 }
 NvDsInferStatus Yolo::parseModel(nvinfer1::INetworkDefinition& network) {
    destroyNetworkUtils();
    std::vector<float> weights = loadWeights(m_WtsFilePath, m_NetworkType);
    std::cout << "Building YOLO network" << std::endl;
    NvDsInferStatus status = buildYoloNetwork(weights, network);
    if (status == NVDSINFER_SUCCESS) {
        std::cout << "Building YOLO network complete" << std::endl;
    } else {
        std::cerr << "Building YOLO network failed" << std::endl;
    }
    return status;
 }
 NvDsInferStatus Yolo::buildYoloNetwork(
    std::vector<float>& weights, nvinfer1::INetworkDefinition& network) {
    int weightPtr = 0;
    int channels = m_InputC;
    nvinfer1::ITensor* data =
        network.addInput(m_InputBlobName.c_str(), nvinfer1::DataType::kFLOAT,
            nvinfer1::DimsCHW{static_cast<int>(m_InputC),
                static_cast<int>(m_InputH), static_cast<int>(m_InputW)});
    assert(data != nullptr && data->getDimensions().nbDims > 0);
    nvinfer1::ITensor* previous = data;
    std::vector<nvinfer1::ITensor*> tensorOutputs;
    uint outputTensorCount = 0;
    for (uint i = 0; i < m_ConfigBlocks.size(); ++i) {
        assert(getNumChannels(previous) == channels);
        std::string layerIndex = "(" + std::to_string(tensorOutputs.size()) + ")";
        if (m_ConfigBlocks.at(i).at("type") == "net") {
            printLayerInfo("", "layer", "     input", "     outup", "weightPtr");
        }
        else if (m_ConfigBlocks.at(i).at("type") == "convolutional") {
            std::string inputVol = dimsToString(previous->getDimensions());
            nvinfer1::ILayer* out = convolutionalLayer(i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, channels, previous, &network);
            previous = out->getOutput(0);
            assert(previous != nullptr);
            channels = getNumChannels(previous);
            std::string outputVol = dimsToString(previous->getDimensions());
            tensorOutputs.push_back(previous);
            std::string layerType = "conv_" + m_ConfigBlocks.at(i).at("activation");
            printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
        }
        else if (m_ConfigBlocks.at(i).at("type") == "dropout") {
            assert(m_ConfigBlocks.at(i).find("probability") != m_ConfigBlocks.at(i).end());
            //float probability = std::stof(m_ConfigBlocks.at(i).at("probability"));
            //nvinfer1::ILayer* out = dropoutLayer(probability, previous, &network);
            //previous = out->getOutput(0);
            //Skip dropout layer
            assert(previous != nullptr);
            tensorOutputs.push_back(previous);
            printLayerInfo(layerIndex, "dropout", "        -", "        -", "    -");
        }
        else if (m_ConfigBlocks.at(i).at("type") == "shortcut") {
            assert(m_ConfigBlocks.at(i).find("activation") != m_ConfigBlocks.at(i).end());
            assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end());
            std::string activation = m_ConfigBlocks.at(i).at("activation");
            int from = stoi(m_ConfigBlocks.at(i).at("from"));
            if (from > 0) {
                from = from - i + 1;
            }
            assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size()));
            assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size()));
            assert(i + from - 1 < i - 2);
            std::string inputVol = dimsToString(previous->getDimensions());
            std::string shortcutVol = dimsToString(tensorOutputs[i + from - 1]->getDimensions());
            nvinfer1::ILayer* out = shortcutLayer(i, activation, inputVol, shortcutVol, previous, tensorOutputs[i + from - 1], &network);
            previous = out->getOutput(0);
            assert(previous != nullptr);
            std::string outputVol = dimsToString(previous->getDimensions());
            tensorOutputs.push_back(previous);
            std::string layerType = "shortcut_" + m_ConfigBlocks.at(i).at("activation") + ": " + std::to_string(i + from - 1);
            printLayerInfo(layerIndex, layerType, "        -", outputVol, "    -");
            if (inputVol != shortcutVol) {
                std::cout << inputVol << " +" << shortcutVol << std::endl;
            }
        }
        else if (m_ConfigBlocks.at(i).at("type") == "route") {
            assert(m_ConfigBlocks.at(i).find("layers") != m_ConfigBlocks.at(i).end());
            nvinfer1::ILayer* out = routeLayer(i, m_ConfigBlocks.at(i), tensorOutputs, &network);
            previous = out->getOutput(0);
            assert(previous != nullptr);
            channels = getNumChannels(previous);
            std::string outputVol = dimsToString(previous->getDimensions());
            tensorOutputs.push_back(previous);
            printLayerInfo(layerIndex, "route", "        -", outputVol, std::to_string(weightPtr));
        }
        else if (m_ConfigBlocks.at(i).at("type") == "upsample") {
            std::string inputVol = dimsToString(previous->getDimensions());
            nvinfer1::ILayer* out = upsampleLayer(i - 1, m_ConfigBlocks[i], previous, &network);
            previous = out->getOutput(0);
            assert(previous != nullptr);
            std::string outputVol = dimsToString(previous->getDimensions());
            tensorOutputs.push_back(previous);
            printLayerInfo(layerIndex, "upsample", inputVol, outputVol, "    -");
        }
        else if (m_ConfigBlocks.at(i).at("type") == "maxpool") {
            std::string inputVol = dimsToString(previous->getDimensions());
            nvinfer1::ILayer* out = maxpoolLayer(i, m_ConfigBlocks.at(i), previous, &network);
            previous = out->getOutput(0);
            assert(previous != nullptr);
            std::string outputVol = dimsToString(previous->getDimensions());
            tensorOutputs.push_back(previous);
            printLayerInfo(layerIndex, "maxpool", inputVol, outputVol, std::to_string(weightPtr));
        }
        else if (m_ConfigBlocks.at(i).at("type") == "yolo") {
            nvinfer1::Dims prevTensorDims = previous->getDimensions();
            TensorInfo& curYoloTensor = m_OutputTensors.at(outputTensorCount);
            curYoloTensor.gridSizeY = prevTensorDims.d[1];
            curYoloTensor.gridSizeX = prevTensorDims.d[2];
            curYoloTensor.stride = m_InputH / curYoloTensor.gridSizeY;
            m_OutputTensors.at(outputTensorCount).volume = curYoloTensor.gridSizeY
                * curYoloTensor.gridSizeX
                * (curYoloTensor.numBBoxes * (5 + curYoloTensor.numClasses));
            std::string layerName = "yolo_" + std::to_string(i);
            curYoloTensor.blobName = layerName;
            int new_coords = 0;
            float scale_x_y = 1;
            float beta_nms = 0.45;
            if (m_ConfigBlocks.at(i).find("new_coords") != m_ConfigBlocks.at(i).end()) {
                new_coords = std::stoi(m_ConfigBlocks.at(i).at("new_coords"));
            }
            if (m_ConfigBlocks.at(i).find("scale_x_y") != m_ConfigBlocks.at(i).end()) {
                scale_x_y = std::stof(m_ConfigBlocks.at(i).at("scale_x_y"));
            }
            if (m_ConfigBlocks.at(i).find("beta_nms") != m_ConfigBlocks.at(i).end()) {
                beta_nms = std::stof(m_ConfigBlocks.at(i).at("beta_nms"));
            }
            nvinfer1::IPluginV2* yoloPlugin
                = new YoloLayer(m_OutputTensors.at(outputTensorCount).numBBoxes,
                                  m_OutputTensors.at(outputTensorCount).numClasses,
                                  m_OutputTensors.at(outputTensorCount).gridSizeX,
                                  m_OutputTensors.at(outputTensorCount).gridSizeY,
                                  1, new_coords, scale_x_y, beta_nms,
                                  curYoloTensor.anchors,
                                  m_OutputMasks);
            assert(yoloPlugin != nullptr);
            nvinfer1::IPluginV2Layer* yolo =
                network.addPluginV2(&previous, 1, *yoloPlugin);
            assert(yolo != nullptr);
            yolo->setName(layerName.c_str());
            std::string inputVol = dimsToString(previous->getDimensions());
            previous = yolo->getOutput(0);
            assert(previous != nullptr);
            previous->setName(layerName.c_str());
            std::string outputVol = dimsToString(previous->getDimensions());
            network.markOutput(*previous);
            channels = getNumChannels(previous);
            tensorOutputs.push_back(yolo->getOutput(0));
            printLayerInfo(layerIndex, "yolo", inputVol, outputVol, std::to_string(weightPtr));
            ++outputTensorCount;
        }
        //YOLOv2 support
        else if (m_ConfigBlocks.at(i).at("type") == "region") {
            nvinfer1::Dims prevTensorDims = previous->getDimensions();
            TensorInfo& curRegionTensor = m_OutputTensors.at(outputTensorCount);
            curRegionTensor.gridSizeY = prevTensorDims.d[1];
            curRegionTensor.gridSizeX = prevTensorDims.d[2];
            curRegionTensor.stride = m_InputH / curRegionTensor.gridSizeY;
            m_OutputTensors.at(outputTensorCount).volume = curRegionTensor.gridSizeY
                * curRegionTensor.gridSizeX
                * (curRegionTensor.numBBoxes * (5 + curRegionTensor.numClasses));
            std::string layerName = "region_" + std::to_string(i);
            curRegionTensor.blobName = layerName;
            std::vector<std::vector<int>> mask;
            nvinfer1::IPluginV2* regionPlugin
                = new YoloLayer(curRegionTensor.numBBoxes,
                                  curRegionTensor.numClasses,
                                  curRegionTensor.gridSizeX,
                                  curRegionTensor.gridSizeY,
                                  0, 0, 1.0, 0,
                                  curRegionTensor.anchors,
                                  mask);
            assert(regionPlugin != nullptr);
            nvinfer1::IPluginV2Layer* region =
                network.addPluginV2(&previous, 1, *regionPlugin);
            assert(region != nullptr);
            region->setName(layerName.c_str());
            std::string inputVol = dimsToString(previous->getDimensions());
            previous = region->getOutput(0);
            assert(previous != nullptr);
            previous->setName(layerName.c_str());
            std::string outputVol = dimsToString(previous->getDimensions());
            network.markOutput(*previous);
            channels = getNumChannels(previous);
            tensorOutputs.push_back(region->getOutput(0));
            printLayerInfo(layerIndex, "region", inputVol, outputVol, std::to_string(weightPtr));
            ++outputTensorCount;
        }
        else if (m_ConfigBlocks.at(i).at("type") == "reorg") {
            std::string inputVol = dimsToString(previous->getDimensions());
            nvinfer1::IPluginV2* reorgPlugin = createReorgPlugin(2);
            assert(reorgPlugin != nullptr);
            nvinfer1::IPluginV2Layer* reorg =
                network.addPluginV2(&previous, 1, *reorgPlugin);
            assert(reorg != nullptr);
            std::string layerName = "reorg_" + std::to_string(i);
            reorg->setName(layerName.c_str());
            previous = reorg->getOutput(0);
            assert(previous != nullptr);
            std::string outputVol = dimsToString(previous->getDimensions());
            channels = getNumChannels(previous);
            tensorOutputs.push_back(reorg->getOutput(0));
            printLayerInfo(layerIndex, "reorg", inputVol, outputVol, std::to_string(weightPtr));
        }
        else
        {
            std::cout << "Unsupported layer type --> \""
                      << m_ConfigBlocks.at(i).at("type") << "\"" << std::endl;
            assert(0);
        }
    }
    if ((int)weights.size() != weightPtr)
    {
        std::cout << "Number of unused weights left: " << weights.size() - weightPtr << std::endl;
        assert(0);
    }
    std::cout << "Output YOLO blob names: " << std::endl;
    for (auto& tensor : m_OutputTensors) {
        std::cout << tensor.blobName << std::endl;
    }
    int nbLayers = network.getNbLayers();
    std::cout << "Total number of YOLO layers: " << nbLayers << std::endl;
    return NVDSINFER_SUCCESS;
 }
 std::vector<std::map<std::string, std::string>>
 Yolo::parseConfigFile (const std::string cfgFilePath)
 {
    assert(fileExists(cfgFilePath));
    std::ifstream file(cfgFilePath);
    assert(file.good());
    std::string line;
    std::vector<std::map<std::string, std::string>> blocks;
    std::map<std::string, std::string> block;
    while (getline(file, line))
    {
        if (line.size() == 0) continue;
        if (line.front() == '#') continue;
        line = trim(line);
        if (line.front() == '[')
        {
            if (block.size() > 0)
            {
                blocks.push_back(block);
                block.clear();
            }
            std::string key = "type";
            std::string value = trim(line.substr(1, line.size() - 2));
            block.insert(std::pair<std::string, std::string>(key, value));
        }
        else
        {
            int cpos = line.find('=');
            std::string key = trim(line.substr(0, cpos));
            std::string value = trim(line.substr(cpos + 1));
            block.insert(std::pair<std::string, std::string>(key, value));
        }
    }
    blocks.push_back(block);
    return blocks;
 }
 void Yolo::parseConfigBlocks()
 {
    for (auto block : m_ConfigBlocks) {
        if (block.at("type") == "net")
        {
            assert((block.find("height") != block.end())
                   && "Missing 'height' param in network cfg");
            assert((block.find("width") != block.end()) && "Missing 'width' param in network cfg");
            assert((block.find("channels") != block.end())
                   && "Missing 'channels' param in network cfg");
            m_InputH = std::stoul(block.at("height"));
            m_InputW = std::stoul(block.at("width"));
            m_InputC = std::stoul(block.at("channels"));
            m_InputSize = m_InputC * m_InputH * m_InputW;
            if (block.find("letter_box") != block.end()) {
                m_LetterBox = std::stoul(block.at("letter_box"));
            }
            else {
                m_LetterBox = 0;
            }
        }
        else if ((block.at("type") == "region") || (block.at("type") == "yolo"))
        {
            assert((block.find("num") != block.end())
                   && std::string("Missing 'num' param in " + block.at("type") + " layer").c_str());
            assert((block.find("classes") != block.end())
                   && std::string("Missing 'classes' param in " + block.at("type") + " layer")
                          .c_str());
            assert((block.find("anchors") != block.end())
                   && std::string("Missing 'anchors' param in " + block.at("type") + " layer")
                          .c_str());
            TensorInfo outputTensor;
            std::string anchorString = block.at("anchors");
            while (!anchorString.empty())
            {
                int npos = anchorString.find_first_of(',');
                if (npos != -1)
                {
                    float anchor = std::stof(trim(anchorString.substr(0, npos)));
                    outputTensor.anchors.push_back(anchor);
                    anchorString.erase(0, npos + 1);
                }
                else
                {
                    float anchor = std::stof(trim(anchorString));
                    outputTensor.anchors.push_back(anchor);
                    break;
                }
            }
            if (block.find("mask") != block.end()) {
                std::string maskString = block.at("mask");
                std::vector<int> pMASKS;
                while (!maskString.empty())
                {
                    int npos = maskString.find_first_of(',');
                    if (npos != -1)
                    {
                        int mask = std::stoul(trim(maskString.substr(0, npos)));
                        pMASKS.push_back(mask);
                        outputTensor.masks.push_back(mask);
                        maskString.erase(0, npos + 1);
                    }
                    else
                    {
                        int mask = std::stoul(trim(maskString));
                        pMASKS.push_back(mask);
                        outputTensor.masks.push_back(mask);
                        break;
                    }
                }
                m_OutputMasks.push_back(pMASKS);
            }
            outputTensor.numBBoxes = outputTensor.masks.size() > 0
                ? outputTensor.masks.size()
                : std::stoul(trim(block.at("num")));
            outputTensor.numClasses = std::stoul(block.at("classes"));
            m_OutputTensors.push_back(outputTensor);
        }
    }
 }
 void Yolo::destroyNetworkUtils() {
    for (uint i = 0; i < m_TrtWeights.size(); ++i) {
        if (m_TrtWeights[i].count > 0)
            free(const_cast<void*>(m_TrtWeights[i].values));
    }
    m_TrtWeights.clear();
 }
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/yolo.h
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/yolo.h
@@ -0,0 +1,105 @@
 /*
 * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 * Edited by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef _YOLO_H_
 #define _YOLO_H_
 #include "layers/convolutional_layer.h"
 #include "layers/dropout_layer.h"
 #include "layers/shortcut_layer.h"
 #include "layers/route_layer.h"
 #include "layers/upsample_layer.h"
 #include "layers/maxpool_layer.h"
 #include "nvdsinfer_custom_impl.h"
 struct NetworkInfo
 {
    std::string networkType;
    std::string configFilePath;
    std::string wtsFilePath;
    std::string int8CalibPath;
    std::string networkMode;
    std::string deviceType;
    std::string inputBlobName;
 };
 struct TensorInfo
 {
    std::string blobName;
    uint stride{0};
    uint gridSizeY{0};
    uint gridSizeX{0};
    uint numClasses{0};
    uint numBBoxes{0};
    uint64_t volume{0};
    std::vector<uint> masks;
    std::vector<float> anchors;
    int bindingIndex{-1};
    float* hostBuffer{nullptr};
 };
 class Yolo : public IModelParser {
 public:
    Yolo(const NetworkInfo& networkInfo);
    ~Yolo() override;
    bool hasFullDimsSupported() const override { return false; }
    const char* getModelName() const override {
        return m_ConfigFilePath.empty() ? m_NetworkType.c_str()
                                        : m_ConfigFilePath.c_str();
    }
    NvDsInferStatus parseModel(nvinfer1::INetworkDefinition& network) override;
    nvinfer1::ICudaEngine *createEngine (nvinfer1::IBuilder* builder);
 protected:
    const std::string m_NetworkType;
    const std::string m_ConfigFilePath;
    const std::string m_WtsFilePath;
    const std::string m_Int8CalibPath;
    const std::string m_NetworkMode;
    const std::string m_DeviceType;
    const std::string m_InputBlobName;
    std::vector<TensorInfo> m_OutputTensors;
    std::vector<std::vector<int>> m_OutputMasks;
    std::vector<std::map<std::string, std::string>> m_ConfigBlocks;
    uint m_InputH;
    uint m_InputW;
    uint m_InputC;
    uint64_t m_InputSize;
    uint m_LetterBox;
    std::vector<nvinfer1::Weights> m_TrtWeights;
 private:
    NvDsInferStatus buildYoloNetwork(
        std::vector<float>& weights, nvinfer1::INetworkDefinition& network);
    std::vector<std::map<std::string, std::string>> parseConfigFile(
        const std::string cfgFilePath);
    void parseConfigBlocks();
    void destroyNetworkUtils();
 };
 #endif // _YOLO_H_
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/yoloForward.cu
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/yoloForward.cu
@@ -0,0 +1,166 @@
 /*
 * Copyright (c) 2018-2019 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA Corporation is strictly prohibited.
 *
 * Edited by Marcos Luciano
 * https://www.github.com/marcoslucianops
 *
 */
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
 __global__ void gpuYoloLayer(const float* input, float* output, const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses,
                               const uint numBBoxes, const uint new_coords, const float scale_x_y)
 {
    uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
    uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
    uint z_id = blockIdx.z * blockDim.z + threadIdx.z;
    if ((x_id >= gridSizeX) || (y_id >= gridSizeY) || (z_id >= numBBoxes))
    {
        return;
    }
    const int numGridCells = gridSizeX * gridSizeY;
    const int bbindex = y_id * gridSizeX + x_id;
    float alpha = scale_x_y;
    float beta = -0.5 * (scale_x_y - 1);
    if (new_coords == 1) {
        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
            = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] * alpha + beta;
        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
            = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] * alpha + beta;
        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
            = pow(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] * 2, 2);
        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
            = pow(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] * 2, 2);
        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
            = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)];
        for (uint i = 0; i < numOutputClasses; ++i)
        {
            output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]
                = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))];
        }
    }
    else {
        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
            = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]) * alpha + beta;
        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
            = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]) * alpha + beta;
        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
            = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]);
        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
            = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]);
        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
            = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
        for (uint i = 0; i < numOutputClasses; ++i)
        {
            output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]
                = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]);
        }
    }
 }
 __global__ void gpuRegionLayer(const float* input, float* output, const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses,
                               const uint numBBoxes)
 {
    uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
    uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
    uint z_id = blockIdx.z * blockDim.z + threadIdx.z;
    if ((x_id >= gridSizeX) || (y_id >= gridSizeY) || (z_id >= numBBoxes))
    {
        return;
    }
    const int numGridCells = gridSizeX * gridSizeY;
    const int bbindex = y_id * gridSizeX + x_id;
    output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
        = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]);
    output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
        = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]);
    output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
        = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]);
    output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
        = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]);
    output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
        = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
    float temp = 1.0;
    int i;
    float sum = 0;
    float largest = -INFINITY;
    for(i = 0; i < numOutputClasses; ++i){
        int val = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))];
        largest = (val>largest) ? val : largest;
    }
    for(i = 0; i < numOutputClasses; ++i){
        float e = exp(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] / temp - largest / temp);
        sum += e;
        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] = e;
    }
    for(i = 0; i < numOutputClasses; ++i){
        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] /= sum;
    }
 }
 cudaError_t cudaYoloLayer(const void* input, void* output, const uint& batchSize, const uint& gridSizeX, const uint& gridSizeY,
                            const uint& numOutputClasses, const uint& numBBoxes,
                            uint64_t outputSize, cudaStream_t stream, const uint modelCoords, const float modelScale, const uint modelType);
 cudaError_t cudaYoloLayer(const void* input, void* output, const uint& batchSize, const uint& gridSizeX, const uint& gridSizeY,
                            const uint& numOutputClasses, const uint& numBBoxes,
                            uint64_t outputSize, cudaStream_t stream, const uint modelCoords, const float modelScale, const uint modelType)
 {
    dim3 threads_per_block(16, 16, 4);
    dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
                          (gridSizeY / threads_per_block.y) + 1,
                          (numBBoxes / threads_per_block.z) + 1);
    if (modelType == 1) {
        for (unsigned int batch = 0; batch < batchSize; ++batch)
        {
            gpuYoloLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
                reinterpret_cast<const float*>(input) + (batch * outputSize),
                reinterpret_cast<float*>(output) + (batch * outputSize), gridSizeX, gridSizeY, numOutputClasses,
                numBBoxes, modelCoords, modelScale);
        }
    }
    else if (modelType == 0) {
        for (unsigned int batch = 0; batch < batchSize; ++batch)
        {
            gpuRegionLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
                reinterpret_cast<const float*>(input) + (batch * outputSize),
                reinterpret_cast<float*>(output) + (batch * outputSize), gridSizeX, gridSizeY, numOutputClasses,
                numBBoxes);
        }
    }
    return cudaGetLastError();
 }
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp
@@ -0,0 +1,209 @@
 /*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 * Edited by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "yoloPlugins.h"
 #include "NvInferPlugin.h"
 #include <cassert>
 #include <iostream>
 #include <memory>
 int kNUM_CLASSES;
 float kBETA_NMS;
 std::vector<float> kANCHORS;
 std::vector<std::vector<int>> kMASK;
 namespace {
 template <typename T>
 void write(char*& buffer, const T& val)
 {
    *reinterpret_cast<T*>(buffer) = val;
    buffer += sizeof(T);
 }
 template <typename T>
 void read(const char*& buffer, T& val)
 {
    val = *reinterpret_cast<const T*>(buffer);
    buffer += sizeof(T);
 }
 }
 cudaError_t cudaYoloLayer (
    const void* input, void* output, const uint& batchSize,
    const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses,
    const uint& numBBoxes, uint64_t outputSize, cudaStream_t stream, const uint modelCoords, const float modelScale, const uint modelType);
 YoloLayer::YoloLayer (const void* data, size_t length)
 {
    const char *d = static_cast<const char*>(data);
    read(d, m_NumBoxes);
    read(d, m_NumClasses);
    read(d, m_GridSizeX);
    read(d, m_GridSizeY);
    read(d, m_OutputSize);
    read(d, m_type);
    read(d, m_new_coords);
    read(d, m_scale_x_y);
    read(d, m_beta_nms);
    uint anchorsSize;
    read(d, anchorsSize);
    for (uint i = 0; i < anchorsSize; i++) {
        float result;
        read(d, result);
        m_Anchors.push_back(result);
    }
    uint maskSize;
    read(d, maskSize);
    for (uint i = 0; i < maskSize; i++) {
        uint nMask;
        read(d, nMask);
        std::vector<int> pMask;
        for (uint f = 0; f < nMask; f++) {
            int result;
            read(d, result);
            pMask.push_back(result);
        }
        m_Mask.push_back(pMask);
    }
    kNUM_CLASSES = m_NumClasses;
    kBETA_NMS = m_beta_nms;
    kANCHORS = m_Anchors;
    kMASK = m_Mask;
 };
 YoloLayer::YoloLayer (
    const uint& numBoxes, const uint& numClasses, const uint& gridSizeX, const uint& gridSizeY, const uint model_type, const uint new_coords, const float scale_x_y, const float beta_nms, const std::vector<float> anchors, std::vector<std::vector<int>> mask) :
    m_NumBoxes(numBoxes),
    m_NumClasses(numClasses),
    m_GridSizeX(gridSizeX),
    m_GridSizeY(gridSizeY),
    m_type(model_type),
    m_new_coords(new_coords),
    m_scale_x_y(scale_x_y),
    m_beta_nms(beta_nms),
    m_Anchors(anchors),
    m_Mask(mask)
 {
    assert(m_NumBoxes > 0);
    assert(m_NumClasses > 0);
    assert(m_GridSizeX > 0);
    assert(m_GridSizeY > 0);
    m_OutputSize = m_GridSizeX * m_GridSizeY * (m_NumBoxes * (4 + 1 + m_NumClasses));
 };
 nvinfer1::Dims
 YoloLayer::getOutputDimensions(
    int index, const nvinfer1::Dims* inputs, int nbInputDims)
 {
    assert(index == 0);
    assert(nbInputDims == 1);
    return inputs[0];
 }
 bool YoloLayer::supportsFormat (
    nvinfer1::DataType type, nvinfer1::PluginFormat format) const {
    return (type == nvinfer1::DataType::kFLOAT &&
            format == nvinfer1::PluginFormat::kNCHW);
 }
 void
 YoloLayer::configureWithFormat (
    const nvinfer1::Dims* inputDims, int nbInputs,
    const nvinfer1::Dims* outputDims, int nbOutputs,
    nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize)
 {
    assert(nbInputs == 1);
    assert (format == nvinfer1::PluginFormat::kNCHW);
    assert(inputDims != nullptr);
 }
 int YoloLayer::enqueue(
    int batchSize, const void* const* inputs, void** outputs, void* workspace,
    cudaStream_t stream)
 {
    CHECK(cudaYoloLayer(
              inputs[0], outputs[0], batchSize, m_GridSizeX, m_GridSizeY, m_NumClasses, m_NumBoxes,
              m_OutputSize, stream, m_new_coords, m_scale_x_y, m_type));
    return 0;
 }
 size_t YoloLayer::getSerializationSize() const
 {
    int anchorsSum = 1;
    for (uint i = 0; i < m_Anchors.size(); i++) {
        anchorsSum += 1;
    }
    int maskSum = 1;
    for (uint i = 0; i < m_Mask.size(); i++) {
        maskSum += 1;
        for (uint f = 0; f < m_Mask[i].size(); f++) {
            maskSum += 1;
        }
    }
    return sizeof(m_NumBoxes) + sizeof(m_NumClasses) + sizeof(m_GridSizeX) + sizeof(m_GridSizeY) + sizeof(m_OutputSize) + sizeof(m_type)
            + sizeof(m_new_coords) + sizeof(m_scale_x_y) + sizeof(m_beta_nms) + anchorsSum * sizeof(float) + maskSum * sizeof(int);
 }
 void YoloLayer::serialize(void* buffer) const
 {
    char *d = static_cast<char*>(buffer);
    write(d, m_NumBoxes);
    write(d, m_NumClasses);
    write(d, m_GridSizeX);
    write(d, m_GridSizeY);
    write(d, m_OutputSize);
    write(d, m_type);
    write(d, m_new_coords);
    write(d, m_scale_x_y);
    write(d, m_beta_nms);
    uint anchorsSize = m_Anchors.size();
    write(d, anchorsSize);
    for (uint i = 0; i < anchorsSize; i++) {
        write(d, m_Anchors[i]);
    }
    uint maskSize = m_Mask.size();
    write(d, maskSize);
    for (uint i = 0; i < maskSize; i++) {
        uint pMaskSize = m_Mask[i].size();
        write(d, pMaskSize);
        for (uint f = 0; f < pMaskSize; f++) {
            write(d, m_Mask[i][f]);
        }
    }
    kNUM_CLASSES = m_NumClasses;
    kBETA_NMS = m_beta_nms;
    kANCHORS = m_Anchors;
    kMASK = m_Mask;
 }
 nvinfer1::IPluginV2* YoloLayer::clone() const
 {
    return new YoloLayer (m_NumBoxes, m_NumClasses, m_GridSizeX, m_GridSizeY, m_type, m_new_coords, m_scale_x_y, m_beta_nms, m_Anchors, m_Mask);
 }
 REGISTER_TENSORRT_PLUGIN(YoloLayerPluginCreator);
--- a/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
+++ b/examples/multiple_inferences/sgie1/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
@@ -56,7 +56,7 @@ class YoloLayer : public nvinfer1::IPluginV2
 {
 public:
    YoloLayer (const void* data, size_t length);
-    YoloLayer (const uint& numBoxes, const uint& numClasses, const uint& gridSize,
+    YoloLayer (const uint& numBoxes, const uint& numClasses, const uint& gridSizeX, const uint& gridSizeY,
                const uint model_type, const uint new_coords, const float scale_x_y, const float beta_nms,
                const std::vector<float> anchors, const std::vector<std::vector<int>> mask);
    const char* getPluginType () const override { return YOLOLAYER_PLUGIN_NAME; }
@@ -96,7 +96,8 @@ public:
 private:
    uint m_NumBoxes {0};
    uint m_NumClasses {0};
-    uint m_GridSize {0};
+    uint m_GridSizeX {0};
    uint m_GridSizeY {0};
    uint64_t m_OutputSize {0};
    std::string m_Namespace {""};
--- a/native/config_infer_primary.txt
+++ b/native/config_infer_primary.txt
@@ -4,10 +4,11 @@ net-scale-factor=0.0039215697906911373
 model-color-format=0
 custom-network-config=yolov4.cfg
 model-file=yolov4.weights
-model-engine-file=model_b1_gpu0_fp16.engine
+model-engine-file=model_b1_gpu0_fp32.engine
 #int8-calib-file=calib.table
 labelfile-path=labels.txt
 batch-size=1
-network-mode=2
+network-mode=0
 num-detected-classes=80
 interval=0
 gie-unique-id=1
--- a/native/config_infer_primary_yoloV2.txt
+++ b/native/config_infer_primary_yoloV2.txt
@@ -4,10 +4,11 @@ net-scale-factor=0.0039215697906911373
 model-color-format=0
 custom-network-config=yolov2.cfg
 model-file=yolov2.weights
-model-engine-file=model_b1_gpu0_fp16.engine
+model-engine-file=model_b1_gpu0_fp32.engine
 #int8-calib-file=calib.table
 labelfile-path=labels.txt
 batch-size=1
-network-mode=2
+network-mode=0
 num-detected-classes=80
 interval=0
 gie-unique-id=1
--- a/native/deepstream_app_config.txt
+++ b/native/deepstream_app_config.txt
@@ -1,6 +1,6 @@
 [application]
 enable-perf-measurement=1
-perf-measurement-interval-sec=1
+perf-measurement-interval-sec=5
 [tiled-display]
 enable=1
@@ -14,7 +14,7 @@ nvbuf-memory-type=0
 [source0]
 enable=1
 type=3
-uri=file://../../samples/streams/sample_1080p_h264.mp4
+uri=file:///opt/nvidia/deepstream/deepstream-5.1/samples/streams/sample_1080p_h264.mp4
 num-sources=1
 gpu-id=0
 cudadec-memtype=0
--- a/native/nvdsinfer_custom_impl_Yolo/Makefile
+++ b/native/nvdsinfer_custom_impl_Yolo/Makefile
@@ -27,13 +27,25 @@ CUDA_VER?=
 ifeq ($(CUDA_VER),)
  $(error "CUDA_VER is not set")
 endif
 OPENCV?=
 ifeq ($(OPENCV),)
  OPENCV=0
 endif
 CC:= g++
 NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
 CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
-CFLAGS+= -I../../includes -I/usr/local/cuda-$(CUDA_VER)/include
+CFLAGS+= -I/opt/nvidia/deepstream/deepstream-5.1/sources/includes -I/usr/local/cuda-$(CUDA_VER)/include
-LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
+ifeq ($(OPENCV), 1)
 COMMON= -DOPENCV
 CFLAGS+= $(shell pkg-config --cflags opencv4 2> /dev/null || pkg-config --cflags opencv)
 LIBS+= $(shell pkg-config --libs opencv4 2> /dev/null || pkg-config --libs opencv)
 endif
 LIBS+= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
 LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
 INCS:= $(wildcard *.h)
@@ -50,6 +62,11 @@ SRCFILES:= nvdsinfer_yolo_engine.cpp \
           utils.cpp \
           yolo.cpp \
           yoloForward.cu
 ifeq ($(OPENCV), 1)
 SRCFILES+= calibrator.cpp
 endif
 TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
 TARGET_OBJS:= $(SRCFILES:.cpp=.o)
@@ -58,7 +75,7 @@ TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
 all: $(TARGET_LIB)
 %.o: %.cpp $(INCS) Makefile
-	$(CC) -c -o $@ $(CFLAGS) $<
+	$(CC) -c $(COMMON) -o $@ $(CFLAGS) $<
 %.o: %.cu $(INCS) Makefile
 	$(NVCC) -c -o $@ --compiler-options '-fPIC' $<
--- a/native/nvdsinfer_custom_impl_Yolo/calibrator.cpp
+++ b/native/nvdsinfer_custom_impl_Yolo/calibrator.cpp
@@ -0,0 +1,130 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "calibrator.h"
 #include <fstream>
 #include <iterator>
 namespace nvinfer1
 {
    int8EntroyCalibrator::int8EntroyCalibrator(const int &batchsize, const int &channels, const int &height, const int &width, const int &letterbox, const std::string &imgPath,
        const std::string &calibTablePath):batchSize(batchsize), inputC(channels), inputH(height), inputW(width), letterBox(letterbox), calibTablePath(calibTablePath), imageIndex(0)
    {
        inputCount = batchsize * channels * height * width;
        std::fstream f(imgPath);
        if (f.is_open())
        {
            std::string temp;
            while (std::getline(f, temp)) imgPaths.push_back(temp);
        }
        batchData = new float[inputCount];
        CUDA_CHECK(cudaMalloc(&deviceInput, inputCount * sizeof(float)));
    }
    int8EntroyCalibrator::~int8EntroyCalibrator()
    {
        CUDA_CHECK(cudaFree(deviceInput));
        if (batchData)
            delete[] batchData;
    }
    bool int8EntroyCalibrator::getBatch(void **bindings, const char **names, int nbBindings)
    {
        if (imageIndex + batchSize > uint(imgPaths.size()))
            return false;
        float* ptr = batchData;
        for (size_t j = imageIndex; j < imageIndex + batchSize; ++j)
        {
            cv::Mat img = cv::imread(imgPaths[j], cv::IMREAD_COLOR);
            std::vector<float>inputData = prepareImage(img, inputC, inputH, inputW, letterBox);
            int len = (int)(inputData.size());
            memcpy(ptr, inputData.data(), len * sizeof(float));
            ptr += inputData.size();
            std::cout << "Load image: " << imgPaths[j] << std::endl;
            std::cout << "Progress: " << (j + 1)*100. / imgPaths.size() << "%" << std::endl;
        }
        imageIndex += batchSize;
        CUDA_CHECK(cudaMemcpy(deviceInput, batchData, inputCount * sizeof(float), cudaMemcpyHostToDevice));
        bindings[0] = deviceInput;
        return true;
    }
    const void* int8EntroyCalibrator::readCalibrationCache(std::size_t &length)
    {
        calibrationCache.clear();
        std::ifstream input(calibTablePath, std::ios::binary);
        input >> std::noskipws;
        if (readCache && input.good())
        {
            std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
                std::back_inserter(calibrationCache));
        }
        length = calibrationCache.size();
        return length ? calibrationCache.data() : nullptr;
    }
    void int8EntroyCalibrator::writeCalibrationCache(const void *cache, std::size_t length)
    {
        std::ofstream output(calibTablePath, std::ios::binary);
        output.write(reinterpret_cast<const char*>(cache), length);
    }
 }
 std::vector<float> prepareImage(cv::Mat& img, int input_c, int input_h, int input_w, int letter_box)
 {
    cv::Mat out;
    if (letter_box == 2)
    {
        int image_w = img.cols;
        int image_h = img.rows;
        int resize_w = 0;
        int resize_h = 0;
        int offset_top = 0;
        int offset_bottom = 0;
        int offset_left = 0;
        int offset_right = 0;
        if ((float)input_h / image_h > (float)input_w / image_w)
        {
            resize_w = input_w;
            resize_h = (input_w * image_h) / image_w;
            offset_bottom = input_h - resize_h;
        }
        else
        {
            resize_h = input_h;
            resize_w = (input_h * image_w) / image_h;
            offset_right = input_w - resize_w;
        }
        cv::resize(img, out, cv::Size(resize_w, resize_h), 0, 0, cv::INTER_CUBIC);
        cv::copyMakeBorder(out, out, offset_top, offset_bottom, offset_left, offset_right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
    }
    else
    {
        cv::resize(img, out, cv::Size(input_w, input_h), 0, 0, cv::INTER_CUBIC);
    }
    cv::cvtColor(out, out, cv::COLOR_BGR2RGB);
    if (input_c == 3)
    {
        out.convertTo(out, CV_32FC3, 1.0 / 255.0);
    }
    else
    {
        out.convertTo(out, CV_32FC1, 1.0 / 255.0);
    }
    std::vector<cv::Mat> input_channels(input_c);
    cv::split(out, input_channels);
    std::vector<float> result(input_h * input_w * input_c);
    auto data = result.data();
    int channelLength = input_h * input_w;
    for (int i = 0; i < input_c; ++i)
    {
        memcpy(data, input_channels[i].data, channelLength * sizeof(float));
        data += channelLength;
    }
    return result;
 }
--- a/native/nvdsinfer_custom_impl_Yolo/calibrator.h
+++ b/native/nvdsinfer_custom_impl_Yolo/calibrator.h
@@ -0,0 +1,62 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef CALIBRATOR_H
 #define CALIBRATOR_H
 #include "opencv2/opencv.hpp"
 #include "cuda_runtime.h"
 #include "NvInfer.h"
 #include <vector>
 #include <string>
 #ifndef CUDA_CHECK
 #define CUDA_CHECK(callstr)                                                                    \
    {                                                                                          \
        cudaError_t error_code = callstr;                                                      \
        if (error_code != cudaSuccess) {                                                       \
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
            assert(0);                                                                         \
        }                                                                                      \
    }
 #endif
 namespace nvinfer1 {
    class int8EntroyCalibrator : public nvinfer1::IInt8EntropyCalibrator2 {
    public:
        int8EntroyCalibrator(const int &batchsize,
                             const int &channels,
                             const int &height,
                             const int &width,
                             const int &letterbox,
                             const std::string &imgPath,
                             const std::string &calibTablePath);
        virtual ~int8EntroyCalibrator();
        int getBatchSize() const override { return batchSize; }
        bool getBatch(void *bindings[], const char *names[], int nbBindings) override;
        const void *readCalibrationCache(std::size_t &length) override;
        void writeCalibrationCache(const void *ptr, std::size_t length) override;
    private:
        int batchSize;
        int inputC;
        int inputH;
        int inputW;
        int letterBox;
        std::string calibTablePath;
        size_t imageIndex;
        size_t inputCount;
        std::vector<std::string> imgPaths;
        float *batchData{ nullptr };
        void  *deviceInput{ nullptr };
        bool readCache;
        std::vector<char> calibrationCache;
    };
 }
 std::vector<float> prepareImage(cv::Mat& img, int input_c, int input_h, int input_w, int letter_box);
 #endif //CALIBRATOR_H
--- a/native/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp
+++ b/native/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp
@@ -8,79 +8,17 @@
 nvinfer1::ILayer* upsampleLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    std::vector<float>& weights,
    std::vector<nvinfer1::Weights>& trtWeights,
    int& inputChannels,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network)
 {
    assert(block.at("type") == "upsample");
    nvinfer1::Dims inpDims = input->getDimensions();
    assert(inpDims.nbDims == 3);
    assert(inpDims.d[1] == inpDims.d[2]);
    int h = inpDims.d[1];
    int w = inpDims.d[2];
    int stride = std::stoi(block.at("stride"));
-    nvinfer1::Dims preDims{3,
+    nvinfer1::IResizeLayer* resize_layer = network->addResize(*input);
-                           {1, stride * h, w},
+    resize_layer->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
-                           {nvinfer1::DimensionType::kCHANNEL, nvinfer1::DimensionType::kSPATIAL,
+    float scale[3] = {1, stride, stride};
-                            nvinfer1::DimensionType::kSPATIAL}};
+    resize_layer->setScales(scale, 3);
-    int size = stride * h * w;
+    std::string layer_name = "upsample_" + std::to_string(layerIdx);
-    nvinfer1::Weights preMul{nvinfer1::DataType::kFLOAT, nullptr, size};
+    resize_layer->setName(layer_name.c_str());
-    float* preWt = new float[size];
+    return resize_layer;
    for (int i = 0, idx = 0; i < h; ++i)
    {
        for (int s = 0; s < stride; ++s)
        {
            for (int j = 0; j < w; ++j, ++idx)
            {
                preWt[idx] = (i == j) ? 1.0 : 0.0;
            }
        }
    }
    preMul.values = preWt;
    trtWeights.push_back(preMul);
    nvinfer1::IConstantLayer* preM = network->addConstant(preDims, preMul);
    assert(preM != nullptr);
    std::string preLayerName = "preMul_" + std::to_string(layerIdx);
    preM->setName(preLayerName.c_str());
    nvinfer1::Dims postDims{3,
                            {1, h, stride * w},
                            {nvinfer1::DimensionType::kCHANNEL, nvinfer1::DimensionType::kSPATIAL,
                             nvinfer1::DimensionType::kSPATIAL}};
    size = stride * h * w;
    nvinfer1::Weights postMul{nvinfer1::DataType::kFLOAT, nullptr, size};
    float* postWt = new float[size];
    for (int i = 0, idx = 0; i < h; ++i)
    {
        for (int j = 0; j < stride * w; ++j, ++idx)
        {
            postWt[idx] = (j / stride == i) ? 1.0 : 0.0;
        }
    }
    postMul.values = postWt;
    trtWeights.push_back(postMul);
    nvinfer1::IConstantLayer* post_m = network->addConstant(postDims, postMul);
    assert(post_m != nullptr);
    std::string postLayerName = "postMul_" + std::to_string(layerIdx);
    post_m->setName(postLayerName.c_str());
    nvinfer1::IMatrixMultiplyLayer* mm1
        = network->addMatrixMultiply(*preM->getOutput(0), nvinfer1::MatrixOperation::kNONE, *input,
                                     nvinfer1::MatrixOperation::kNONE);
    assert(mm1 != nullptr);
    std::string mm1LayerName = "mm1_" + std::to_string(layerIdx);
    mm1->setName(mm1LayerName.c_str());
    nvinfer1::IMatrixMultiplyLayer* mm2
        = network->addMatrixMultiply(*mm1->getOutput(0), nvinfer1::MatrixOperation::kNONE,
                                     *post_m->getOutput(0), nvinfer1::MatrixOperation::kNONE);
    assert(mm2 != nullptr);
    std::string mm2LayerName = "mm2_" + std::to_string(layerIdx);
    mm2->setName(mm2LayerName.c_str());
    return mm2;
 }
--- a/native/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.h
+++ b/native/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.h
@@ -15,9 +15,6 @@
 nvinfer1::ILayer* upsampleLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    std::vector<float>& weights,
    std::vector<nvinfer1::Weights>& trtWeights,
    int& inputChannels,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network);
--- a/native/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp
+++ b/native/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp
@@ -45,9 +45,20 @@ static bool getYoloNetworkInfo (NetworkInfo &networkInfo, const NvDsInferContext
    networkInfo.networkType     = yoloType;
    networkInfo.configFilePath  = initParams->customNetworkConfigFilePath;
    networkInfo.wtsFilePath     = initParams->modelFilePath;
    networkInfo.int8CalibPath   = initParams->int8CalibrationFilePath;
    networkInfo.deviceType      = (initParams->useDLA ? "kDLA" : "kGPU");
    networkInfo.inputBlobName   = "data";
    if(initParams->networkMode == 0) {
        networkInfo.networkMode = "FP32";
    }
    else if(initParams->networkMode == 1) {
        networkInfo.networkMode = "INT8";
    }
    else if(initParams->networkMode == 2) {
        networkInfo.networkMode = "FP16";
    }
    if (networkInfo.configFilePath.empty() ||
        networkInfo.wtsFilePath.empty()) {
        std::cerr << "YOLO config file or weights file is not specified"
--- a/native/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp
+++ b/native/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp
@@ -302,7 +302,6 @@ static bool NvDsInferParseYolo(
        const uint gridSizeH = layer.inferDims.d[1];
        const uint gridSizeW = layer.inferDims.d[2];
        const uint stride = DIVUP(networkInfo.width, gridSizeW);
        assert(stride == DIVUP(networkInfo.height, gridSizeH));
        std::vector<NvDsInferParseObjectInfo> outObjs =
            decodeYoloTensor((const float*)(layer.buffer), masks[idx], anchors, gridSizeW, gridSizeH, stride, masks[idx].size(),
@@ -344,7 +343,6 @@ static bool NvDsInferParseYoloV2(
    const uint gridSizeH = layer.inferDims.d[1];
    const uint gridSizeW = layer.inferDims.d[2];
    const uint stride = DIVUP(networkInfo.width, gridSizeW);
    assert(stride == DIVUP(networkInfo.height, gridSizeH));
    for (auto& anchor : anchors) {
        anchor *= stride;
    }
--- a/native/nvdsinfer_custom_impl_Yolo/yolo.cpp
+++ b/native/nvdsinfer_custom_impl_Yolo/yolo.cpp
@@ -25,6 +25,11 @@
 #include "yolo.h"
 #include "yoloPlugins.h"
 #include <stdlib.h>
 #ifdef OPENCV
 #include "calibrator.h"
 #endif
 void orderParams(std::vector<std::vector<int>> *maskVector) {
    std::vector<std::vector<int>> maskinput = *maskVector;
@@ -45,6 +50,8 @@ Yolo::Yolo(const NetworkInfo& networkInfo)
    : m_NetworkType(networkInfo.networkType), // YOLO type
      m_ConfigFilePath(networkInfo.configFilePath), // YOLO cfg
      m_WtsFilePath(networkInfo.wtsFilePath), // YOLO weights
      m_Int8CalibPath(networkInfo.int8CalibPath), // INT8 calibration path
      m_NetworkMode(networkInfo.networkMode), // FP32, INT8, FP16
      m_DeviceType(networkInfo.deviceType), // kDLA, kGPU
      m_InputBlobName(networkInfo.inputBlobName), // data
      m_InputH(0),
@@ -62,6 +69,38 @@ nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder)
 {
    assert (builder);
    m_ConfigBlocks = parseConfigFile(m_ConfigFilePath);
    parseConfigBlocks();
    orderParams(&m_OutputMasks);
    if (m_NetworkMode == "INT8" && !fileExists(m_Int8CalibPath)) {
        assert(builder->platformHasFastInt8());
 #ifdef OPENCV
        std::string calib_image_list;
        int calib_batch_size;
        if (getenv("INT8_CALIB_IMG_PATH")) {
            calib_image_list = getenv("INT8_CALIB_IMG_PATH");
        }
        else {
            std::cerr << "INT8_CALIB_IMG_PATH not set" << std::endl;
            std::abort();
        }
        if (getenv("INT8_CALIB_BATCH_SIZE")) {
            calib_batch_size = std::stoi(getenv("INT8_CALIB_BATCH_SIZE"));
        }
        else {
            std::cerr << "INT8_CALIB_BATCH_SIZE not set" << std::endl;
            std::abort();
        }
        nvinfer1::int8EntroyCalibrator *calibrator = new nvinfer1::int8EntroyCalibrator(calib_batch_size, m_InputC, m_InputH, m_InputW, m_LetterBox, calib_image_list, m_Int8CalibPath);
        builder->setInt8Mode(true);
        builder->setInt8Calibrator(calibrator);
 #else
        std::cerr << "OpenCV is required to run INT8 calibrator" << std::endl;
        std::abort();
 #endif
    }
    std::vector<float> weights = loadWeights(m_WtsFilePath, m_NetworkType);
    std::vector<nvinfer1::Weights> trtWeights;
@@ -71,8 +110,12 @@ nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder)
        return nullptr;
    }
    // Build the engine
    std::cout << "Building the TensorRT Engine" << std::endl;
    if (m_LetterBox == 1) {
        std::cout << "\nNOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file to get better accuracy\n" << std::endl;
    }
    nvinfer1::ICudaEngine * engine = builder->buildCudaEngine(*network);
    if (engine) {
        std::cout << "Building complete\n" << std::endl;
@@ -80,7 +123,6 @@ nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder)
        std::cerr << "Building engine failed\n" << std::endl;
    }
    // destroy
    network->destroy();
    return engine;
 }
@@ -88,12 +130,7 @@ nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder)
 NvDsInferStatus Yolo::parseModel(nvinfer1::INetworkDefinition& network) {
    destroyNetworkUtils();
    m_ConfigBlocks = parseConfigFile(m_ConfigFilePath);
    parseConfigBlocks();
    orderParams(&m_OutputMasks);
    std::vector<float> weights = loadWeights(m_WtsFilePath, m_NetworkType);
    // build yolo network
    std::cout << "Building YOLO network" << std::endl;
    NvDsInferStatus status = buildYoloNetwork(weights, network);
@@ -121,9 +158,7 @@ NvDsInferStatus Yolo::buildYoloNetwork(
    std::vector<nvinfer1::ITensor*> tensorOutputs;
    uint outputTensorCount = 0;
    // build the network using the network API
    for (uint i = 0; i < m_ConfigBlocks.size(); ++i) {
        // check if num. of channels is correct
        assert(getNumChannels(previous) == channels);
        std::string layerIndex = "(" + std::to_string(tensorOutputs.size()) + ")";
@@ -192,7 +227,7 @@ NvDsInferStatus Yolo::buildYoloNetwork(
        else if (m_ConfigBlocks.at(i).at("type") == "upsample") {
            std::string inputVol = dimsToString(previous->getDimensions());
-            nvinfer1::ILayer* out = upsampleLayer(i - 1, m_ConfigBlocks[i], weights, m_TrtWeights, channels, previous, &network);
+            nvinfer1::ILayer* out = upsampleLayer(i - 1, m_ConfigBlocks[i], previous, &network);
            previous = out->getOutput(0);
            assert(previous != nullptr);
            std::string outputVol = dimsToString(previous->getDimensions());
@@ -212,12 +247,12 @@ NvDsInferStatus Yolo::buildYoloNetwork(
        else if (m_ConfigBlocks.at(i).at("type") == "yolo") {
            nvinfer1::Dims prevTensorDims = previous->getDimensions();
            assert(prevTensorDims.d[1] == prevTensorDims.d[2]);
            TensorInfo& curYoloTensor = m_OutputTensors.at(outputTensorCount);
-            curYoloTensor.gridSize = prevTensorDims.d[1];
+            curYoloTensor.gridSizeY = prevTensorDims.d[1];
-            curYoloTensor.stride = m_InputW / curYoloTensor.gridSize;
+            curYoloTensor.gridSizeX = prevTensorDims.d[2];
-            m_OutputTensors.at(outputTensorCount).volume = curYoloTensor.gridSize
+            curYoloTensor.stride = m_InputH / curYoloTensor.gridSizeY;
-                * curYoloTensor.gridSize
+            m_OutputTensors.at(outputTensorCount).volume = curYoloTensor.gridSizeY
                * curYoloTensor.gridSizeX
                * (curYoloTensor.numBBoxes * (5 + curYoloTensor.numClasses));
            std::string layerName = "yolo_" + std::to_string(i);
            curYoloTensor.blobName = layerName;
@@ -236,7 +271,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
            nvinfer1::IPluginV2* yoloPlugin
                = new YoloLayer(m_OutputTensors.at(outputTensorCount).numBBoxes,
                                  m_OutputTensors.at(outputTensorCount).numClasses,
-                                  m_OutputTensors.at(outputTensorCount).gridSize,
+                                  m_OutputTensors.at(outputTensorCount).gridSizeX,
                                  m_OutputTensors.at(outputTensorCount).gridSizeY,
                                  1, new_coords, scale_x_y, beta_nms,
                                  curYoloTensor.anchors,
                                  m_OutputMasks);
@@ -260,12 +296,12 @@ NvDsInferStatus Yolo::buildYoloNetwork(
        //YOLOv2 support
        else if (m_ConfigBlocks.at(i).at("type") == "region") {
            nvinfer1::Dims prevTensorDims = previous->getDimensions();
            assert(prevTensorDims.d[1] == prevTensorDims.d[2]);
            TensorInfo& curRegionTensor = m_OutputTensors.at(outputTensorCount);
-            curRegionTensor.gridSize = prevTensorDims.d[1];
+            curRegionTensor.gridSizeY = prevTensorDims.d[1];
-            curRegionTensor.stride = m_InputW / curRegionTensor.gridSize;
+            curRegionTensor.gridSizeX = prevTensorDims.d[2];
-            m_OutputTensors.at(outputTensorCount).volume = curRegionTensor.gridSize
+            curRegionTensor.stride = m_InputH / curRegionTensor.gridSizeY;
-                * curRegionTensor.gridSize
+            m_OutputTensors.at(outputTensorCount).volume = curRegionTensor.gridSizeY
                * curRegionTensor.gridSizeX
                * (curRegionTensor.numBBoxes * (5 + curRegionTensor.numClasses));
            std::string layerName = "region_" + std::to_string(i);
            curRegionTensor.blobName = layerName;
@@ -273,7 +309,8 @@ NvDsInferStatus Yolo::buildYoloNetwork(
            nvinfer1::IPluginV2* regionPlugin
                = new YoloLayer(curRegionTensor.numBBoxes,
                                  curRegionTensor.numClasses,
-                                  curRegionTensor.gridSize,
+                                  curRegionTensor.gridSizeX,
                                  curRegionTensor.gridSizeY,
                                  0, 0, 1.0, 0,
                                  curRegionTensor.anchors,
                                  mask);
@@ -387,8 +424,14 @@ void Yolo::parseConfigBlocks()
            m_InputH = std::stoul(block.at("height"));
            m_InputW = std::stoul(block.at("width"));
            m_InputC = std::stoul(block.at("channels"));
            assert(m_InputW == m_InputH);
            m_InputSize = m_InputC * m_InputH * m_InputW;
            if (block.find("letter_box") != block.end()) {
                m_LetterBox = std::stoul(block.at("letter_box"));
            }
            else {
                m_LetterBox = 0;
            }
        }
        else if ((block.at("type") == "region") || (block.at("type") == "yolo"))
        {
@@ -456,7 +499,6 @@ void Yolo::parseConfigBlocks()
 }
 void Yolo::destroyNetworkUtils() {
    // deallocate the weights
    for (uint i = 0; i < m_TrtWeights.size(); ++i) {
        if (m_TrtWeights[i].count > 0)
            free(const_cast<void*>(m_TrtWeights[i].values));
--- a/native/nvdsinfer_custom_impl_Yolo/yolo.h
+++ b/native/nvdsinfer_custom_impl_Yolo/yolo.h
@@ -40,6 +40,8 @@ struct NetworkInfo
    std::string networkType;
    std::string configFilePath;
    std::string wtsFilePath;
    std::string int8CalibPath;
    std::string networkMode;
    std::string deviceType;
    std::string inputBlobName;
 };
@@ -48,7 +50,8 @@ struct TensorInfo
 {
    std::string blobName;
    uint stride{0};
-    uint gridSize{0};
+    uint gridSizeY{0};
    uint gridSizeX{0};
    uint numClasses{0};
    uint numBBoxes{0};
    uint64_t volume{0};
@@ -75,6 +78,8 @@ protected:
    const std::string m_NetworkType;
    const std::string m_ConfigFilePath;
    const std::string m_WtsFilePath;
    const std::string m_Int8CalibPath;
    const std::string m_NetworkMode;
    const std::string m_DeviceType;
    const std::string m_InputBlobName;
    std::vector<TensorInfo> m_OutputTensors;
@@ -84,6 +89,7 @@ protected:
    uint m_InputW;
    uint m_InputC;
    uint64_t m_InputSize;
    uint m_LetterBox;
    std::vector<nvinfer1::Weights> m_TrtWeights;
--- a/native/nvdsinfer_custom_impl_Yolo/yoloForward.cu
+++ b/native/nvdsinfer_custom_impl_Yolo/yoloForward.cu
@@ -20,20 +20,20 @@
 inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
-__global__ void gpuYoloLayer(const float* input, float* output, const uint gridSize, const uint numOutputClasses,
+__global__ void gpuYoloLayer(const float* input, float* output, const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses,
                               const uint numBBoxes, const uint new_coords, const float scale_x_y)
 {
    uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
    uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
    uint z_id = blockIdx.z * blockDim.z + threadIdx.z;
-    if ((x_id >= gridSize) || (y_id >= gridSize) || (z_id >= numBBoxes))
+    if ((x_id >= gridSizeX) || (y_id >= gridSizeY) || (z_id >= numBBoxes))
    {
        return;
    }
-    const int numGridCells = gridSize * gridSize;
+    const int numGridCells = gridSizeX * gridSizeY;
-    const int bbindex = y_id * gridSize + x_id;
+    const int bbindex = y_id * gridSizeX + x_id;
    float alpha = scale_x_y;
    float beta = -0.5 * (scale_x_y - 1);
@@ -84,20 +84,20 @@ __global__ void gpuYoloLayer(const float* input, float* output, const uint gridS
    }
 }
-__global__ void gpuRegionLayer(const float* input, float* output, const uint gridSize, const uint numOutputClasses,
+__global__ void gpuRegionLayer(const float* input, float* output, const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses,
                               const uint numBBoxes)
 {
    uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
    uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
    uint z_id = blockIdx.z * blockDim.z + threadIdx.z;
-    if ((x_id >= gridSize) || (y_id >= gridSize) || (z_id >= numBBoxes))
+    if ((x_id >= gridSizeX) || (y_id >= gridSizeY) || (z_id >= numBBoxes))
    {
        return;
    }
-    const int numGridCells = gridSize * gridSize;
+    const int numGridCells = gridSizeX * gridSizeY;
-    const int bbindex = y_id * gridSize + x_id;
+    const int bbindex = y_id * gridSizeX + x_id;
    output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
        = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]);
@@ -132,24 +132,24 @@ __global__ void gpuRegionLayer(const float* input, float* output, const uint gri
    }
 }
-cudaError_t cudaYoloLayer(const void* input, void* output, const uint& batchSize, const uint& gridSize,
+cudaError_t cudaYoloLayer(const void* input, void* output, const uint& batchSize, const uint& gridSizeX, const uint& gridSizeY,
                            const uint& numOutputClasses, const uint& numBBoxes,
                            uint64_t outputSize, cudaStream_t stream, const uint modelCoords, const float modelScale, const uint modelType);
-cudaError_t cudaYoloLayer(const void* input, void* output, const uint& batchSize, const uint& gridSize,
+cudaError_t cudaYoloLayer(const void* input, void* output, const uint& batchSize, const uint& gridSizeX, const uint& gridSizeY,
                            const uint& numOutputClasses, const uint& numBBoxes,
                            uint64_t outputSize, cudaStream_t stream, const uint modelCoords, const float modelScale, const uint modelType)
 {
    dim3 threads_per_block(16, 16, 4);
-    dim3 number_of_blocks((gridSize / threads_per_block.x) + 1,
+    dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1,
-                          (gridSize / threads_per_block.y) + 1,
+                          (gridSizeY / threads_per_block.y) + 1,
                          (numBBoxes / threads_per_block.z) + 1);
    if (modelType == 1) {
        for (unsigned int batch = 0; batch < batchSize; ++batch)
        {
            gpuYoloLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
                reinterpret_cast<const float*>(input) + (batch * outputSize),
-                reinterpret_cast<float*>(output) + (batch * outputSize), gridSize, numOutputClasses,
+                reinterpret_cast<float*>(output) + (batch * outputSize), gridSizeX, gridSizeY, numOutputClasses,
                numBBoxes, modelCoords, modelScale);
        }
    }
@@ -158,7 +158,7 @@ cudaError_t cudaYoloLayer(const void* input, void* output, const uint& batchSize
        {
            gpuRegionLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
                reinterpret_cast<const float*>(input) + (batch * outputSize),
-                reinterpret_cast<float*>(output) + (batch * outputSize), gridSize, numOutputClasses,
+                reinterpret_cast<float*>(output) + (batch * outputSize), gridSizeX, gridSizeY, numOutputClasses,
                numBBoxes);
        }
    }
--- a/native/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp
+++ b/native/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp
@@ -52,7 +52,7 @@ void read(const char*& buffer, T& val)
 cudaError_t cudaYoloLayer (
    const void* input, void* output, const uint& batchSize,
-    const uint& gridSize, const uint& numOutputClasses,
+    const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses,
    const uint& numBBoxes, uint64_t outputSize, cudaStream_t stream, const uint modelCoords, const float modelScale, const uint modelType);
 YoloLayer::YoloLayer (const void* data, size_t length)
@@ -60,7 +60,8 @@ YoloLayer::YoloLayer (const void* data, size_t length)
    const char *d = static_cast<const char*>(data);
    read(d, m_NumBoxes);
    read(d, m_NumClasses);
-    read(d, m_GridSize);
+    read(d, m_GridSizeX);
    read(d, m_GridSizeY);
    read(d, m_OutputSize);
    read(d, m_type);
@@ -94,10 +95,11 @@ YoloLayer::YoloLayer (const void* data, size_t length)
 };
 YoloLayer::YoloLayer (
-    const uint& numBoxes, const uint& numClasses, const uint& gridSize, const uint model_type, const uint new_coords, const float scale_x_y, const float beta_nms, const std::vector<float> anchors, std::vector<std::vector<int>> mask) :
+    const uint& numBoxes, const uint& numClasses, const uint& gridSizeX, const uint& gridSizeY, const uint model_type, const uint new_coords, const float scale_x_y, const float beta_nms, const std::vector<float> anchors, std::vector<std::vector<int>> mask) :
    m_NumBoxes(numBoxes),
    m_NumClasses(numClasses),
-    m_GridSize(gridSize),
+    m_GridSizeX(gridSizeX),
    m_GridSizeY(gridSizeY),
    m_type(model_type),
    m_new_coords(new_coords),
    m_scale_x_y(scale_x_y),
@@ -107,8 +109,9 @@ YoloLayer::YoloLayer (
 {
    assert(m_NumBoxes > 0);
    assert(m_NumClasses > 0);
-    assert(m_GridSize > 0);
+    assert(m_GridSizeX > 0);
-    m_OutputSize = m_GridSize * m_GridSize * (m_NumBoxes * (4 + 1 + m_NumClasses));
+    assert(m_GridSizeY > 0);
    m_OutputSize = m_GridSizeX * m_GridSizeY * (m_NumBoxes * (4 + 1 + m_NumClasses));
 };
 nvinfer1::Dims
@@ -142,7 +145,7 @@ int YoloLayer::enqueue(
    cudaStream_t stream)
 {
    CHECK(cudaYoloLayer(
-              inputs[0], outputs[0], batchSize, m_GridSize, m_NumClasses, m_NumBoxes,
+              inputs[0], outputs[0], batchSize, m_GridSizeX, m_GridSizeY, m_NumClasses, m_NumBoxes,
              m_OutputSize, stream, m_new_coords, m_scale_x_y, m_type));
    return 0;
 }
@@ -161,7 +164,7 @@ size_t YoloLayer::getSerializationSize() const
        }
    }
-    return sizeof(m_NumBoxes) + sizeof(m_NumClasses) + sizeof(m_GridSize) + sizeof(m_OutputSize) + sizeof(m_type)
+    return sizeof(m_NumBoxes) + sizeof(m_NumClasses) + sizeof(m_GridSizeX) + sizeof(m_GridSizeY) + sizeof(m_OutputSize) + sizeof(m_type)
            + sizeof(m_new_coords) + sizeof(m_scale_x_y) + sizeof(m_beta_nms) + anchorsSum * sizeof(float) + maskSum * sizeof(int);
 }
@@ -170,7 +173,8 @@ void YoloLayer::serialize(void* buffer) const
    char *d = static_cast<char*>(buffer);
    write(d, m_NumBoxes);
    write(d, m_NumClasses);
-    write(d, m_GridSize);
+    write(d, m_GridSizeX);
    write(d, m_GridSizeY);
    write(d, m_OutputSize);
    write(d, m_type);
@@ -199,7 +203,7 @@ void YoloLayer::serialize(void* buffer) const
 nvinfer1::IPluginV2* YoloLayer::clone() const
 {
-    return new YoloLayer (m_NumBoxes, m_NumClasses, m_GridSize, m_type, m_new_coords, m_scale_x_y, m_beta_nms, m_Anchors, m_Mask);
+    return new YoloLayer (m_NumBoxes, m_NumClasses, m_GridSizeX, m_GridSizeY, m_type, m_new_coords, m_scale_x_y, m_beta_nms, m_Anchors, m_Mask);
 }
 REGISTER_TENSORRT_PLUGIN(YoloLayerPluginCreator);
--- a/native/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
+++ b/native/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
@@ -56,7 +56,7 @@ class YoloLayer : public nvinfer1::IPluginV2
 {
 public:
    YoloLayer (const void* data, size_t length);
-    YoloLayer (const uint& numBoxes, const uint& numClasses, const uint& gridSize,
+    YoloLayer (const uint& numBoxes, const uint& numClasses, const uint& gridSizeX, const uint& gridSizeY,
                const uint model_type, const uint new_coords, const float scale_x_y, const float beta_nms,
                const std::vector<float> anchors, const std::vector<std::vector<int>> mask);
    const char* getPluginType () const override { return YOLOLAYER_PLUGIN_NAME; }
@@ -96,7 +96,8 @@ public:
 private:
    uint m_NumBoxes {0};
    uint m_NumClasses {0};
-    uint m_GridSize {0};
+    uint m_GridSizeX {0};
    uint m_GridSizeY {0};
    uint64_t m_OutputSize {0};
    std::string m_Namespace {""};
--- a/non_square/nvdsinfer_custom_impl_Yolo/Makefile
+++ b/non_square/nvdsinfer_custom_impl_Yolo/Makefile
@@ -1,71 +0,0 @@
 ################################################################################
 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
 # Edited by Marcos Luciano
 # https://www.github.com/marcoslucianops
 ################################################################################
 CUDA_VER?=
 ifeq ($(CUDA_VER),)
  $(error "CUDA_VER is not set")
 endif
 CC:= g++
 NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
 CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
 CFLAGS+= -I../../includes -I/usr/local/cuda-$(CUDA_VER)/include
 LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
 LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
 INCS:= $(wildcard *.h)
 SRCFILES:= nvdsinfer_yolo_engine.cpp \
           nvdsparsebbox_Yolo.cpp \
           yoloPlugins.cpp \
           layers/convolutional_layer.cpp \
           layers/dropout_layer.cpp \
           layers/shortcut_layer.cpp \
           layers/route_layer.cpp \
           layers/upsample_layer.cpp \
           layers/maxpool_layer.cpp \
           layers/activation_layer.cpp \
           utils.cpp \
           yolo.cpp \
           yoloForward.cu
 TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
 TARGET_OBJS:= $(SRCFILES:.cpp=.o)
 TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
 all: $(TARGET_LIB)
 %.o: %.cpp $(INCS) Makefile
 	$(CC) -c -o $@ $(CFLAGS) $<
 %.o: %.cu $(INCS) Makefile
 	$(NVCC) -c -o $@ --compiler-options '-fPIC' $<
 $(TARGET_LIB) : $(TARGET_OBJS)
 	$(CC) -o $@  $(TARGET_OBJS) $(LFLAGS)
 clean:
 	rm -rf $(TARGET_LIB)
 	rm -rf $(TARGET_OBJS)
--- a/non_square/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp
+++ b/non_square/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp
@@ -1,90 +0,0 @@
 /*
 * Created by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #include "upsample_layer.h"
 nvinfer1::ILayer* upsampleLayer(
    int layerIdx,
    std::map<std::string, std::string>& block,
    std::vector<float>& weights,
    std::vector<nvinfer1::Weights>& trtWeights,
    int& inputChannels,
    nvinfer1::ITensor* input,
    nvinfer1::INetworkDefinition* network)
 {
    assert(block.at("type") == "upsample");
    nvinfer1::Dims inpDims = input->getDimensions();
    assert(inpDims.nbDims == 3);
    //assert(inpDims.d[1] == inpDims.d[2]);
    int h = inpDims.d[1];
    int w = inpDims.d[2];
    int stride = std::stoi(block.at("stride"));
    /*nvinfer1::Dims preDims{3,
                           {1, stride * h, h},
                           {nvinfer1::DimensionType::kCHANNEL, nvinfer1::DimensionType::kSPATIAL,
                            nvinfer1::DimensionType::kSPATIAL}};
    int size = stride * h * w;
    nvinfer1::Weights preMul{nvinfer1::DataType::kFLOAT, nullptr, size};
    float* preWt = new float[size];
    for (int i = 0, idx = 0; i < h; ++i)
    {
        for (int j = 0; j < h * stride; ++j, ++idx)
        {
            preWt[idx] = (i == j) ? 1.0 : 0.0;
        }
    }
    preMul.values = preWt;
    trtWeights.push_back(preMul);
    nvinfer1::IConstantLayer* preM = network->addConstant(preDims, preMul);
    assert(preM != nullptr);
    std::string preLayerName = "preMul_" + std::to_string(layerIdx);
    preM->setName(preLayerName.c_str());
    nvinfer1::Dims postDims{3,
                            {1, w, stride * w},
                            {nvinfer1::DimensionType::kCHANNEL, nvinfer1::DimensionType::kSPATIAL,
                             nvinfer1::DimensionType::kSPATIAL}};
    size = stride * w * w;
    nvinfer1::Weights postMul{nvinfer1::DataType::kFLOAT, nullptr, size};
    float* postWt = new float[size];
    for (int i = 0, idx = 0; i < w; ++i)
    {
        for (int j = 0; j < stride * w; ++j, ++idx)
        {
            postWt[idx] = (j / stride == i) ? 1.0 : 0.0;
        }
    }
    postMul.values = postWt;
    trtWeights.push_back(postMul);
    nvinfer1::IConstantLayer* post_m = network->addConstant(postDims, postMul);
    assert(post_m != nullptr);
    std::string postLayerName = "postMul_" + std::to_string(layerIdx);
    post_m->setName(postLayerName.c_str());
    nvinfer1::IMatrixMultiplyLayer* mm1
        = network->addMatrixMultiply(*preM->getOutput(0), nvinfer1::MatrixOperation::kNONE, *input,
                                     nvinfer1::MatrixOperation::kNONE);
    assert(mm1 != nullptr);
    std::string mm1LayerName = "mm1_" + std::to_string(layerIdx);
    mm1->setName(mm1LayerName.c_str());
    nvinfer1::IMatrixMultiplyLayer* mm2
        = network->addMatrixMultiply(*mm1->getOutput(0), nvinfer1::MatrixOperation::kNONE,
                                     *post_m->getOutput(0), nvinfer1::MatrixOperation::kNONE);
    assert(mm2 != nullptr);
    std::string mm2LayerName = "mm2_" + std::to_string(layerIdx);
    mm2->setName(mm2LayerName.c_str());
    return mm2;*/
    nvinfer1::IResizeLayer* resize_layer = network->addResize(*input);
    resize_layer->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
    float scale[3] = {1, stride, stride};
    resize_layer->setScales(scale, 3);
    std::string layer_name = "upsample_" + std::to_string(layerIdx);
    resize_layer->setName(layer_name.c_str());
    return resize_layer;
 }
--- a/non_square/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
+++ b/non_square/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
@@ -1,156 +0,0 @@
 /*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 * Edited by Marcos Luciano
 * https://www.github.com/marcoslucianops
 */
 #ifndef __YOLO_PLUGINS__
 #define __YOLO_PLUGINS__
 #include <cassert>
 #include <cstring>
 #include <cuda_runtime_api.h>
 #include <iostream>
 #include <memory>
 #include <vector>
 #include "NvInferPlugin.h"
 #define CHECK(status)                                                                              \
    {                                                                                              \
        if (status != 0)                                                                           \
        {                                                                                          \
            std::cout << "CUDA failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \
                      << " at line " << __LINE__ << std::endl;                                     \
            abort();                                                                               \
        }                                                                                          \
    }
 namespace
 {
 const char* YOLOLAYER_PLUGIN_VERSION {"1"};
 const char* YOLOLAYER_PLUGIN_NAME {"YoloLayer_TRT"};
 } // namespace
 class YoloLayer : public nvinfer1::IPluginV2
 {
 public:
    YoloLayer (const void* data, size_t length);
    YoloLayer (const uint& numBoxes, const uint& numClasses, const uint& gridSizeX, const uint& gridSizeY,
                const uint model_type, const uint new_coords, const float scale_x_y, const float beta_nms,
                const std::vector<float> anchors, const std::vector<std::vector<int>> mask);
    const char* getPluginType () const override { return YOLOLAYER_PLUGIN_NAME; }
    const char* getPluginVersion () const override { return YOLOLAYER_PLUGIN_VERSION; }
    int getNbOutputs () const override { return 1; }
    nvinfer1::Dims getOutputDimensions (
        int index, const nvinfer1::Dims* inputs,
        int nbInputDims) override;
    bool supportsFormat (
        nvinfer1::DataType type, nvinfer1::PluginFormat format) const override;
    void configureWithFormat (
        const nvinfer1::Dims* inputDims, int nbInputs,
        const nvinfer1::Dims* outputDims, int nbOutputs,
        nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override;
    int initialize () override { return 0; }
    void terminate () override {}
    size_t getWorkspaceSize (int maxBatchSize) const override { return 0; }
    int enqueue (
        int batchSize, const void* const* inputs, void** outputs,
        void* workspace, cudaStream_t stream) override;
    size_t getSerializationSize() const override;
    void serialize (void* buffer) const override;
    void destroy () override { delete this; }
    nvinfer1::IPluginV2* clone() const override;
    void setPluginNamespace (const char* pluginNamespace)override {
        m_Namespace = pluginNamespace;
    }
    virtual const char* getPluginNamespace () const override {
        return m_Namespace.c_str();
    }
 private:
    uint m_NumBoxes {0};
    uint m_NumClasses {0};
    uint m_GridSizeX {0};
    uint m_GridSizeY {0};
    uint64_t m_OutputSize {0};
    std::string m_Namespace {""};
    uint m_type {0};
    uint m_new_coords {0};
    float m_scale_x_y {0};
    float m_beta_nms {0};
    std::vector<float> m_Anchors;
    std::vector<std::vector<int>> m_Mask;
 };
 class YoloLayerPluginCreator : public nvinfer1::IPluginCreator
 {
 public:
    YoloLayerPluginCreator () {}
    ~YoloLayerPluginCreator () {}
    const char* getPluginName () const override { return YOLOLAYER_PLUGIN_NAME; }
    const char* getPluginVersion () const override { return YOLOLAYER_PLUGIN_VERSION; }
    const nvinfer1::PluginFieldCollection* getFieldNames() override {
        std::cerr<< "YoloLayerPluginCreator::getFieldNames is not implemented" << std::endl;
        return nullptr;
    }
    nvinfer1::IPluginV2* createPlugin (
        const char* name, const nvinfer1::PluginFieldCollection* fc) override
    {
        std::cerr<< "YoloLayerPluginCreator::getFieldNames is not implemented";
        return nullptr;
    }
    nvinfer1::IPluginV2* deserializePlugin (
        const char* name, const void* serialData, size_t serialLength) override
    {
        std::cout << "Deserialize yoloLayer plugin: " << name << std::endl;
        return new YoloLayer(serialData, serialLength);
    }
    void setPluginNamespace(const char* libNamespace) override {
        m_Namespace = libNamespace;
    }
    const char* getPluginNamespace() const override {
        return m_Namespace.c_str();
    }
 private:
    std::string m_Namespace {""};
 };
 extern int kNUM_CLASSES;
 extern float kBETA_NMS;
 extern std::vector<float> kANCHORS;
 extern std::vector<std::vector<int>> kMASK;
 #endif // __YOLO_PLUGINS__
--- a/readme.md
+++ b/readme.md
@@ -6,45 +6,46 @@ NVIDIA DeepStream SDK 5.1 configuration for YOLO models
 ### Improvements on this repository
 * Darknet CFG params parser (not need to edit nvdsparsebbox_Yolo.cpp or another file for native models)
-* Support to new_coords, beta_nms and scale_x_y params
+* Support for new_coords, beta_nms and scale_x_y params
-* Support to new models not supported in official DeepStream SDK YOLO.
+* Support for new models not supported in official DeepStream SDK YOLO.
-* Support to layers not supported in official DeepStream SDK YOLO.
+* Support for layers not supported in official DeepStream SDK YOLO.
-* Support to activations not supported in official DeepStream SDK YOLO.
+* Support for activations not supported in official DeepStream SDK YOLO.
-* Support to Convolutional groups
+* Support for Convolutional groups
 * **Support for INT8 calibration** (not available for YOLOv5 models)
 * **Support for non square models**
 ##
 Tutorial
 * [Basic usage](#basic-usage)
 * [INT8 calibration](#int8-calibration)
 * [Configuring to your custom model](https://github.com/marcoslucianops/DeepStream-Yolo/blob/master/customModels.md)
 * [Multiple YOLO inferences](https://github.com/marcoslucianops/DeepStream-Yolo/blob/master/multipleInferences.md)
 Benchmark
 * [mAP/FPS comparison between models](#mapfps-comparison-between-models)
 TensorRT conversion
-* [Native](#native-tensorrt-conversion) (tested models below)
+* Native (tested models below)
-    * YOLOv4x-Mish
+    * [YOLOv4x-Mish](https://github.com/AlexeyAB/darknet) [[cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4x-mish.cfg)] [[weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4x-mish.weights)]
-    * YOLOv4-CSP
+    * [YOLOv4-CSP](https://github.com/WongKinYiu/ScaledYOLOv4/tree/yolov4-csp) [[cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-csp.cfg)] [[weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-csp.weights)]
-    * YOLOv4
+    * [YOLOv4](https://github.com/AlexeyAB/darknet) [[cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4.cfg)] [[weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights)]
-    * YOLOv4-Tiny
+    * [YOLOv4-Tiny](https://github.com/AlexeyAB/darknet) [[cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny.cfg)] [[weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights)]
-    * YOLOv3-SSP
+    * [YOLOv3-SPP](https://github.com/pjreddie/darknet) [[cfg](https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3-spp.cfg)] [[weights](https://pjreddie.com/media/files/yolov3-spp.weights)]
-    * YOLOv3
+    * [YOLOv3](https://github.com/pjreddie/darknet) [[cfg](https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg)] [[weights](https://pjreddie.com/media/files/yolov3.weights)]
-    * YOLOv3-Tiny-PRN
+    * [YOLOv3-Tiny-PRN](https://github.com/WongKinYiu/PartialResidualNetworks) [[cfg](https://raw.githubusercontent.com/WongKinYiu/PartialResidualNetworks/master/cfg/yolov3-tiny-prn.cfg)] [[weights](https://github.com/WongKinYiu/PartialResidualNetworks/raw/master/model/yolov3-tiny-prn.weights)]
-    * YOLOv3-Tiny
+    * [YOLOv3-Tiny](https://github.com/pjreddie/darknet) [[cfg](https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3-tiny.cfg)] [[weights](https://pjreddie.com/media/files/yolov3-tiny.weights)]
-    * YOLOv3-Lite
+    * [YOLOv3-Lite](https://github.com/dog-qiuqiu/MobileNet-Yolo) [[cfg](https://raw.githubusercontent.com/dog-qiuqiu/MobileNet-Yolo/master/MobileNetV2-YOLOv3-Lite/COCO/MobileNetV2-YOLOv3-Lite-coco.cfg)] [[weights](https://github.com/dog-qiuqiu/MobileNet-Yolo/raw/master/MobileNetV2-YOLOv3-Lite/COCO/MobileNetV2-YOLOv3-Lite-coco.weights)]
-    * YOLOv3-Nano
+    * [YOLOv3-Nano](https://github.com/dog-qiuqiu/MobileNet-Yolo) [[cfg](https://raw.githubusercontent.com/dog-qiuqiu/MobileNet-Yolo/master/MobileNetV2-YOLOv3-Nano/COCO/MobileNetV2-YOLOv3-Nano-coco.cfg)] [[weights](https://github.com/dog-qiuqiu/MobileNet-Yolo/raw/master/MobileNetV2-YOLOv3-Nano/COCO/MobileNetV2-YOLOv3-Nano-coco.weights)]
-    * YOLO-Fastest
+    * [YOLO-Fastest 1.1](https://github.com/dog-qiuqiu/Yolo-Fastest) [[cfg](https://raw.githubusercontent.com/dog-qiuqiu/Yolo-Fastest/master/ModelZoo/yolo-fastest-1.1_coco/yolo-fastest-1.1-xl.cfg)] [[weights](https://github.com/dog-qiuqiu/Yolo-Fastest/raw/master/ModelZoo/yolo-fastest-1.1_coco/yolo-fastest-1.1-xl.weights)]
-    * YOLO-Fastest-XL
+    * [YOLO-Fastest-XL 1.1](https://github.com/dog-qiuqiu/Yolo-Fastest) [[cfg](https://raw.githubusercontent.com/dog-qiuqiu/Yolo-Fastest/master/ModelZoo/yolo-fastest-1.1_coco/yolo-fastest-1.1.cfg)] [[weights](https://github.com/dog-qiuqiu/Yolo-Fastest/raw/master/ModelZoo/yolo-fastest-1.1_coco/yolo-fastest-1.1.weights)]
-    * YOLOv2
+    * [YOLOv2](https://github.com/pjreddie/darknet) [[cfg](https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov2.cfg)] [[weights](https://pjreddie.com/media/files/yolov2.weights)]
-    * YOLOv2-Tiny
+    * [YOLOv2-Tiny](https://github.com/pjreddie/darknet) [[cfg](https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov2-tiny.cfg)] [[weights](https://pjreddie.com/media/files/yolov2-tiny.weights)]
 * External
    * [YOLOv5 5.0](https://github.com/marcoslucianops/DeepStream-Yolo/blob/master/YOLOv5-5.0.md)
    * [YOLOv5 4.0](https://github.com/marcoslucianops/DeepStream-Yolo/blob/master/YOLOv5-4.0.md)
    * [YOLOv5 3.X (3.0/3.1)](https://github.com/marcoslucianops/DeepStream-Yolo/blob/master/YOLOv5-3.X.md)
-Request
+Benchmark
-* [Request native TensorRT conversion for your YOLO-based model](#request-native-tensorrt-conversion-for-your-yolo-based-model)
+* [mAP/FPS comparison between models](#mapfps-comparison-between-models)
 ##
@@ -55,202 +56,24 @@ Request
 ##
-### mAP/FPS comparison between models (OUTDATED)
+### Basic usage
 DeepStream SDK YOLOv4: https://youtu.be/Qi_F_IYpuFQ
 Darknet YOLOv4: https://youtu.be/AxJJ9fnJ7Xk
 <details><summary>NVIDIA GTX 1050 (4GB Mobile)</summary>
 ```
-CUDA 10.2
+git clone https://github.com/marcoslucianops/DeepStream-Yolo.git
-Driver 440.33
+cd DeepStream-Yolo/native
 TensorRT 7.2.1
 cuDNN 8.0.5
 OpenCV 3.2.0 (libopencv-dev)
 OpenCV Python 4.4.0 (opencv-python)
 PyTorch 1.7.0
 Torchvision 0.8.1
 ```
-| TensorRT        | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS<br />(with display) | FPS<br />(without display) |
+Download cfg and weights files from your model and move to DeepStream-Yolo/native folder
 |:---------------:|:---------:|:----------:|:------------:|:-------:|:--------:|:-----------------------:|:--------------------------:|
 | YOLOv5x         | FP32      | 608        | 0.406        | 0.562   | 0.441    | 7.91                    | 7.99                       |
 | YOLOv5l         | FP32      | 608        | 0.385        | 0.540   | 0.419    | 12.82                   | 12.97                      |
 | YOLOv5m         | FP32      | 608        | 0.354        | 0.507   | 0.388    | 25.09                   | 25.97                      |
 | YOLOv5s         | FP32      | 608        | 0.281        | 0.430   | 0.307    | 52.02                   | 56.21                      |
 | YOLOv4x-MISH    | FP32      | 640        | 0.454        | 0.644   | 0.491    | 7.45                    | 7.56                       |
 | YOLOv4x-MISH    | FP32      | 608        | 0.450        | 0.644   | 0.482    | 7.93                    | 8.05                       |
 | YOLOv4-CSP      | FP32      | 608        | 0.434        | 0.628   | 0.465    | 13.74                   | 14.11                      |
 | YOLOv4-CSP      | FP32      | 512        | 0.427        | 0.618   | 0.459    | 21.69                   | 22.75                      |
 | YOLOv4          | FP32      | 608        | 0.490        | 0.734   | 0.538    | 11.72                   | 12.09                      |
 | YOLOv4          | FP32      | 512        | 0.484        | 0.725   | 0.533    | 19.00                   | 19.70                      |
 | YOLOv4          | FP32      | 416        | 0.456        | 0.693   | 0.491    | 22.63                   | 23.81                      |
 | YOLOv4          | FP32      | 320        | 0.400        | 0.623   | 0.424    | 32.46                   | 35.07                      |
 | YOLOv3-SPP      | FP32      | 608        | 0.411        | 0.680   | 0.436    | 11.85                   | 12.12                      |
 | YOLOv3          | FP32      | 608        | 0.374        | 0.654   | 0.387    | 12.00                   | 12.33                      |
 | YOLOv3          | FP32      | 416        | 0.369        | 0.651   | 0.379    | 23.19                   | 24.55                      |
 | YOLOv4-Tiny     | FP32      | 416        | 0.195        | 0.382   | 0.175    | 144.55                  | 176.31                     |
 | YOLOv3-Tiny-PRN | FP32      | 416        | 0.168        | 0.369   | 0.130    | 181.71                  | 244.47                     |
 | YOLOv3-Tiny     | FP32      | 416        | 0.165        | 0.357   | 0.128    | 154.19                  | 190.42                     |
 | YOLOv3-Lite     | FP32      | 416        | 0.165        | 0.350   | 0.131    | 122.40                  | 146.19                     |
 | YOLOv3-Lite     | FP32      | 320        | 0.155        | 0.324   | 0.128    | 163.76                  | 204.21                     |
 | YOLOv3-Nano     | FP32      | 416        | 0.127        | 0.277   | 0.098    | 191.77                  | 264.59                     |
 | YOLOv3-Nano     | FP32      | 320        | 0.122        | 0.258   | 0.099    | 207.04                  | 269.89                     |
 | YOLO-Fastest    | FP32      | 416        | 0.092        | 0.213   | 0.062    | 174.26                  | 221.05                     |
 | YOLO-Fastest    | FP32      | 320        | 0.090        | 0.201   | 0.068    | 199.48                  | 258.56                     |
 | YOLO-FastestXL  | FP32      | 416        | 0.144        | 0.306   | 0.115    | 121.89                  | 145.13                     |
 | YOLO-FastestXL  | FP32      | 320        | 0.136        | 0.279   | 0.117    | 162.65                  | 199.75                     |
 | YOLOv2          | FP32      | 608        | 0.286        | 0.534   | 0.274    | 23.92                   | 25.47                      |
 | YOLOv2-Tiny     | FP32      | 416        | 0.103        | 0.251   | 0.064    | 165.01                  | 203.02                     |
 | Darknet         | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS<br />(with display) | FPS<br />(without display) |
 |:---------------:|:---------:|:----------:|:------------:|:-------:|:--------:|:-----------------------:|:--------------------------:|
 | YOLOv4x-MISH    | FP32      | 640        | 0.495        | 0.682   | 0.538    | 5.3                     | 5.5                        |
 | YOLOv4x-MISH    | FP32      | 608        | 0.493        | 0.680   | 0.535    | 5.4                     | 5.6                        |
 | YOLOv4-CSP      | FP32      | 608        | 0.473        | 0.661   | 0.515    | 9.2                     | 9.5                        |
 | YOLOv4-CSP      | FP32      | 512        | 0.458        | 0.645   | 0.496    | 13.6                    | 14.0                       |
 | YOLOv4          | FP32      | 608        | 0.513        | 0.748   | 0.574    | 7.3                     | 7.5                        |
 | YOLOv4          | FP32      | 512        | 0.506        | 0.738   | 0.564    | 11.8                    | 12.3                       |
 | YOLOv4          | FP32      | 416        | 0.479        | 0.709   | 0.527    | 15.4                    | 15.8                       |
 | YOLOv4          | FP32      | 320        | 0.421        | 0.638   | 0.454    | 21.0                    | 21.7                       |
 | YOLOv3-SPP      | FP32      | 608        | 0.432        | 0.701   | 0.465    | 6.9                     | 7.1                        |
 | YOLOv3          | FP32      | 608        | 0.391        | 0.672   | 0.412    | 7.0                     | 7.3                        |
 | YOLOv3          | FP32      | 416        | 0.384        | 0.668   | 0.402    | 16.3                    | 16.9                       |
 | YOLOv4-Tiny     | FP32      | 416        | 0.203        | 0.388   | 0.189    | 68.0                    | 112.5                      |
 | YOLOv3-Tiny-PRN | FP32      | 416        | 0.172        | 0.378   | 0.133    | 71.6                    | 143.9                      |
 | YOLOv3-Tiny     | FP32      | 416        | 0.171        | 0.367   | 0.137    | 71.5                    | 117.9                      |
 | YOLOv3-Lite     | FP32      | 416        | 0.169        | 0.349   | 0.144    | 53.8                    | 63.4                       |
 | YOLOv3-Lite     | FP32      | 320        | 0.159        | 0.326   | 0.139    | 55.2                    | 97.5                       |
 | YOLOv3-Nano     | FP32      | 416        | 0.129        | 0.275   | 0.102    | 58.0                    | 113.1                      |
 | YOLOv3-Nano     | FP32      | 320        | 0.124        | 0.259   | 0.106    | 61.6                    | 156.8                      |
 | YOLO-Fastest    | FP32      | 416        | 0.095        | 0.213   | 0.068    | 61.7                    | 104.1                      |
 | YOLO-Fastest    | FP32      | 320        | 0.093        | 0.202   | 0.074    | 65.8                    | 143.3                      |
 | YOLO-FastestXL  | FP32      | 416        | 0.148        | 0.308   | 0.125    | 62.0                    | 75.9                       |
 | YOLO-FastestXL  | FP32      | 320        | 0.141        | 0.284   | 0.125    | 63.9                    | 112.3                      |
 | YOLOv2          | FP32      | 608        | 0.297        | 0.548   | 0.291    | 12.1                    | 12.1                       |
 | YOLOv2-Tiny     | FP32      | 416        | 0.105        | 0.255   | 0.068    | 34.5                    | 40.7                       |
 | PyTorch | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS<br />(with output) | FPS<br />(without output) |
 |:-------:|:---------:|:----------:|:------------:|:-------:|:--------:|:----------------------:|:-------------------------:|
 | YOLOv5x | FP32      | 608        | 0.487        | 0.676   | 0.527    | 8.25                   | 9.49                      |
 | YOLOv5l | FP32      | 608        | 0.471        | 0.662   | 0.512    | 12.67                  | 15.77                     |
 | YOLOv5m | FP32      | 608        | 0.439        | 0.631   | 0.474    | 18.13                  | 24.80                     |
 | YOLOv5s | FP32      | 608        | 0.369        | 0.567   | 0.395    | 28.03                  | 49.52                     |
 <br />
 </details>
 <details><summary>NVIDIA Jetson Nano (4GB)</summary>
 ```
 JetPack 4.4.1
 CUDA 10.2
 TensorRT 7.1.3
 cuDNN 8.0
 OpenCV 4.1.1
 ```
 | TensorRT        | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS<br />(with display) | FPS<br />(without display) |
 |:---------------:|:---------:|:----------:|:------------:|:-------:|:--------:|:-----------------------:|:--------------------------:|
 | YOLOv4          | FP32      | 416        | 0.462        | 0.694   | 0.503    | 2.97                   | 2.99                      |
 | YOLOv4          | FP16      | 416        | 0.462        | 0.694   | 0.504    | 4.89                   | 4.96                      |
 | YOLOv4          | FP32      | 320        | 0.407        | 0.625   | 0.434    |                    |                       |
 | YOLOv4          | FP16      | 320        | 0.408        | 0.625   | 0.435    |                    |                       |
 | YOLOv3          | FP32      | 416        | 0.370        | 0.664   | 0.379    |                    |                       |
 | YOLOv3          | FP16      | 416        | 0.370        | 0.664   | 0.378    |                   |                       |
 | YOLOv4-Tiny     | FP32      | 416        | 0.194        | 0.378   | 0.177    | 21.79                  | 23.23                      |
 | YOLOv4-Tiny     | FP16      | 416        | 0.194        | 0.378   | 0.177    | 24.76                  | 26.18                      |
 | YOLOv3-Tiny-PRN | FP32      | 416        | 0.163        | 0.375   | 0.120    | 23.79                  | 25.18                      |
 | YOLOv3-Tiny-PRN | FP16      | 416        | 0.163        | 0.375   | 0.119    | 26.08                  | 27.96                      |
 | YOLOv3-Tiny     | FP32      | 416        | 0.162        | 0.363   | 0.122    | 22.84                  | 24.28                      |
 | YOLOv3-Tiny     | FP16      | 416        | 0.162        | 0.363   | 0.122    | 25.47                  | 27.18                      |
 | Darknet         | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS<br />(with display) | FPS<br />(without display) |
 |:---------------:|:---------:|:----------:|:------------:|:-------:|:--------:|:-----------------------:|:--------------------------:|
 | YOLOv4          | FP32      | 416        |              |         |          |                         |                            |
 | YOLOv4          | FP32      | 320        |              |         |          |                         |                            |
 | YOLOv3          | FP32      | 416        |              |         |          |                         |                            |
 | YOLOv4-Tiny     | FP32      | 416        |              |         |          |                         |                            |
 | YOLOv3-Tiny-PRN | FP32      | 416        |              |         |          |                         |                            |
 | YOLOv3-Tiny     | FP32      | 416        |              |         |          |                         |                            |
 | YOLOv2          | FP32      | 608        |              |         |          |                         |                            |
 | YOLOv2-Tiny     | FP32      | 416        |              |         |          |                         |                            |
 | PyTorch | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS<br />(with output) | FPS<br />(without output) |
 |:-------:|:---------:|:----------:|:------------:|:-------:|:--------:|:----------------------:|:-------------------------:|
 | YOLOv5s | FP32      | 416        |              |         |          |                        |                           |
 | YOLOv5s | FP16      | 416        |              |         |          |                        |                           |
 <br />
 </details>
 #### DeepStream settings
 * General
 ```
 width = 1920
 height = 1080
 maintain-aspect-ratio = 0
 batch-size = 1
 ```
 * Evaluate mAP
 ```
 valid = val2017 (COCO)
 nms-iou-threshold = 0.6
 pre-cluster-threshold = 0.001 (CONF_THRESH)
 ```
 * Evaluate FPS and Demo
 ```
 nms-iou-threshold = 0.45 (NMS; changed to beta_nms when available)
 pre-cluster-threshold = 0.25 (CONF_THRESH)
 ```
 ##
 ### Native TensorRT conversion
 Run command
 ```
 sudo chmod -R 777 /opt/nvidia/deepstream/deepstream-5.1/sources/
 ```
 Download [my native folder](https://github.com/marcoslucianops/DeepStream-Yolo/tree/master/native), rename to yolo and move to your deepstream/sources folder.
 Download cfg and weights files from your model and move to deepstream/sources/yolo folder.
 * [YOLOv4x-Mish](https://github.com/AlexeyAB/darknet) [[cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4x-mish.cfg)] [[weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4x-mish.weights)]
 * [YOLOv4-CSP](https://github.com/WongKinYiu/ScaledYOLOv4/tree/yolov4-csp) [[cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-csp.cfg)] [[weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-csp.weights)]
 * [YOLOv4](https://github.com/AlexeyAB/darknet) [[cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4.cfg)] [[weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights)]
 * [YOLOv4-Tiny](https://github.com/AlexeyAB/darknet) [[cfg](https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny.cfg)] [[weights](https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights)]
 * [YOLOv3-SPP](https://github.com/pjreddie/darknet) [[cfg](https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3-spp.cfg)] [[weights](https://pjreddie.com/media/files/yolov3-spp.weights)]
 * [YOLOv3](https://github.com/pjreddie/darknet) [[cfg](https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg)] [[weights](https://pjreddie.com/media/files/yolov3.weights)]
 * [YOLOv3-Tiny-PRN](https://github.com/WongKinYiu/PartialResidualNetworks) [[cfg](https://raw.githubusercontent.com/WongKinYiu/PartialResidualNetworks/master/cfg/yolov3-tiny-prn.cfg)] [[weights](https://github.com/WongKinYiu/PartialResidualNetworks/raw/master/model/yolov3-tiny-prn.weights)]
 * [YOLOv3-Tiny](https://github.com/pjreddie/darknet) [[cfg](https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3-tiny.cfg)] [[weights](https://pjreddie.com/media/files/yolov3-tiny.weights)]
 * [YOLOv3-Lite](https://github.com/dog-qiuqiu/MobileNet-Yolo) [[cfg](https://raw.githubusercontent.com/dog-qiuqiu/MobileNet-Yolo/master/MobileNetV2-YOLOv3-Lite/COCO/MobileNetV2-YOLOv3-Lite-coco.cfg)] [[weights](https://github.com/dog-qiuqiu/MobileNet-Yolo/raw/master/MobileNetV2-YOLOv3-Lite/COCO/MobileNetV2-YOLOv3-Lite-coco.weights)]
 * [YOLOv3-Nano](https://github.com/dog-qiuqiu/MobileNet-Yolo) [[cfg](https://raw.githubusercontent.com/dog-qiuqiu/MobileNet-Yolo/master/MobileNetV2-YOLOv3-Nano/COCO/MobileNetV2-YOLOv3-Nano-coco.cfg)] [[weights](https://github.com/dog-qiuqiu/MobileNet-Yolo/raw/master/MobileNetV2-YOLOv3-Nano/COCO/MobileNetV2-YOLOv3-Nano-coco.weights)]
 * [YOLO-Fastest](https://github.com/dog-qiuqiu/Yolo-Fastest) [[cfg](https://raw.githubusercontent.com/dog-qiuqiu/Yolo-Fastest/master/Yolo-Fastest/COCO/yolo-fastest.cfg)] [[weights](https://github.com/dog-qiuqiu/Yolo-Fastest/raw/master/Yolo-Fastest/COCO/yolo-fastest.weights)]
 * [YOLO-Fastest-XL](https://github.com/dog-qiuqiu/Yolo-Fastest) [[cfg](https://raw.githubusercontent.com/dog-qiuqiu/Yolo-Fastest/master/Yolo-Fastest/COCO/yolo-fastest-xl.cfg)] [[weights](https://github.com/dog-qiuqiu/Yolo-Fastest/raw/master/Yolo-Fastest/COCO/yolo-fastest-xl.weights)]
 * [YOLOv2](https://github.com/pjreddie/darknet) [[cfg](https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov2.cfg)] [[weights](https://pjreddie.com/media/files/yolov2.weights)]
 * [YOLOv2-Tiny](https://github.com/pjreddie/darknet) [[cfg](https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov2-tiny.cfg)] [[weights](https://pjreddie.com/media/files/yolov2-tiny.weights)]
 Compile
 * x86 platform
 ```
 cd /opt/nvidia/deepstream/deepstream-5.1/sources/yolo
 CUDA_VER=11.1 make -C nvdsinfer_custom_impl_Yolo
 ```
 * Jetson platform
 ```
 cd /opt/nvidia/deepstream/deepstream-5.1/sources/yolo
 CUDA_VER=10.2 make -C nvdsinfer_custom_impl_Yolo
 ```
@@ -299,12 +122,131 @@ Note: config_infer_primary.txt uses cluster-mode=4 and NMS = 0.45 (via code) whe
 ##
-### Request native TensorRT conversion for your YOLO-based model
+### INT8 calibration
 To request moded files for native TensorRT conversion to use in DeepStream SDK, send me the model cfg and weights files via Issues tab.
-<br />
+Install OpenCV
 ```
 sudo apt-get install libopencv-dev
 ```
-Note: If your model are listed in native tab, you can use [my native folder](https://github.com/marcoslucianops/DeepStream-Yolo/tree/master/native) to run your model in DeepStream.
+Compile/recompile the nvdsinfer_custom_impl_Yolo lib with OpenCV support
 * x86 platform
 ```
 cd DeepStream-Yolo/native
 CUDA_VER=11.1 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo
 ```
 * Jetson platform
 ```
 cd DeepStream-Yolo/native
 CUDA_VER=10.2 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo
 ```
 For COCO dataset, download the [val2017](https://drive.google.com/file/d/1gbvfn7mcsGDRZ_luJwtITL-ru2kK99aK/view?usp=sharing), extract, and move to DeepStream-Yolo/native folder
 Select 1000 random images from COCO dataset to run calibration
 ```
 mkdir calibration
 for jpg in $(ls -1 val2017/*.jpg | sort -R | head -1000); do \
    cp val2017/${jpg} calibration/; \
 done
 ```
 Create the calibration.txt file with all selected images
 ```
 realpath calibration/*jpg > calibration.txt
 ```
 Set environment variables
 ```
 export INT8_CALIB_IMG_PATH=calibration.txt
 export INT8_CALIB_BATCH_SIZE=1
 ```
 Change config_infer_primary.txt file
 ```
 ...
 model-engine-file=model_b1_gpu0_fp32.engine
 #int8-calib-file=calib.table
 ...
 network-mode=0
 ...
 ```
 To
 ```
 ...
 model-engine-file=model_b1_gpu0_int8.engine
 int8-calib-file=calib.table
 ...
 network-mode=1
 ...
 ```
 Run
 ```
 deepstream-app -c deepstream_app_config.txt
 ```
 Note: NVIDIA recommends at least 500 images to get a good accuracy. In this example I used 1000 images to get better accuracy (more images = more accuracy). Higher INT8_CALIB_BATCH_SIZE values will increase the accuracy and calibration speed. Set it according to you GPU memory. This process can take a long time. The calibration isn't available for YOLOv5 models.
 ###
 ### mAP/FPS comparison between models
 <details><summary>Open</summary>
 ```
 valid = val2017 (COCO)
 NMS = 0.45 (changed to beta_nms when used in Darknet cfg file) / 0.6 (YOLOv5 models)
 pre-cluster-threshold = 0.001 (mAP eval) / 0.25 (FPS measurement)
 batch-size = 1
 FPS measurement display width = 1920
 FPS measurement display height = 1080
 NOTE: Used NVIDIA GTX 1050 (4GB Mobile) for evaluate. Used maintain-aspect-ratio=1 in config_infer file for YOLOv4 (with letter_box=1) and YOLOv5 models. For INT8 calibration, was used 1000 random images from val2017 (COCO) and INT8_CALIB_BATCH_SIZE=1.
 ```
 | TensorRT        | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS<br />(with display) | FPS<br />(without display) |
 |:---------------:|:---------:|:----------:|:------------:|:-------:|:--------:|:-----------------------:|:--------------------------:|
 | YOLOv5x 5.0     | FP32      | 640        | 0.        | 0.   | 0.    | .                    | .                       |
 | YOLOv5l 5.0     | FP32      | 640        | 0.        | 0.   | 0.    | .                   | .                      |
 | YOLOv5m 5.0     | FP32      | 640        | 0.        | 0.   | 0.    | .                   | .                      |
 | YOLOv5s 5.0     | FP32      | 640        | 0.        | 0.   | 0.    | .                   | .                      |
 | YOLOv5s 5.0     | FP32      | 416        | 0.        | 0.   | 0.    | .                   | .                      |
 | YOLOv4x-MISH    | FP32      | 640        | 0.461        | 0.649   | 0.499    | .                    | .                       |
 | YOLOv4x-MISH    | **INT8**  | 640        | 0.443        | 0.629   | 0.479    | .                    | .                       |
 | YOLOv4x-MISH    | FP32      | 608        | 0.461        | 0.650   | 0.496    | .                    | .                       |
 | YOLOv4-CSP      | FP32      | 640        | 0.443        | 0.632   | 0.477    | .                   | .                      |
 | YOLOv4-CSP      | FP32      | 608        | 0.443        | 0.632   | 0.477    | .                   | .                      |
 | YOLOv4-CSP      | FP32      | 512        | 0.437        | 0.625   | 0.471    | .                   | .                      |
 | YOLOv4-CSP      | **INT8**  | 512        | 0.414        | 0.601   | 0.447    | .                    | .                       |
 | YOLOv4          | FP32      | 640        | 0.492        | 0.729   | 0.547    | .                   | .                      |
 | YOLOv4          | FP32      | 608        | 0.499        | 0.739   | 0.551    | .                   | .                      |
 | YOLOv4          | **INT8**  | 608        | 0.483        | 0.728   | 0.534    | .                    | .                       |
 | YOLOv4          | FP32      | 512        | 0.492        | 0.730   | 0.542    | .                   | .                      |
 | YOLOv4          | FP32      | 416        | 0.468        | 0.702   | 0.507    | .                   | .                      |
 | YOLOv3-SPP      | FP32      | 608        | 0.412        | 0.687   | 0.434    | .                   | .                      |
 | YOLOv3          | FP32      | 608        | 0.378        | 0.674   | 0.389    | .                   | .                      |
 | YOLOv3          | **INT8**  | 608        | 0.381        | 0.677   | 0.388    | .                    | .                       |
 | YOLOv3          | FP32      | 416        | 0.373        | 0.669   | 0.379    | .                   | .                      |
 | YOLOv2          | FP32      | 608        | 0.211        | 0.365   | 0.220    | .                   | .                      |
 | YOLOv2          | FP32      | 416        | 0.207        | 0.362   | 0.211    | .                   | .                      |
 | YOLOv4-Tiny     | FP32      | 416        | 0.216        | 0.403   | 0.207    | .                  | .                     |
 | YOLOv4-Tiny     | **INT8**  | 416        | 0.203        | 0.385   | 0.192    | .                  | .                     |
 | YOLOv3-Tiny-PRN | FP32      | 416        | 0.168        | 0.381   | 0.126    | .                  | .                     |
 | YOLOv3-Tiny-PRN | **INT8**  | 416        | 0.155        | 0.358   | 0.113    | .                  | .                     |
 | YOLOv3-Tiny     | FP32      | 416        | 0.096        | 0.203   | 0.080    | .                  | .                     |
 | YOLOv2-Tiny     | FP32      | 416        | 0.084        | 0.194   | 0.062    | .                  | .                     |
 | YOLOv3-Lite     | FP32      | 416        | 0.169        | 0.356   | 0.137    | .                  | .                     |
 | YOLOv3-Lite     | FP32      | 320        | 0.158        | 0.328   | 0.132    | .                  | .                     |
 | YOLOv3-Nano     | FP32      | 416        | 0.128        | 0.278   | 0.099    | .                  | .                     |
 | YOLOv3-Nano     | FP32      | 320        | 0.122        | 0.260   | 0.099    | .                  | .                     |
 | YOLO-Fastest-XL | FP32      | 416        | 0.160        | 0.342   | 0.130    | .                  | .                     |
 | YOLO-Fastest-XL | FP32      | 320        | 0.158        | 0.329   | 0.135    | .                  | .                     |
 | YOLO-Fastest    | FP32      | 416        | 0.101        | 0.230   | 0.072    | .                  | .                     |
 | YOLO-Fastest    | FP32      | 320        | 0.102        | 0.232   | 0.073    | .                  | .                     |
 </details>
 ##