diff --git a/config_infer_primary_ppyoloe.txt b/config_infer_primary_ppyoloe.txt
new file mode 100644
index 0000000..4dfc950
--- /dev/null
+++ b/config_infer_primary_ppyoloe.txt
@@ -0,0 +1,25 @@
+[property]
+gpu-id=0
+net-scale-factor=0.0173520735727919486
+offsets=123.675;116.28;103.53
+model-color-format=0
+custom-network-config=ppyoloe_crn_s_400e_coco.cfg
+model-file=ppyoloe_crn_s_400e_coco.wts
+model-engine-file=model_b1_gpu0_fp32.engine
+#int8-calib-file=calib.table
+labelfile-path=labels.txt
+batch-size=1
+network-mode=0
+num-detected-classes=80
+interval=0
+gie-unique-id=1
+process-mode=1
+network-type=0
+cluster-mode=4
+maintain-aspect-ratio=0
+parse-bbox-func-name=NvDsInferParseYolo
+custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
+engine-create-func-name=NvDsInferYoloCudaEngineGet
+
+[class-attrs-all]
+pre-cluster-threshold=0
diff --git a/docs/PPYOLOE.md b/docs/PPYOLOE.md
new file mode 100644
index 0000000..9e97d0c
--- /dev/null
+++ b/docs/PPYOLOE.md
@@ -0,0 +1,115 @@
+# PP-YOLOE usage
+
+* [Convert model](#convert-model)
+* [Compile the lib](#compile-the-lib)
+* [Edit the config_infer_primary_ppyoloe file](#edit-the-config_infer_primary_ppyoloe-file)
+* [Edit the deepstream_app_config file](#edit-the-deepstream_app_config-file)
+* [Testing the model](#testing-the-model)
+
+##
+
+### Convert model
+
+#### 1. Download the PaddleDetection repo and install the requirements
+
+https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/docs/tutorials/INSTALL.md
+
+**NOTE**: It is recommended to use Python virtualenv.
+
+#### 2. Copy conversor
+
+Copy the `gen_wts_ppyoloe.py` file from `DeepStream-Yolo/utils` directory to the `PaddleDetection` folder.
+
+#### 3. Download the model
+
+Download the `pdparams` file from [PP-YOLOE](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/configs/ppyoloe) releases (example for PP-YOLOE-s)
+
+```
+wget https://paddledet.bj.bcebos.com/models/ppyoloe_crn_s_400e_coco.pdparams
+```
+
+**NOTE**: You can use your custom model, but it is important to keep the YOLO model reference (`ppyoloe_`) in you `cfg` and `weights`/`wts` filenames to generate the engine correctly.
+
+#### 4. Convert model
+
+Generate the `cfg` and `wts` files (example for PP-YOLOE-s)
+
+```
+python3 gen_wts_ppyoloe.py -w ppyoloe_crn_s_400e_coco.pdparams -c configs/ppyoloe/ppyoloe_crn_s_400e_coco.yml
+```
+
+#### 5. Copy generated files
+
+Copy the generated `cfg` and `wts` files to the `DeepStream-Yolo` folder.
+
+##
+
+### Compile the lib
+
+Open the `DeepStream-Yolo` folder and compile the lib
+
+* DeepStream 6.1 on x86 platform
+
+  ```
+  CUDA_VER=11.6 make -C nvdsinfer_custom_impl_Yolo
+  ```
+
+* DeepStream 6.0.1 / 6.0 on x86 platform
+
+  ```
+  CUDA_VER=11.4 make -C nvdsinfer_custom_impl_Yolo
+  ```
+
+* DeepStream 6.1 on Jetson platform
+
+  ```
+  CUDA_VER=11.4 make -C nvdsinfer_custom_impl_Yolo
+  ```
+
+* DeepStream 6.0.1 / 6.0 on Jetson platform
+
+  ```
+  CUDA_VER=10.2 make -C nvdsinfer_custom_impl_Yolo
+  ```
+
+##
+
+### Edit the config_infer_primary_yoloV5 file
+
+Edit the `config_infer_primary_ppyoloe.txt` file according to your model (example for PP-YOLOE-s)
+
+```
+[property]
+...
+custom-network-config=ppyoloe_crn_s_400e_coco.cfg
+model-file=ppyoloe_crn_s_400e_coco.wts
+...
+```
+
+**NOTE**: The PP-YOLOE uses normalization on the image preprocess. It is important to change the `net-scale-factor` and `offsets` according to the trained values.
+
+Default: `mean = 0.485, 0.456, 0.406` and `std = 0.229, 0.224, 0.225`
+
+```
+net-scale-factor=0.0173520735727919486
+offsets=123.675;116.28;103.53
+```
+
+##
+
+### Edit the deepstream_app_config.txt file
+
+```
+...
+[primary-gie]
+...
+config-file=config_infer_primary_ppyoloe.txt
+```
+
+##
+
+### Testing the model
+
+```
+deepstream-app -c deepstream_app_config.txt
+```
diff --git a/docs/YOLOR.md b/docs/YOLOR.md
index e111a80..efa0037 100644
--- a/docs/YOLOR.md
+++ b/docs/YOLOR.md
@@ -22,7 +22,7 @@ cd yolor
 pip3 install -r requirements.txt
 ```
 
-**NOTE**: It is recommended to use a Python virtualenv.
+**NOTE**: It is recommended to use Python virtualenv.
 
 #### 2. Copy conversor
 
diff --git a/docs/YOLOv5.md b/docs/YOLOv5.md
index 7a5da06..44a5d40 100644
--- a/docs/YOLOv5.md
+++ b/docs/YOLOv5.md
@@ -22,7 +22,7 @@ cd yolov5
 pip3 install -r requirements.txt
 ```
 
-**NOTE**: It is recommended to use a Python virtualenv.
+**NOTE**: It is recommended to use Python virtualenv.
 
 #### 2. Copy conversor
 
diff --git a/nvdsinfer_custom_impl_Yolo/Makefile b/nvdsinfer_custom_impl_Yolo/Makefile
index c5bedbc..d71080a 100644
--- a/nvdsinfer_custom_impl_Yolo/Makefile
+++ b/nvdsinfer_custom_impl_Yolo/Makefile
@@ -59,15 +59,21 @@ SRCFILES:= nvdsinfer_yolo_engine.cpp \
            layers/shortcut_layer.cpp \
            layers/route_layer.cpp \
            layers/upsample_layer.cpp \
-           layers/maxpool_layer.cpp \
+           layers/pooling_layer.cpp \
            layers/activation_layer.cpp \
-           layers/reorgv5_layer.cpp \
+           layers/reorg_layer.cpp \
+           layers/reduce_layer.cpp \
+           layers/shuffle_layer.cpp \
+           layers/softmax_layer.cpp \
+           layers/cls_layer.cpp \
+           layers/reg_layer.cpp \
            utils.cpp \
            yolo.cpp \
            yoloForward.cu \
            yoloForward_v2.cu \
            yoloForward_nc.cu \
            yoloForward_r.cu \
+           yoloForward_e.cu \
            sortDetections.cu
 
 ifeq ($(OPENCV), 1)
diff --git a/nvdsinfer_custom_impl_Yolo/layers/activation_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/activation_layer.cpp
index 500789c..139b42f 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/activation_layer.cpp
+++ b/nvdsinfer_custom_impl_Yolo/layers/activation_layer.cpp
@@ -5,114 +5,113 @@
 
 #include "activation_layer.h"
 
-nvinfer1::ILayer* activationLayer(
+nvinfer1::ITensor* activationLayer(
     int layerIdx,
     std::string activation,
-    nvinfer1::ILayer* output,
     nvinfer1::ITensor* input,
     nvinfer1::INetworkDefinition* network)
 {
+    nvinfer1::ITensor* output;
+
     if (activation == "linear")
     {
-        // Pass
+        output = input;
     }
     else if (activation == "relu")
     {
-        nvinfer1::IActivationLayer* relu = network->addActivation(
-            *input, nvinfer1::ActivationType::kRELU);
+        nvinfer1::IActivationLayer* relu = network->addActivation(*input, nvinfer1::ActivationType::kRELU);
         assert(relu != nullptr);
         std::string reluLayerName = "relu_" + std::to_string(layerIdx);
         relu->setName(reluLayerName.c_str());
-        output = relu;
+        output = relu->getOutput(0);
     }
     else if (activation == "sigmoid" || activation == "logistic")
     {
-        nvinfer1::IActivationLayer* sigmoid = network->addActivation(
-            *input, nvinfer1::ActivationType::kSIGMOID);
+        nvinfer1::IActivationLayer* sigmoid = network->addActivation(*input, nvinfer1::ActivationType::kSIGMOID);
         assert(sigmoid != nullptr);
         std::string sigmoidLayerName = "sigmoid_" + std::to_string(layerIdx);
         sigmoid->setName(sigmoidLayerName.c_str());
-        output = sigmoid;
+        output = sigmoid->getOutput(0);
     }
     else if (activation == "tanh")
     {
-        nvinfer1::IActivationLayer* tanh = network->addActivation(
-            *input, nvinfer1::ActivationType::kTANH);
+        nvinfer1::IActivationLayer* tanh = network->addActivation(*input, nvinfer1::ActivationType::kTANH);
         assert(tanh != nullptr);
         std::string tanhLayerName = "tanh_" + std::to_string(layerIdx);
         tanh->setName(tanhLayerName.c_str());
-        output = tanh;
+        output = tanh->getOutput(0);
     }
     else if (activation == "leaky")
     {
-        nvinfer1::IActivationLayer* leaky = network->addActivation(
-            *input, nvinfer1::ActivationType::kLEAKY_RELU);
+        nvinfer1::IActivationLayer* leaky = network->addActivation(*input, nvinfer1::ActivationType::kLEAKY_RELU);
         assert(leaky != nullptr);
-        leaky->setAlpha(0.1);
         std::string leakyLayerName = "leaky_" + std::to_string(layerIdx);
         leaky->setName(leakyLayerName.c_str());
-        output = leaky;
+        leaky->setAlpha(0.1);
+        output = leaky->getOutput(0);
     }
     else if (activation == "softplus")
     {
-        nvinfer1::IActivationLayer* softplus = network->addActivation(
-            *input, nvinfer1::ActivationType::kSOFTPLUS);
+        nvinfer1::IActivationLayer* softplus = network->addActivation(*input, nvinfer1::ActivationType::kSOFTPLUS);
         assert(softplus != nullptr);
         std::string softplusLayerName = "softplus_" + std::to_string(layerIdx);
         softplus->setName(softplusLayerName.c_str());
-        output = softplus;
+        output = softplus->getOutput(0);
     }
     else if (activation == "mish")
     {
-        nvinfer1::IActivationLayer* softplus = network->addActivation(
-            *input, nvinfer1::ActivationType::kSOFTPLUS);
+        nvinfer1::IActivationLayer* softplus = network->addActivation(*input, nvinfer1::ActivationType::kSOFTPLUS);
         assert(softplus != nullptr);
         std::string softplusLayerName = "softplus_" + std::to_string(layerIdx);
         softplus->setName(softplusLayerName.c_str());
-        nvinfer1::IActivationLayer* tanh = network->addActivation(
-            *softplus->getOutput(0), nvinfer1::ActivationType::kTANH);
+        nvinfer1::IActivationLayer* tanh = network->addActivation(*softplus->getOutput(0), nvinfer1::ActivationType::kTANH);
         assert(tanh != nullptr);
         std::string tanhLayerName = "tanh_" + std::to_string(layerIdx);
         tanh->setName(tanhLayerName.c_str());
-        nvinfer1::IElementWiseLayer* mish = network->addElementWise(
-            *input, *tanh->getOutput(0),
-            nvinfer1::ElementWiseOperation::kPROD);
+        nvinfer1::IElementWiseLayer* mish
+            = network->addElementWise(*input, *tanh->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
         assert(mish != nullptr);
         std::string mishLayerName = "mish_" + std::to_string(layerIdx);
         mish->setName(mishLayerName.c_str());
-        output = mish;
+        output = mish->getOutput(0);
     }
     else if (activation == "silu" || activation == "swish")
     {
-        nvinfer1::IActivationLayer* sigmoid = network->addActivation(
-            *input, nvinfer1::ActivationType::kSIGMOID);
+        nvinfer1::IActivationLayer* sigmoid = network->addActivation(*input, nvinfer1::ActivationType::kSIGMOID);
         assert(sigmoid != nullptr);
         std::string sigmoidLayerName = "sigmoid_" + std::to_string(layerIdx);
         sigmoid->setName(sigmoidLayerName.c_str());
-        nvinfer1::IElementWiseLayer* silu = network->addElementWise(
-            *input, *sigmoid->getOutput(0),
-            nvinfer1::ElementWiseOperation::kPROD);
+        nvinfer1::IElementWiseLayer* silu
+            = network->addElementWise(*input, *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
         assert(silu != nullptr);
         std::string siluLayerName = "silu_" + std::to_string(layerIdx);
         silu->setName(siluLayerName.c_str());
-        output = silu;
+        output = silu->getOutput(0);
+    }
+    else if (activation == "hardsigmoid")
+    {
+        nvinfer1::IActivationLayer* hardsigmoid = network->addActivation(*input, nvinfer1::ActivationType::kHARD_SIGMOID);
+        assert(hardsigmoid != nullptr);
+        std::string hardsigmoidLayerName = "hardsigmoid_" + std::to_string(layerIdx);
+        hardsigmoid->setName(hardsigmoidLayerName.c_str());
+        hardsigmoid->setAlpha(1.0 / 6.0);
+        hardsigmoid->setBeta(0.5);
+        output = hardsigmoid->getOutput(0);
     }
     else if (activation == "hardswish")
     {
-        nvinfer1::IActivationLayer* hard_sigmoid = network->addActivation(
-            *input, nvinfer1::ActivationType::kHARD_SIGMOID);
-        assert(hard_sigmoid != nullptr);
-        hard_sigmoid->setAlpha(1.0 / 6.0);
-        hard_sigmoid->setBeta(0.5);
-        std::string hardSigmoidLayerName = "hard_sigmoid_" + std::to_string(layerIdx);
-        hard_sigmoid->setName(hardSigmoidLayerName.c_str());
-        nvinfer1::IElementWiseLayer* hard_swish = network->addElementWise(
-            *input, *hard_sigmoid->getOutput(0),
-            nvinfer1::ElementWiseOperation::kPROD);
-        assert(hard_swish != nullptr);
-        std::string hardSwishLayerName = "hard_swish_" + std::to_string(layerIdx);
-        hard_swish->setName(hardSwishLayerName.c_str());
-        output = hard_swish;
+        nvinfer1::IActivationLayer* hardsigmoid = network->addActivation(*input, nvinfer1::ActivationType::kHARD_SIGMOID);
+        assert(hardsigmoid != nullptr);
+        std::string hardsigmoidLayerName = "hardsigmoid_" + std::to_string(layerIdx);
+        hardsigmoid->setName(hardsigmoidLayerName.c_str());
+        hardsigmoid->setAlpha(1.0 / 6.0);
+        hardsigmoid->setBeta(0.5);
+        nvinfer1::IElementWiseLayer* hardswish
+            = network->addElementWise(*input, *hardsigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
+        assert(hardswish != nullptr);
+        std::string hardswishLayerName = "hardswish_" + std::to_string(layerIdx);
+        hardswish->setName(hardswishLayerName.c_str());
+        output = hardswish->getOutput(0);
     }
     else
     {
diff --git a/nvdsinfer_custom_impl_Yolo/layers/activation_layer.h b/nvdsinfer_custom_impl_Yolo/layers/activation_layer.h
index 176a5c2..c5151ac 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/activation_layer.h
+++ b/nvdsinfer_custom_impl_Yolo/layers/activation_layer.h
@@ -6,18 +6,14 @@
 #ifndef __ACTIVATION_LAYER_H__
 #define __ACTIVATION_LAYER_H__
 
-#include <string>
 #include <cassert>
 #include <iostream>
 
 #include "NvInfer.h"
 
-#include "activation_layer.h"
-
-nvinfer1::ILayer* activationLayer(
+nvinfer1::ITensor* activationLayer(
     int layerIdx,
     std::string activation,
-    nvinfer1::ILayer* output,
     nvinfer1::ITensor* input,
     nvinfer1::INetworkDefinition* network);
 
diff --git a/nvdsinfer_custom_impl_Yolo/layers/batchnorm_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/batchnorm_layer.cpp
index 063f3d1..e6828e7 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/batchnorm_layer.cpp
+++ b/nvdsinfer_custom_impl_Yolo/layers/batchnorm_layer.cpp
@@ -6,7 +6,7 @@
 #include <math.h>
 #include "batchnorm_layer.h"
 
-nvinfer1::ILayer* batchnormLayer(
+nvinfer1::ITensor* batchnormLayer(
     int layerIdx,
     std::map<std::string, std::string>& block,
     std::vector<float>& weights,
@@ -17,6 +17,8 @@ nvinfer1::ILayer* batchnormLayer(
     nvinfer1::ITensor* input,
     nvinfer1::INetworkDefinition* network)
 {
+    nvinfer1::ITensor* output;
+
     assert(block.at("type") == "batchnorm");
     assert(block.find("filters") != block.end());
 
@@ -28,7 +30,8 @@ nvinfer1::ILayer* batchnormLayer(
     std::vector<float> bnRunningMean;
     std::vector<float> bnRunningVar;
 
-    if (weightsType == "weights") {
+    if (weightsType == "weights")
+    {
         for (int i = 0; i < filters; ++i)
         {
             bnBiases.push_back(weights[weightPtr]);
@@ -50,7 +53,8 @@ nvinfer1::ILayer* batchnormLayer(
             weightPtr++;
         }
     }
-    else {
+    else
+    {
         for (int i = 0; i < filters; ++i)
         {
             bnWeights.push_back(weights[weightPtr]);
@@ -79,35 +83,27 @@ nvinfer1::ILayer* batchnormLayer(
     nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, size};
     float* shiftWt = new float[size];
     for (int i = 0; i < size; ++i)
-    {
-        shiftWt[i]
-            = bnBiases.at(i) - ((bnRunningMean.at(i) * bnWeights.at(i)) / bnRunningVar.at(i));
-    }
+        shiftWt[i] = bnBiases.at(i) - ((bnRunningMean.at(i) * bnWeights.at(i)) / bnRunningVar.at(i));
     shift.values = shiftWt;
     float* scaleWt = new float[size];
     for (int i = 0; i < size; ++i)
-    {
         scaleWt[i] = bnWeights.at(i) / bnRunningVar[i];
-    }
     scale.values = scaleWt;
     float* powerWt = new float[size];
     for (int i = 0; i < size; ++i)
-    {
         powerWt[i] = 1.0;
-    }
     power.values = powerWt;
     trtWeights.push_back(shift);
     trtWeights.push_back(scale);
     trtWeights.push_back(power);
 
-    nvinfer1::IScaleLayer* bn = network->addScale(
-        *input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
-    assert(bn != nullptr);
-    std::string bnLayerName = "batch_norm_" + std::to_string(layerIdx);
-    bn->setName(bnLayerName.c_str());
-    nvinfer1::ILayer* output = bn;
+    nvinfer1::IScaleLayer* batchnorm = network->addScale(*input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
+    assert(batchnorm != nullptr);
+    std::string batchnormLayerName = "batchnorm_" + std::to_string(layerIdx);
+    batchnorm->setName(batchnormLayerName.c_str());
+    output = batchnorm->getOutput(0);
 
-    output = activationLayer(layerIdx, activation, output, output->getOutput(0), network);
+    output = activationLayer(layerIdx, activation, output, network);
     assert(output != nullptr);
 
     return output;
diff --git a/nvdsinfer_custom_impl_Yolo/layers/batchnorm_layer.h b/nvdsinfer_custom_impl_Yolo/layers/batchnorm_layer.h
index 514b456..078b7f1 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/batchnorm_layer.h
+++ b/nvdsinfer_custom_impl_Yolo/layers/batchnorm_layer.h
@@ -13,7 +13,7 @@
 
 #include "activation_layer.h"
 
-nvinfer1::ILayer* batchnormLayer(
+nvinfer1::ITensor* batchnormLayer(
     int layerIdx,
     std::map<std::string, std::string>& block,
     std::vector<float>& weights,
diff --git a/nvdsinfer_custom_impl_Yolo/layers/channels_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/channels_layer.cpp
index 42d17a1..69e183a 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/channels_layer.cpp
+++ b/nvdsinfer_custom_impl_Yolo/layers/channels_layer.cpp
@@ -5,27 +5,32 @@
 
 #include "channels_layer.h"
 
-nvinfer1::ILayer* channelsLayer(
-    std::string type,
+nvinfer1::ITensor* channelsLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
     nvinfer1::ITensor* input,
     nvinfer1::ITensor* implicitTensor,
     nvinfer1::INetworkDefinition* network)
 {
-    nvinfer1::ILayer* output;
+    nvinfer1::ITensor* output;
 
-    if (type == "shift") {
-    nvinfer1::IElementWiseLayer* ew = network->addElementWise(
-        *input, *implicitTensor,
-        nvinfer1::ElementWiseOperation::kSUM);
-    assert(ew != nullptr);
-    output = ew;
+    assert(block.at("type") == "shift_channels" || block.at("type") == "control_channels");
+
+    if (block.at("type") == "shift_channels") {
+        nvinfer1::IElementWiseLayer* shift
+            = network->addElementWise(*input, *implicitTensor, nvinfer1::ElementWiseOperation::kSUM);
+        assert(shift != nullptr);
+        std::string shiftLayerName = "shift_channels_" + std::to_string(layerIdx);
+        shift->setName(shiftLayerName.c_str());
+        output = shift->getOutput(0);
     }
-    else if (type == "control") {
-        nvinfer1::IElementWiseLayer* ew = network->addElementWise(
-        *input, *implicitTensor,
-        nvinfer1::ElementWiseOperation::kPROD);
-    assert(ew != nullptr);
-    output = ew;
+    else if (block.at("type") == "control_channels") {
+        nvinfer1::IElementWiseLayer* control
+            = network->addElementWise(*input, *implicitTensor, nvinfer1::ElementWiseOperation::kPROD);
+        assert(control != nullptr);
+        std::string controlLayerName = "control_channels_" + std::to_string(layerIdx);
+        control->setName(controlLayerName.c_str());
+        output = control->getOutput(0);
     }
 
     return output;
diff --git a/nvdsinfer_custom_impl_Yolo/layers/channels_layer.h b/nvdsinfer_custom_impl_Yolo/layers/channels_layer.h
index b22f6b6..4db704c 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/channels_layer.h
+++ b/nvdsinfer_custom_impl_Yolo/layers/channels_layer.h
@@ -11,8 +11,9 @@
 
 #include "NvInfer.h"
 
-nvinfer1::ILayer* channelsLayer(
-    std::string type,
+nvinfer1::ITensor* channelsLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
     nvinfer1::ITensor* input,
     nvinfer1::ITensor* implicitTensor,
     nvinfer1::INetworkDefinition* network);
diff --git a/nvdsinfer_custom_impl_Yolo/layers/cls_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/cls_layer.cpp
new file mode 100644
index 0000000..c8eed52
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/layers/cls_layer.cpp
@@ -0,0 +1,29 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#include "cls_layer.h"
+
+nvinfer1::ITensor* clsLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
+    nvinfer1::ITensor* input,
+    nvinfer1::INetworkDefinition* network)
+{
+    nvinfer1::ITensor* output;
+
+    assert(block.at("type") == "cls");
+
+    nvinfer1::IShuffleLayer* shuffle = network->addShuffle(*input);
+    assert(shuffle != nullptr);
+    std::string shuffleLayerName = "shuffle_" + std::to_string(layerIdx);
+    shuffle->setName(shuffleLayerName.c_str());
+    nvinfer1::Permutation permutation;
+    permutation.order[0] = 1;
+    permutation.order[1] = 0;
+    shuffle->setFirstTranspose(permutation);
+    output = shuffle->getOutput(0);
+
+    return output;
+}
diff --git a/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.h b/nvdsinfer_custom_impl_Yolo/layers/cls_layer.h
similarity index 76%
rename from nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.h
rename to nvdsinfer_custom_impl_Yolo/layers/cls_layer.h
index a0d139e..cca342b 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.h
+++ b/nvdsinfer_custom_impl_Yolo/layers/cls_layer.h
@@ -3,15 +3,15 @@
  * https://www.github.com/marcoslucianops
  */
 
-#ifndef __MAXPOOL_LAYER_H__
-#define __MAXPOOL_LAYER_H__
+#ifndef __CLS_LAYER_H__
+#define __CLS_LAYER_H__
 
 #include <map>
 #include <cassert>
 
 #include "NvInfer.h"
 
-nvinfer1::ILayer* maxpoolLayer(
+nvinfer1::ITensor* clsLayer(
     int layerIdx,
     std::map<std::string, std::string>& block,
     nvinfer1::ITensor* input,
diff --git a/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.cpp
index 6b502ad..be85379 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.cpp
+++ b/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.cpp
@@ -6,7 +6,7 @@
 #include <math.h>
 #include "convolutional_layer.h"
 
-nvinfer1::ILayer* convolutionalLayer(
+nvinfer1::ITensor* convolutionalLayer(
     int layerIdx,
     std::map<std::string, std::string>& block,
     std::vector<float>& weights,
@@ -18,6 +18,8 @@ nvinfer1::ILayer* convolutionalLayer(
     nvinfer1::ITensor* input,
     nvinfer1::INetworkDefinition* network)
 {
+    nvinfer1::ITensor* output;
+
     assert(block.at("type") == "convolutional");
     assert(block.find("filters") != block.end());
     assert(block.find("pad") != block.end());
@@ -40,14 +42,10 @@ nvinfer1::ILayer* convolutionalLayer(
 
     int groups = 1;
     if (block.find("groups") != block.end())
-    {
         groups = std::stoi(block.at("groups"));
-    }
 
     if (block.find("bias") != block.end())
-    {
         bias = std::stoi(block.at("bias"));
-    }
 
     int pad;
     if (padding)
@@ -63,7 +61,8 @@ nvinfer1::ILayer* convolutionalLayer(
     nvinfer1::Weights convWt{nvinfer1::DataType::kFLOAT, nullptr, size};
     nvinfer1::Weights convBias{nvinfer1::DataType::kFLOAT, nullptr, bias};
 
-    if (weightsType == "weights") {
+    if (weightsType == "weights")
+    {
         if (batchNormalize == false)
         {
             float* val;
@@ -120,7 +119,8 @@ nvinfer1::ILayer* convolutionalLayer(
                 trtWeights.push_back(convBias);
         }
     }
-    else {
+    else
+    {
         if (batchNormalize == false)
         {
             float* val = new float[size];
@@ -177,20 +177,18 @@ nvinfer1::ILayer* convolutionalLayer(
         }
     }
 
-    nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(
-        *input, filters, nvinfer1::DimsHW{kernelSize, kernelSize}, convWt, convBias);
+    nvinfer1::IConvolutionLayer* conv
+        = network->addConvolutionNd(*input, filters, nvinfer1::Dims{2, {kernelSize, kernelSize}}, convWt, convBias);
     assert(conv != nullptr);
     std::string convLayerName = "conv_" + std::to_string(layerIdx);
     conv->setName(convLayerName.c_str());
-    conv->setStrideNd(nvinfer1::DimsHW{stride, stride});
-    conv->setPaddingNd(nvinfer1::DimsHW{pad, pad});
+    conv->setStrideNd(nvinfer1::Dims{2, {stride, stride}});
+    conv->setPaddingNd(nvinfer1::Dims{2, {pad, pad}});
 
     if (block.find("groups") != block.end())
-    {
         conv->setNbGroups(groups);
-    }
 
-    nvinfer1::ILayer* output = conv;
+    output = conv->getOutput(0);
 
     if (batchNormalize == true)
     {
@@ -200,36 +198,28 @@ nvinfer1::ILayer* convolutionalLayer(
         nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, size};
         float* shiftWt = new float[size];
         for (int i = 0; i < size; ++i)
-        {
-            shiftWt[i]
-                = bnBiases.at(i) - ((bnRunningMean.at(i) * bnWeights.at(i)) / bnRunningVar.at(i));
-        }
+            shiftWt[i] = bnBiases.at(i) - ((bnRunningMean.at(i) * bnWeights.at(i)) / bnRunningVar.at(i));
         shift.values = shiftWt;
         float* scaleWt = new float[size];
         for (int i = 0; i < size; ++i)
-        {
             scaleWt[i] = bnWeights.at(i) / bnRunningVar[i];
-        }
         scale.values = scaleWt;
         float* powerWt = new float[size];
         for (int i = 0; i < size; ++i)
-        {
             powerWt[i] = 1.0;
-        }
         power.values = powerWt;
         trtWeights.push_back(shift);
         trtWeights.push_back(scale);
         trtWeights.push_back(power);
 
-        nvinfer1::IScaleLayer* bn = network->addScale(
-            *output->getOutput(0), nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
-        assert(bn != nullptr);
-        std::string bnLayerName = "batch_norm_" + std::to_string(layerIdx);
-        bn->setName(bnLayerName.c_str());
-        output = bn;
+        nvinfer1::IScaleLayer* batchnorm = network->addScale(*output, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
+        assert(batchnorm != nullptr);
+        std::string batchnormLayerName = "batchnorm_" + std::to_string(layerIdx);
+        batchnorm->setName(batchnormLayerName.c_str());
+        output = batchnorm->getOutput(0);
     }
 
-    output = activationLayer(layerIdx, activation, output, output->getOutput(0), network);
+    output = activationLayer(layerIdx, activation, output, network);
     assert(output != nullptr);
 
     return output;
diff --git a/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.h b/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.h
index ba077cb..8df166a 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.h
+++ b/nvdsinfer_custom_impl_Yolo/layers/convolutional_layer.h
@@ -13,7 +13,7 @@
 
 #include "activation_layer.h"
 
-nvinfer1::ILayer* convolutionalLayer(
+nvinfer1::ITensor* convolutionalLayer(
     int layerIdx,
     std::map<std::string, std::string>& block,
     std::vector<float>& weights,
diff --git a/nvdsinfer_custom_impl_Yolo/layers/implicit_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/implicit_layer.cpp
index 0d7e044..25ce603 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/implicit_layer.cpp
+++ b/nvdsinfer_custom_impl_Yolo/layers/implicit_layer.cpp
@@ -5,17 +5,25 @@
 
 #include "implicit_layer.h"
 
-nvinfer1::ILayer* implicitLayer(
-    int channels,
+nvinfer1::ITensor* implicitLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
     std::vector<float>& weights,
     std::vector<nvinfer1::Weights>& trtWeights,
     int& weightPtr,
     nvinfer1::INetworkDefinition* network)
 {
-    nvinfer1::Weights convWt{nvinfer1::DataType::kFLOAT, nullptr, channels};
+    nvinfer1::ITensor* output;
 
-    float* val = new float[channels];
-    for (int i = 0; i < channels; ++i)
+    assert(block.at("type") == "implicit_add" || block.at("type") == "implicit_mul");
+    assert(block.find("filters") != block.end());
+
+    int filters = std::stoi(block.at("filters"));
+
+    nvinfer1::Weights convWt{nvinfer1::DataType::kFLOAT, nullptr, filters};
+
+    float* val = new float[filters];
+    for (int i = 0; i < filters; ++i)
     {
         val[i] = weights[weightPtr];
         weightPtr++;
@@ -23,8 +31,11 @@ nvinfer1::ILayer* implicitLayer(
     convWt.values = val;
     trtWeights.push_back(convWt);
 
-    nvinfer1::IConstantLayer* implicit = network->addConstant(nvinfer1::Dims3{static_cast<int>(channels), 1, 1}, convWt);
+    nvinfer1::IConstantLayer* implicit = network->addConstant(nvinfer1::Dims{3, {filters, 1, 1}}, convWt);
     assert(implicit != nullptr);
+    std::string implicitLayerName = block.at("type") + "_" + std::to_string(layerIdx);
+    implicit->setName(implicitLayerName.c_str());
+    output = implicit->getOutput(0);
 
-    return implicit;
+    return output;
 }
diff --git a/nvdsinfer_custom_impl_Yolo/layers/implicit_layer.h b/nvdsinfer_custom_impl_Yolo/layers/implicit_layer.h
index e34d738..a4611c9 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/implicit_layer.h
+++ b/nvdsinfer_custom_impl_Yolo/layers/implicit_layer.h
@@ -12,8 +12,9 @@
 
 #include "NvInfer.h"
 
-nvinfer1::ILayer* implicitLayer(
-    int channels,
+nvinfer1::ITensor* implicitLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
     std::vector<float>& weights,
     std::vector<nvinfer1::Weights>& trtWeights,
     int& weightPtr,
diff --git a/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.cpp
deleted file mode 100644
index 51caab9..0000000
--- a/nvdsinfer_custom_impl_Yolo/layers/maxpool_layer.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Created by Marcos Luciano
- * https://www.github.com/marcoslucianops
- */
-
-#include "maxpool_layer.h"
-
-nvinfer1::ILayer* maxpoolLayer(
-    int layerIdx,
-    std::map<std::string, std::string>& block,
-    nvinfer1::ITensor* input,
-    nvinfer1::INetworkDefinition* network)
-{
-    assert(block.at("type") == "maxpool");
-    assert(block.find("size") != block.end());
-    assert(block.find("stride") != block.end());
-
-    int size = std::stoi(block.at("size"));
-    int stride = std::stoi(block.at("stride"));
-
-    nvinfer1::IPoolingLayer* pool
-        = network->addPoolingNd(*input, nvinfer1::PoolingType::kMAX, nvinfer1::Dims{2, {size, size}});
-    assert(pool);
-    std::string maxpoolLayerName = "maxpool_" + std::to_string(layerIdx);
-    pool->setStrideNd(nvinfer1::Dims{2, {stride, stride}});
-    pool->setPaddingNd(nvinfer1::Dims{2, {(size - 1) / 2, (size - 1) / 2}});
-    if (size == 2 && stride == 1)
-    {
-        pool->setPrePadding(nvinfer1::Dims{2, {0, 0}});
-        pool->setPostPadding(nvinfer1::Dims{2, {1, 1}});
-    }
-    pool->setName(maxpoolLayerName.c_str());
-
-    return pool;
-}
diff --git a/nvdsinfer_custom_impl_Yolo/layers/pooling_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/pooling_layer.cpp
new file mode 100644
index 0000000..9f4d59c
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/layers/pooling_layer.cpp
@@ -0,0 +1,57 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#include "pooling_layer.h"
+
+nvinfer1::ITensor* poolingLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
+    nvinfer1::ITensor* input,
+    nvinfer1::INetworkDefinition* network)
+{
+    nvinfer1::ITensor* output;
+
+    assert(block.at("type") == "maxpool" || block.at("type") == "avgpool");
+
+    if (block.at("type") == "maxpool")
+    {
+        assert(block.find("size") != block.end());
+        assert(block.find("stride") != block.end());
+
+        int size = std::stoi(block.at("size"));
+        int stride = std::stoi(block.at("stride"));
+
+        nvinfer1::IPoolingLayer* maxpool
+            = network->addPoolingNd(*input, nvinfer1::PoolingType::kMAX, nvinfer1::Dims{2, {size, size}});
+        assert(maxpool != nullptr);
+        std::string maxpoolLayerName = "maxpool_" + std::to_string(layerIdx);
+        maxpool->setName(maxpoolLayerName.c_str());
+        maxpool->setStrideNd(nvinfer1::Dims{2, {stride, stride}});
+        maxpool->setPaddingNd(nvinfer1::Dims{2, {(size - 1) / 2, (size - 1) / 2}});
+        if (size == 2 && stride == 1)
+        {
+            maxpool->setPrePadding(nvinfer1::Dims{2, {0, 0}});
+            maxpool->setPostPadding(nvinfer1::Dims{2, {1, 1}});
+        }
+        output = maxpool->getOutput(0);
+    }
+    else if (block.at("type") == "avgpool")
+    {
+        nvinfer1::Dims inputDims = input->getDimensions();
+        nvinfer1::IPoolingLayer* avgpool = network->addPoolingNd(
+            *input, nvinfer1::PoolingType::kAVERAGE, nvinfer1::Dims{2, {inputDims.d[1], inputDims.d[2]}});
+        assert(avgpool != nullptr);
+        std::string avgpoolLayerName = "avgpool_" + std::to_string(layerIdx);
+        avgpool->setName(avgpoolLayerName.c_str());
+        output = avgpool->getOutput(0);
+    }
+    else
+    {
+        std::cerr << "Pooling not supported: " << block.at("type") << std::endl;
+        std::abort();
+    }
+
+    return output;
+}
diff --git a/nvdsinfer_custom_impl_Yolo/layers/pooling_layer.h b/nvdsinfer_custom_impl_Yolo/layers/pooling_layer.h
new file mode 100644
index 0000000..0e97f24
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/layers/pooling_layer.h
@@ -0,0 +1,21 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#ifndef __POOLING_LAYER_H__
+#define __POOLING_LAYER_H__
+
+#include <map>
+#include <cassert>
+#include <iostream>
+
+#include "NvInfer.h"
+
+nvinfer1::ITensor* poolingLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
+    nvinfer1::ITensor* input,
+    nvinfer1::INetworkDefinition* network);
+
+#endif
diff --git a/nvdsinfer_custom_impl_Yolo/layers/reduce_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/reduce_layer.cpp
new file mode 100644
index 0000000..716848b
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/layers/reduce_layer.cpp
@@ -0,0 +1,58 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#include "reduce_layer.h"
+
+nvinfer1::ITensor* reduceLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
+    nvinfer1::ITensor* input,
+    nvinfer1::INetworkDefinition* network)
+{
+    nvinfer1::ITensor* output;
+
+    assert(block.at("type") == "reduce");
+    assert(block.find("mode") != block.end());
+    assert(block.find("axes") != block.end());
+
+    std::string mode = block.at("mode");
+
+    nvinfer1::ReduceOperation operation;
+    if (mode == "mean")
+        operation = nvinfer1::ReduceOperation::kAVG;
+
+    std::string strAxes = block.at("axes");
+    std::vector<int32_t> axes;
+    size_t lastPos = 0, pos = 0;
+    while ((pos = strAxes.find(',', lastPos)) != std::string::npos)
+    {
+        int vL = std::stoi(trim(strAxes.substr(lastPos, pos - lastPos)));
+        axes.push_back(vL);
+        lastPos = pos + 1;
+    }
+    if (lastPos < strAxes.length())
+    {
+        std::string lastV = trim(strAxes.substr(lastPos));
+        if (!lastV.empty())
+            axes.push_back(std::stoi(lastV));
+    }
+    assert(!axes.empty());
+    
+    uint32_t axisMask = 0;
+    for (int axis : axes)
+        axisMask |= 1 << axis;
+    
+    bool keepDims = false;
+    if (block.find("keep") != block.end())
+        keepDims = std::stoi(block.at("keep")) == 1 ? true : false;
+
+    nvinfer1::IReduceLayer* reduce = network->addReduce(*input, operation, axisMask, keepDims);
+    assert(reduce != nullptr);
+    std::string reduceLayerName = "reduce_" + std::to_string(layerIdx);
+    reduce->setName(reduceLayerName.c_str());
+    output = reduce->getOutput(0);
+
+    return output;
+}
diff --git a/nvdsinfer_custom_impl_Yolo/layers/reduce_layer.h b/nvdsinfer_custom_impl_Yolo/layers/reduce_layer.h
new file mode 100644
index 0000000..c8330a1
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/layers/reduce_layer.h
@@ -0,0 +1,18 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#ifndef __REDUCE_LAYER_H__
+#define __REDUCE_LAYER_H__
+
+#include "NvInfer.h"
+#include "../utils.h"
+
+nvinfer1::ITensor* reduceLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
+    nvinfer1::ITensor* input,
+    nvinfer1::INetworkDefinition* network);
+
+#endif
diff --git a/nvdsinfer_custom_impl_Yolo/layers/reg_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/reg_layer.cpp
new file mode 100644
index 0000000..ea9be07
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/layers/reg_layer.cpp
@@ -0,0 +1,113 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#include "reg_layer.h"
+
+nvinfer1::ITensor* regLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
+    std::vector<float>& weights,
+    std::vector<nvinfer1::Weights>& trtWeights,
+    int& weightPtr,
+    nvinfer1::ITensor* input,
+    nvinfer1::INetworkDefinition* network)
+{
+    nvinfer1::ITensor* output;
+
+    assert(block.at("type") == "reg");
+
+    nvinfer1::IShuffleLayer* shuffle = network->addShuffle(*input);
+    assert(shuffle != nullptr);
+    std::string shuffleLayerName = "shuffle_" + std::to_string(layerIdx);
+    shuffle->setName(shuffleLayerName.c_str());
+    nvinfer1::Permutation permutation;
+    permutation.order[0] = 1;
+    permutation.order[1] = 0;
+    shuffle->setFirstTranspose(permutation);
+    output = shuffle->getOutput(0);
+    nvinfer1::Dims shuffleDims = output->getDimensions();
+
+    nvinfer1::ISliceLayer* sliceLt = network->addSlice(
+        *output, nvinfer1::Dims{2, {0, 0}}, nvinfer1::Dims{2, {shuffleDims.d[0], 2}}, nvinfer1::Dims{2, {1, 1}});
+    assert(sliceLt != nullptr);
+    std::string sliceLtLayerName = "slice_lt_" + std::to_string(layerIdx);
+    sliceLt->setName(sliceLtLayerName.c_str());
+    nvinfer1::ITensor* lt = sliceLt->getOutput(0);
+
+    nvinfer1::ISliceLayer* sliceRb = network->addSlice(
+        *output, nvinfer1::Dims{2, {0, 2}}, nvinfer1::Dims{2, {shuffleDims.d[0], 2}}, nvinfer1::Dims{2, {1, 1}});
+    assert(sliceRb != nullptr);
+    std::string sliceRbLayerName = "slice_rb_" + std::to_string(layerIdx);
+    sliceRb->setName(sliceRbLayerName.c_str());
+    nvinfer1::ITensor* rb = sliceRb->getOutput(0);
+
+    int channels = shuffleDims.d[0] * 2;
+    nvinfer1::Weights anchorPointsWt{nvinfer1::DataType::kFLOAT, nullptr, channels};
+    float* val = new float[channels];
+    for (int i = 0; i < channels; ++i)
+    {
+        val[i] = weights[weightPtr];
+        weightPtr++;
+    }
+    anchorPointsWt.values = val;
+    trtWeights.push_back(anchorPointsWt);
+
+    nvinfer1::IConstantLayer* anchorPoints = network->addConstant(nvinfer1::Dims{2, {shuffleDims.d[0], 2}}, anchorPointsWt);
+    assert(anchorPoints != nullptr);
+    std::string anchorPointsLayerName = "anchor_points_" + std::to_string(layerIdx);
+    anchorPoints->setName(anchorPointsLayerName.c_str());
+    nvinfer1::ITensor* anchorPointsTensor = anchorPoints->getOutput(0);
+
+    nvinfer1::IElementWiseLayer* x1y1
+        = network->addElementWise(*anchorPointsTensor, *lt, nvinfer1::ElementWiseOperation::kSUB);
+    assert(x1y1 != nullptr);
+    std::string x1y1LayerName = "x1y1_" + std::to_string(layerIdx);
+    x1y1->setName(x1y1LayerName.c_str());
+    nvinfer1::ITensor* x1y1Tensor = x1y1->getOutput(0);
+
+    nvinfer1::IElementWiseLayer* x2y2
+        = network->addElementWise(*rb, *anchorPointsTensor, nvinfer1::ElementWiseOperation::kSUM);
+    assert(x2y2 != nullptr);
+    std::string x2y2LayerName = "x2y2_" + std::to_string(layerIdx);
+    x2y2->setName(x2y2LayerName.c_str());
+    nvinfer1::ITensor* x2y2Tensor = x2y2->getOutput(0);
+
+    std::vector<nvinfer1::ITensor*> concatInputs;
+    concatInputs.push_back(x1y1Tensor);
+    concatInputs.push_back(x2y2Tensor);
+
+    nvinfer1::IConcatenationLayer* concat = network->addConcatenation(concatInputs.data(), concatInputs.size());
+    assert(concat != nullptr);
+    std::string concatLayerName = "concat_" + std::to_string(layerIdx);
+    concat->setName(concatLayerName.c_str());
+    concat->setAxis(1);
+    output = concat->getOutput(0);
+
+    channels = shuffleDims.d[0];
+    nvinfer1::Weights stridePointsWt{nvinfer1::DataType::kFLOAT, nullptr, channels};
+    val = new float[channels];
+    for (int i = 0; i < channels; ++i)
+    {
+        val[i] = weights[weightPtr];
+        weightPtr++;
+    }
+    stridePointsWt.values = val;
+    trtWeights.push_back(stridePointsWt);
+
+    nvinfer1::IConstantLayer* stridePoints = network->addConstant(nvinfer1::Dims{2, {shuffleDims.d[0], 1}}, stridePointsWt);
+    assert(stridePoints != nullptr);
+    std::string stridePointsLayerName = "stride_points_" + std::to_string(layerIdx);
+    stridePoints->setName(stridePointsLayerName.c_str());
+    nvinfer1::ITensor* stridePointsTensor = stridePoints->getOutput(0);
+
+    nvinfer1::IElementWiseLayer* pred
+        = network->addElementWise(*output, *stridePointsTensor, nvinfer1::ElementWiseOperation::kPROD);
+    assert(pred != nullptr);
+    std::string predLayerName = "pred_" + std::to_string(layerIdx);
+    pred->setName(predLayerName.c_str());
+    output = pred->getOutput(0);
+
+    return output;
+}
diff --git a/nvdsinfer_custom_impl_Yolo/layers/reg_layer.h b/nvdsinfer_custom_impl_Yolo/layers/reg_layer.h
new file mode 100644
index 0000000..b8addb3
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/layers/reg_layer.h
@@ -0,0 +1,24 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#ifndef __REG_LAYER_H__
+#define __REG_LAYER_H__
+
+#include <map>
+#include <vector>
+#include <cassert>
+
+#include "NvInfer.h"
+
+nvinfer1::ITensor* regLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
+    std::vector<float>& weights,
+    std::vector<nvinfer1::Weights>& trtWeights,
+    int& weightPtr,
+    nvinfer1::ITensor* input,
+    nvinfer1::INetworkDefinition* network);
+
+#endif
diff --git a/nvdsinfer_custom_impl_Yolo/layers/reorg_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/reorg_layer.cpp
new file mode 100644
index 0000000..c126df2
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/layers/reorg_layer.cpp
@@ -0,0 +1,62 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#include "reorg_layer.h"
+
+nvinfer1::ITensor* reorgLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
+    nvinfer1::ITensor* input,
+    nvinfer1::INetworkDefinition* network)
+{
+    nvinfer1::ITensor* output;
+
+    assert(block.at("type") == "reorg");
+
+    nvinfer1::Dims inputDims = input->getDimensions();
+
+    nvinfer1::ISliceLayer *slice1 = network->addSlice(
+        *input, nvinfer1::Dims{3, {0, 0, 0}}, nvinfer1::Dims{3, {inputDims.d[0], inputDims.d[1] / 2, inputDims.d[2] / 2}},
+        nvinfer1::Dims{3, {1, 2, 2}});
+    assert(slice1 != nullptr);
+    std::string slice1LayerName = "slice1_" + std::to_string(layerIdx);
+    slice1->setName(slice1LayerName.c_str());
+
+    nvinfer1::ISliceLayer *slice2 = network->addSlice(
+        *input, nvinfer1::Dims{3, {0, 0, 1}}, nvinfer1::Dims{3, {inputDims.d[0], inputDims.d[1] / 2, inputDims.d[2] / 2}},
+        nvinfer1::Dims{3, {1, 2, 2}});
+    assert(slice2 != nullptr);
+    std::string slice2LayerName = "slice2_" + std::to_string(layerIdx);
+    slice2->setName(slice2LayerName.c_str());
+
+    nvinfer1::ISliceLayer *slice3 = network->addSlice(
+        *input, nvinfer1::Dims{3, {0, 1, 0}}, nvinfer1::Dims{3, {inputDims.d[0], inputDims.d[1] / 2, inputDims.d[2] / 2}},
+        nvinfer1::Dims{3, {1, 2, 2}});
+    assert(slice3 != nullptr);
+    std::string slice3LayerName = "slice3_" + std::to_string(layerIdx);
+    slice3->setName(slice3LayerName.c_str());
+
+    nvinfer1::ISliceLayer *slice4 = network->addSlice(
+        *input, nvinfer1::Dims{3, {0, 1, 1}}, nvinfer1::Dims{3, {inputDims.d[0], inputDims.d[1] / 2, inputDims.d[2] / 2}},
+        nvinfer1::Dims{3, {1, 2, 2}});
+    assert(slice4 != nullptr);
+    std::string slice4LayerName = "slice4_" + std::to_string(layerIdx);
+    slice4->setName(slice4LayerName.c_str());
+
+    std::vector<nvinfer1::ITensor*> concatInputs;
+    concatInputs.push_back(slice1->getOutput(0));
+    concatInputs.push_back(slice2->getOutput(0));
+    concatInputs.push_back(slice3->getOutput(0));
+    concatInputs.push_back(slice4->getOutput(0));
+
+    nvinfer1::IConcatenationLayer* concat = network->addConcatenation(concatInputs.data(), concatInputs.size());
+    assert(concat != nullptr);
+    std::string concatLayerName = "concat_" + std::to_string(layerIdx);
+    concat->setName(concatLayerName.c_str());
+    concat->setAxis(0);
+    output = concat->getOutput(0);
+
+    return output;
+}
diff --git a/nvdsinfer_custom_impl_Yolo/layers/reorgv5_layer.h b/nvdsinfer_custom_impl_Yolo/layers/reorg_layer.h
similarity index 79%
rename from nvdsinfer_custom_impl_Yolo/layers/reorgv5_layer.h
rename to nvdsinfer_custom_impl_Yolo/layers/reorg_layer.h
index 5794d37..fca09fa 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/reorgv5_layer.h
+++ b/nvdsinfer_custom_impl_Yolo/layers/reorg_layer.h
@@ -12,8 +12,9 @@
 
 #include "NvInfer.h"
 
-nvinfer1::ILayer* reorgV5Layer(
+nvinfer1::ITensor* reorgLayer(
     int layerIdx,
+    std::map<std::string, std::string>& block,
     nvinfer1::ITensor* input,
     nvinfer1::INetworkDefinition* network);
 
diff --git a/nvdsinfer_custom_impl_Yolo/layers/reorgv5_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/reorgv5_layer.cpp
deleted file mode 100644
index 2e65033..0000000
--- a/nvdsinfer_custom_impl_Yolo/layers/reorgv5_layer.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Created by Marcos Luciano
- * https://www.github.com/marcoslucianops
- */
-
-#include "reorgv5_layer.h"
-
-nvinfer1::ILayer* reorgV5Layer(
-    int layerIdx,
-    nvinfer1::ITensor* input,
-    nvinfer1::INetworkDefinition* network)
-{
-    nvinfer1::Dims prevTensorDims = input->getDimensions();
-
-    nvinfer1::ISliceLayer *slice1 = network->addSlice(
-        *input,
-        nvinfer1::Dims3{0, 0, 0},
-        nvinfer1::Dims3{prevTensorDims.d[0], prevTensorDims.d[1] / 2, prevTensorDims.d[2] / 2},
-        nvinfer1::Dims3{1, 2, 2});
-    assert(slice1 != nullptr);
-    std::string slice1LayerName = "slice1_" + std::to_string(layerIdx);
-    slice1->setName(slice1LayerName.c_str());
-
-    nvinfer1::ISliceLayer *slice2 = network->addSlice(
-        *input,
-        nvinfer1::Dims3{0, 1, 0},
-        nvinfer1::Dims3{prevTensorDims.d[0], prevTensorDims.d[1] / 2, prevTensorDims.d[2] / 2},
-        nvinfer1::Dims3{1, 2, 2});
-    assert(slice2 != nullptr);
-    std::string slice2LayerName = "slice2_" + std::to_string(layerIdx);
-    slice2->setName(slice2LayerName.c_str());
-
-    nvinfer1::ISliceLayer *slice3 = network->addSlice(
-        *input,
-        nvinfer1::Dims3{0, 0, 1},
-        nvinfer1::Dims3{prevTensorDims.d[0], prevTensorDims.d[1] / 2, prevTensorDims.d[2] / 2},
-        nvinfer1::Dims3{1, 2, 2});
-    assert(slice3 != nullptr);
-    std::string slice3LayerName = "slice3_" + std::to_string(layerIdx);
-    slice3->setName(slice3LayerName.c_str());
-
-    nvinfer1::ISliceLayer *slice4 = network->addSlice(
-        *input,
-        nvinfer1::Dims3{0, 1, 1},
-        nvinfer1::Dims3{prevTensorDims.d[0], prevTensorDims.d[1] / 2, prevTensorDims.d[2] / 2},
-        nvinfer1::Dims3{1, 2, 2});
-    assert(slice4 != nullptr);
-    std::string slice4LayerName = "slice4_" + std::to_string(layerIdx);
-    slice4->setName(slice4LayerName.c_str());
-
-    std::vector<nvinfer1::ITensor*> concatInputs;
-    concatInputs.push_back (slice1->getOutput(0));
-    concatInputs.push_back (slice2->getOutput(0));
-    concatInputs.push_back (slice3->getOutput(0));
-    concatInputs.push_back (slice4->getOutput(0));
-
-    nvinfer1::IConcatenationLayer* concat =
-        network->addConcatenation(concatInputs.data(), concatInputs.size());
-    assert(concat != nullptr);
-
-    return concat;
-}
diff --git a/nvdsinfer_custom_impl_Yolo/layers/route_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/route_layer.cpp
index 5ca4844..1fda570 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/route_layer.cpp
+++ b/nvdsinfer_custom_impl_Yolo/layers/route_layer.cpp
@@ -5,58 +5,73 @@
 
 #include "route_layer.h"
 
-nvinfer1::ILayer* routeLayer(
+nvinfer1::ITensor* routeLayer(
     int layerIdx,
+    std::string& layers,
     std::map<std::string, std::string>& block,
     std::vector<nvinfer1::ITensor*> tensorOutputs,
     nvinfer1::INetworkDefinition* network)
 {
+    nvinfer1::ITensor* output;
+
+    assert(block.at("type") == "route");
+    assert(block.find("layers") != block.end());
+
     std::string strLayers = block.at("layers");
     std::vector<int> idxLayers;
     size_t lastPos = 0, pos = 0;
-    while ((pos = strLayers.find(',', lastPos)) != std::string::npos) {
+    while ((pos = strLayers.find(',', lastPos)) != std::string::npos)
+    {
         int vL = std::stoi(trim(strLayers.substr(lastPos, pos - lastPos)));
-        idxLayers.push_back (vL);
+        idxLayers.push_back(vL);
         lastPos = pos + 1;
     }
-    if (lastPos < strLayers.length()) {
+    if (lastPos < strLayers.length())
+    {
         std::string lastV = trim(strLayers.substr(lastPos));
-        if (!lastV.empty()) {
-            idxLayers.push_back (std::stoi(lastV));
-        }
+        if (!lastV.empty())
+            idxLayers.push_back(std::stoi(lastV));
     }
     assert (!idxLayers.empty());
     std::vector<nvinfer1::ITensor*> concatInputs;
-    for (int idxLayer : idxLayers) {
-        if (idxLayer < 0) {
-            idxLayer = tensorOutputs.size() + idxLayer;
-        }
-        assert (idxLayer >= 0 && idxLayer < (int)tensorOutputs.size());
-        concatInputs.push_back (tensorOutputs[idxLayer]);
+    for (uint i = 0; i < idxLayers.size(); ++i)
+    {
+        if (idxLayers[i] < 0)
+            idxLayers[i] = tensorOutputs.size() + idxLayers[i];
+        assert (idxLayers[i] >= 0 && idxLayers[i] < (int)tensorOutputs.size());
+        concatInputs.push_back(tensorOutputs[idxLayers[i]]);
+        if (i < idxLayers.size() - 1)
+            layers += std::to_string(idxLayers[i]) + ", ";
     }
+    layers += std::to_string(idxLayers[idxLayers.size() - 1]);
 
-    nvinfer1::IConcatenationLayer* concat =
-        network->addConcatenation(concatInputs.data(), concatInputs.size());
+    int axis = 0;
+    if (block.find("axis") != block.end())
+        axis = std::stoi(block.at("axis"));
+    if (axis < 0)
+        axis = concatInputs[0]->getDimensions().nbDims + axis;
+
+    nvinfer1::IConcatenationLayer* concat = network->addConcatenation(concatInputs.data(), concatInputs.size());
     assert(concat != nullptr);
-    std::string concatLayerName = "route_" + std::to_string(layerIdx - 1);
+    std::string concatLayerName = "route_" + std::to_string(layerIdx);
     concat->setName(concatLayerName.c_str());
-    concat->setAxis(0);
+    concat->setAxis(axis);
+    output = concat->getOutput(0);
 
-    nvinfer1::ILayer* output = concat;
-
-    if (block.find("groups") != block.end()) {
-        nvinfer1::Dims prevTensorDims = output->getOutput(0)->getDimensions();
+    if (block.find("groups") != block.end())
+    {
+        nvinfer1::Dims prevTensorDims = output->getDimensions();
         int groups = stoi(block.at("groups"));
         int group_id = stoi(block.at("group_id"));
         int startSlice = (prevTensorDims.d[0] / groups) * group_id;
         int channelSlice = (prevTensorDims.d[0] / groups);
-        nvinfer1::ISliceLayer* sl = network->addSlice(
-            *output->getOutput(0),
-            nvinfer1::Dims3{startSlice, 0, 0},
-            nvinfer1::Dims3{channelSlice, prevTensorDims.d[1], prevTensorDims.d[2]},
-            nvinfer1::Dims3{1, 1, 1});
-        assert(sl != nullptr);
-        output = sl;
+        nvinfer1::ISliceLayer* slice = network->addSlice(
+            *output, nvinfer1::Dims{3, {startSlice, 0, 0}},
+            nvinfer1::Dims{3, {channelSlice, prevTensorDims.d[1], prevTensorDims.d[2]}}, nvinfer1::Dims{3, {1, 1, 1}});
+        assert(slice != nullptr);
+        std::string sliceLayerName = "slice_" + std::to_string(layerIdx);
+        slice->setName(sliceLayerName.c_str());
+        output = slice->getOutput(0);
     }
 
     return output;
diff --git a/nvdsinfer_custom_impl_Yolo/layers/route_layer.h b/nvdsinfer_custom_impl_Yolo/layers/route_layer.h
index 409beee..9679365 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/route_layer.h
+++ b/nvdsinfer_custom_impl_Yolo/layers/route_layer.h
@@ -9,8 +9,9 @@
 #include "NvInfer.h"
 #include "../utils.h"
 
-nvinfer1::ILayer* routeLayer(
+nvinfer1::ITensor* routeLayer(
     int layerIdx,
+    std::string& layers,
     std::map<std::string, std::string>& block,
     std::vector<nvinfer1::ITensor*> tensorOutputs,
     nvinfer1::INetworkDefinition* network);
diff --git a/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.cpp
index 725d781..3e53cbe 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.cpp
+++ b/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.cpp
@@ -5,40 +5,47 @@
 
 #include "shortcut_layer.h"
 
-nvinfer1::ILayer* shortcutLayer(
+nvinfer1::ITensor* shortcutLayer(
     int layerIdx,
+    std::string mode,
     std::string activation,
     std::string inputVol,
     std::string shortcutVol,
+    std::map<std::string, std::string>& block,
     nvinfer1::ITensor* input,
-    nvinfer1::ITensor* shortcutTensor,
+    nvinfer1::ITensor* shortcutInput,
     nvinfer1::INetworkDefinition* network)
 {
-    nvinfer1::ILayer* output;
-    nvinfer1::ITensor* outputTensor;
+    nvinfer1::ITensor* output;
 
-    if (inputVol != shortcutVol)
+    assert(block.at("type") == "shortcut");
+
+    nvinfer1::ElementWiseOperation operation = nvinfer1::ElementWiseOperation::kSUM;
+
+    if (mode == "mul")
+        operation = nvinfer1::ElementWiseOperation::kPROD;
+
+    if (mode == "add" && inputVol != shortcutVol)
     {
-        nvinfer1::ISliceLayer* sl = network->addSlice(
-            *shortcutTensor,
-            nvinfer1::Dims3{0, 0, 0},
-            input->getDimensions(),
-            nvinfer1::Dims3{1, 1, 1});
-        assert(sl != nullptr);
-        outputTensor = sl->getOutput(0);
-        assert(outputTensor != nullptr);
-    } else 
+        nvinfer1::ISliceLayer* slice = network->addSlice(
+            *shortcutInput, nvinfer1::Dims{3, {0, 0, 0}}, input->getDimensions(), nvinfer1::Dims{3, {1, 1, 1}});
+        assert(slice != nullptr);
+        std::string sliceLayerName = "slice_" + std::to_string(layerIdx);
+        slice->setName(sliceLayerName.c_str());
+        output = slice->getOutput(0);
+    }
+    else 
     {
-        outputTensor = shortcutTensor;
-        assert(outputTensor != nullptr);
+        output = shortcutInput;
     }
 
-    nvinfer1::IElementWiseLayer* ew = network->addElementWise(
-        *input, *outputTensor,
-        nvinfer1::ElementWiseOperation::kSUM);
-    assert(ew != nullptr);
+    nvinfer1::IElementWiseLayer* shortcut = network->addElementWise(*input, *output, operation);
+    assert(shortcut != nullptr);
+    std::string shortcutLayerName = "shortcut_" + std::to_string(layerIdx);
+    shortcut->setName(shortcutLayerName.c_str());
+    output = shortcut->getOutput(0);
 
-    output = activationLayer(layerIdx, activation, ew, ew->getOutput(0), network);
+    output = activationLayer(layerIdx, activation, output, network);
     assert(output != nullptr);
 
     return output;
diff --git a/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.h b/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.h
index c564d3d..22195e8 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.h
+++ b/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.h
@@ -6,17 +6,21 @@
 #ifndef __SHORTCUT_LAYER_H__
 #define __SHORTCUT_LAYER_H__
 
+#include <map>
+
 #include "NvInfer.h"
 
 #include "activation_layer.h"
 
-nvinfer1::ILayer* shortcutLayer(
+nvinfer1::ITensor* shortcutLayer(
     int layerIdx,
+    std::string mode,
     std::string activation,
     std::string inputVol,
     std::string shortcutVol,
+    std::map<std::string, std::string>& block,
     nvinfer1::ITensor* input,
-    nvinfer1::ITensor* shortcutTensor,
+    nvinfer1::ITensor* shortcut,
     nvinfer1::INetworkDefinition* network);
 
 #endif
diff --git a/nvdsinfer_custom_impl_Yolo/layers/shuffle_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/shuffle_layer.cpp
new file mode 100644
index 0000000..a967e46
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/layers/shuffle_layer.cpp
@@ -0,0 +1,123 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#include "shuffle_layer.h"
+
+nvinfer1::ITensor* shuffleLayer(
+    int layerIdx,
+    std::string& layer,
+    std::map<std::string, std::string>& block,
+    nvinfer1::ITensor* input,
+    std::vector<nvinfer1::ITensor*> tensorOutputs,
+    nvinfer1::INetworkDefinition* network)
+{
+    nvinfer1::ITensor* output;
+
+    assert(block.at("type") == "shuffle");
+
+    nvinfer1::IShuffleLayer* shuffle = network->addShuffle(*input);
+    assert(shuffle != nullptr);
+    std::string shuffleLayerName = "shuffle_" + std::to_string(layerIdx);
+    shuffle->setName(shuffleLayerName.c_str());
+
+    if (block.find("reshape") != block.end())
+    {
+        std::string strReshape = block.at("reshape");
+        std::vector<int32_t> reshape;
+        size_t lastPos = 0, pos = 0;
+        while ((pos = strReshape.find(',', lastPos)) != std::string::npos)
+        {
+            int vL = std::stoi(trim(strReshape.substr(lastPos, pos - lastPos)));
+            reshape.push_back(vL);
+            lastPos = pos + 1;
+        }
+        if (lastPos < strReshape.length())
+        {
+            std::string lastV = trim(strReshape.substr(lastPos));
+            if (!lastV.empty())
+                reshape.push_back(std::stoi(lastV));
+        }
+        assert(!reshape.empty());
+
+        int from = -1;
+        if (block.find("from") != block.end())
+            from = std::stoi(block.at("from"));
+
+        if (from < 0)
+            from = tensorOutputs.size() + from;
+
+        layer = std::to_string(from);
+
+        nvinfer1::Dims inputTensorDims = tensorOutputs[from]->getDimensions();
+        int32_t l = inputTensorDims.d[1] * inputTensorDims.d[2];
+        
+        nvinfer1::Dims reshapeDims;
+        reshapeDims.nbDims = reshape.size();
+
+        for (uint i = 0; i < reshape.size(); ++i)
+            if (reshape[i] == 0)
+                reshapeDims.d[i] = l;
+            else
+                reshapeDims.d[i] = reshape[i];
+
+        shuffle->setReshapeDimensions(reshapeDims);
+    }
+
+    if (block.find("transpose1") != block.end())
+    {
+        std::string strTranspose1 = block.at("transpose1");
+        std::vector<int32_t> transpose1;
+        size_t lastPos = 0, pos = 0;
+        while ((pos = strTranspose1.find(',', lastPos)) != std::string::npos)
+        {
+            int vL = std::stoi(trim(strTranspose1.substr(lastPos, pos - lastPos)));
+            transpose1.push_back(vL);
+            lastPos = pos + 1;
+        }
+        if (lastPos < strTranspose1.length())
+        {
+            std::string lastV = trim(strTranspose1.substr(lastPos));
+            if (!lastV.empty())
+                transpose1.push_back(std::stoi(lastV));
+        }
+        assert(!transpose1.empty());
+        
+        nvinfer1::Permutation permutation1;
+        for (uint i = 0; i < transpose1.size(); ++i)
+            permutation1.order[i] = transpose1[i];
+
+        shuffle->setFirstTranspose(permutation1);
+    }
+
+    if (block.find("transpose2") != block.end())
+    {
+        std::string strTranspose2 = block.at("transpose2");
+        std::vector<int32_t> transpose2;
+        size_t lastPos = 0, pos = 0;
+        while ((pos = strTranspose2.find(',', lastPos)) != std::string::npos)
+        {
+            int vL = std::stoi(trim(strTranspose2.substr(lastPos, pos - lastPos)));
+            transpose2.push_back(vL);
+            lastPos = pos + 1;
+        }
+        if (lastPos < strTranspose2.length())
+        {
+            std::string lastV = trim(strTranspose2.substr(lastPos));
+            if (!lastV.empty())
+                transpose2.push_back(std::stoi(lastV));
+        }
+        assert(!transpose2.empty());
+
+        nvinfer1::Permutation permutation2;
+        for (uint i = 0; i < transpose2.size(); ++i)
+            permutation2.order[i] = transpose2[i];
+
+        shuffle->setSecondTranspose(permutation2);
+    }
+
+    output = shuffle->getOutput(0);
+
+    return output;
+}
diff --git a/nvdsinfer_custom_impl_Yolo/layers/shuffle_layer.h b/nvdsinfer_custom_impl_Yolo/layers/shuffle_layer.h
new file mode 100644
index 0000000..53aa3ce
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/layers/shuffle_layer.h
@@ -0,0 +1,20 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#ifndef __SHUFFLE_LAYER_H__
+#define __SHUFFLE_LAYER_H__
+
+#include "NvInfer.h"
+#include "../utils.h"
+
+nvinfer1::ITensor* shuffleLayer(
+    int layerIdx,
+    std::string& layer,
+    std::map<std::string, std::string>& block,
+    nvinfer1::ITensor* input,
+    std::vector<nvinfer1::ITensor*> tensorOutputs,
+    nvinfer1::INetworkDefinition* network);
+
+#endif
diff --git a/nvdsinfer_custom_impl_Yolo/layers/softmax_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/softmax_layer.cpp
new file mode 100644
index 0000000..cb6348d
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/layers/softmax_layer.cpp
@@ -0,0 +1,29 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#include "softmax_layer.h"
+
+nvinfer1::ITensor* softmaxLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
+    nvinfer1::ITensor* input,
+    nvinfer1::INetworkDefinition* network)
+{
+    nvinfer1::ITensor* output;
+
+    assert(block.at("type") == "softmax");
+    assert(block.find("axes") != block.end());
+
+    int axes = std::stoi(block.at("axes"));
+
+    nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*input);
+    assert(softmax != nullptr);
+    std::string softmaxLayerName = "softmax_" + std::to_string(layerIdx);
+    softmax->setName(softmaxLayerName.c_str());
+    softmax->setAxes(1 << axes);
+    output = softmax->getOutput(0);
+
+    return output;
+}
diff --git a/nvdsinfer_custom_impl_Yolo/layers/softmax_layer.h b/nvdsinfer_custom_impl_Yolo/layers/softmax_layer.h
new file mode 100644
index 0000000..0ca208e
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/layers/softmax_layer.h
@@ -0,0 +1,20 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#ifndef __SOFTMAX_LAYER_H__
+#define __SOFTMAX_LAYER_H__
+
+#include <map>
+#include <cassert>
+
+#include "NvInfer.h"
+
+nvinfer1::ITensor* softmaxLayer(
+    int layerIdx,
+    std::map<std::string, std::string>& block,
+    nvinfer1::ITensor* input,
+    nvinfer1::INetworkDefinition* network);
+
+#endif
diff --git a/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp
index 066cce8..4e3614c 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp
+++ b/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp
@@ -5,20 +5,28 @@
 
 #include "upsample_layer.h"
 
-nvinfer1::ILayer* upsampleLayer(
+nvinfer1::ITensor* upsampleLayer(
     int layerIdx,
     std::map<std::string, std::string>& block,
     nvinfer1::ITensor* input,
     nvinfer1::INetworkDefinition* network)
 {
+    nvinfer1::ITensor* output;
+
     assert(block.at("type") == "upsample");
+    assert(block.find("stride") != block.end());
+
     int stride = std::stoi(block.at("stride"));
 
-    nvinfer1::IResizeLayer* resize_layer = network->addResize(*input);
-    resize_layer->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
     float scale[3] = {1, static_cast<float>(stride), static_cast<float>(stride)};
-    resize_layer->setScales(scale, 3);
-    std::string layer_name = "upsample_" + std::to_string(layerIdx);
-    resize_layer->setName(layer_name.c_str());
-    return resize_layer;
+
+    nvinfer1::IResizeLayer* resize = network->addResize(*input);
+    assert(resize != nullptr);
+    std::string resizeLayerName = "upsample_" + std::to_string(layerIdx);
+    resize->setName(resizeLayerName.c_str());
+    resize->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    resize->setScales(scale, 3);
+    output = resize->getOutput(0);
+
+    return output;
 }
diff --git a/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.h b/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.h
index d1b7768..89e69bf 100644
--- a/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.h
+++ b/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.h
@@ -7,12 +7,11 @@
 #define __UPSAMPLE_LAYER_H__
 
 #include <map>
-#include <vector>
 #include <cassert>
 
 #include "NvInfer.h"
 
-nvinfer1::ILayer* upsampleLayer(
+nvinfer1::ITensor* upsampleLayer(
     int layerIdx,
     std::map<std::string, std::string>& block,
     nvinfer1::ITensor* input,
diff --git a/nvdsinfer_custom_impl_Yolo/sortDetections.cu b/nvdsinfer_custom_impl_Yolo/sortDetections.cu
index cda816a..e2a2480 100644
--- a/nvdsinfer_custom_impl_Yolo/sortDetections.cu
+++ b/nvdsinfer_custom_impl_Yolo/sortDetections.cu
@@ -37,13 +37,11 @@ cudaError_t sortDetections(
         float* _d_scores = reinterpret_cast<float*>(d_scores) + (batch * outputSize);
 
         int* _countData = reinterpret_cast<int*>(countData) + (batch);
-        int* _count = (int*)malloc(sizeof(int));
-        cudaMemcpy(_count, (int*)&_countData[0], sizeof(int), cudaMemcpyDeviceToHost);
-        int count = _count[0];
+        int count;
+        cudaMemcpy(&count, _countData, sizeof(int), cudaMemcpyDeviceToHost);
 
         if (count == 0)
         {
-            free(_count);
             return cudaGetLastError();
         }
 
@@ -72,13 +70,13 @@ cudaError_t sortDetections(
 
         int _topK = count < topK ? count : topK;
 
-        int threads_per_block = 0;
-        int number_of_blocks = 4;
+        int threads_per_block = 16;
+        int number_of_blocks = 0;
 
-        if (_topK % 2 == 0 && _topK >= number_of_blocks)
-            threads_per_block = _topK / number_of_blocks;
+        if (_topK % 2 == 0 && _topK >= threads_per_block)
+            number_of_blocks = _topK / threads_per_block;
         else
-            threads_per_block = (_topK / number_of_blocks) + 1;
+            number_of_blocks = (_topK / threads_per_block) + 1;
 
         sortOutput<<<number_of_blocks, threads_per_block, 0, stream>>>(
             _d_indexes, _d_scores, reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
@@ -89,8 +87,6 @@ cudaError_t sortDetections(
         cudaFree(d_keys_out);
         cudaFree(d_values_out);
         cudaFree(d_temp_storage);
-
-        free(_count);
     }
     return cudaGetLastError();
 }
diff --git a/nvdsinfer_custom_impl_Yolo/utils.cpp b/nvdsinfer_custom_impl_Yolo/utils.cpp
index 21d8117..b6431a1 100644
--- a/nvdsinfer_custom_impl_Yolo/utils.cpp
+++ b/nvdsinfer_custom_impl_Yolo/utils.cpp
@@ -132,11 +132,10 @@ std::string dimsToString(const nvinfer1::Dims d)
 {
     std::stringstream s;
     assert(d.nbDims >= 1);
+    s << "[";
     for (int i = 0; i < d.nbDims - 1; ++i)
-    {
-        s << std::setw(4) << d.d[i] << " x";
-    }
-    s << std::setw(4) << d.d[d.nbDims - 1];
+        s << d.d[i] << ", ";
+    s << d.d[d.nbDims - 1] << "]";
 
     return s.str();
 }
@@ -152,10 +151,9 @@ int getNumChannels(nvinfer1::ITensor* t)
 void printLayerInfo(
     std::string layerIndex, std::string layerName, std::string layerInput, std::string layerOutput, std::string weightPtr)
 {
-    std::cout << std::setw(6) << std::left << layerIndex << std::setw(24) << std::left << layerName;
-    std::cout << std::setw(20) << std::left << layerInput << std::setw(20) << std::left
-              << layerOutput;
-    std::cout << std::setw(7) << std::left << weightPtr << std::endl;
+    std::cout << std::setw(8) << std::left << layerIndex << std::setw(30) << std::left << layerName;
+    std::cout << std::setw(20) << std::left << layerInput << std::setw(20) << std::left << layerOutput;
+    std::cout << weightPtr << std::endl;
 }
 
 std::string getAbsPath(std::string path)
diff --git a/nvdsinfer_custom_impl_Yolo/yolo.cpp b/nvdsinfer_custom_impl_Yolo/yolo.cpp
index 2e7fb9f..c784250 100644
--- a/nvdsinfer_custom_impl_Yolo/yolo.cpp
+++ b/nvdsinfer_custom_impl_Yolo/yolo.cpp
@@ -158,7 +158,6 @@ NvDsInferStatus Yolo::parseModel(nvinfer1::INetworkDefinition& network) {
 NvDsInferStatus Yolo::buildYoloNetwork(std::vector<float>& weights, nvinfer1::INetworkDefinition& network)
 {
     int weightPtr = 0;
-    int channels = m_InputC;
 
     std::string weightsType;
     if (m_WtsFilePath.find(".weights") != std::string::npos)
@@ -172,81 +171,64 @@ NvDsInferStatus Yolo::buildYoloNetwork(std::vector<float>& weights, nvinfer1::IN
     else if (m_NetworkType.find("yolor") != std::string::npos)
         eps = 1.0e-4;
 
-    nvinfer1::ITensor* data =
-        network.addInput(m_InputBlobName.c_str(), nvinfer1::DataType::kFLOAT,
-            nvinfer1::Dims3{static_cast<int>(m_InputC),
-                static_cast<int>(m_InputH), static_cast<int>(m_InputW)});
+    nvinfer1::ITensor* data = network.addInput(
+        m_InputBlobName.c_str(), nvinfer1::DataType::kFLOAT,
+        nvinfer1::Dims{3, {static_cast<int>(m_InputC), static_cast<int>(m_InputH), static_cast<int>(m_InputW)}});
     assert(data != nullptr && data->getDimensions().nbDims > 0);
 
     nvinfer1::ITensor* previous = data;
     std::vector<nvinfer1::ITensor*> tensorOutputs;
-    std::vector<nvinfer1::ITensor*> yoloInputs;
-    uint inputYoloCount = 0;
+
+    nvinfer1::ITensor* yoloTensorInputs[m_YoloCount];
+    uint yoloCountInputs = 0;
 
     int modelType = -1;
 
     for (uint i = 0; i < m_ConfigBlocks.size(); ++i)
     {
-        assert(getNumChannels(previous) == channels);
         std::string layerIndex = "(" + std::to_string(tensorOutputs.size()) + ")";
 
         if (m_ConfigBlocks.at(i).at("type") == "net")
-            printLayerInfo("", "layer", "     input", "     output", "weightPtr");
+            printLayerInfo("", "Layer", "Input Shape", "Output Shape", "WeightPtr");
         
         else if (m_ConfigBlocks.at(i).at("type") == "convolutional")
         {
+            int channels = getNumChannels(previous);
             std::string inputVol = dimsToString(previous->getDimensions());
-            nvinfer1::ILayer* out = convolutionalLayer(
+            previous = convolutionalLayer(
                 i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, weightsType, channels, eps, previous, &network);
-            previous = out->getOutput(0);
             assert(previous != nullptr);
-            channels = getNumChannels(previous);
             std::string outputVol = dimsToString(previous->getDimensions());
             tensorOutputs.push_back(previous);
-            std::string layerType = "conv_" + m_ConfigBlocks.at(i).at("activation");
-            printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
+            std::string layerName = "conv_" + m_ConfigBlocks.at(i).at("activation");
+            printLayerInfo(layerIndex, layerName, inputVol, outputVol, std::to_string(weightPtr));
         }
 
         else if (m_ConfigBlocks.at(i).at("type") == "batchnorm")
         {
             std::string inputVol = dimsToString(previous->getDimensions());
-            nvinfer1::ILayer* out = batchnormLayer(
+            previous = batchnormLayer(
                 i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, weightsType, eps, previous, &network);
-            previous = out->getOutput(0);
             assert(previous != nullptr);
-            channels = getNumChannels(previous);
             std::string outputVol = dimsToString(previous->getDimensions());
             tensorOutputs.push_back(previous);
-            std::string layerType = "bn_" + m_ConfigBlocks.at(i).at("activation");
-            printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
+            std::string layerName = "batchnorm_" + m_ConfigBlocks.at(i).at("activation");
+            printLayerInfo(layerIndex, layerName, inputVol, outputVol, std::to_string(weightPtr));
         }
 
         else if (m_ConfigBlocks.at(i).at("type") == "implicit_add" || m_ConfigBlocks.at(i).at("type") == "implicit_mul")
         {
-            std::string type;
-            if (m_ConfigBlocks.at(i).at("type") == "implicit_add")
-                type = "add";
-            else if (m_ConfigBlocks.at(i).at("type") == "implicit_mul")
-                type = "mul";
-            assert(m_ConfigBlocks.at(i).find("filters") != m_ConfigBlocks.at(i).end());
-            int filters = std::stoi(m_ConfigBlocks.at(i).at("filters"));
-            nvinfer1::ILayer* out = implicitLayer(filters, weights, m_TrtWeights, weightPtr, &network);
-            previous = out->getOutput(0);
+            previous = implicitLayer(i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, &network);
             assert(previous != nullptr);
-            channels = getNumChannels(previous);
             std::string outputVol = dimsToString(previous->getDimensions());
             tensorOutputs.push_back(previous);
-            std::string layerType = "implicit_" + type;
-            printLayerInfo(layerIndex, layerType, "        -", outputVol, std::to_string(weightPtr));
+            std::string layerName =  m_ConfigBlocks.at(i).at("type");
+            printLayerInfo(layerIndex, layerName, "-", outputVol, std::to_string(weightPtr));
         }
 
-        else if (m_ConfigBlocks.at(i).at("type") == "shift_channels" || m_ConfigBlocks.at(i).at("type") == "control_channels")
+        else if (m_ConfigBlocks.at(i).at("type") == "shift_channels" ||
+            m_ConfigBlocks.at(i).at("type") == "control_channels")
         {
-            std::string type;
-            if (m_ConfigBlocks.at(i).at("type") == "shift_channels")
-                type = "shift";
-            else if (m_ConfigBlocks.at(i).at("type") == "control_channels")
-                type = "control";
             assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end());
             int from = stoi(m_ConfigBlocks.at(i).at("from"));
             if (from > 0)
@@ -254,140 +236,193 @@ NvDsInferStatus Yolo::buildYoloNetwork(std::vector<float>& weights, nvinfer1::IN
             assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size()));
             assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size()));
             assert(i + from - 1 < i - 2);
-            nvinfer1::ILayer* out = channelsLayer(type, previous, tensorOutputs[i + from - 1], &network);
-            previous = out->getOutput(0);
+
+            std::string inputVol = dimsToString(previous->getDimensions());
+            previous = channelsLayer(i, m_ConfigBlocks.at(i), previous, tensorOutputs[i + from - 1], &network);
             assert(previous != nullptr);
             std::string outputVol = dimsToString(previous->getDimensions());
             tensorOutputs.push_back(previous);
-            std::string layerType = type + "_channels" + ": " + std::to_string(i + from - 1);
-            printLayerInfo(layerIndex, layerType, "        -", outputVol, "    -");
-        }
-
-        else if (m_ConfigBlocks.at(i).at("type") == "dropout")
-        {
-            // Skip dropout layer
-            assert(previous != nullptr);
-            tensorOutputs.push_back(previous);
-            printLayerInfo(layerIndex, "dropout", "        -", "        -", "    -");
+            std::string layerName = m_ConfigBlocks.at(i).at("type") + ": " + std::to_string(i + from - 1);
+            printLayerInfo(layerIndex, layerName, inputVol, outputVol, "-");
         }
 
         else if (m_ConfigBlocks.at(i).at("type") == "shortcut")
         {
-            assert(m_ConfigBlocks.at(i).find("activation") != m_ConfigBlocks.at(i).end());
             assert(m_ConfigBlocks.at(i).find("from") != m_ConfigBlocks.at(i).end());
-            std::string activation = m_ConfigBlocks.at(i).at("activation");
             int from = stoi(m_ConfigBlocks.at(i).at("from"));
             if (from > 0)
                 from = from - i + 1;
             assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size()));
             assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size()));
             assert(i + from - 1 < i - 2);
+
+            std::string mode = "add";
+            if (m_ConfigBlocks.at(i).find("mode") != m_ConfigBlocks.at(i).end())
+                mode = m_ConfigBlocks.at(i).at("mode");
+
+            std::string activation = "linear";
+            if (m_ConfigBlocks.at(i).find("activation") != m_ConfigBlocks.at(i).end())
+                activation = m_ConfigBlocks.at(i).at("activation");
+
             std::string inputVol = dimsToString(previous->getDimensions());
             std::string shortcutVol = dimsToString(tensorOutputs[i + from - 1]->getDimensions());
-            nvinfer1::ILayer* out = shortcutLayer(i, activation, inputVol, shortcutVol, previous, tensorOutputs[i + from - 1], &network);
-            previous = out->getOutput(0);
+            previous = shortcutLayer(
+                i, mode, activation, inputVol, shortcutVol, m_ConfigBlocks.at(i), previous, tensorOutputs[i + from - 1],
+                &network);
             assert(previous != nullptr);
             std::string outputVol = dimsToString(previous->getDimensions());
             tensorOutputs.push_back(previous);
-            std::string layerType = "shortcut_" + m_ConfigBlocks.at(i).at("activation") + ": " + std::to_string(i + from - 1);
-            printLayerInfo(layerIndex, layerType, "        -", outputVol, "    -");
-            if (inputVol != shortcutVol) {
+            std::string layerName = "shortcut_" + mode + "_" + activation + ": " + std::to_string(i + from - 1);
+            printLayerInfo(layerIndex, layerName, inputVol, outputVol, "-");
+
+            if (mode == "add" && inputVol != shortcutVol)
                 std::cout << inputVol << " +" << shortcutVol << std::endl;
-            }
         }
 
         else if (m_ConfigBlocks.at(i).at("type") == "route")
         {
-            assert(m_ConfigBlocks.at(i).find("layers") != m_ConfigBlocks.at(i).end());
-            nvinfer1::ILayer* out = routeLayer(i, m_ConfigBlocks.at(i), tensorOutputs, &network);
-            previous = out->getOutput(0);
+            std::string layers;
+            previous = routeLayer(i, layers, m_ConfigBlocks.at(i), tensorOutputs, &network);
             assert(previous != nullptr);
-            channels = getNumChannels(previous);
             std::string outputVol = dimsToString(previous->getDimensions());
             tensorOutputs.push_back(previous);
-            printLayerInfo(layerIndex, "route", "        -", outputVol, std::to_string(weightPtr));
+            std::string layerName = "route: " + layers;
+            printLayerInfo(layerIndex, layerName, "-", outputVol, "-");
         }
 
         else if (m_ConfigBlocks.at(i).at("type") == "upsample")
         {
             std::string inputVol = dimsToString(previous->getDimensions());
-            nvinfer1::ILayer* out = upsampleLayer(i - 1, m_ConfigBlocks[i], previous, &network);
-            previous = out->getOutput(0);
+            previous = upsampleLayer(i, m_ConfigBlocks[i], previous, &network);
             assert(previous != nullptr);
             std::string outputVol = dimsToString(previous->getDimensions());
             tensorOutputs.push_back(previous);
-            printLayerInfo(layerIndex, "upsample", inputVol, outputVol, "    -");
+            std::string layerName = "upsample";
+            printLayerInfo(layerIndex, layerName, inputVol, outputVol, "-");
         }
 
-        else if (m_ConfigBlocks.at(i).at("type") == "maxpool")
+        else if (m_ConfigBlocks.at(i).at("type") == "maxpool" || m_ConfigBlocks.at(i).at("type") == "avgpool")
         {
             std::string inputVol = dimsToString(previous->getDimensions());
-            nvinfer1::ILayer* out = maxpoolLayer(i, m_ConfigBlocks.at(i), previous, &network);
-            previous = out->getOutput(0);
+            previous = poolingLayer(i, m_ConfigBlocks.at(i), previous, &network);
             assert(previous != nullptr);
             std::string outputVol = dimsToString(previous->getDimensions());
             tensorOutputs.push_back(previous);
-            printLayerInfo(layerIndex, "maxpool", inputVol, outputVol, std::to_string(weightPtr));
+            std::string layerName = m_ConfigBlocks.at(i).at("type");
+            printLayerInfo(layerIndex, layerName, inputVol, outputVol, "-");
         }
 
         else if (m_ConfigBlocks.at(i).at("type") == "reorg")
         {
-            if (m_NetworkType.find("yolov5") != std::string::npos || m_NetworkType.find("yolor") != std::string::npos)
-            {
-                std::string inputVol = dimsToString(previous->getDimensions());
-                nvinfer1::ILayer* out = reorgV5Layer(i, previous, &network);
-                previous = out->getOutput(0);
-                assert(previous != nullptr);
-                channels = getNumChannels(previous);
-                std::string outputVol = dimsToString(previous->getDimensions());
-                tensorOutputs.push_back(previous);
-                std::string layerType = "reorgV5";
-                printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
-            }
-            else 
-            {
-                std::string inputVol = dimsToString(previous->getDimensions());
-                nvinfer1::IPluginV2* reorgPlugin = createReorgPlugin(2);
-                assert(reorgPlugin != nullptr);
-                nvinfer1::IPluginV2Layer* reorg =
-                    network.addPluginV2(&previous, 1, *reorgPlugin);
-                assert(reorg != nullptr);
-                std::string layerName = "reorg_" + std::to_string(i);
-                reorg->setName(layerName.c_str());
-                previous = reorg->getOutput(0);
-                assert(previous != nullptr);
-                std::string outputVol = dimsToString(previous->getDimensions());
-                channels = getNumChannels(previous);
-                tensorOutputs.push_back(reorg->getOutput(0));
-                printLayerInfo(layerIndex, "reorg", inputVol, outputVol, std::to_string(weightPtr));
-            }
+            std::string inputVol = dimsToString(previous->getDimensions());
+            previous = reorgLayer(i, m_ConfigBlocks.at(i), previous, &network);
+            assert(previous != nullptr);
+            std::string outputVol = dimsToString(previous->getDimensions());
+            tensorOutputs.push_back(previous);
+            std::string layerName = "reorg";
+            printLayerInfo(layerIndex, layerName, inputVol, outputVol, "-");
+        }
+
+        else if (m_ConfigBlocks.at(i).at("type") == "reduce")
+        {
+            std::string inputVol = dimsToString(previous->getDimensions());
+            previous = reduceLayer(i, m_ConfigBlocks.at(i), previous, &network);
+            assert(previous != nullptr);
+            std::string outputVol = dimsToString(previous->getDimensions());
+            tensorOutputs.push_back(previous);
+            std::string layerName = "reduce";
+            printLayerInfo(layerIndex, layerName, inputVol, outputVol, "-");
+        }
+
+        else if (m_ConfigBlocks.at(i).at("type") == "shuffle")
+        {
+            std::string layer;
+            std::string inputVol = dimsToString(previous->getDimensions());
+            previous = shuffleLayer(i, layer, m_ConfigBlocks.at(i), previous, tensorOutputs, &network);
+            assert(previous != nullptr);
+            std::string outputVol = dimsToString(previous->getDimensions());
+            tensorOutputs.push_back(previous);
+            std::string layerName = "shuffle: " + layer;
+            printLayerInfo(layerIndex, layerName, inputVol, outputVol, "-");
+        }
+
+        else if (m_ConfigBlocks.at(i).at("type") == "softmax")
+        {
+            std::string inputVol = dimsToString(previous->getDimensions());
+            previous = softmaxLayer(i, m_ConfigBlocks.at(i), previous, &network);
+            assert(previous != nullptr);
+            std::string outputVol = dimsToString(previous->getDimensions());
+            tensorOutputs.push_back(previous);
+            std::string layerName = "softmax";
+            printLayerInfo(layerIndex, layerName, inputVol, outputVol, "-");
         }
 
         else if (m_ConfigBlocks.at(i).at("type") == "yolo" || m_ConfigBlocks.at(i).at("type") == "region")
         {
             if (m_ConfigBlocks.at(i).at("type") == "yolo")
-            {
                 if (m_NetworkType.find("yolor") != std::string::npos)
                     modelType = 2;
                 else
                     modelType = 1;
-            }
             else
                 modelType = 0;
 
-            std::string layerName = modelType != 0 ? "yolo_" + std::to_string(i) : "region_" + std::to_string(i);
+            std::string blobName = modelType != 0 ? "yolo_" + std::to_string(i) : "region_" + std::to_string(i);
             nvinfer1::Dims prevTensorDims = previous->getDimensions();
-            TensorInfo& curYoloTensor = m_YoloTensors.at(inputYoloCount);
-            curYoloTensor.blobName = layerName;
+            TensorInfo& curYoloTensor = m_YoloTensors.at(yoloCountInputs);
+            curYoloTensor.blobName = blobName;
             curYoloTensor.gridSizeX = prevTensorDims.d[2];
             curYoloTensor.gridSizeY = prevTensorDims.d[1];
 
             std::string inputVol = dimsToString(previous->getDimensions());
-            channels = getNumChannels(previous);
             tensorOutputs.push_back(previous);
-            yoloInputs.push_back(previous);
-            ++inputYoloCount;
-            printLayerInfo(layerIndex, modelType != 0 ? "yolo" : "region", inputVol, "        -", "    -");
+            yoloTensorInputs[yoloCountInputs] = previous;
+            ++yoloCountInputs;
+            std::string layerName = modelType != 0 ? "yolo" : "region";
+            printLayerInfo(layerIndex, layerName, inputVol, "-", "-");
+        }
+
+        else if (m_ConfigBlocks.at(i).at("type") == "cls")
+        {
+            modelType = 3;
+
+            std::string blobName = "cls_" + std::to_string(i);
+            nvinfer1::Dims prevTensorDims = previous->getDimensions();
+            TensorInfo& curYoloTensor = m_YoloTensors.at(yoloCountInputs);
+            curYoloTensor.blobName = blobName;
+            curYoloTensor.numBBoxes = prevTensorDims.d[1];
+            m_NumClasses = prevTensorDims.d[0];
+
+            std::string inputVol = dimsToString(previous->getDimensions());
+            previous = clsLayer(i, m_ConfigBlocks.at(i), previous, &network);
+            assert(previous != nullptr);
+            std::string outputVol = dimsToString(previous->getDimensions());
+            tensorOutputs.push_back(previous);
+            yoloTensorInputs[yoloCountInputs] = previous;
+            ++yoloCountInputs;
+            std::string layerName = "cls";
+            printLayerInfo(layerIndex, layerName, inputVol, outputVol, "-");
+        }
+
+        else if (m_ConfigBlocks.at(i).at("type") == "reg")
+        {
+            modelType = 3;
+
+            std::string blobName = "reg_" + std::to_string(i);
+            nvinfer1::Dims prevTensorDims = previous->getDimensions();
+            TensorInfo& curYoloTensor = m_YoloTensors.at(yoloCountInputs);
+            curYoloTensor.blobName = blobName;
+            curYoloTensor.numBBoxes = prevTensorDims.d[1];
+
+            std::string inputVol = dimsToString(previous->getDimensions());
+            previous = regLayer(i, m_ConfigBlocks.at(i), weights, m_TrtWeights, weightPtr, previous, &network);
+            assert(previous != nullptr);
+            std::string outputVol = dimsToString(previous->getDimensions());
+            tensorOutputs.push_back(previous);
+            yoloTensorInputs[yoloCountInputs] = previous;
+            ++yoloCountInputs;
+            std::string layerName = "reg";
+            printLayerInfo(layerIndex, layerName, inputVol, outputVol, std::to_string(weightPtr));
         }
 
         else
@@ -403,17 +438,18 @@ NvDsInferStatus Yolo::buildYoloNetwork(std::vector<float>& weights, nvinfer1::IN
         assert(0);
     }
 
-    if (m_YoloCount == inputYoloCount)
+    if (m_YoloCount == yoloCountInputs)
     {
         assert((modelType != -1) && "\nCould not determine model type"); 
 
-        nvinfer1::ITensor* yoloInputTensors[inputYoloCount];
         uint64_t outputSize = 0;
-        for (uint j = 0; j < inputYoloCount; ++j)
+        for (uint j = 0; j < yoloCountInputs; ++j)
         {
-            yoloInputTensors[j] = yoloInputs[j];
             TensorInfo& curYoloTensor = m_YoloTensors.at(j);
-            outputSize += curYoloTensor.gridSizeX * curYoloTensor.gridSizeY * curYoloTensor.numBBoxes;
+            if (modelType == 3)
+                outputSize = curYoloTensor.numBBoxes;
+            else
+                outputSize += curYoloTensor.gridSizeX * curYoloTensor.gridSizeY * curYoloTensor.numBBoxes;
         }
 
         if (m_TopK > outputSize) {
@@ -422,21 +458,15 @@ NvDsInferStatus Yolo::buildYoloNetwork(std::vector<float>& weights, nvinfer1::IN
             assert(0);
         }
 
-        std::string layerName = "yolo";
         nvinfer1::IPluginV2* yoloPlugin = new YoloLayer(
-            m_InputW, m_InputH, m_NumClasses, m_NewCoords, m_YoloTensors, outputSize, modelType, m_TopK,
-            m_ScoreThreshold);
+            m_InputW, m_InputH, m_NumClasses, m_NewCoords, m_YoloTensors, outputSize, modelType, m_TopK, m_ScoreThreshold);
         assert(yoloPlugin != nullptr);
-        nvinfer1::IPluginV2Layer* yolo = network.addPluginV2(yoloInputTensors, inputYoloCount, *yoloPlugin);
+        nvinfer1::IPluginV2Layer* yolo = network.addPluginV2(yoloTensorInputs, m_YoloCount, *yoloPlugin);
         assert(yolo != nullptr);
-        yolo->setName(layerName.c_str());
-        previous = yolo->getOutput(0);
-        assert(previous != nullptr);
-        previous->setName(layerName.c_str());
-        tensorOutputs.push_back(yolo->getOutput(0));
+        std::string yoloLayerName = "yolo";
+        yolo->setName(yoloLayerName.c_str());
 
-        nvinfer1::ITensor* yoloTensors[] = {yolo->getOutput(0), yolo->getOutput(1)};
-        std::string outputVol = dimsToString(previous->getDimensions());
+        nvinfer1::ITensor* yoloTensorOutputs[] = {yolo->getOutput(0), yolo->getOutput(1)};
 
         nvinfer1::plugin::NMSParameters nmsParams;
         nmsParams.shareLocation = true;
@@ -448,28 +478,28 @@ NvDsInferStatus Yolo::buildYoloNetwork(std::vector<float>& weights, nvinfer1::IN
         nmsParams.iouThreshold = m_IouThreshold;
         nmsParams.isNormalized = false;
 
-        layerName = "batchedNMS";
+        std::string nmslayerName = "batchedNMS";
         nvinfer1::IPluginV2* batchedNMS = createBatchedNMSPlugin(nmsParams);
-        nvinfer1::IPluginV2Layer* nms = network.addPluginV2(yoloTensors, 2, *batchedNMS);
-        nms->setName(layerName.c_str());
+        nvinfer1::IPluginV2Layer* nms = network.addPluginV2(yoloTensorOutputs, 2, *batchedNMS);
+        nms->setName(nmslayerName.c_str());
         nvinfer1::ITensor* num_detections = nms->getOutput(0);
-        layerName = "num_detections";
-        num_detections->setName(layerName.c_str());
+        nmslayerName = "num_detections";
+        num_detections->setName(nmslayerName.c_str());
         nvinfer1::ITensor* nmsed_boxes = nms->getOutput(1);
-        layerName = "nmsed_boxes";
-        nmsed_boxes->setName(layerName.c_str());
+        nmslayerName = "nmsed_boxes";
+        nmsed_boxes->setName(nmslayerName.c_str());
         nvinfer1::ITensor* nmsed_scores = nms->getOutput(2);
-        layerName = "nmsed_scores";
-        nmsed_scores->setName(layerName.c_str());
+        nmslayerName = "nmsed_scores";
+        nmsed_scores->setName(nmslayerName.c_str());
         nvinfer1::ITensor* nmsed_classes = nms->getOutput(3);
-        layerName = "nmsed_classes";
-        nmsed_classes->setName(layerName.c_str());
+        nmslayerName = "nmsed_classes";
+        nmsed_classes->setName(nmslayerName.c_str());
         network.markOutput(*num_detections);
         network.markOutput(*nmsed_boxes);
         network.markOutput(*nmsed_scores);
         network.markOutput(*nmsed_classes);
 
-        printLayerInfo("", "batched_nms", "        -", outputVol, "    -");
+        printLayerInfo("", "batched_nms", "-", "-", "-");
     }
     else {
         std::cout << "\nError in yolo cfg file" << std::endl;
@@ -620,6 +650,12 @@ void Yolo::parseConfigBlocks()
             
             m_YoloTensors.push_back(outputTensor);
         }
+        else if ((block.at("type") == "cls") || (block.at("type") == "reg"))
+        {
+            ++m_YoloCount;
+            TensorInfo outputTensor;
+            m_YoloTensors.push_back(outputTensor);
+        }
     }
 }
 
@@ -640,9 +676,7 @@ void Yolo::parseConfigNMSBlocks()
 void Yolo::destroyNetworkUtils()
 {
     for (uint i = 0; i < m_TrtWeights.size(); ++i)
-    {
         if (m_TrtWeights[i].count > 0)
             free(const_cast<void*>(m_TrtWeights[i].values));
-    }
     m_TrtWeights.clear();
 }
diff --git a/nvdsinfer_custom_impl_Yolo/yolo.h b/nvdsinfer_custom_impl_Yolo/yolo.h
index 038a2ec..1dd75f5 100644
--- a/nvdsinfer_custom_impl_Yolo/yolo.h
+++ b/nvdsinfer_custom_impl_Yolo/yolo.h
@@ -33,8 +33,13 @@
 #include "layers/shortcut_layer.h"
 #include "layers/route_layer.h"
 #include "layers/upsample_layer.h"
-#include "layers/maxpool_layer.h"
-#include "layers/reorgv5_layer.h"
+#include "layers/pooling_layer.h"
+#include "layers/reorg_layer.h"
+#include "layers/reduce_layer.h"
+#include "layers/shuffle_layer.h"
+#include "layers/softmax_layer.h"
+#include "layers/cls_layer.h"
+#include "layers/reg_layer.h"
 
 #include "nvdsinfer_custom_impl.h"
 
diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward.cu b/nvdsinfer_custom_impl_Yolo/yoloForward.cu
index fb03854..a3d106f 100644
--- a/nvdsinfer_custom_impl_Yolo/yoloForward.cu
+++ b/nvdsinfer_custom_impl_Yolo/yoloForward.cu
@@ -28,7 +28,7 @@ __global__ void gpuYoloLayer(
     if (objectness < scoreThreshold)
         return;
 
-    int count = (int)atomicAdd(&countData[0], 1);
+    int count = (int)atomicAdd(countData, 1);
 
     const float alpha = scaleXY;
     const float beta = -0.5 * (scaleXY - 1);
diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward_e.cu b/nvdsinfer_custom_impl_Yolo/yoloForward_e.cu
new file mode 100644
index 0000000..2d34d39
--- /dev/null
+++ b/nvdsinfer_custom_impl_Yolo/yoloForward_e.cu
@@ -0,0 +1,73 @@
+/*
+ * Created by Marcos Luciano
+ * https://www.github.com/marcoslucianops
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+
+__global__ void gpuYoloLayer_e(
+    const float* cls, const float* reg, int* d_indexes, float* d_scores, float* d_boxes, int* d_classes, int* countData,
+    const float scoreThreshold, const uint netWidth, const uint netHeight, const uint numOutputClasses,
+    const uint64_t outputSize)
+{
+    uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (x_id >= outputSize)
+        return;
+
+    float maxProb = 0.0f;
+    int maxIndex = -1;
+
+    for (uint i = 0; i < numOutputClasses; ++i)
+    {
+        float prob
+            = cls[x_id * numOutputClasses + i];
+
+        if (prob > maxProb)
+        {
+            maxProb = prob;
+            maxIndex = i;
+        }
+    }
+
+    if (maxProb < scoreThreshold)
+        return;
+
+    int count = (int)atomicAdd(countData, 1);
+
+    d_indexes[count] = count;
+    d_scores[count] = maxProb + 1.f;
+    d_boxes[count * 4 + 0] = reg[x_id * 4 + 0];
+    d_boxes[count * 4 + 1] = reg[x_id * 4 + 1];
+    d_boxes[count * 4 + 2] = reg[x_id * 4 + 2];
+    d_boxes[count * 4 + 3] = reg[x_id * 4 + 3];
+    d_classes[count] = maxIndex;
+}
+
+cudaError_t cudaYoloLayer_e(
+    const void* cls, const void* reg, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
+    const uint& batchSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, const uint& netHeight,
+    const uint& numOutputClasses, cudaStream_t stream);
+
+cudaError_t cudaYoloLayer_e(
+    const void* cls, const void* reg, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
+    const uint& batchSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, const uint& netHeight,
+    const uint& numOutputClasses, cudaStream_t stream)
+{
+    int threads_per_block = 16;
+    int number_of_blocks = 525;
+
+    for (unsigned int batch = 0; batch < batchSize; ++batch)
+    {
+        gpuYoloLayer_e<<<number_of_blocks, threads_per_block, 0, stream>>>(
+            reinterpret_cast<const float*>(cls) + (batch * numOutputClasses * outputSize),
+            reinterpret_cast<const float*>(reg) + (batch * 4 * outputSize),
+            reinterpret_cast<int*>(d_indexes) + (batch * outputSize),
+            reinterpret_cast<float*>(d_scores) + (batch * outputSize),
+            reinterpret_cast<float*>(d_boxes) + (batch * 4 * outputSize),
+            reinterpret_cast<int*>(d_classes) + (batch * outputSize), reinterpret_cast<int*>(countData) + (batch),
+            scoreThreshold, netWidth, netHeight, numOutputClasses, outputSize);
+    }
+    return cudaGetLastError();
+}
diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu b/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu
index ff850c3..9b81910 100644
--- a/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu
+++ b/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu
@@ -26,7 +26,7 @@ __global__ void gpuYoloLayer_nc(
     if (objectness < scoreThreshold)
         return;
 
-    int count = (int)atomicAdd(&countData[0], 1);
+    int count = (int)atomicAdd(countData, 1);
 
     const float alpha = scaleXY;
     const float beta = -0.5 * (scaleXY - 1);
diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward_r.cu b/nvdsinfer_custom_impl_Yolo/yoloForward_r.cu
index 46bda23..cbc1aaf 100644
--- a/nvdsinfer_custom_impl_Yolo/yoloForward_r.cu
+++ b/nvdsinfer_custom_impl_Yolo/yoloForward_r.cu
@@ -28,7 +28,7 @@ __global__ void gpuYoloLayer_r(
     if (objectness < scoreThreshold)
         return;
 
-    int count = (int)atomicAdd(&countData[0], 1);
+    int count = (int)atomicAdd(countData, 1);
 
     const float alpha = scaleXY;
     const float beta = -0.5 * (scaleXY - 1);
diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu b/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu
index a1e0cbc..43c57d9 100644
--- a/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu
+++ b/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu
@@ -49,7 +49,7 @@ __global__ void gpuRegionLayer(
     if (objectness < scoreThreshold)
         return;
 
-    int count = (int)atomicAdd(&countData[0], 1);
+    int count = (int)atomicAdd(countData, 1);
 
     float x
         = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)])
diff --git a/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp b/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp
index 504e68c..75dfe25 100644
--- a/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp
+++ b/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp
@@ -47,6 +47,11 @@ namespace {
     }
 }
 
+cudaError_t cudaYoloLayer_e(
+    const void* cls, const void* reg, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
+    const uint& batchSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth, const uint& netHeight,
+    const uint& numOutputClasses, cudaStream_t stream);
+
 cudaError_t cudaYoloLayer_r(
     const void* input, void* d_indexes, void* d_scores, void* d_boxes, void* d_classes, void* countData,
     const uint& batchSize, uint64_t& inputSize, uint64_t& outputSize, const float& scoreThreshold, const uint& netWidth,
@@ -88,34 +93,36 @@ YoloLayer::YoloLayer (const void* data, size_t length)
     read(d, m_TopK);
     read(d, m_ScoreThreshold);
 
-    uint yoloTensorsSize;
-    read(d, yoloTensorsSize);
-    for (uint i = 0; i < yoloTensorsSize; ++i)
-    {
-        TensorInfo curYoloTensor;
-        read(d, curYoloTensor.gridSizeX);
-        read(d, curYoloTensor.gridSizeY);
-        read(d, curYoloTensor.numBBoxes);
-        read(d, curYoloTensor.scaleXY);
-
-        uint anchorsSize;
-        read(d, anchorsSize);
-        for (uint j = 0; j < anchorsSize; j++)
+    if (m_Type != 3) {
+        uint yoloTensorsSize;
+        read(d, yoloTensorsSize);
+        for (uint i = 0; i < yoloTensorsSize; ++i)
         {
-            float result;
-            read(d, result);
-            curYoloTensor.anchors.push_back(result);
-        }
+            TensorInfo curYoloTensor;
+            read(d, curYoloTensor.gridSizeX);
+            read(d, curYoloTensor.gridSizeY);
+            read(d, curYoloTensor.numBBoxes);
+            read(d, curYoloTensor.scaleXY);
 
-        uint maskSize;
-        read(d, maskSize);
-        for (uint j = 0; j < maskSize; j++)
-        {
-            int result;
-            read(d, result);
-            curYoloTensor.mask.push_back(result);
+            uint anchorsSize;
+            read(d, anchorsSize);
+            for (uint j = 0; j < anchorsSize; j++)
+            {
+                float result;
+                read(d, result);
+                curYoloTensor.anchors.push_back(result);
+            }
+
+            uint maskSize;
+            read(d, maskSize);
+            for (uint j = 0; j < maskSize; j++)
+            {
+                int result;
+                read(d, result);
+                curYoloTensor.mask.push_back(result);
+            }
+            m_YoloTensors.push_back(curYoloTensor);
         }
-        m_YoloTensors.push_back(curYoloTensor);
     }
 
     kNUM_CLASSES = m_NumClasses;
@@ -147,9 +154,9 @@ YoloLayer::getOutputDimensions(
 {
     assert(index < 3);
     if (index == 0) {
-        return nvinfer1::Dims3(m_TopK, 1, 4);
+        return nvinfer1::Dims{3, {static_cast<int>(m_TopK), 1, 4}};
     }
-    return nvinfer1::DimsHW(m_TopK, m_NumClasses);
+    return nvinfer1::Dims{2, {static_cast<int>(m_TopK), static_cast<int>(m_NumClasses)}};
 }
 
 bool YoloLayer::supportsFormat (
@@ -173,95 +180,106 @@ int32_t YoloLayer::enqueue (
     int batchSize, void const* const* inputs, void* const* outputs, void* workspace,	
     cudaStream_t stream) noexcept
 {
-    void* countData = workspace;
     void* bboxData = outputs[0];
     void* scoreData = outputs[1];
 
-    CUDA_CHECK(cudaMemsetAsync((int*)countData, 0, sizeof(int) * batchSize, stream));
     CUDA_CHECK(cudaMemsetAsync((float*)bboxData, 0, sizeof(float) * m_TopK * 4 * batchSize, stream));
     CUDA_CHECK(cudaMemsetAsync((float*)scoreData, 0, sizeof(float) * m_TopK * m_NumClasses * batchSize, stream));
 
+    void* countData;
+    CUDA_CHECK(cudaMalloc(&countData, sizeof(int) * batchSize));
+    CUDA_CHECK(cudaMemsetAsync((int*)countData, 0, sizeof(int) * batchSize, stream));
+
     void* d_indexes;
-    CUDA_CHECK(cudaMallocHost(&d_indexes, sizeof(int) * m_OutputSize * batchSize));
-    CUDA_CHECK(cudaMemsetAsync((float*)d_indexes, 0, sizeof(int) * m_OutputSize * batchSize, stream));
+    CUDA_CHECK(cudaMalloc(&d_indexes, sizeof(int) * m_OutputSize * batchSize));
+    CUDA_CHECK(cudaMemsetAsync((int*)d_indexes, 0, sizeof(int) * m_OutputSize * batchSize, stream));
 
     void* d_scores;
-    CUDA_CHECK(cudaMallocHost(&d_scores, sizeof(float) * m_OutputSize * batchSize));
+    CUDA_CHECK(cudaMalloc(&d_scores, sizeof(float) * m_OutputSize * batchSize));
     CUDA_CHECK(cudaMemsetAsync((float*)d_scores, 0, sizeof(float) * m_OutputSize * batchSize, stream));
 
     void* d_boxes;
-    CUDA_CHECK(cudaMallocHost(&d_boxes, sizeof(float) * m_OutputSize * 4 * batchSize));
+    CUDA_CHECK(cudaMalloc(&d_boxes, sizeof(float) * m_OutputSize * 4 * batchSize));
     CUDA_CHECK(cudaMemsetAsync((float*)d_boxes, 0, sizeof(float) * m_OutputSize * 4 * batchSize, stream));
 
     void* d_classes;
-    CUDA_CHECK(cudaMallocHost(&d_classes, sizeof(int) * m_OutputSize * batchSize));
+    CUDA_CHECK(cudaMalloc(&d_classes, sizeof(int) * m_OutputSize * batchSize));
     CUDA_CHECK(cudaMemsetAsync((float*)d_classes, 0, sizeof(int) * m_OutputSize * batchSize, stream));
 
-    uint yoloTensorsSize = m_YoloTensors.size();
-    for (uint i = 0; i < yoloTensorsSize; ++i)
+    if (m_Type == 3)
     {
-        TensorInfo& curYoloTensor = m_YoloTensors.at(i);
+        CUDA_CHECK(cudaYoloLayer_e(
+            inputs[0], inputs[1], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, m_OutputSize,
+            m_ScoreThreshold, m_NetWidth, m_NetHeight, m_NumClasses, stream));
+    }
+    else
+    {
+        uint yoloTensorsSize = m_YoloTensors.size();
+        for (uint i = 0; i < yoloTensorsSize; ++i)
+        {
+            TensorInfo& curYoloTensor = m_YoloTensors.at(i);
 
-        uint numBBoxes = curYoloTensor.numBBoxes;
-        float scaleXY = curYoloTensor.scaleXY;
-        uint gridSizeX = curYoloTensor.gridSizeX;
-        uint gridSizeY = curYoloTensor.gridSizeY;
-        std::vector<float> anchors = curYoloTensor.anchors;
-        std::vector<int> mask = curYoloTensor.mask;
+            uint numBBoxes = curYoloTensor.numBBoxes;
+            float scaleXY = curYoloTensor.scaleXY;
+            uint gridSizeX = curYoloTensor.gridSizeX;
+            uint gridSizeY = curYoloTensor.gridSizeY;
+            std::vector<float> anchors = curYoloTensor.anchors;
+            std::vector<int> mask = curYoloTensor.mask;
 
-        void* v_anchors;
-        void* v_mask;
-        if (anchors.size() > 0) {
-            float* f_anchors = anchors.data();
-            CUDA_CHECK(cudaMallocHost(&v_anchors, sizeof(float) * anchors.size()));
-            CUDA_CHECK(cudaMemcpy(v_anchors, f_anchors, sizeof(float) * anchors.size(), cudaMemcpyHostToDevice));
-        }
-        if (mask.size() > 0) {
-            int* f_mask = mask.data();
-            CUDA_CHECK(cudaMallocHost(&v_mask, sizeof(int) * mask.size()));
-            CUDA_CHECK(cudaMemcpy(v_mask, f_mask, sizeof(int) * mask.size(), cudaMemcpyHostToDevice));
-        }
+            void* v_anchors;
+            void* v_mask;
+            if (anchors.size() > 0) {
+                float* f_anchors = anchors.data();
+                CUDA_CHECK(cudaMalloc(&v_anchors, sizeof(float) * anchors.size()));
+                CUDA_CHECK(cudaMemcpy(v_anchors, f_anchors, sizeof(float) * anchors.size(), cudaMemcpyHostToDevice));
+            }
+            if (mask.size() > 0) {
+                int* f_mask = mask.data();
+                CUDA_CHECK(cudaMalloc(&v_mask, sizeof(int) * mask.size()));
+                CUDA_CHECK(cudaMemcpy(v_mask, f_mask, sizeof(int) * mask.size(), cudaMemcpyHostToDevice));
+            }
 
-        uint64_t inputSize = gridSizeX * gridSizeY * (numBBoxes * (4 + 1 + m_NumClasses));
+            uint64_t inputSize = gridSizeX * gridSizeY * (numBBoxes * (4 + 1 + m_NumClasses));
 
-        if (m_Type == 2) {  // YOLOR incorrect param: scale_x_y = 2.0
-            CUDA_CHECK(cudaYoloLayer_r(
-                inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
-                m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, 2.0, v_anchors,
-                v_mask, stream));
-        }
-        else if (m_Type == 1) {
-            if (m_NewCoords) {
-                CUDA_CHECK(cudaYoloLayer_nc(
+            if (m_Type == 2) {  // YOLOR incorrect param: scale_x_y = 2.0
+                CUDA_CHECK(cudaYoloLayer_r(
                     inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
-                    m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY,
-                    v_anchors, v_mask, stream));
+                    m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, 2.0, v_anchors,
+                    v_mask, stream));
+            }
+            else if (m_Type == 1) {
+                if (m_NewCoords) {
+                    CUDA_CHECK(cudaYoloLayer_nc(
+                        inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
+                        m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY,
+                        v_anchors, v_mask, stream));
+                }
+                else {
+                    CUDA_CHECK(cudaYoloLayer(
+                        inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
+                        m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY,
+                        v_anchors, v_mask, stream));
+                }
             }
             else {
-                CUDA_CHECK(cudaYoloLayer(
-                    inputs[i], d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
-                    m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY,
-                    v_anchors, v_mask, stream));
+                void* softmax;
+                CUDA_CHECK(cudaMalloc(&softmax, sizeof(float) * inputSize * batchSize));
+                CUDA_CHECK(cudaMemsetAsync((float*)softmax, 0, sizeof(float) * inputSize * batchSize));
+
+                CUDA_CHECK(cudaRegionLayer(
+                    inputs[i], softmax, d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
+                    m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, v_anchors,
+                    stream));
+
+                CUDA_CHECK(cudaFree(softmax));
             }
-        }
-        else {
-            void* softmax;
-            CUDA_CHECK(cudaMallocHost(&softmax, sizeof(float) * inputSize * batchSize));
-            CUDA_CHECK(cudaMemsetAsync((float*)softmax, 0, sizeof(float) * inputSize * batchSize));
 
-            CUDA_CHECK(cudaRegionLayer(
-                inputs[i], softmax, d_indexes, d_scores, d_boxes, d_classes, countData, batchSize, inputSize, m_OutputSize,
-                m_ScoreThreshold, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, v_anchors,
-                stream));
-
-            CUDA_CHECK(cudaFreeHost(softmax));
-        }
-
-        if (anchors.size() > 0) {
-            CUDA_CHECK(cudaFreeHost(v_anchors));
-        }
-        if (mask.size() > 0) {
-            CUDA_CHECK(cudaFreeHost(v_mask));
+            if (anchors.size() > 0) {
+                CUDA_CHECK(cudaFree(v_anchors));
+            }
+            if (mask.size() > 0) {
+                CUDA_CHECK(cudaFree(v_mask));
+            }
         }
     }
 
@@ -269,10 +287,11 @@ int32_t YoloLayer::enqueue (
         d_indexes, d_scores, d_boxes, d_classes, bboxData, scoreData, countData, batchSize, m_OutputSize, m_TopK,
         m_NumClasses, stream));
 
-    CUDA_CHECK(cudaFreeHost(d_indexes));
-    CUDA_CHECK(cudaFreeHost(d_scores));
-    CUDA_CHECK(cudaFreeHost(d_boxes));
-    CUDA_CHECK(cudaFreeHost(d_classes));
+    CUDA_CHECK(cudaFree(countData));
+    CUDA_CHECK(cudaFree(d_indexes));
+    CUDA_CHECK(cudaFree(d_scores));
+    CUDA_CHECK(cudaFree(d_boxes));
+    CUDA_CHECK(cudaFree(d_classes));
 
     return 0;
 }
@@ -290,18 +309,20 @@ size_t YoloLayer::getSerializationSize() const noexcept
     totalSize += sizeof(m_TopK);
     totalSize += sizeof(m_ScoreThreshold);
 
-    uint yoloTensorsSize = m_YoloTensors.size();
-    totalSize += sizeof(yoloTensorsSize);
+    if (m_Type != 3) {
+        uint yoloTensorsSize = m_YoloTensors.size();
+        totalSize += sizeof(yoloTensorsSize);
 
-    for (uint i = 0; i < yoloTensorsSize; ++i)
-    {
-        const TensorInfo& curYoloTensor = m_YoloTensors.at(i);
-        totalSize += sizeof(curYoloTensor.gridSizeX);
-        totalSize += sizeof(curYoloTensor.gridSizeY);
-        totalSize += sizeof(curYoloTensor.numBBoxes);
-        totalSize += sizeof(curYoloTensor.scaleXY);
-        totalSize += sizeof(uint) + sizeof(curYoloTensor.anchors[0]) * curYoloTensor.anchors.size();
-        totalSize += sizeof(uint) + sizeof(curYoloTensor.mask[0]) * curYoloTensor.mask.size();
+        for (uint i = 0; i < yoloTensorsSize; ++i)
+        {
+            const TensorInfo& curYoloTensor = m_YoloTensors.at(i);
+            totalSize += sizeof(curYoloTensor.gridSizeX);
+            totalSize += sizeof(curYoloTensor.gridSizeY);
+            totalSize += sizeof(curYoloTensor.numBBoxes);
+            totalSize += sizeof(curYoloTensor.scaleXY);
+            totalSize += sizeof(uint) + sizeof(curYoloTensor.anchors[0]) * curYoloTensor.anchors.size();
+            totalSize += sizeof(uint) + sizeof(curYoloTensor.mask[0]) * curYoloTensor.mask.size();
+        }
     }
 
     return totalSize;
@@ -320,28 +341,30 @@ void YoloLayer::serialize(void* buffer) const noexcept
     write(d, m_TopK);
     write(d, m_ScoreThreshold);
 
-    uint yoloTensorsSize = m_YoloTensors.size();
-    write(d, yoloTensorsSize);
-    for (uint i = 0; i < yoloTensorsSize; ++i)
-    {
-        const TensorInfo& curYoloTensor = m_YoloTensors.at(i);
-        write(d, curYoloTensor.gridSizeX);
-        write(d, curYoloTensor.gridSizeY);
-        write(d, curYoloTensor.numBBoxes);
-        write(d, curYoloTensor.scaleXY);
-
-        uint anchorsSize = curYoloTensor.anchors.size();
-        write(d, anchorsSize);
-        for (uint j = 0; j < anchorsSize; ++j)
+    if (m_Type != 3) {
+        uint yoloTensorsSize = m_YoloTensors.size();
+        write(d, yoloTensorsSize);
+        for (uint i = 0; i < yoloTensorsSize; ++i)
         {
-            write(d, curYoloTensor.anchors[j]);
-        }
+            const TensorInfo& curYoloTensor = m_YoloTensors.at(i);
+            write(d, curYoloTensor.gridSizeX);
+            write(d, curYoloTensor.gridSizeY);
+            write(d, curYoloTensor.numBBoxes);
+            write(d, curYoloTensor.scaleXY);
 
-        uint maskSize = curYoloTensor.mask.size();
-        write(d, maskSize);
-        for (uint j = 0; j < maskSize; ++j)
-        {
-            write(d, curYoloTensor.mask[j]);
+            uint anchorsSize = curYoloTensor.anchors.size();
+            write(d, anchorsSize);
+            for (uint j = 0; j < anchorsSize; ++j)
+            {
+                write(d, curYoloTensor.anchors[j]);
+            }
+
+            uint maskSize = curYoloTensor.mask.size();
+            write(d, maskSize);
+            for (uint j = 0; j < maskSize; ++j)
+            {
+                write(d, curYoloTensor.mask[j]);
+            }
         }
     }
 }
diff --git a/nvdsinfer_custom_impl_Yolo/yoloPlugins.h b/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
index e7ae524..2948e2d 100644
--- a/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
+++ b/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
@@ -85,9 +85,7 @@ public:
 
     void terminate () noexcept override {}
 
-    size_t getWorkspaceSize (int maxBatchSize) const noexcept override {
-        return maxBatchSize * sizeof(int);
-    }
+    size_t getWorkspaceSize (int maxBatchSize) const noexcept override { return 0; }
 
     int32_t enqueue (
         int batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream)
diff --git a/readme.md b/readme.md
index 1916a8c..e38a7e1 100644
--- a/readme.md
+++ b/readme.md
@@ -7,7 +7,6 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
 * Models benchmarks
 * DeepStream tutorials
 * YOLOX support
-* PP-YOLO support
 * YOLOv6 support
 * YOLOv7 support
 * Dynamic batch-size
@@ -23,18 +22,18 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
 * Support for INT8 calibration
 * Support for non square models
 * New documentation for multiple models
-* **YOLOv5 >= 2.0 support**
-* **YOLOR support**
+* YOLOv5 support
+* YOLOR support
 * **GPU YOLO Decoder** [#138](https://github.com/marcoslucianops/DeepStream-Yolo/issues/138)
 * **GPU Batched NMS** [#142](https://github.com/marcoslucianops/DeepStream-Yolo/issues/142)
-* **New YOLOv5 conversion**
+* **PP-YOLOE support**
 
 ##
 
 ### Getting started
 
 * [Requirements](#requirements)
-* [Tested models](#tested-models)
+* [Suported models](#supported-models)
 * [Benchmarks](#benchmarks)
 * [dGPU installation](#dgpu-installation)
 * [Basic usage](#basic-usage)
@@ -42,6 +41,7 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
 * [INT8 calibration](#int8-calibration)
 * [YOLOv5 usage](docs/YOLOv5.md)
 * [YOLOR usage](docs/YOLOR.md)
+* [PP-YOLOE usage](docs/PPYOLOE.md)
 * [Using your custom model](docs/customModels.md)
 * [Multiple YOLO GIEs](docs/multipleGIEs.md)
 
@@ -81,23 +81,14 @@ NVIDIA DeepStream SDK 6.1 / 6.0.1 / 6.0 configuration for YOLO models
 * [NVIDIA DeepStream SDK 6.0.1 / 6.0](https://developer.nvidia.com/embedded/deepstream-on-jetson-downloads-archived)
 * [DeepStream-Yolo](https://github.com/marcoslucianops/DeepStream-Yolo)
 
-### For YOLOv5 and YOLOR
-
-#### x86 platform
-
-* [PyTorch >= 1.7.0](https://pytorch.org/get-started/locally/)
-
-#### Jetson platform
-
-* [PyTorch >= 1.7.0](https://forums.developer.nvidia.com/t/pytorch-for-jetson-version-1-11-now-available/72048)
-
 ##
 
-### Tested models
+### Suported models
 
 * [Darknet YOLO](https://github.com/AlexeyAB/darknet)
 * [YOLOv5 >= 2.0](https://github.com/ultralytics/yolov5)
 * [YOLOR](https://github.com/WongKinYiu/yolor)
+* [PP-YOLOE](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.4/configs/ppyoloe)
 * [MobileNet-YOLO](https://github.com/dog-qiuqiu/MobileNet-Yolo)
 * [YOLO-Fastest](https://github.com/dog-qiuqiu/Yolo-Fastest)
 
diff --git a/utils/gen_wts_ppyoloe.py b/utils/gen_wts_ppyoloe.py
new file mode 100644
index 0000000..85fe20d
--- /dev/null
+++ b/utils/gen_wts_ppyoloe.py
@@ -0,0 +1,437 @@
+import os
+import struct
+import paddle
+import numpy as np
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.utils.check import check_gpu, check_version, check_config
+from ppdet.utils.cli import ArgsParser
+from ppdet.engine import Trainer
+from ppdet.slim import build_slim_model
+
+class Layers(object):
+    def __init__(self, size, fw, fc, letter_box):
+        self.blocks = [0 for _ in range(300)]
+        self.current = -1
+
+        self.backbone_outs = []
+        self.neck_fpn_feats = []
+        self.neck_pan_feats = []
+        self.yolo_head_cls = []
+        self.yolo_head_reg = []
+
+        self.width = size[0] if len(size) == 1 else size[1]
+        self.height = size[0]
+        self.letter_box = letter_box
+
+        self.fw = fw
+        self.fc = fc
+        self.wc = 0
+
+        self.net()
+
+    def ConvBNLayer(self, child):
+        self.current += 1
+
+        self.convolutional(child, act='swish')
+
+    def CSPResStage(self, child, ret):
+        self.current += 1
+
+        if child.conv_down is not None:
+            self.convolutional(child.conv_down, act='swish')
+        self.convolutional(child.conv1, act='swish')
+        self.route('-2')
+        self.convolutional(child.conv2, act='swish')
+        idx = -3
+        for m in child.blocks:
+            self.convolutional(m.conv1, act='swish')
+            self.convolutional(m.conv2, act='swish')
+            self.shortcut(-3)
+            idx -= 3
+        self.route('%d, -1' % idx)
+        if child.attn is not None:
+            self.reduce((1, 2), mode='mean', keepdim=True)
+            self.convolutional(child.attn.fc, act='hardsigmoid')
+            self.shortcut(-3, ew='mul')
+        self.convolutional(child.conv3, act='swish')
+        if ret is True:
+            self.backbone_outs.append(self.current)
+
+    def CSPStage(self, child, stage):
+        self.current += 1
+
+        self.convolutional(child.conv1, act='swish')
+        self.route('-2')
+        self.convolutional(child.conv2, act='swish')
+        idx = -3
+        for m in child.convs:
+            if m.__class__.__name__ == 'BasicBlock':
+                self.convolutional(m.conv1, act='swish')
+                self.convolutional(m.conv2, act='swish')
+                idx -= 2
+            elif m.__class__.__name__ == 'SPP':
+                self.maxpool(m.pool0)
+                self.route('-2')
+                self.maxpool(m.pool1)
+                self.route('-4')
+                self.maxpool(m.pool2)
+                self.route('-6, -5, -3, -1')
+                self.convolutional(m.conv, act='swish')
+                idx -= 7
+        self.route('%d, -1' % idx)
+        self.convolutional(child.conv3, act='swish')
+        if stage == 'fpn':
+            self.neck_fpn_feats.append(self.current)
+        elif stage == 'pan':
+            self.neck_pan_feats.append(self.current)
+
+    def Concat(self, route):
+        self.current += 1
+
+        r = self.get_route(route)
+        self.route('-1, %d' % r)
+
+    def Upsample(self):
+        self.current += 1
+
+        self.upsample()
+
+    def AvgPool2d(self, route=None):
+        self.current += 1
+
+        if route is not None:
+            r = self.get_route(route)
+            self.route('%d' % r)
+        self.avgpool()
+
+    def ESEAttn(self, child, route=0):
+        self.current += 1
+
+        if route < 0:
+            self.route('%d' % route)
+        self.convolutional(child.fc, act='sigmoid')
+        self.shortcut(route - 3, ew='mul')
+        self.convolutional(child.conv, act='swish')
+        if route == 0:
+            self.shortcut(-5)
+
+    def Conv2D(self, child, act='linear'):
+        self.current += 1
+
+        self.convolutional(child, act=act)
+
+    def Shuffle(self, reshape=None, transpose1=None, transpose2=None, route=None, output=''):
+        self.current += 1
+
+        r = 0
+        if route is not None:
+            r = self.get_route(route)
+        self.shuffle(reshape=reshape, transpose1=transpose1, transpose2=transpose2, route=r)
+        if output == 'cls':
+            self.yolo_head_cls.append(self.current)
+        elif output == 'reg':
+            self.yolo_head_reg.append(self.current)
+
+    def SoftMax(self, axes):
+        self.current += 1
+
+        self.softmax(axes)
+
+    def Detect(self, output):
+        self.current += 1
+
+        routes = self.yolo_head_cls if output == 'cls' else self.yolo_head_reg
+
+        for i, route in enumerate(routes):
+            routes[i] = self.get_route(route)
+        self.route(str(routes)[1:-1], axis=-1)
+        self.yolo(output)
+
+    def net(self):
+        lb = 'letter_box=1\n' if self.letter_box else ''
+
+        self.fc.write('[net]\n' +
+                      'width=%d\n' % self.width +
+                      'height=%d\n' % self.height +
+                      'channels=3\n' +
+                      lb)
+
+    def convolutional(self, cv, act='linear', detect=False):
+        self.blocks[self.current] += 1
+
+        self.get_state_dict(cv.state_dict())
+
+        if cv.__class__.__name__ == 'Conv2D':
+            filters = cv._out_channels
+            size = cv._kernel_size
+            stride = cv._stride
+            pad = cv._padding
+            groups = cv._groups
+            bias = cv.bias
+            bn = False
+        else:
+            filters = cv.conv._out_channels
+            size = cv.conv._kernel_size
+            stride = cv.conv._stride
+            pad = cv.conv._padding
+            groups = cv.conv._groups
+            bias = cv.conv.bias
+            bn = True if hasattr(cv, 'bn') else False
+
+        if detect:
+            act = 'logistic'
+
+        b = 'batch_normalize=1\n' if bn is True else ''
+        g = 'groups=%d\n' % groups if groups > 1 else ''
+        w = 'bias=0\n' if bias is None and bn is False else ''
+
+        self.fc.write('\n[convolutional]\n' +
+                      b +
+                      'filters=%d\n' % filters +
+                      'size=%s\n' % self.get_value(size) +
+                      'stride=%s\n' % self.get_value(stride) +
+                      'pad=%s\n' % self.get_value(pad) +
+                      g +
+                      w +
+                      'activation=%s\n' % act)
+
+    def route(self, layers, axis=0):
+        self.blocks[self.current] += 1
+
+        a = 'axis=%d\n' % axis if axis != 0 else ''
+
+        self.fc.write('\n[route]\n' +
+                      'layers=%s\n' % layers +
+                      a)
+
+    def shortcut(self, r, ew='add', act='linear'):
+        self.blocks[self.current] += 1
+
+        m = 'mode=mul\n' if ew == 'mul' else ''
+
+        self.fc.write('\n[shortcut]\n' +
+                      'from=%d\n' % r +
+                      m +
+                      'activation=%s\n' % act)
+
+    def reduce(self, dim, mode='mean', keepdim=False):
+        self.blocks[self.current] += 1
+
+        self.fc.write('\n[reduce]\n' +
+                      'mode=%s\n' % mode +
+                      'axes=%s\n' % str(dim)[1:-1] +
+                      'keep=%d\n' % keepdim)
+
+    def maxpool(self, m):
+        self.blocks[self.current] += 1
+
+        stride = m.stride
+        size = m.ksize
+        mode = m.ceil_mode
+
+        m = 'maxpool_up' if mode else 'maxpool'
+
+        self.fc.write('\n[%s]\n' % m +
+                      'stride=%d\n' % stride +
+                      'size=%d\n' % size)
+
+    def upsample(self):
+        self.blocks[self.current] += 1
+
+        stride = 2
+
+        self.fc.write('\n[upsample]\n' +
+                      'stride=%d\n' % stride)
+
+    def avgpool(self):
+        self.blocks[self.current] += 1
+
+        self.fc.write('\n[avgpool]\n')
+
+    def shuffle(self, reshape=None, transpose1=None, transpose2=None, route=None):
+        self.blocks[self.current] += 1
+
+        r = 'reshape=%s\n' % str(reshape)[1:-1] if reshape is not None else ''
+        t1 = 'transpose1=%s\n' % str(transpose1)[1:-1] if transpose1 is not None else ''
+        t2 = 'transpose2=%s\n' % str(transpose2)[1:-1] if transpose2 is not None else ''
+        f = 'from=%d\n' % route if route is not None else ''
+
+        self.fc.write('\n[shuffle]\n' +
+                      r +
+                      t1 +
+                      t2 +
+                      f)
+
+    def softmax(self, axes):
+        self.blocks[self.current] += 1
+
+        self.fc.write('\n[softmax]\n' +
+                      'axes=%d\n' % axes)
+
+    def yolo(self, output):
+        self.blocks[self.current] += 1
+
+        self.fc.write('\n[%s]\n' % output)
+
+    def get_state_dict(self, state_dict):
+        for k, v in state_dict.items():
+            vr = v.reshape([-1]).numpy()
+            self.fw.write('{} {} '.format(k, len(vr)))
+            for vv in vr:
+                self.fw.write(' ')
+                self.fw.write(struct.pack('>f', float(vv)).hex())
+            self.fw.write('\n')
+            self.wc += 1
+
+    def get_anchors(self, anchor_points, stride_tensor):
+        vr = anchor_points.numpy()
+        self.fw.write('{} {} '.format('anchor_points', len(vr)))
+        for vv in vr:
+            self.fw.write(' ')
+            self.fw.write(struct.pack('>f', float(vv)).hex())
+        self.fw.write('\n')
+        self.wc += 1
+        vr = stride_tensor.numpy()
+        self.fw.write('{} {} '.format('stride_tensor', len(vr)))
+        for vv in vr:
+            self.fw.write(' ')
+            self.fw.write(struct.pack('>f', float(vv)).hex())
+        self.fw.write('\n')
+        self.wc += 1
+
+    def get_value(self, key):
+        if type(key) == int:
+            return key
+        return key[0] if key[0] == key[1] else str(key)[1:-1]
+
+    def get_route(self, n):
+        r = 0
+        for i, b in enumerate(self.blocks):
+            if i <= n:
+                r += b
+            else:
+                break
+        return r - 1
+
+
+def export_model():
+    paddle.set_device('cpu')
+
+    FLAGS = parse_args()
+
+    cfg = load_config(FLAGS.config)
+
+    FLAGS.opt['weights'] = FLAGS.weights
+    FLAGS.opt['exclude_nms'] = True
+
+    if 'norm_type' in cfg and cfg['norm_type'] == 'sync_bn':
+        FLAGS.opt['norm_type'] = 'bn'
+    merge_config(FLAGS.opt)
+
+    if FLAGS.slim_config:
+        cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test')
+
+    merge_config(FLAGS.opt)
+    check_config(cfg)
+    check_gpu(cfg.use_gpu)
+    check_version()
+
+    trainer = Trainer(cfg, mode='test')
+    trainer.load_weights(cfg.weights)
+
+    trainer.model.eval()
+
+    if not os.path.exists('.tmp'):
+        os.makedirs('.tmp')
+
+    static_model, _ = trainer._get_infer_cfg_and_input_spec('.tmp')
+
+    os.system('rm -r .tmp')
+
+    return cfg, static_model
+
+
+def parse_args():
+    parser = ArgsParser()
+    parser.add_argument('-w', '--weights', required=True, type=str, help='Input weights (.pdparams) file path (required)')
+    parser.add_argument('--slim_config', default=None, type=str, help='Slim configuration file of slim method')
+    args = parser.parse_args()
+    return args
+
+
+cfg, model = export_model()
+
+model_name = cfg.filename
+inference_size = (cfg.eval_height, cfg.eval_width)
+letter_box = False
+
+for sample_transforms in cfg['EvalReader']['sample_transforms']:
+    if 'Resize' in sample_transforms:
+        letter_box = sample_transforms['Resize']['keep_ratio']
+
+backbone = cfg[cfg.architecture]['backbone']
+neck = cfg[cfg.architecture]['neck']
+yolo_head = cfg[cfg.architecture]['yolo_head']
+
+wts_file = model_name + '.wts' if 'ppyoloe' in model_name else 'ppyoloe_' + model_name + '.wts'
+cfg_file = model_name + '.cfg' if 'ppyoloe' in model_name else 'ppyoloe_' + model_name + '.cfg'
+
+with open(wts_file, 'w') as fw, open(cfg_file, 'w') as fc:
+    layers = Layers(inference_size, fw, fc, letter_box)
+
+    if backbone == 'CSPResNet':
+        layers.fc.write('\n# CSPResNet\n')
+
+        for child in model.backbone.stem:
+            layers.ConvBNLayer(child)
+        for i, child in enumerate(model.backbone.stages):
+            ret = True if i in model.backbone.return_idx else False
+            layers.CSPResStage(child, ret)
+    else:
+        raise SystemExit('Model not supported')
+
+    if neck == 'CustomCSPPAN':
+        layers.fc.write('\n# CustomCSPPAN\n')
+
+        blocks = layers.backbone_outs[::-1]
+        for i, block in enumerate(blocks):
+            if i > 0:
+                layers.Concat(block)
+            layers.CSPStage(model.neck.fpn_stages[i][0], 'fpn')
+            if i < model.neck.num_blocks - 1:
+                layers.ConvBNLayer(model.neck.fpn_routes[i])
+                layers.Upsample()
+        layers.neck_pan_feats = [layers.neck_fpn_feats[-1], ]
+        for i in reversed(range(model.neck.num_blocks - 1)):
+            layers.ConvBNLayer(model.neck.pan_routes[i])
+            layers.Concat(layers.neck_fpn_feats[i])
+            layers.CSPStage(model.neck.pan_stages[i][0], 'pan')
+        layers.neck_pan_feats = layers.neck_pan_feats[::-1]
+    else:
+        raise SystemExit('Model not supported')
+
+    if yolo_head == 'PPYOLOEHead':
+        layers.fc.write('\n# PPYOLOEHead\n')
+
+        for i, feat in enumerate(layers.neck_pan_feats):
+            if i > 0:
+                layers.AvgPool2d(route=feat)
+            else:
+                layers.AvgPool2d()
+            layers.ESEAttn(model.yolo_head.stem_cls[i])
+            layers.Conv2D(model.yolo_head.pred_cls[i], act='sigmoid')
+            layers.Shuffle(reshape=[model.yolo_head.num_classes, 0], route=feat, output='cls')
+            layers.ESEAttn(model.yolo_head.stem_reg[i], route=-7)
+            layers.Conv2D(model.yolo_head.pred_reg[i])
+            layers.Shuffle(reshape=[4, model.yolo_head.reg_max + 1, 0], transpose2=[1, 0, 2], route=feat)
+            layers.SoftMax(0)
+            layers.Conv2D(model.yolo_head.proj_conv)
+            layers.Shuffle(reshape=[4, 0], route=feat, output='reg')
+        layers.Detect('cls')
+        layers.Detect('reg')
+        layers.get_anchors(model.yolo_head.anchor_points.reshape([-1]), model.yolo_head.stride_tensor)
+
+    else:
+        raise SystemExit('Model not supported')
+
+os.system('echo "%d" | cat - %s > temp && mv temp %s' % (layers.wc, wts_file, wts_file))
diff --git a/utils/gen_wts_yolor.py b/utils/gen_wts_yolor.py
index cfc6043..ffaf427 100644
--- a/utils/gen_wts_yolor.py
+++ b/utils/gen_wts_yolor.py
@@ -9,11 +9,11 @@ from models.models import Darknet
 def parse_args():
     parser = argparse.ArgumentParser(description='PyTorch YOLOR conversion (main branch)')
     parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)')
-    parser.add_argument('-c', '--cfg', help='Input cfg (.cfg) file path')
+    parser.add_argument('-c', '--cfg', default='', help='Input cfg (.cfg) file path')
     args = parser.parse_args()
     if not os.path.isfile(args.weights):
         raise SystemExit('Invalid weights file')
-    if not os.path.isfile(args.cfg):
+    if args.cfg != '' and not os.path.isfile(args.cfg):
         raise SystemExit('Invalid cfg file')
     return args.weights, args.cfg