diff --git a/native/nvdsinfer_custom_impl_Yolo/yolo.cpp b/native/nvdsinfer_custom_impl_Yolo/yolo.cpp index e5a17c5..d3596c6 100644 --- a/native/nvdsinfer_custom_impl_Yolo/yolo.cpp +++ b/native/nvdsinfer_custom_impl_Yolo/yolo.cpp @@ -237,7 +237,7 @@ NvDsInferStatus Yolo::buildYoloNetwork( = new YoloLayer(m_OutputTensors.at(outputTensorCount).numBBoxes, m_OutputTensors.at(outputTensorCount).numClasses, m_OutputTensors.at(outputTensorCount).gridSize, - 'y', new_coords, scale_x_y, beta_nms, + 1, new_coords, scale_x_y, beta_nms, curYoloTensor.anchors, m_OutputMasks); assert(yoloPlugin != nullptr); @@ -274,7 +274,7 @@ NvDsInferStatus Yolo::buildYoloNetwork( = new YoloLayer(curRegionTensor.numBBoxes, curRegionTensor.numClasses, curRegionTensor.gridSize, - 'r', 0, 1.0, 0, + 0, 0, 1.0, 0, curRegionTensor.anchors, mask); assert(regionPlugin != nullptr); diff --git a/native/nvdsinfer_custom_impl_Yolo/yoloForward.cu b/native/nvdsinfer_custom_impl_Yolo/yoloForward.cu index c7dcc97..a9310a5 100644 --- a/native/nvdsinfer_custom_impl_Yolo/yoloForward.cu +++ b/native/nvdsinfer_custom_impl_Yolo/yoloForward.cu @@ -21,7 +21,7 @@ inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); } __global__ void gpuYoloLayer(const float* input, float* output, const uint gridSize, const uint numOutputClasses, - const uint numBBoxes, const uint new_coords, const float scale_x_y, char type) + const uint numBBoxes, const uint new_coords, const float scale_x_y) { uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint y_id = blockIdx.y * blockDim.y + threadIdx.y; @@ -38,53 +38,29 @@ __global__ void gpuYoloLayer(const float* input, float* output, const uint gridS float alpha = scale_x_y; float beta = -0.5 * (scale_x_y - 1); - if (type == 'y') { - if (new_coords == 1) { - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] - = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] * alpha + beta; + if (new_coords == 1) { + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] + = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] * alpha + beta; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] - = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] * alpha + beta; + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] + = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] * alpha + beta; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] - = pow(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] * 2, 2); + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] + = pow(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] * 2, 2); - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] - = pow(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] * 2, 2); + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] + = pow(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] * 2, 2); - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] - = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]; + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] + = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]; - for (uint i = 0; i < numOutputClasses; ++i) - { - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] - = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]; - } - } - else { - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] - = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]) * alpha + beta; - - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] - = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]) * alpha + beta; - - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] - = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]); - - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] - = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]); - - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] - = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); - - for (uint i = 0; i < numOutputClasses; ++i) - { - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] - = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]); - } + for (uint i = 0; i < numOutputClasses; ++i) + { + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] + = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]; } } - else if (type == 'r') { + else { output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]) * alpha + beta; @@ -100,43 +76,91 @@ __global__ void gpuYoloLayer(const float* input, float* output, const uint gridS output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); - float temp = 1.0; - int i; - float sum = 0; - float largest = -INFINITY; - for(i = 0; i < numOutputClasses; ++i){ - int val = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]; - largest = (val>largest) ? val : largest; - } - for(i = 0; i < numOutputClasses; ++i){ - float e = exp(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] / temp - largest / temp); - sum += e; - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] = e; - } - for(i = 0; i < numOutputClasses; ++i){ - output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] /= sum; + for (uint i = 0; i < numOutputClasses; ++i) + { + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] + = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]); } } } +__global__ void gpuRegionLayer(const float* input, float* output, const uint gridSize, const uint numOutputClasses, + const uint numBBoxes) +{ + uint x_id = blockIdx.x * blockDim.x + threadIdx.x; + uint y_id = blockIdx.y * blockDim.y + threadIdx.y; + uint z_id = blockIdx.z * blockDim.z + threadIdx.z; + + if ((x_id >= gridSize) || (y_id >= gridSize) || (z_id >= numBBoxes)) + { + return; + } + + const int numGridCells = gridSize * gridSize; + const int bbindex = y_id * gridSize + x_id; + + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)] + = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]); + + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)] + = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]); + + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)] + = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]); + + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] + = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]); + + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)] + = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); + + float temp = 1.0; + int i; + float sum = 0; + float largest = -INFINITY; + for(i = 0; i < numOutputClasses; ++i){ + int val = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]; + largest = (val>largest) ? val : largest; + } + for(i = 0; i < numOutputClasses; ++i){ + float e = exp(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] / temp - largest / temp); + sum += e; + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] = e; + } + for(i = 0; i < numOutputClasses; ++i){ + output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))] /= sum; + } +} + cudaError_t cudaYoloLayer(const void* input, void* output, const uint& batchSize, const uint& gridSize, const uint& numOutputClasses, const uint& numBBoxes, - uint64_t outputSize, cudaStream_t stream, const uint new_coords, const float scale_x_y, char type); + uint64_t outputSize, cudaStream_t stream, const uint modelCoords, const float modelScale, const uint modelType); cudaError_t cudaYoloLayer(const void* input, void* output, const uint& batchSize, const uint& gridSize, const uint& numOutputClasses, const uint& numBBoxes, - uint64_t outputSize, cudaStream_t stream, const uint new_coords, const float scale_x_y, char type) + uint64_t outputSize, cudaStream_t stream, const uint modelCoords, const float modelScale, const uint modelType) { dim3 threads_per_block(16, 16, 4); dim3 number_of_blocks((gridSize / threads_per_block.x) + 1, (gridSize / threads_per_block.y) + 1, (numBBoxes / threads_per_block.z) + 1); - for (unsigned int batch = 0; batch < batchSize; ++batch) - { - gpuYoloLayer<<>>( - reinterpret_cast(input) + (batch * outputSize), - reinterpret_cast(output) + (batch * outputSize), gridSize, numOutputClasses, - numBBoxes, new_coords, scale_x_y, type); + if (modelType == 1) { + for (unsigned int batch = 0; batch < batchSize; ++batch) + { + gpuYoloLayer<<>>( + reinterpret_cast(input) + (batch * outputSize), + reinterpret_cast(output) + (batch * outputSize), gridSize, numOutputClasses, + numBBoxes, modelCoords, modelScale); + } + } + else if (modelType == 0) { + for (unsigned int batch = 0; batch < batchSize; ++batch) + { + gpuRegionLayer<<>>( + reinterpret_cast(input) + (batch * outputSize), + reinterpret_cast(output) + (batch * outputSize), gridSize, numOutputClasses, + numBBoxes); + } } return cudaGetLastError(); } diff --git a/native/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp b/native/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp index df3edbb..c7184e0 100644 --- a/native/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp +++ b/native/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp @@ -53,7 +53,7 @@ void read(const char*& buffer, T& val) cudaError_t cudaYoloLayer ( const void* input, void* output, const uint& batchSize, const uint& gridSize, const uint& numOutputClasses, - const uint& numBBoxes, uint64_t outputSize, cudaStream_t stream, const uint new_coords, const float scale_x_y, char type); + const uint& numBBoxes, uint64_t outputSize, cudaStream_t stream, const uint modelCoords, const float modelScale, const uint modelType); YoloLayer::YoloLayer (const void* data, size_t length) { @@ -63,7 +63,7 @@ YoloLayer::YoloLayer (const void* data, size_t length) read(d, m_GridSize); read(d, m_OutputSize); - read(d, m_Type); + read(d, m_type); read(d, m_new_coords); read(d, m_scale_x_y); read(d, m_beta_nms); @@ -94,11 +94,11 @@ YoloLayer::YoloLayer (const void* data, size_t length) }; YoloLayer::YoloLayer ( - const uint& numBoxes, const uint& numClasses, const uint& gridSize, char type, int new_coords, float scale_x_y, float beta_nms, std::vector anchors, std::vector> mask) : + const uint& numBoxes, const uint& numClasses, const uint& gridSize, const uint model_type, const uint new_coords, const float scale_x_y, const float beta_nms, const std::vector anchors, std::vector> mask) : m_NumBoxes(numBoxes), m_NumClasses(numClasses), m_GridSize(gridSize), - m_Type(type), + m_type(model_type), m_new_coords(new_coords), m_scale_x_y(scale_x_y), m_beta_nms(beta_nms), @@ -143,7 +143,7 @@ int YoloLayer::enqueue( { CHECK(cudaYoloLayer( inputs[0], outputs[0], batchSize, m_GridSize, m_NumClasses, m_NumBoxes, - m_OutputSize, stream, m_new_coords, m_scale_x_y, m_Type)); + m_OutputSize, stream, m_new_coords, m_scale_x_y, m_type)); return 0; } @@ -161,7 +161,7 @@ size_t YoloLayer::getSerializationSize() const } } - return sizeof(m_NumBoxes) + sizeof(m_NumClasses) + sizeof(m_GridSize) + sizeof(m_OutputSize) + sizeof(m_Type) + return sizeof(m_NumBoxes) + sizeof(m_NumClasses) + sizeof(m_GridSize) + sizeof(m_OutputSize) + sizeof(m_type) + sizeof(m_new_coords) + sizeof(m_scale_x_y) + sizeof(m_beta_nms) + anchorsSum * sizeof(float) + maskSum * sizeof(int); } @@ -173,7 +173,7 @@ void YoloLayer::serialize(void* buffer) const write(d, m_GridSize); write(d, m_OutputSize); - write(d, m_Type); + write(d, m_type); write(d, m_new_coords); write(d, m_scale_x_y); write(d, m_beta_nms); @@ -199,7 +199,7 @@ void YoloLayer::serialize(void* buffer) const nvinfer1::IPluginV2* YoloLayer::clone() const { - return new YoloLayer (m_NumBoxes, m_NumClasses, m_GridSize, m_Type, m_new_coords, m_scale_x_y, m_beta_nms, m_Anchors, m_Mask); + return new YoloLayer (m_NumBoxes, m_NumClasses, m_GridSize, m_type, m_new_coords, m_scale_x_y, m_beta_nms, m_Anchors, m_Mask); } REGISTER_TENSORRT_PLUGIN(YoloLayerPluginCreator); \ No newline at end of file diff --git a/native/nvdsinfer_custom_impl_Yolo/yoloPlugins.h b/native/nvdsinfer_custom_impl_Yolo/yoloPlugins.h index 5af1679..ebf5661 100644 --- a/native/nvdsinfer_custom_impl_Yolo/yoloPlugins.h +++ b/native/nvdsinfer_custom_impl_Yolo/yoloPlugins.h @@ -57,8 +57,8 @@ class YoloLayer : public nvinfer1::IPluginV2 public: YoloLayer (const void* data, size_t length); YoloLayer (const uint& numBoxes, const uint& numClasses, const uint& gridSize, - char type, int new_coords, float scale_x_y, float beta_nms, - std::vector anchors, std::vector> mask); + const uint model_type, const uint new_coords, const float scale_x_y, const float beta_nms, + const std::vector anchors, const std::vector> mask); const char* getPluginType () const override { return YOLOLAYER_PLUGIN_NAME; } const char* getPluginVersion () const override { return YOLOLAYER_PLUGIN_VERSION; } int getNbOutputs () const override { return 1; } @@ -100,7 +100,7 @@ private: uint64_t m_OutputSize {0}; std::string m_Namespace {""}; - char m_Type; + uint m_type {0}; uint m_new_coords {0}; float m_scale_x_y {0}; float m_beta_nms {0};