Update Benchmarks + Add YOLOv7-u6 + Fixes
This commit is contained in:
@@ -73,22 +73,22 @@ addBBoxProposal(const float bx1, const float by1, const float bx2, const float b
|
||||
}
|
||||
|
||||
static std::vector<NvDsInferParseObjectInfo>
|
||||
decodeTensorYolo(const float* detection, const uint& outputSize, const uint& count, const uint& netW, const uint& netH,
|
||||
decodeTensorYolo(const float* detection, const uint& outputSize, const uint& netW, const uint& netH,
|
||||
const std::vector<float>& preclusterThreshold)
|
||||
{
|
||||
std::vector<NvDsInferParseObjectInfo> binfo;
|
||||
|
||||
for (uint b = 0; b < outputSize; ++b) {
|
||||
float maxProb = count == 6 ? detection[b * count + 4] : detection[b * count + 4] * detection[b * count + 6];
|
||||
int maxIndex = (int) detection[b * count + 5];
|
||||
float maxProb = detection[b * 6 + 4];
|
||||
int maxIndex = (int) detection[b * 6 + 5];
|
||||
|
||||
if (maxProb < preclusterThreshold[maxIndex])
|
||||
continue;
|
||||
|
||||
float bxc = detection[b * count + 0];
|
||||
float byc = detection[b * count + 1];
|
||||
float bw = detection[b * count + 2];
|
||||
float bh = detection[b * count + 3];
|
||||
float bxc = detection[b * 6 + 0];
|
||||
float byc = detection[b * 6 + 1];
|
||||
float bw = detection[b * 6 + 2];
|
||||
float bh = detection[b * 6 + 3];
|
||||
|
||||
float bx1 = bxc - bw / 2;
|
||||
float by1 = byc - bh / 2;
|
||||
@@ -102,22 +102,22 @@ decodeTensorYolo(const float* detection, const uint& outputSize, const uint& cou
|
||||
}
|
||||
|
||||
static std::vector<NvDsInferParseObjectInfo>
|
||||
decodeTensorYoloE(const float* detection, const uint& outputSize, const uint& count, const uint& netW, const uint& netH,
|
||||
decodeTensorYoloE(const float* detection, const uint& outputSize, const uint& netW, const uint& netH,
|
||||
const std::vector<float>& preclusterThreshold)
|
||||
{
|
||||
std::vector<NvDsInferParseObjectInfo> binfo;
|
||||
|
||||
for (uint b = 0; b < outputSize; ++b) {
|
||||
float maxProb = count == 6 ? detection[b * count + 4] : detection[b * count + 4] * detection[b * count + 6];
|
||||
int maxIndex = (int) detection[b * count + 5];
|
||||
float maxProb = detection[b * 6 + 4];
|
||||
int maxIndex = (int) detection[b * 6 + 5];
|
||||
|
||||
if (maxProb < preclusterThreshold[maxIndex])
|
||||
continue;
|
||||
|
||||
float bx1 = detection[b * count + 0];
|
||||
float by1 = detection[b * count + 1];
|
||||
float bx2 = detection[b * count + 2];
|
||||
float by2 = detection[b * count + 3];
|
||||
float bx1 = detection[b * 6 + 0];
|
||||
float by1 = detection[b * 6 + 1];
|
||||
float bx2 = detection[b * 6 + 2];
|
||||
float by2 = detection[b * 6 + 3];
|
||||
|
||||
addBBoxProposal(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo);
|
||||
}
|
||||
@@ -139,9 +139,8 @@ NvDsInferParseCustomYolo(std::vector<NvDsInferLayerInfo> const& outputLayersInfo
|
||||
const NvDsInferLayerInfo& layer = outputLayersInfo[0];
|
||||
|
||||
const uint outputSize = layer.inferDims.d[0];
|
||||
const uint count = layer.inferDims.d[1];
|
||||
|
||||
std::vector<NvDsInferParseObjectInfo> outObjs = decodeTensorYolo((const float*) (layer.buffer), outputSize, count,
|
||||
std::vector<NvDsInferParseObjectInfo> outObjs = decodeTensorYolo((const float*) (layer.buffer), outputSize,
|
||||
networkInfo.width, networkInfo.height, detectionParams.perClassPreclusterThreshold);
|
||||
|
||||
objects.insert(objects.end(), outObjs.begin(), outObjs.end());
|
||||
@@ -165,9 +164,8 @@ NvDsInferParseCustomYoloE(std::vector<NvDsInferLayerInfo> const& outputLayersInf
|
||||
const NvDsInferLayerInfo& layer = outputLayersInfo[0];
|
||||
|
||||
const uint outputSize = layer.inferDims.d[0];
|
||||
const uint count = layer.inferDims.d[1];
|
||||
|
||||
std::vector<NvDsInferParseObjectInfo> outObjs = decodeTensorYoloE((const float*) (layer.buffer), outputSize, count,
|
||||
std::vector<NvDsInferParseObjectInfo> outObjs = decodeTensorYoloE((const float*) (layer.buffer), outputSize,
|
||||
networkInfo.width, networkInfo.height, detectionParams.perClassPreclusterThreshold);
|
||||
|
||||
objects.insert(objects.end(), outObjs.begin(), outObjs.end());
|
||||
|
||||
@@ -50,13 +50,12 @@ __global__ void gpuYoloLayer(const float* input, float* output, int* count, cons
|
||||
|
||||
int _count = (int)atomicAdd(count, 1);
|
||||
|
||||
output[_count * 7 + 0] = xc;
|
||||
output[_count * 7 + 1] = yc;
|
||||
output[_count * 7 + 2] = w;
|
||||
output[_count * 7 + 3] = h;
|
||||
output[_count * 7 + 4] = maxProb;
|
||||
output[_count * 7 + 5] = maxIndex;
|
||||
output[_count * 7 + 6] = objectness;
|
||||
output[_count * 6 + 0] = xc;
|
||||
output[_count * 6 + 1] = yc;
|
||||
output[_count * 6 + 2] = w;
|
||||
output[_count * 6 + 3] = h;
|
||||
output[_count * 6 + 4] = maxProb * objectness;
|
||||
output[_count * 6 + 5] = maxIndex;
|
||||
}
|
||||
|
||||
cudaError_t cudaYoloLayer(const void* input, void* output, void* count, const uint& batchSize, uint64_t& inputSize,
|
||||
@@ -76,7 +75,7 @@ cudaError_t cudaYoloLayer(const void* input, void* output, void* count, const ui
|
||||
for (unsigned int batch = 0; batch < batchSize; ++batch) {
|
||||
gpuYoloLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
|
||||
reinterpret_cast<const float*> (input) + (batch * inputSize),
|
||||
reinterpret_cast<float*> (output) + (batch * 7 * outputSize),
|
||||
reinterpret_cast<float*> (output) + (batch * 6 * outputSize),
|
||||
reinterpret_cast<int*> (count) + (batch),
|
||||
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
|
||||
reinterpret_cast<const float*> (anchors), reinterpret_cast<const int*> (mask));
|
||||
|
||||
@@ -47,13 +47,12 @@ __global__ void gpuYoloLayer_nc(const float* input, float* output, int* count, c
|
||||
|
||||
int _count = (int)atomicAdd(count, 1);
|
||||
|
||||
output[_count * 7 + 0] = xc;
|
||||
output[_count * 7 + 1] = yc;
|
||||
output[_count * 7 + 2] = w;
|
||||
output[_count * 7 + 3] = h;
|
||||
output[_count * 7 + 4] = maxProb;
|
||||
output[_count * 7 + 5] = maxIndex;
|
||||
output[_count * 7 + 6] = objectness;
|
||||
output[_count * 6 + 0] = xc;
|
||||
output[_count * 6 + 1] = yc;
|
||||
output[_count * 6 + 2] = w;
|
||||
output[_count * 6 + 3] = h;
|
||||
output[_count * 6 + 4] = maxProb * objectness;
|
||||
output[_count * 6 + 5] = maxIndex;
|
||||
}
|
||||
|
||||
cudaError_t cudaYoloLayer_nc(const void* input, void* output, void* count, const uint& batchSize, uint64_t& inputSize,
|
||||
@@ -73,7 +72,7 @@ cudaError_t cudaYoloLayer_nc(const void* input, void* output, void* count, const
|
||||
for (unsigned int batch = 0; batch < batchSize; ++batch) {
|
||||
gpuYoloLayer_nc<<<number_of_blocks, threads_per_block, 0, stream>>>(
|
||||
reinterpret_cast<const float*> (input) + (batch * inputSize),
|
||||
reinterpret_cast<float*> (output) + (batch * 7 * outputSize),
|
||||
reinterpret_cast<float*> (output) + (batch * 6 * outputSize),
|
||||
reinterpret_cast<int*> (count) + (batch),
|
||||
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY,
|
||||
reinterpret_cast<const float*> (anchors), reinterpret_cast<const int*> (mask));
|
||||
|
||||
@@ -68,13 +68,12 @@ __global__ void gpuRegionLayer(const float* input, float* softmax, float* output
|
||||
|
||||
int _count = (int)atomicAdd(count, 1);
|
||||
|
||||
output[_count * 7 + 0] = xc;
|
||||
output[_count * 7 + 1] = yc;
|
||||
output[_count * 7 + 2] = w;
|
||||
output[_count * 7 + 3] = h;
|
||||
output[_count * 7 + 4] = maxProb;
|
||||
output[_count * 7 + 5] = maxIndex;
|
||||
output[_count * 7 + 6] = objectness;
|
||||
output[_count * 6 + 0] = xc;
|
||||
output[_count * 6 + 1] = yc;
|
||||
output[_count * 6 + 2] = w;
|
||||
output[_count * 6 + 3] = h;
|
||||
output[_count * 6 + 4] = maxProb * objectness;
|
||||
output[_count * 6 + 5] = maxIndex;
|
||||
}
|
||||
|
||||
cudaError_t cudaRegionLayer(const void* input, void* softmax, void* output, void* count, const uint& batchSize,
|
||||
@@ -93,7 +92,7 @@ cudaError_t cudaRegionLayer(const void* input, void* softmax, void* output, void
|
||||
gpuRegionLayer<<<number_of_blocks, threads_per_block, 0, stream>>>(
|
||||
reinterpret_cast<const float*> (input) + (batch * inputSize),
|
||||
reinterpret_cast<float*> (softmax) + (batch * inputSize),
|
||||
reinterpret_cast<float*> (output) + (batch * 7 * outputSize),
|
||||
reinterpret_cast<float*> (output) + (batch * 6 * outputSize),
|
||||
reinterpret_cast<int*> (count) + (batch),
|
||||
netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes,
|
||||
reinterpret_cast<const float*> (anchors));
|
||||
|
||||
@@ -103,7 +103,7 @@ nvinfer1::Dims
|
||||
YoloLayer::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) noexcept
|
||||
{
|
||||
assert(index == 0);
|
||||
return nvinfer1::Dims{2, {static_cast<int>(m_OutputSize), 7}};
|
||||
return nvinfer1::Dims{2, {static_cast<int>(m_OutputSize), 6}};
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -125,7 +125,7 @@ YoloLayer::enqueue(int batchSize, void const* const* inputs, void* const* output
|
||||
noexcept
|
||||
{
|
||||
void* output = outputs[0];
|
||||
CUDA_CHECK(cudaMemsetAsync((float*) output, 0, sizeof(float) * m_OutputSize * 7 * batchSize, stream));
|
||||
CUDA_CHECK(cudaMemsetAsync((float*) output, 0, sizeof(float) * m_OutputSize * 6 * batchSize, stream));
|
||||
|
||||
void* count = workspace;
|
||||
CUDA_CHECK(cudaMemsetAsync((int*) count, 0, sizeof(int) * batchSize, stream));
|
||||
|
||||
Reference in New Issue
Block a user