diff --git a/README.md b/README.md index 536117d..39ef41a 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ NVIDIA DeepStream SDK 6.2 / 6.1.1 / 6.1 / 6.0.1 / 6.0 / 5.1 configuration for YOLO models -------------------------------------------------------------------------------------------------- -### Important: please generate the ONNX model and the TensorRT engine again with the updated files +### Important: please export the ONNX model with the new export file, generate the TensorRT engine again with the updated files, and use the new config_infer_primary file according to your model -------------------------------------------------------------------------------------------------- ### Future updates @@ -19,11 +19,14 @@ NVIDIA DeepStream SDK 6.2 / 6.1.1 / 6.1 / 6.0.1 / 6.0 / 5.1 configuration for Y * Support for INT8 calibration * Support for non square models * Models benchmarks -* **Support for Darknet YOLO models (YOLOv4, etc) using cfg and weights conversion with GPU post-processing** -* **Support for YOLO-NAS, PPYOLOE+, PPYOLOE, DAMO-YOLO, YOLOX, YOLOR, YOLOv8, YOLOv7, YOLOv6 and YOLOv5 using ONNX conversion with GPU post-processing** -* **GPU bbox parser (it is slightly slower than CPU bbox parser on V100 GPU tests)** -* **Dynamic batch-size for ONNX exported models (YOLO-NAS, PPYOLOE+, PPYOLOE, DAMO-YOLO, YOLOX, YOLOR, YOLOv8, YOLOv7, YOLOv6 and YOLOv5)** +* Support for Darknet models (YOLOv4, etc) using cfg and weights conversion with GPU post-processing +* Support for YOLO-NAS, PPYOLOE+, PPYOLOE, DAMO-YOLO, YOLOX, YOLOR, YOLOv8, YOLOv7, YOLOv6 and YOLOv5 using ONNX conversion with GPU post-processing +* GPU bbox parser (it is slightly slower than CPU bbox parser on V100 GPU tests) * **Support for DeepStream 5.1** +* **Custom ONNX model parser (`NvDsInferYoloCudaEngineGet`)** +* **Dynamic batch-size for Darknet and ONNX exported models** +* **INT8 calibration (PTQ) for Darknet and ONNX exported models** +* **New output structure (fix wrong output on DeepStream < 6.2) - it need to export the ONNX model with the new export file, generate the TensorRT engine again with the updated files, and use the new config_infer_primary file according to your model** ## @@ -31,12 +34,12 @@ NVIDIA DeepStream SDK 6.2 / 6.1.1 / 6.1 / 6.0.1 / 6.0 / 5.1 configuration for Y * [Requirements](#requirements) * [Suported models](#supported-models) -* [Benchmarks](#benchmarks) -* [dGPU installation](#dgpu-installation) +* [Benchmarks](docs/benchmarks.md) +* [dGPU installation](docs/dGPUInstalation.md) * [Basic usage](#basic-usage) * [Docker usage](#docker-usage) * [NMS configuration](#nms-configuration) -* [INT8 calibration](#int8-calibration) +* [INT8 calibration](docs/INT8Calibration.md) * [YOLOv5 usage](docs/YOLOv5.md) * [YOLOv6 usage](docs/YOLOv6.md) * [YOLOv7 usage](docs/YOLOv7.md) @@ -137,7 +140,7 @@ NVIDIA DeepStream SDK 6.2 / 6.1.1 / 6.1 / 6.0.1 / 6.0 / 5.1 configuration for Y ### Suported models -* [Darknet YOLO](https://github.com/AlexeyAB/darknet) +* [Darknet](https://github.com/AlexeyAB/darknet) * [MobileNet-YOLO](https://github.com/dog-qiuqiu/MobileNet-Yolo) * [YOLO-Fastest](https://github.com/dog-qiuqiu/Yolo-Fastest) * [YOLOv5](https://github.com/ultralytics/yolov5) @@ -152,784 +155,6 @@ NVIDIA DeepStream SDK 6.2 / 6.1.1 / 6.1 / 6.0.1 / 6.0 / 5.1 configuration for Y ## -### Benchmarks - -#### Config - -``` -board = NVIDIA Tesla V100 16GB (AWS: p3.2xlarge) -batch-size = 1 -eval = val2017 (COCO) -sample = 1920x1080 video -``` - -**NOTE**: Used maintain-aspect-ratio=1 in config_infer file for Darknet (with letter_box=1) and PyTorch models. - -#### NMS config - -- Eval - -``` -nms-iou-threshold = 0.6 (Darknet) / 0.65 (YOLOv5, YOLOv6, YOLOv7, YOLOR and YOLOX) / 0.7 (Paddle, YOLO-NAS, DAMO-YOLO, YOLOv8 and YOLOv7-u6) -pre-cluster-threshold = 0.001 -topk = 300 -``` - -- Test - -``` -nms-iou-threshold = 0.45 -pre-cluster-threshold = 0.25 -topk = 300 -``` - -#### Results - -**NOTE**: * = PyTorch. - -**NOTE**: ** = The YOLOv4 is trained with the trainvalno5k set, so the mAP is high on val2017 test. - -**NOTE**: star = DAMO-YOLO model trained with distillation. - -**NOTE**: The V100 GPU decoder max out at 625-635 FPS on DeepStream even using lighter models. - -**NOTE**: The GPU bbox parser is a bit slower than CPU bbox parser on V100 GPU tests. - -| DeepStream | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS
(without display) | -|:------------------:|:---------:|:----------:|:------------:|:-------:|:--------:|:--------------------------:| -| YOLO-NAS L | FP16 | 640 | 0.484 | 0.658 | 0.532 | 235.27 | -| YOLO-NAS M | FP16 | 640 | 0.480 | 0.651 | 0.524 | 287.39 | -| YOLO-NAS S | FP16 | 640 | 0.442 | 0.614 | 0.485 | 478.52 | -| PP-YOLOE+_x | FP16 | 640 | 0.528 | 0.705 | 0.579 | 121.17 | -| PP-YOLOE+_l | FP16 | 640 | 0.511 | 0.686 | 0.557 | 191.82 | -| PP-YOLOE+_m | FP16 | 640 | 0.483 | 0.658 | 0.528 | 264.39 | -| PP-YOLOE+_s | FP16 | 640 | 0.424 | 0.594 | 0.464 | 476.13 | -| PP-YOLOE-s (400) | FP16 | 640 | 0.423 | 0.589 | 0.463 | 461.23 | -| DAMO-YOLO-L star | FP16 | 640 | 0.502 | 0.674 | 0.551 | 176.93 | -| DAMO-YOLO-M star | FP16 | 640 | 0.485 | 0.656 | 0.530 | 242.24 | -| DAMO-YOLO-S star | FP16 | 640 | 0.460 | 0.631 | 0.502 | 385.09 | -| DAMO-YOLO-S | FP16 | 640 | 0.445 | 0.611 | 0.486 | 378.68 | -| DAMO-YOLO-T star | FP16 | 640 | 0.419 | 0.586 | 0.455 | 492.24 | -| DAMO-YOLO-Nl | FP16 | 416 | 0.392 | 0.559 | 0.423 | 483.73 | -| DAMO-YOLO-Nm | FP16 | 416 | 0.371 | 0.532 | 0.402 | 555.94 | -| DAMO-YOLO-Ns | FP16 | 416 | 0.312 | 0.460 | 0.335 | 627.67 | -| YOLOX-x | FP16 | 640 | 0.447 | 0.616 | 0.483 | 125.40 | -| YOLOX-l | FP16 | 640 | 0.430 | 0.598 | 0.466 | 193.10 | -| YOLOX-m | FP16 | 640 | 0.397 | 0.566 | 0.431 | 298.61 | -| YOLOX-s | FP16 | 640 | 0.335 | 0.502 | 0.365 | 522.05 | -| YOLOX-s legacy | FP16 | 640 | 0.375 | 0.569 | 0.407 | 518.52 | -| YOLOX-Darknet | FP16 | 640 | 0.414 | 0.595 | 0.453 | 212.88 | -| YOLOX-Tiny | FP16 | 640 | 0.274 | 0.427 | 0.292 | 633.95 | -| YOLOX-Nano | FP16 | 640 | 0.212 | 0.342 | 0.222 | 633.04 | -| YOLOv8x | FP16 | 640 | 0.499 | 0.669 | 0.545 | 130.49 | -| YOLOv8l | FP16 | 640 | 0.491 | 0.660 | 0.535 | 180.75 | -| YOLOv8m | FP16 | 640 | 0.468 | 0.637 | 0.510 | 278.08 | -| YOLOv8s | FP16 | 640 | 0.415 | 0.578 | 0.453 | 493.45 | -| YOLOv8n | FP16 | 640 | 0.343 | 0.492 | 0.373 | 627.43 | -| YOLOv7-u6 | FP16 | 640 | 0.484 | 0.652 | 0.530 | 193.54 | -| YOLOv7x* | FP16 | 640 | 0.496 | 0.679 | 0.536 | 155.07 | -| YOLOv7* | FP16 | 640 | 0.476 | 0.660 | 0.518 | 226.01 | -| YOLOv7-Tiny Leaky* | FP16 | 640 | 0.345 | 0.516 | 0.372 | 626.23 | -| YOLOv7-Tiny Leaky* | FP16 | 416 | 0.328 | 0.493 | 0.349 | 633.90 | -| YOLOv6-L 4.0 | FP16 | 640 | 0.490 | 0.671 | 0.535 | 178.41 | -| YOLOv6-M 4.0 | FP16 | 640 | 0.460 | 0.635 | 0.502 | 293.39 | -| YOLOv6-S 4.0 | FP16 | 640 | 0.416 | 0.585 | 0.453 | 513.90 | -| YOLOv6-N 4.0 | FP16 | 640 | 0.349 | 0.503 | 0.378 | 633.37 | -| YOLOv5x 7.0 | FP16 | 640 | 0.471 | 0.652 | 0.513 | 149.93 | -| YOLOv5l 7.0 | FP16 | 640 | 0.455 | 0.637 | 0.497 | 235.55 | -| YOLOv5m 7.0 | FP16 | 640 | 0.421 | 0.604 | 0.459 | 351.69 | -| YOLOv5s 7.0 | FP16 | 640 | 0.344 | 0.529 | 0.372 | 618.13 | -| YOLOv5n 7.0 | FP16 | 640 | 0.247 | 0.414 | 0.257 | 629.66 | - -## - -### dGPU installation - -To install the DeepStream on dGPU (x86 platform), without docker, we need to do some steps to prepare the computer. - -
DeepStream 6.2 - -#### 1. Disable Secure Boot in BIOS - -#### 2. Install dependencies - -``` -sudo apt-get update -sudo apt-get install gcc make git libtool autoconf autogen pkg-config cmake -sudo apt-get install python3 python3-dev python3-pip -sudo apt-get install dkms -sudo apt install libssl1.1 libgstreamer1.0-0 gstreamer1.0-tools gstreamer1.0-plugins-good gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly gstreamer1.0-libav libgstreamer-plugins-base1.0-dev libgstrtspserver-1.0-0 libjansson4 libyaml-cpp-dev libjsoncpp-dev protobuf-compiler -sudo apt-get install linux-headers-$(uname -r) -``` - -**NOTE**: Purge all NVIDIA driver, CUDA, etc (replace $CUDA_PATH to your CUDA path) - -``` -sudo nvidia-uninstall -sudo $CUDA_PATH/bin/cuda-uninstaller -sudo apt-get remove --purge '*nvidia*' -sudo apt-get remove --purge '*cuda*' -sudo apt-get remove --purge '*cudnn*' -sudo apt-get remove --purge '*tensorrt*' -sudo apt autoremove --purge && sudo apt autoclean && sudo apt clean -``` - -#### 3. Install CUDA Keyring - -``` -wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb -sudo dpkg -i cuda-keyring_1.0-1_all.deb -sudo apt-get update -``` - -#### 4. Download and install NVIDIA Driver - -
TITAN, GeForce RTX / GTX series and RTX / Quadro series
- -- Download - - ``` - wget https://us.download.nvidia.com/XFree86/Linux-x86_64/525.105.17/NVIDIA-Linux-x86_64-525.105.17.run - ``` - -
Laptop - -* Run - - ``` - sudo sh NVIDIA-Linux-x86_64-525.105.17.run --no-cc-version-check --silent --disable-nouveau --dkms --install-libglvnd - ``` - - **NOTE**: This step will disable the nouveau drivers. - -* Reboot - - ``` - sudo reboot - ``` - -* Install - - ``` - sudo sh NVIDIA-Linux-x86_64-525.105.17.run --no-cc-version-check --silent --disable-nouveau --dkms --install-libglvnd - ``` - -**NOTE**: If you are using a laptop with NVIDIA Optimius, run - -``` -sudo apt-get install nvidia-prime -sudo prime-select nvidia -``` - -
- -
Desktop - -* Run - - ``` - sudo sh NVIDIA-Linux-x86_64-525.105.17.run --no-cc-version-check --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig - ``` - - **NOTE**: This step will disable the nouveau drivers. - -* Reboot - - ``` - sudo reboot - ``` - -* Install - - ``` - sudo sh NVIDIA-Linux-x86_64-525.105.17.run --no-cc-version-check --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig - ``` - -
- -
- -
Data center / Tesla series
- - - Download - - ``` - wget https://us.download.nvidia.com/XFree86/Linux-x86_64/525.105.17/NVIDIA-Linux-x86_64-525.105.17.run - ``` - - * Run - - ``` - sudo sh NVIDIA-Linux-x86_64-525.105.17.run --no-cc-version-check --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig - ``` - -
- -#### 5. Download and install CUDA - -``` -wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run -sudo sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit -``` - -* Export environment variables - - ``` - echo $'export PATH=/usr/local/cuda-11.8/bin${PATH:+:${PATH}}\nexport LD_LIBRARY_PATH=/usr/local/cuda-11.8/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> ~/.bashrc && source ~/.bashrc - ``` - -#### 6. Install TensorRT - -``` -sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub -sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" -sudo apt-get update -sudo apt-get install libnvinfer8=8.5.2-1+cuda11.8 libnvinfer-plugin8=8.5.2-1+cuda11.8 libnvparsers8=8.5.2-1+cuda11.8 libnvonnxparsers8=8.5.2-1+cuda11.8 libnvinfer-bin=8.5.2-1+cuda11.8 libnvinfer-dev=8.5.2-1+cuda11.8 libnvinfer-plugin-dev=8.5.2-1+cuda11.8 libnvparsers-dev=8.5.2-1+cuda11.8 libnvonnxparsers-dev=8.5.2-1+cuda11.8 libnvinfer-samples=8.5.2-1+cuda11.8 libcudnn8=8.7.0.84-1+cuda11.8 libcudnn8-dev=8.7.0.84-1+cuda11.8 python3-libnvinfer=8.5.2-1+cuda11.8 python3-libnvinfer-dev=8.5.2-1+cuda11.8 -sudo apt-mark hold libnvinfer* libnvparsers* libnvonnxparsers* libcudnn8* python3-libnvinfer* tensorrt -``` - -#### 7. Download from [NVIDIA website](https://developer.nvidia.com/deepstream-getting-started) and install the DeepStream SDK - -DeepStream 6.2 for Servers and Workstations (.deb) - -``` -sudo apt-get install ./deepstream-6.2_6.2.0-1_amd64.deb -rm ${HOME}/.cache/gstreamer-1.0/registry.x86_64.bin -sudo ln -snf /usr/local/cuda-11.8 /usr/local/cuda -``` - -#### 8. Reboot the computer - -``` -sudo reboot -``` - -
- -
DeepStream 6.1.1 - -#### 1. Disable Secure Boot in BIOS - -#### 2. Install dependencies - -``` -sudo apt-get update -sudo apt-get install gcc make git libtool autoconf autogen pkg-config cmake -sudo apt-get install python3 python3-dev python3-pip -sudo apt-get install dkms -sudo apt-get install libssl1.1 libgstreamer1.0-0 gstreamer1.0-tools gstreamer1.0-plugins-good gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly gstreamer1.0-libav libgstreamer-plugins-base1.0-dev libgstrtspserver-1.0-0 libjansson4 libyaml-cpp-dev -sudo apt-get install linux-headers-$(uname -r) -``` - -**NOTE**: Purge all NVIDIA driver, CUDA, etc (replace $CUDA_PATH to your CUDA path) - -``` -sudo nvidia-uninstall -sudo $CUDA_PATH/bin/cuda-uninstaller -sudo apt-get remove --purge '*nvidia*' -sudo apt-get remove --purge '*cuda*' -sudo apt-get remove --purge '*cudnn*' -sudo apt-get remove --purge '*tensorrt*' -sudo apt autoremove --purge && sudo apt autoclean && sudo apt clean -``` - -#### 3. Install CUDA Keyring - -``` -wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb -sudo dpkg -i cuda-keyring_1.0-1_all.deb -sudo apt-get update -``` - -#### 4. Download and install NVIDIA Driver - -
TITAN, GeForce RTX / GTX series and RTX / Quadro series
- -- Download - - ``` - wget https://us.download.nvidia.com/XFree86/Linux-x86_64/515.65.01/NVIDIA-Linux-x86_64-515.65.01.run - ``` - -
Laptop - -* Run - - ``` - sudo sh NVIDIA-Linux-x86_64-515.65.01.run --silent --disable-nouveau --dkms --install-libglvnd - ``` - - **NOTE**: This step will disable the nouveau drivers. - -* Reboot - - ``` - sudo reboot - ``` - -* Install - - ``` - sudo sh NVIDIA-Linux-x86_64-515.65.01.run --silent --disable-nouveau --dkms --install-libglvnd - ``` - -**NOTE**: If you are using a laptop with NVIDIA Optimius, run - -``` -sudo apt-get install nvidia-prime -sudo prime-select nvidia -``` - -
- -
Desktop - -* Run - - ``` - sudo sh NVIDIA-Linux-x86_64-515.65.01.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig - ``` - - **NOTE**: This step will disable the nouveau drivers. - -* Reboot - - ``` - sudo reboot - ``` - -* Install - - ``` - sudo sh NVIDIA-Linux-x86_64-515.65.01.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig - ``` - -
- -
- -
Data center / Tesla series
- - - Download - - ``` - wget https://us.download.nvidia.com/tesla/515.65.01/NVIDIA-Linux-x86_64-515.65.01.run - ``` - - * Run - - ``` - sudo sh NVIDIA-Linux-x86_64-515.65.01.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig - ``` - -
- -#### 5. Download and install CUDA - -``` -wget https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run -sudo sh cuda_11.7.1_515.65.01_linux.run --silent --toolkit -``` - -* Export environment variables - - ``` - echo $'export PATH=/usr/local/cuda-11.7/bin${PATH:+:${PATH}}\nexport LD_LIBRARY_PATH=/usr/local/cuda-11.7/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> ~/.bashrc && source ~/.bashrc - ``` - -#### 6. Download from [NVIDIA website](https://developer.nvidia.com/nvidia-tensorrt-8x-download) and install the TensorRT - -TensorRT 8.4 GA for Ubuntu 20.04 and CUDA 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6 and 11.7 DEB local repo Package - -``` -sudo dpkg -i nv-tensorrt-repo-ubuntu2004-cuda11.6-trt8.4.1.5-ga-20220604_1-1_amd64.deb -sudo apt-key add /var/nv-tensorrt-repo-ubuntu2004-cuda11.6-trt8.4.1.5-ga-20220604/9a60d8bf.pub -sudo apt-get update -sudo apt-get install libnvinfer8=8.4.1-1+cuda11.6 libnvinfer-plugin8=8.4.1-1+cuda11.6 libnvparsers8=8.4.1-1+cuda11.6 libnvonnxparsers8=8.4.1-1+cuda11.6 libnvinfer-bin=8.4.1-1+cuda11.6 libnvinfer-dev=8.4.1-1+cuda11.6 libnvinfer-plugin-dev=8.4.1-1+cuda11.6 libnvparsers-dev=8.4.1-1+cuda11.6 libnvonnxparsers-dev=8.4.1-1+cuda11.6 libnvinfer-samples=8.4.1-1+cuda11.6 libcudnn8=8.4.1.50-1+cuda11.6 libcudnn8-dev=8.4.1.50-1+cuda11.6 python3-libnvinfer=8.4.1-1+cuda11.6 python3-libnvinfer-dev=8.4.1-1+cuda11.6 -sudo apt-mark hold libnvinfer* libnvparsers* libnvonnxparsers* libcudnn8* tensorrt -``` - -#### 7. Download from [NVIDIA website](https://developer.nvidia.com/deepstream-getting-started) and install the DeepStream SDK - -DeepStream 6.1.1 for Servers and Workstations (.deb) - -``` -sudo apt-get install ./deepstream-6.1_6.1.1-1_amd64.deb -rm ${HOME}/.cache/gstreamer-1.0/registry.x86_64.bin -sudo ln -snf /usr/local/cuda-11.7 /usr/local/cuda -``` - -#### 8. Reboot the computer - -``` -sudo reboot -``` - -
- -
DeepStream 6.1 - -#### 1. Disable Secure Boot in BIOS - -#### 2. Install dependencies - -``` -sudo apt-get update -sudo apt-get install gcc make git libtool autoconf autogen pkg-config cmake -sudo apt-get install python3 python3-dev python3-pip -sudo apt-get install dkms -sudo apt-get install libssl1.1 libgstreamer1.0-0 gstreamer1.0-tools gstreamer1.0-plugins-good gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly gstreamer1.0-libav libgstrtspserver-1.0-0 libjansson4 libyaml-cpp-dev -sudo apt-get install linux-headers-$(uname -r) -``` - -**NOTE**: Purge all NVIDIA driver, CUDA, etc (replace $CUDA_PATH to your CUDA path) - -``` -sudo nvidia-uninstall -sudo $CUDA_PATH/bin/cuda-uninstaller -sudo apt-get remove --purge '*nvidia*' -sudo apt-get remove --purge '*cuda*' -sudo apt-get remove --purge '*cudnn*' -sudo apt-get remove --purge '*tensorrt*' -sudo apt autoremove --purge && sudo apt autoclean && sudo apt clean -``` - -#### 3. Install CUDA Keyring - -``` -wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb -sudo dpkg -i cuda-keyring_1.0-1_all.deb -sudo apt-get update -``` - -#### 4. Download and install NVIDIA Driver - -
TITAN, GeForce RTX / GTX series and RTX / Quadro series
- -- Download - - ``` - wget https://us.download.nvidia.com/XFree86/Linux-x86_64/510.47.03/NVIDIA-Linux-x86_64-510.47.03.run - ``` - -
Laptop - -* Run - - ``` - sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd - ``` - - **NOTE**: This step will disable the nouveau drivers. - -* Reboot - - ``` - sudo reboot - ``` - -* Install - - ``` - sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd - ``` - -**NOTE**: If you are using a laptop with NVIDIA Optimius, run - -``` -sudo apt-get install nvidia-prime -sudo prime-select nvidia -``` - -
- -
Desktop - -* Run - - ``` - sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig - ``` - - **NOTE**: This step will disable the nouveau drivers. - -* Reboot - - ``` - sudo reboot - ``` - -* Install - - ``` - sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig - ``` - -
- -
- -
Data center / Tesla series
- - - Download - - ``` - wget https://us.download.nvidia.com/tesla/510.47.03/NVIDIA-Linux-x86_64-510.47.03.run - ``` - - * Run - - ``` - sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig - ``` - -
- -#### 5. Download and install CUDA - -``` -wget https://developer.download.nvidia.com/compute/cuda/11.6.1/local_installers/cuda_11.6.1_510.47.03_linux.run -sudo sh cuda_11.6.1_510.47.03_linux.run --silent --toolkit -``` - -* Export environment variables - - ``` - echo $'export PATH=/usr/local/cuda-11.6/bin${PATH:+:${PATH}}\nexport LD_LIBRARY_PATH=/usr/local/cuda-11.6/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> ~/.bashrc && source ~/.bashrc - ``` - -#### 6. Download from [NVIDIA website](https://developer.nvidia.com/nvidia-tensorrt-8x-download) and install the TensorRT - -TensorRT 8.2 GA Update 4 for Ubuntu 20.04 and CUDA 11.0, 11.1, 11.2, 11.3, 11.4 and 11.5 DEB local repo Package - -``` -sudo dpkg -i nv-tensorrt-repo-ubuntu2004-cuda11.4-trt8.2.5.1-ga-20220505_1-1_amd64.deb -sudo apt-key add /var/nv-tensorrt-repo-ubuntu2004-cuda11.4-trt8.2.5.1-ga-20220505/82307095.pub -sudo apt-get update -sudo apt-get install libnvinfer8=8.2.5-1+cuda11.4 libnvinfer-plugin8=8.2.5-1+cuda11.4 libnvparsers8=8.2.5-1+cuda11.4 libnvonnxparsers8=8.2.5-1+cuda11.4 libnvinfer-bin=8.2.5-1+cuda11.4 libnvinfer-dev=8.2.5-1+cuda11.4 libnvinfer-plugin-dev=8.2.5-1+cuda11.4 libnvparsers-dev=8.2.5-1+cuda11.4 libnvonnxparsers-dev=8.2.5-1+cuda11.4 libnvinfer-samples=8.2.5-1+cuda11.4 libnvinfer-doc=8.2.5-1+cuda11.4 libcudnn8-dev=8.4.0.27-1+cuda11.6 libcudnn8=8.4.0.27-1+cuda11.6 -sudo apt-mark hold libnvinfer* libnvparsers* libnvonnxparsers* libcudnn8* tensorrt -``` - -#### 7. Download from [NVIDIA website](https://developer.nvidia.com/deepstream-sdk-download-tesla-archived) and install the DeepStream SDK - -DeepStream 6.1 for Servers and Workstations (.deb) - -``` -sudo apt-get install ./deepstream-6.1_6.1.0-1_amd64.deb -rm ${HOME}/.cache/gstreamer-1.0/registry.x86_64.bin -sudo ln -snf /usr/local/cuda-11.6 /usr/local/cuda -``` - -#### 8. Reboot the computer - -``` -sudo reboot -``` - -
- -
DeepStream 6.0.1 / 6.0 - -#### 1. Disable Secure Boot in BIOS - -
If you are using a laptop with newer Intel/AMD processors and your Graphics in Settings->Details->About tab is llvmpipe, please update the kernel. - -``` -wget https://kernel.ubuntu.com/~kernel-ppa/mainline/v5.11/amd64/linux-headers-5.11.0-051100_5.11.0-051100.202102142330_all.deb -wget https://kernel.ubuntu.com/~kernel-ppa/mainline/v5.11/amd64/linux-headers-5.11.0-051100-generic_5.11.0-051100.202102142330_amd64.deb -wget https://kernel.ubuntu.com/~kernel-ppa/mainline/v5.11/amd64/linux-image-unsigned-5.11.0-051100-generic_5.11.0-051100.202102142330_amd64.deb -wget https://kernel.ubuntu.com/~kernel-ppa/mainline/v5.11/amd64/linux-modules-5.11.0-051100-generic_5.11.0-051100.202102142330_amd64.deb -sudo dpkg -i *.deb -sudo reboot -``` - -
- -#### 2. Install dependencies - -``` -sudo apt-get update -sudo apt-get install gcc make git libtool autoconf autogen pkg-config cmake -sudo apt-get install python3 python3-dev python3-pip -sudo apt-get install libssl1.0.0 libgstreamer1.0-0 gstreamer1.0-tools gstreamer1.0-plugins-good gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly gstreamer1.0-libav libgstrtspserver-1.0-0 libjansson4 -sudo apt-get install linux-headers-$(uname -r) -``` - -**NOTE**: Install DKMS only if you are using the default Ubuntu kernel - -``` -sudo apt-get install dkms -``` - -**NOTE**: Purge all NVIDIA driver, CUDA, etc (replace $CUDA_PATH to your CUDA path) - -``` -sudo nvidia-uninstall -sudo $CUDA_PATH/bin/cuda-uninstaller -sudo apt-get remove --purge '*nvidia*' -sudo apt-get remove --purge '*cuda*' -sudo apt-get remove --purge '*cudnn*' -sudo apt-get remove --purge '*tensorrt*' -sudo apt autoremove --purge && sudo apt autoclean && sudo apt clean -``` - -#### 3. Install CUDA Keyring - -``` -wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb -sudo dpkg -i cuda-keyring_1.0-1_all.deb -sudo apt-get update -``` - -#### 4. Download and install NVIDIA Driver - -
TITAN, GeForce RTX / GTX series and RTX / Quadro series
- -- Download - - ``` - wget https://us.download.nvidia.com/XFree86/Linux-x86_64/470.129.06/NVIDIA-Linux-x86_64-470.129.06.run - ``` - -
Laptop - -* Run - - ``` - sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd - ``` - - **NOTE**: This step will disable the nouveau drivers. - - **NOTE**: Remove --dkms flag if you installed the 5.11.0 kernel. - -* Reboot - - ``` - sudo reboot - ``` - -* Install - - ``` - sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd - ``` - - **NOTE**: Remove --dkms flag if you installed the 5.11.0 kernel. - -**NOTE**: If you are using a laptop with NVIDIA Optimius, run - -``` -sudo apt-get install nvidia-prime -sudo prime-select nvidia -``` - -
- -
Desktop - -* Run - - ``` - sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig - ``` - - **NOTE**: This step will disable the nouveau drivers. - - **NOTE**: Remove --dkms flag if you installed the 5.11.0 kernel. - -* Reboot - - ``` - sudo reboot - ``` - -* Install - - ``` - sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig - ``` - - **NOTE**: Remove --dkms flag if you installed the 5.11.0 kernel. - -
- -
- -
Data center / Tesla series
- - - Download - - ``` - wget https://us.download.nvidia.com/tesla/470.129.06/NVIDIA-Linux-x86_64-470.129.06.run - ``` - - * Run - - ``` - sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig - ``` - - **NOTE**: Remove --dkms flag if you installed the 5.11.0 kernel. - -
- -#### 5. Download and install CUDA - -``` -wget https://developer.download.nvidia.com/compute/cuda/11.4.1/local_installers/cuda_11.4.1_470.57.02_linux.run -sudo sh cuda_11.4.1_470.57.02_linux.run --silent --toolkit -``` - -* Export environment variables - - ``` - echo $'export PATH=/usr/local/cuda-11.4/bin${PATH:+:${PATH}}\nexport LD_LIBRARY_PATH=/usr/local/cuda-11.4/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> ~/.bashrc && source ~/.bashrc - ``` - -#### 6. Download from [NVIDIA website](https://developer.nvidia.com/nvidia-tensorrt-8x-download) and install the TensorRT - -TensorRT 8.0.1 GA for Ubuntu 18.04 and CUDA 11.3 DEB local repo package - -``` -sudo dpkg -i nv-tensorrt-repo-ubuntu1804-cuda11.3-trt8.0.1.6-ga-20210626_1-1_amd64.deb -sudo apt-key add /var/nv-tensorrt-repo-ubuntu1804-cuda11.3-trt8.0.1.6-ga-20210626/7fa2af80.pub -sudo apt-get update -sudo apt-get install libnvinfer8=8.0.1-1+cuda11.3 libnvinfer-plugin8=8.0.1-1+cuda11.3 libnvparsers8=8.0.1-1+cuda11.3 libnvonnxparsers8=8.0.1-1+cuda11.3 libnvinfer-bin=8.0.1-1+cuda11.3 libnvinfer-dev=8.0.1-1+cuda11.3 libnvinfer-plugin-dev=8.0.1-1+cuda11.3 libnvparsers-dev=8.0.1-1+cuda11.3 libnvonnxparsers-dev=8.0.1-1+cuda11.3 libnvinfer-samples=8.0.1-1+cuda11.3 libnvinfer-doc=8.0.1-1+cuda11.3 libcudnn8-dev=8.2.1.32-1+cuda11.3 libcudnn8=8.2.1.32-1+cuda11.3 -sudo apt-mark hold libnvinfer* libnvparsers* libnvonnxparsers* libcudnn8* tensorrt -``` - -#### 7. Download from [NVIDIA website](https://developer.nvidia.com/deepstream-sdk-download-tesla-archived) and install the DeepStream SDK - -* DeepStream 6.0.1 for Servers and Workstations (.deb) - - ``` - sudo apt-get install ./deepstream-6.0_6.0.1-1_amd64.deb - ``` - -* DeepStream 6.0 for Servers and Workstations (.deb) - - ``` - sudo apt-get install ./deepstream-6.0_6.0.0-1_amd64.deb - ``` - -* Run - - ``` - rm ${HOME}/.cache/gstreamer-1.0/registry.x86_64.bin - sudo ln -snf /usr/local/cuda-11.4 /usr/local/cuda - ``` - -#### 8. Reboot the computer - -``` -sudo reboot -``` - -
- -## - ### Basic usage #### 1. Download the repo @@ -970,7 +195,7 @@ cd DeepStream-Yolo * DeepStream 5.1 on x86 platform ``` - CUDA_VER=11.1 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo + CUDA_VER=11.1 make -C nvdsinfer_custom_impl_Yolo ``` * DeepStream 6.2 / 6.1.1 / 6.1 on Jetson platform @@ -979,18 +204,12 @@ cd DeepStream-Yolo CUDA_VER=11.4 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 6.0.1 / 6.0 on Jetson platform +* DeepStream 6.0.1 / 6.0 / 5.1 on Jetson platform ``` CUDA_VER=10.2 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 5.1 on Jetson platform - - ``` - CUDA_VER=10.2 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo - ``` - #### 4. Edit the `config_infer_primary.txt` file according to your model (example for YOLOv4) ``` @@ -1001,6 +220,14 @@ model-file=yolov4.weights ... ``` +**NOTE**: By default, the dynamic batch-size is set. To use implicit batch-size, uncomment the line + +``` +... +force-implicit-batch-dim=1 +... +``` + #### 5. Run ``` @@ -1066,125 +293,6 @@ topk=300 ## -### INT8 calibration - -**NOTE**: For now, Only for Darknet YOLO model. - -#### 1. Install OpenCV - -``` -sudo apt-get install libopencv-dev -``` - -#### 2. Compile/recompile the `nvdsinfer_custom_impl_Yolo` lib with OpenCV support - -* DeepStream 6.2 on x86 platform - - ``` - CUDA_VER=11.8 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo - ``` - -* DeepStream 6.1.1 on x86 platform - - ``` - CUDA_VER=11.7 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo - ``` - -* DeepStream 6.1 on x86 platform - - ``` - CUDA_VER=11.6 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo - ``` - -* DeepStream 6.0.1 / 6.0 on x86 platform - - ``` - CUDA_VER=11.4 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo - ``` - -* DeepStream 5.1 on x86 platform - - ``` - CUDA_VER=11.1 OPENCV=1 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo - ``` - -* DeepStream 6.2 / 6.1.1 / 6.1 on Jetson platform - - ``` - CUDA_VER=11.4 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo - ``` - -* DeepStream 6.0.1 / 6.0 on Jetson platform - - ``` - CUDA_VER=10.2 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo - ``` - -* DeepStream 5.1 on Jetson platform - - ``` - CUDA_VER=10.2 OPENCV=1 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo - ``` - -#### 3. For COCO dataset, download the [val2017](https://drive.google.com/file/d/1gbvfn7mcsGDRZ_luJwtITL-ru2kK99aK/view?usp=sharing), extract, and move to DeepStream-Yolo folder - -* Select 1000 random images from COCO dataset to run calibration - - ``` - mkdir calibration - ``` - - ``` - for jpg in $(ls -1 val2017/*.jpg | sort -R | head -1000); do \ - cp ${jpg} calibration/; \ - done - ``` - -* Create the `calibration.txt` file with all selected images - - ``` - realpath calibration/*jpg > calibration.txt - ``` - -* Set environment variables - - ``` - export INT8_CALIB_IMG_PATH=calibration.txt - export INT8_CALIB_BATCH_SIZE=1 - ``` - -* Edit the `config_infer` file - - ``` - ... - model-engine-file=model_b1_gpu0_fp32.engine - #int8-calib-file=calib.table - ... - network-mode=0 - ... - ``` - - To - - ``` - ... - model-engine-file=model_b1_gpu0_int8.engine - int8-calib-file=calib.table - ... - network-mode=1 - ... - ``` - -* Run - - ``` - deepstream-app -c deepstream_app_config.txt - ``` - -**NOTE**: NVIDIA recommends at least 500 images to get a good accuracy. On this example, I recommend to use 1000 images to get better accuracy (more images = more accuracy). Higher `INT8_CALIB_BATCH_SIZE` values will result in more accuracy and faster calibration speed. Set it according to you GPU memory. This process may take a long time. - -## - ### Extract metadata You can get metadata from DeepStream using Python and C/C++. For C/C++, you can edit the `deepstream-app` or `deepstream-test` codes. For Python, your can install and edit [deepstream_python_apps](https://github.com/NVIDIA-AI-IOT/deepstream_python_apps). diff --git a/config_infer_primary.txt b/config_infer_primary.txt index fa5788d..c0552b8 100644 --- a/config_infer_primary.txt +++ b/config_infer_primary.txt @@ -17,7 +17,9 @@ network-type=0 cluster-mode=2 maintain-aspect-ratio=0 symmetric-padding=1 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYolo +#parse-bbox-func-name=NvDsInferParseYoloCuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so engine-create-func-name=NvDsInferYoloCudaEngineGet diff --git a/config_infer_primary_damoyolo.txt b/config_infer_primary_damoyolo.txt index 6ab6541..7d2a2f7 100644 --- a/config_infer_primary_damoyolo.txt +++ b/config_infer_primary_damoyolo.txt @@ -3,7 +3,7 @@ gpu-id=0 net-scale-factor=1 model-color-format=0 onnx-file=damoyolo_tinynasL25_S.onnx -model-engine-file=damoyolo_tinynasL25_S.onnx_b1_gpu0_fp32.engine +model-engine-file=model_b1_gpu0_fp32.engine #int8-calib-file=calib.table labelfile-path=labels.txt batch-size=1 @@ -15,8 +15,11 @@ process-mode=1 network-type=0 cluster-mode=2 maintain-aspect-ratio=0 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYoloE +#parse-bbox-func-name=NvDsInferParseYoloECuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so +engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] nms-iou-threshold=0.45 diff --git a/config_infer_primary_ppyoloe.txt b/config_infer_primary_ppyoloe.txt index 4060360..7c3c642 100644 --- a/config_infer_primary_ppyoloe.txt +++ b/config_infer_primary_ppyoloe.txt @@ -4,7 +4,7 @@ net-scale-factor=0.0173520735727919486 offsets=123.675;116.28;103.53 model-color-format=0 onnx-file=ppyoloe_crn_s_400e_coco.onnx -model-engine-file=ppyoloe_crn_s_400e_coco.onnx_b1_gpu0_fp32.engine +model-engine-file=model_b1_gpu0_fp32.engine #int8-calib-file=calib.table labelfile-path=labels.txt batch-size=1 @@ -16,8 +16,11 @@ process-mode=1 network-type=0 cluster-mode=2 maintain-aspect-ratio=0 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYoloE +#parse-bbox-func-name=NvDsInferParseYoloECuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so +engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] nms-iou-threshold=0.45 diff --git a/config_infer_primary_ppyoloe_plus.txt b/config_infer_primary_ppyoloe_plus.txt index 5b5b172..090939b 100644 --- a/config_infer_primary_ppyoloe_plus.txt +++ b/config_infer_primary_ppyoloe_plus.txt @@ -3,7 +3,7 @@ gpu-id=0 net-scale-factor=0.0039215697906911373 model-color-format=0 onnx-file=ppyoloe_plus_crn_s_80e_coco.onnx -model-engine-file=ppyoloe_plus_crn_s_80e_coco.onnx_b1_gpu0_fp32.engine +model-engine-file=model_b1_gpu0_fp32.engine #int8-calib-file=calib.table labelfile-path=labels.txt batch-size=1 @@ -15,8 +15,11 @@ process-mode=1 network-type=0 cluster-mode=2 maintain-aspect-ratio=0 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYoloE +#parse-bbox-func-name=NvDsInferParseYoloECuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so +engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] nms-iou-threshold=0.45 diff --git a/config_infer_primary_yoloV2.txt b/config_infer_primary_yoloV2.txt index 7d9ce07..220614f 100644 --- a/config_infer_primary_yoloV2.txt +++ b/config_infer_primary_yoloV2.txt @@ -16,7 +16,9 @@ process-mode=1 network-type=0 cluster-mode=2 maintain-aspect-ratio=0 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYolo +#parse-bbox-func-name=NvDsInferParseYoloCuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so engine-create-func-name=NvDsInferYoloCudaEngineGet diff --git a/config_infer_primary_yoloV5.txt b/config_infer_primary_yoloV5.txt index f294ef6..89919b3 100644 --- a/config_infer_primary_yoloV5.txt +++ b/config_infer_primary_yoloV5.txt @@ -3,7 +3,7 @@ gpu-id=0 net-scale-factor=0.0039215697906911373 model-color-format=0 onnx-file=yolov5s.onnx -model-engine-file=yolov5s.onnx_b1_gpu0_fp32.engine +model-engine-file=model_b1_gpu0_fp32.engine #int8-calib-file=calib.table labelfile-path=labels.txt batch-size=1 @@ -16,8 +16,11 @@ network-type=0 cluster-mode=2 maintain-aspect-ratio=1 symmetric-padding=1 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYolo +#parse-bbox-func-name=NvDsInferParseYoloCuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so +engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] nms-iou-threshold=0.45 diff --git a/config_infer_primary_yoloV6.txt b/config_infer_primary_yoloV6.txt index 98a487c..6aa6912 100644 --- a/config_infer_primary_yoloV6.txt +++ b/config_infer_primary_yoloV6.txt @@ -3,7 +3,7 @@ gpu-id=0 net-scale-factor=0.0039215697906911373 model-color-format=0 onnx-file=yolov6s.onnx -model-engine-file=yolov6s.onnx_b1_gpu0_fp32.engine +model-engine-file=model_b1_gpu0_fp32.engine #int8-calib-file=calib.table labelfile-path=labels.txt batch-size=1 @@ -16,8 +16,11 @@ network-type=0 cluster-mode=2 maintain-aspect-ratio=1 symmetric-padding=1 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYolo +#parse-bbox-func-name=NvDsInferParseYoloCuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so +engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] nms-iou-threshold=0.45 diff --git a/config_infer_primary_yoloV7.txt b/config_infer_primary_yoloV7.txt index 1a16f1d..06906d2 100644 --- a/config_infer_primary_yoloV7.txt +++ b/config_infer_primary_yoloV7.txt @@ -3,7 +3,7 @@ gpu-id=0 net-scale-factor=0.0039215697906911373 model-color-format=0 onnx-file=yolov7.onnx -model-engine-file=yolov7.onnx_b1_gpu0_fp32.engine +model-engine-file=model_b1_gpu0_fp32.engine #int8-calib-file=calib.table labelfile-path=labels.txt batch-size=1 @@ -16,8 +16,11 @@ network-type=0 cluster-mode=2 maintain-aspect-ratio=1 symmetric-padding=1 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYolo +#parse-bbox-func-name=NvDsInferParseYoloCuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so +engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] nms-iou-threshold=0.45 diff --git a/config_infer_primary_yoloV8.txt b/config_infer_primary_yoloV8.txt index 25fabd4..79cf321 100644 --- a/config_infer_primary_yoloV8.txt +++ b/config_infer_primary_yoloV8.txt @@ -3,7 +3,7 @@ gpu-id=0 net-scale-factor=0.0039215697906911373 model-color-format=0 onnx-file=yolov8s.onnx -model-engine-file=yolov8s.onnx_b1_gpu0_fp32.engine +model-engine-file=model_b1_gpu0_fp32.engine #int8-calib-file=calib.table labelfile-path=labels.txt batch-size=1 @@ -16,8 +16,11 @@ network-type=0 cluster-mode=2 maintain-aspect-ratio=1 symmetric-padding=1 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYolo +#parse-bbox-func-name=NvDsInferParseYoloCuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so +engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] nms-iou-threshold=0.45 diff --git a/config_infer_primary_yolonas.txt b/config_infer_primary_yolonas.txt index fdf55b6..925c848 100644 --- a/config_infer_primary_yolonas.txt +++ b/config_infer_primary_yolonas.txt @@ -3,7 +3,7 @@ gpu-id=0 net-scale-factor=0.0039215697906911373 model-color-format=0 onnx-file=yolo_nas_s_coco.onnx -model-engine-file=yolo_nas_s_coco.onnx_b1_gpu0_fp32.engine +model-engine-file=model_b1_gpu0_fp32.engine #int8-calib-file=calib.table labelfile-path=labels.txt batch-size=1 @@ -16,8 +16,11 @@ network-type=0 cluster-mode=2 maintain-aspect-ratio=1 symmetric-padding=0 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYoloE +#parse-bbox-func-name=NvDsInferParseYoloECuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so +engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] nms-iou-threshold=0.45 diff --git a/config_infer_primary_yolor.txt b/config_infer_primary_yolor.txt index 4883e34..2f5732b 100644 --- a/config_infer_primary_yolor.txt +++ b/config_infer_primary_yolor.txt @@ -3,7 +3,7 @@ gpu-id=0 net-scale-factor=0.0039215697906911373 model-color-format=0 onnx-file=yolor_csp.onnx -model-engine-file=yolor_csp.onnx_b1_gpu0_fp32.engine +model-engine-file=model_b1_gpu0_fp32.engine #int8-calib-file=calib.table labelfile-path=labels.txt batch-size=1 @@ -16,8 +16,11 @@ network-type=0 cluster-mode=2 maintain-aspect-ratio=1 symmetric-padding=1 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYolo +#parse-bbox-func-name=NvDsInferParseYoloCuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so +engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] nms-iou-threshold=0.45 diff --git a/config_infer_primary_yolox.txt b/config_infer_primary_yolox.txt index 99888fe..d50618b 100644 --- a/config_infer_primary_yolox.txt +++ b/config_infer_primary_yolox.txt @@ -3,7 +3,7 @@ gpu-id=0 net-scale-factor=1 model-color-format=0 onnx-file=yolox_s.onnx -model-engine-file=yolox_s.onnx_b1_gpu0_fp32.engine +model-engine-file=model_b1_gpu0_fp32.engine #int8-calib-file=calib.table labelfile-path=labels.txt batch-size=1 @@ -16,8 +16,11 @@ network-type=0 cluster-mode=2 maintain-aspect-ratio=1 symmetric-padding=0 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYolo +#parse-bbox-func-name=NvDsInferParseYoloCuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so +engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] nms-iou-threshold=0.45 diff --git a/config_infer_primary_yolox_legacy.txt b/config_infer_primary_yolox_legacy.txt index cc3c3b6..9aefdd9 100644 --- a/config_infer_primary_yolox_legacy.txt +++ b/config_infer_primary_yolox_legacy.txt @@ -4,7 +4,7 @@ net-scale-factor=0.0173520735727919486 offsets=123.675;116.28;103.53 model-color-format=0 onnx-file=yolox_s.onnx -model-engine-file=yolox_s.onnx_b1_gpu0_fp32.engine +model-engine-file=model_b1_gpu0_fp32.engine #int8-calib-file=calib.table labelfile-path=labels.txt batch-size=1 @@ -17,8 +17,11 @@ network-type=0 cluster-mode=2 maintain-aspect-ratio=1 symmetric-padding=0 +#force-implicit-batch-dim=1 parse-bbox-func-name=NvDsInferParseYolo +#parse-bbox-func-name=NvDsInferParseYoloCuda custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so +engine-create-func-name=NvDsInferYoloCudaEngineGet [class-attrs-all] nms-iou-threshold=0.45 diff --git a/docs/DAMOYOLO.md b/docs/DAMOYOLO.md index e55b49a..4deb7c4 100644 --- a/docs/DAMOYOLO.md +++ b/docs/DAMOYOLO.md @@ -43,6 +43,24 @@ Generate the ONNX model file (example for DAMO-YOLO-S*) python3 export_damoyolo.py -w damoyolo_tinynasL25_S_477.pth -c configs/damoyolo_tinynasL25_S.py --simplify --dynamic ``` +**NOTE**: To simplify the ONNX model + +``` +--simplify +``` + +**NOTE**: To use dynamic batch-size + +``` +--dynamic +``` + +**NOTE**: To use implicit batch-size (example for batch-size = 4) + +``` +--batch 4 +``` + **NOTE**: If you are using DeepStream 5.1, use opset 11 or lower. ``` @@ -107,7 +125,7 @@ Open the `DeepStream-Yolo` folder and compile the lib * DeepStream 5.1 on x86 platform ``` - CUDA_VER=11.1 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo + CUDA_VER=11.1 make -C nvdsinfer_custom_impl_Yolo ``` * DeepStream 6.2 / 6.1.1 / 6.1 on Jetson platform @@ -116,18 +134,12 @@ Open the `DeepStream-Yolo` folder and compile the lib CUDA_VER=11.4 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 6.0.1 / 6.0 on Jetson platform +* DeepStream 6.0.1 / 6.0 / 5.1 on Jetson platform ``` CUDA_VER=10.2 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 5.1 on Jetson platform - - ``` - CUDA_VER=10.2 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo - ``` - ## ### Edit the config_infer_primary_damoyolo file @@ -138,7 +150,6 @@ Edit the `config_infer_primary_damoyolo.txt` file according to your model (examp [property] ... onnx-file=damoyolo_tinynasL25_S.onnx -model-engine-file=damoyolo_tinynasL25_S.onnx_b1_gpu0_fp32.engine ... num-detected-classes=80 ... @@ -149,7 +160,17 @@ parse-bbox-func-name=NvDsInferParseYoloE **NOTE**: The **DAMO-YOLO** do not resize the input with padding. To get better accuracy, use ``` +... maintain-aspect-ratio=0 +... +``` + +**NOTE**: By default, the dynamic batch-size is set. To use implicit batch-size, uncomment the line + +``` +... +force-implicit-batch-dim=1 +... ``` ## diff --git a/docs/INT8Calibration.md b/docs/INT8Calibration.md new file mode 100644 index 0000000..5b40e22 --- /dev/null +++ b/docs/INT8Calibration.md @@ -0,0 +1,108 @@ +# INT8 calibration (PTQ) + +### 1. Install OpenCV + +``` +sudo apt-get install libopencv-dev +``` + +### 2. Compile/recompile the `nvdsinfer_custom_impl_Yolo` lib with OpenCV support + +* DeepStream 6.2 on x86 platform + + ``` + CUDA_VER=11.8 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo + ``` + +* DeepStream 6.1.1 on x86 platform + + ``` + CUDA_VER=11.7 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo + ``` + +* DeepStream 6.1 on x86 platform + + ``` + CUDA_VER=11.6 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo + ``` + +* DeepStream 6.0.1 / 6.0 on x86 platform + + ``` + CUDA_VER=11.4 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo + ``` + +* DeepStream 5.1 on x86 platform + + ``` + CUDA_VER=11.1 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo + ``` + +* DeepStream 6.2 / 6.1.1 / 6.1 on Jetson platform + + ``` + CUDA_VER=11.4 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo + ``` + +* DeepStream 6.0.1 / 6.0 / 5.1 on Jetson platform + + ``` + CUDA_VER=10.2 OPENCV=1 make -C nvdsinfer_custom_impl_Yolo + ``` + +### 3. For COCO dataset, download the [val2017](https://drive.google.com/file/d/1gbvfn7mcsGDRZ_luJwtITL-ru2kK99aK/view?usp=sharing), extract, and move to DeepStream-Yolo folder + +* Select 1000 random images from COCO dataset to run calibration + + ``` + mkdir calibration + ``` + + ``` + for jpg in $(ls -1 val2017/*.jpg | sort -R | head -1000); do \ + cp ${jpg} calibration/; \ + done + ``` + +* Create the `calibration.txt` file with all selected images + + ``` + realpath calibration/*jpg > calibration.txt + ``` + +* Set environment variables + + ``` + export INT8_CALIB_IMG_PATH=calibration.txt + export INT8_CALIB_BATCH_SIZE=1 + ``` + +* Edit the `config_infer` file + + ``` + ... + model-engine-file=model_b1_gpu0_fp32.engine + #int8-calib-file=calib.table + ... + network-mode=0 + ... + ``` + + To + + ``` + ... + model-engine-file=model_b1_gpu0_int8.engine + int8-calib-file=calib.table + ... + network-mode=1 + ... + ``` + +* Run + + ``` + deepstream-app -c deepstream_app_config.txt + ``` + +**NOTE**: NVIDIA recommends at least 500 images to get a good accuracy. On this example, I recommend to use 1000 images to get better accuracy (more images = more accuracy). Higher `INT8_CALIB_BATCH_SIZE` values will result in more accuracy and faster calibration speed. Set it according to you GPU memory. This process may take a long time. diff --git a/docs/PPYOLOE.md b/docs/PPYOLOE.md index cd61bf1..f9a8277 100644 --- a/docs/PPYOLOE.md +++ b/docs/PPYOLOE.md @@ -38,7 +38,25 @@ Generate the ONNX model file (example for PP-YOLOE+_s) ``` pip3 install onnx onnxsim onnxruntime -python3 export_ppyoloe.py -w ppyoloe_plus_crn_s_80e_coco.pdparams -c configs/ppyoloe/ppyoloe_plus_crn_s_80e_coco.yml --simplify +python3 export_ppyoloe.py -w ppyoloe_plus_crn_s_80e_coco.pdparams -c configs/ppyoloe/ppyoloe_plus_crn_s_80e_coco.yml --simplify --dynamic +``` + +**NOTE**: To simplify the ONNX model + +``` +--simplify +``` + +**NOTE**: To use dynamic batch-size + +``` +--dynamic +``` + +**NOTE**: To use implicit batch-size (example for batch-size = 4) + +``` +--batch 4 ``` **NOTE**: If you are using DeepStream 5.1, use opset 12 or lower. The default opset is 11. @@ -84,7 +102,7 @@ Open the `DeepStream-Yolo` folder and compile the lib * DeepStream 5.1 on x86 platform ``` - CUDA_VER=11.1 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo + CUDA_VER=11.1 make -C nvdsinfer_custom_impl_Yolo ``` * DeepStream 6.2 / 6.1.1 / 6.1 on Jetson platform @@ -93,18 +111,12 @@ Open the `DeepStream-Yolo` folder and compile the lib CUDA_VER=11.4 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 6.0.1 / 6.0 on Jetson platform +* DeepStream 6.0.1 / 6.0 / 5.1 on Jetson platform ``` CUDA_VER=10.2 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 5.1 on Jetson platform - - ``` - CUDA_VER=10.2 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo - ``` - ## ### Edit the config_infer_primary_ppyoloe_plus file @@ -115,7 +127,6 @@ Edit the `config_infer_primary_ppyoloe_plus.txt` file according to your model (e [property] ... onnx-file=ppyoloe_plus_crn_s_80e_coco.onnx -model-engine-file=ppyoloe_plus_crn_s_80e_coco.onnx_b1_gpu0_fp32.engine ... num-detected-classes=80 ... @@ -128,13 +139,17 @@ parse-bbox-func-name=NvDsInferParseYoloE **NOTE**: The **PP-YOLOE+ and PP-YOLOE legacy** do not resize the input with padding. To get better accuracy, use ``` +... maintain-aspect-ratio=0 +... ``` **NOTE**: The **PP-YOLOE+** uses zero mean normalization on the image preprocess. It is important to change the `net-scale-factor` according to the trained values. ``` +... net-scale-factor=0.0039215697906911373 +... ``` **NOTE**: The **PP-YOLOE legacy** uses normalization on the image preprocess. It is important to change the `net-scale-factor` and `offsets` according to the trained values. @@ -142,8 +157,18 @@ net-scale-factor=0.0039215697906911373 Default: `mean = 0.485, 0.456, 0.406` and `std = 0.229, 0.224, 0.225` ``` +... net-scale-factor=0.0173520735727919486 offsets=123.675;116.28;103.53 +... +``` + +**NOTE**: By default, the dynamic batch-size is set. To use implicit batch-size, uncomment the line + +``` +... +force-implicit-batch-dim=1 +... ``` ## diff --git a/docs/YOLONAS.md b/docs/YOLONAS.md index a28172e..49c25e9 100644 --- a/docs/YOLONAS.md +++ b/docs/YOLONAS.md @@ -46,6 +46,24 @@ Generate the ONNX model file (example for YOLO-NAS S) python3 export_yolonas.py -m yolo_nas_s -w yolo_nas_s_coco.pth --simplify --dynamic ``` +**NOTE**: To simplify the ONNX model + +``` +--simplify +``` + +**NOTE**: To use dynamic batch-size + +``` +--dynamic +``` + +**NOTE**: To use implicit batch-size (example for batch-size = 4) + +``` +--batch 4 +``` + **NOTE**: If you are using DeepStream 5.1, use opset 12 or lower. The default opset is 14. ``` @@ -128,7 +146,7 @@ Open the `DeepStream-Yolo` folder and compile the lib * DeepStream 5.1 on x86 platform ``` - CUDA_VER=11.1 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo + CUDA_VER=11.1 make -C nvdsinfer_custom_impl_Yolo ``` * DeepStream 6.2 / 6.1.1 / 6.1 on Jetson platform @@ -137,18 +155,12 @@ Open the `DeepStream-Yolo` folder and compile the lib CUDA_VER=11.4 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 6.0.1 / 6.0 on Jetson platform +* DeepStream 6.0.1 / 6.0 / 5.1 on Jetson platform ``` CUDA_VER=10.2 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 5.1 on Jetson platform - - ``` - CUDA_VER=10.2 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo - ``` - ## ### Edit the config_infer_primary_yolonas file @@ -159,7 +171,6 @@ Edit the `config_infer_primary_yolonas.txt` file according to your model (exampl [property] ... onnx-file=yolo_nas_s_coco.onnx -model-engine-file=yolo_nas_s_coco.onnx_b1_gpu0_fp32.engine ... num-detected-classes=80 ... @@ -170,8 +181,18 @@ parse-bbox-func-name=NvDsInferParseYoloE **NOTE**: The **YOLO-NAS** resizes the input with left/top padding. To get better accuracy, use ``` +... maintain-aspect-ratio=1 symmetric-padding=0 +... +``` + +**NOTE**: By default, the dynamic batch-size is set. To use implicit batch-size, uncomment the line + +``` +... +force-implicit-batch-dim=1 +... ``` ## diff --git a/docs/YOLOR.md b/docs/YOLOR.md index bfa28c7..a6cb481 100644 --- a/docs/YOLOR.md +++ b/docs/YOLOR.md @@ -55,6 +55,24 @@ Generate the ONNX model file python3 export_yolor.py -w yolor-p6.pt --simplify --dynamic ``` +**NOTE**: To simplify the ONNX model + +``` +--simplify +``` + +**NOTE**: To use dynamic batch-size + +``` +--dynamic +``` + +**NOTE**: To use implicit batch-size (example for batch-size = 4) + +``` +--batch 4 +``` + **NOTE**: If you are using DeepStream 5.1, use opset 12 or lower. The default opset is 12. ``` @@ -125,7 +143,7 @@ Open the `DeepStream-Yolo` folder and compile the lib * DeepStream 5.1 on x86 platform ``` - CUDA_VER=11.1 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo + CUDA_VER=11.1 make -C nvdsinfer_custom_impl_Yolo ``` * DeepStream 6.2 / 6.1.1 / 6.1 on Jetson platform @@ -134,18 +152,12 @@ Open the `DeepStream-Yolo` folder and compile the lib CUDA_VER=11.4 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 6.0.1 / 6.0 on Jetson platform +* DeepStream 6.0.1 / 6.0 / 5.1 on Jetson platform ``` CUDA_VER=10.2 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 5.1 on Jetson platform - - ``` - CUDA_VER=10.2 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo - ``` - ## ### Edit the config_infer_primary_yolor file @@ -156,7 +168,6 @@ Edit the `config_infer_primary_yolor.txt` file according to your model (example [property] ... onnx-file=yolor_csp.onnx -model-engine-file=yolor_csp.onnx_b1_gpu0_fp32.engine ... num-detected-classes=80 ... @@ -167,8 +178,18 @@ parse-bbox-func-name=NvDsInferParseYolo **NOTE**: The **YOLOR** resizes the input with center padding. To get better accuracy, use ``` +... maintain-aspect-ratio=1 symmetric-padding=1 +... +``` + +**NOTE**: By default, the dynamic batch-size is set. To use implicit batch-size, uncomment the line + +``` +... +force-implicit-batch-dim=1 +... ``` ## diff --git a/docs/YOLOX.md b/docs/YOLOX.md index 2b719e9..c33c567 100644 --- a/docs/YOLOX.md +++ b/docs/YOLOX.md @@ -46,6 +46,24 @@ Generate the ONNX model file (example for YOLOX-s) python3 export_yolox.py -w yolox_s.pth -c exps/default/yolox_s.py --simplify --dynamic ``` +**NOTE**: To simplify the ONNX model + +``` +--simplify +``` + +**NOTE**: To use dynamic batch-size + +``` +--dynamic +``` + +**NOTE**: To use implicit batch-size (example for batch-size = 4) + +``` +--batch 4 +``` + **NOTE**: If you are using DeepStream 5.1, use opset 12 or lower. The default opset is 11. ``` @@ -89,7 +107,7 @@ Open the `DeepStream-Yolo` folder and compile the lib * DeepStream 5.1 on x86 platform ``` - CUDA_VER=11.1 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo + CUDA_VER=11.1 make -C nvdsinfer_custom_impl_Yolo ``` * DeepStream 6.2 / 6.1.1 / 6.1 on Jetson platform @@ -98,18 +116,12 @@ Open the `DeepStream-Yolo` folder and compile the lib CUDA_VER=11.4 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 6.0.1 / 6.0 on Jetson platform +* DeepStream 6.0.1 / 6.0 / 5.1 on Jetson platform ``` CUDA_VER=10.2 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 5.1 on Jetson platform - - ``` - CUDA_VER=10.2 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo - ``` - ## ### Edit the config_infer_primary_yolox file @@ -120,7 +132,6 @@ Edit the `config_infer_primary_yolox.txt` file according to your model (example [property] ... onnx-file=yolox_s.onnx -model-engine-file=yolox_s.onnx_b1_gpu0_fp32.engine ... num-detected-classes=80 ... @@ -133,14 +144,18 @@ parse-bbox-func-name=NvDsInferParseYolo **NOTE**: The **YOLOX and YOLOX legacy** resize the input with left/top padding. To get better accuracy, use ``` +... maintain-aspect-ratio=1 symmetric-padding=0 +... ``` **NOTE**: The **YOLOX** uses no normalization on the image preprocess. It is important to change the `net-scale-factor` according to the trained values. ``` +... net-scale-factor=1 +... ``` **NOTE**: The **YOLOX legacy** uses normalization on the image preprocess. It is important to change the `net-scale-factor` and `offsets` according to the trained values. @@ -148,8 +163,18 @@ net-scale-factor=1 Default: `mean = 0.485, 0.456, 0.406` and `std = 0.229, 0.224, 0.225` ``` +... net-scale-factor=0.0173520735727919486 offsets=123.675;116.28;103.53 +... +``` + +**NOTE**: By default, the dynamic batch-size is set. To use implicit batch-size, uncomment the line + +``` +... +force-implicit-batch-dim=1 +... ``` ## diff --git a/docs/YOLOv5.md b/docs/YOLOv5.md index 66a0309..aabfad4 100644 --- a/docs/YOLOv5.md +++ b/docs/YOLOv5.md @@ -47,6 +47,24 @@ Generate the ONNX model file (example for YOLOv5s) python3 export_yoloV5.py -w yolov5s.pt --simplify --dynamic ``` +**NOTE**: To simplify the ONNX model + +``` +--simplify +``` + +**NOTE**: To use dynamic batch-size + +``` +--dynamic +``` + +**NOTE**: To use implicit batch-size (example for batch-size = 4) + +``` +--batch 4 +``` + **NOTE**: If you are using DeepStream 5.1, use opset 12 or lower. The default opset is 17. ``` @@ -117,7 +135,7 @@ Open the `DeepStream-Yolo` folder and compile the lib * DeepStream 5.1 on x86 platform ``` - CUDA_VER=11.1 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo + CUDA_VER=11.1 make -C nvdsinfer_custom_impl_Yolo ``` * DeepStream 6.2 / 6.1.1 / 6.1 on Jetson platform @@ -126,18 +144,12 @@ Open the `DeepStream-Yolo` folder and compile the lib CUDA_VER=11.4 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 6.0.1 / 6.0 on Jetson platform +* DeepStream 6.0.1 / 6.0 / 5.1 on Jetson platform ``` CUDA_VER=10.2 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 5.1 on Jetson platform - - ``` - CUDA_VER=10.2 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo - ``` - ## ### Edit the config_infer_primary_yoloV5 file @@ -148,7 +160,6 @@ Edit the `config_infer_primary_yoloV5.txt` file according to your model (example [property] ... onnx-file=yolov5s.onnx -model-engine-file=yolov5s.onnx_b1_gpu0_fp32.engine ... num-detected-classes=80 ... @@ -159,8 +170,18 @@ parse-bbox-func-name=NvDsInferParseYolo **NOTE**: The **YOLOv5** resizes the input with center padding. To get better accuracy, use ``` +... maintain-aspect-ratio=1 symmetric-padding=1 +... +``` + +**NOTE**: By default, the dynamic batch-size is set. To use implicit batch-size, uncomment the line + +``` +... +force-implicit-batch-dim=1 +... ``` ## diff --git a/docs/YOLOv6.md b/docs/YOLOv6.md index 38d70f1..680b69e 100644 --- a/docs/YOLOv6.md +++ b/docs/YOLOv6.md @@ -47,6 +47,24 @@ Generate the ONNX model file (example for YOLOv6-S 4.0) python3 export_yoloV6.py -w yolov6s.pt --simplify --dynamic ``` +**NOTE**: To simplify the ONNX model + +``` +--simplify +``` + +**NOTE**: To use dynamic batch-size + +``` +--dynamic +``` + +**NOTE**: To use implicit batch-size (example for batch-size = 4) + +``` +--batch 4 +``` + **NOTE**: If you are using DeepStream 5.1, use opset 12 or lower. The default opset is 13. ``` @@ -117,7 +135,7 @@ Open the `DeepStream-Yolo` folder and compile the lib * DeepStream 5.1 on x86 platform ``` - CUDA_VER=11.1 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo + CUDA_VER=11.1 make -C nvdsinfer_custom_impl_Yolo ``` * DeepStream 6.2 / 6.1.1 / 6.1 on Jetson platform @@ -126,18 +144,12 @@ Open the `DeepStream-Yolo` folder and compile the lib CUDA_VER=11.4 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 6.0.1 / 6.0 on Jetson platform +* DeepStream 6.0.1 / 6.0 / 5.1 on Jetson platform ``` CUDA_VER=10.2 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 5.1 on Jetson platform - - ``` - CUDA_VER=10.2 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo - ``` - ## ### Edit the config_infer_primary_yoloV6 file @@ -148,7 +160,6 @@ Edit the `config_infer_primary_yoloV6.txt` file according to your model (example [property] ... onnx-file=yolov6s.onnx -model-engine-file=yolov6s.onnx_b1_gpu0_fp32.engine ... num-detected-classes=80 ... @@ -159,8 +170,18 @@ parse-bbox-func-name=NvDsInferParseYolo **NOTE**: The **YOLOv6** resizes the input with center padding. To get better accuracy, use ``` +... maintain-aspect-ratio=1 symmetric-padding=1 +... +``` + +**NOTE**: By default, the dynamic batch-size is set. To use implicit batch-size, uncomment the line + +``` +... +force-implicit-batch-dim=1 +... ``` ## diff --git a/docs/YOLOv7.md b/docs/YOLOv7.md index d75cde4..99cc4f8 100644 --- a/docs/YOLOv7.md +++ b/docs/YOLOv7.md @@ -49,6 +49,24 @@ Generate the ONNX model file (example for YOLOv7) python3 export_yoloV7.py -w yolov7.pt --simplify --dynamic ``` +**NOTE**: To simplify the ONNX model + +``` +--simplify +``` + +**NOTE**: To use dynamic batch-size + +``` +--dynamic +``` + +**NOTE**: To use implicit batch-size (example for batch-size = 4) + +``` +--batch 4 +``` + **NOTE**: If you are using DeepStream 5.1, use opset 12 or lower. The default opset is 12. ``` @@ -119,7 +137,7 @@ Open the `DeepStream-Yolo` folder and compile the lib * DeepStream 5.1 on x86 platform ``` - CUDA_VER=11.1 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo + CUDA_VER=11.1 make -C nvdsinfer_custom_impl_Yolo ``` * DeepStream 6.2 / 6.1.1 / 6.1 on Jetson platform @@ -128,18 +146,12 @@ Open the `DeepStream-Yolo` folder and compile the lib CUDA_VER=11.4 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 6.0.1 / 6.0 on Jetson platform +* DeepStream 6.0.1 / 6.0 / 5.1 on Jetson platform ``` CUDA_VER=10.2 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 5.1 on Jetson platform - - ``` - CUDA_VER=10.2 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo - ``` - ## ### Edit the config_infer_primary_yoloV7 file @@ -150,7 +162,6 @@ Edit the `config_infer_primary_yoloV7.txt` file according to your model (example [property] ... onnx-file=yolov7.onnx -model-engine-file=yolov7.onnx_b1_gpu0_fp32.engine ... num-detected-classes=80 ... @@ -161,8 +172,18 @@ parse-bbox-func-name=NvDsInferParseYolo **NOTE**: The **YOLOv7** resizes the input with center padding. To get better accuracy, use ``` +... maintain-aspect-ratio=1 symmetric-padding=1 +... +``` + +**NOTE**: By default, the dynamic batch-size is set. To use implicit batch-size, uncomment the line + +``` +... +force-implicit-batch-dim=1 +... ``` ## diff --git a/docs/YOLOv8.md b/docs/YOLOv8.md index 64955d7..67147da 100644 --- a/docs/YOLOv8.md +++ b/docs/YOLOv8.md @@ -46,6 +46,24 @@ Generate the ONNX model file (example for YOLOv8s) python3 export_yoloV8.py -w yolov8s.pt --simplify --dynamic ``` +**NOTE**: To simplify the ONNX model + +``` +--simplify +``` + +**NOTE**: To use dynamic batch-size + +``` +--dynamic +``` + +**NOTE**: To use implicit batch-size (example for batch-size = 4) + +``` +--batch 4 +``` + **NOTE**: If you are using DeepStream 5.1, use opset 12 or lower. The default opset is 16. ``` @@ -110,7 +128,7 @@ Open the `DeepStream-Yolo` folder and compile the lib * DeepStream 5.1 on x86 platform ``` - CUDA_VER=11.1 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo + CUDA_VER=11.1 make -C nvdsinfer_custom_impl_Yolo ``` * DeepStream 6.2 / 6.1.1 / 6.1 on Jetson platform @@ -119,18 +137,12 @@ Open the `DeepStream-Yolo` folder and compile the lib CUDA_VER=11.4 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 6.0.1 / 6.0 on Jetson platform +* DeepStream 6.0.1 / 6.0 / 5.1 on Jetson platform ``` CUDA_VER=10.2 make -C nvdsinfer_custom_impl_Yolo ``` -* DeepStream 5.1 on Jetson platform - - ``` - CUDA_VER=10.2 LEGACY=1 make -C nvdsinfer_custom_impl_Yolo - ``` - ## ### Edit the config_infer_primary_yoloV8 file @@ -141,7 +153,6 @@ Edit the `config_infer_primary_yoloV8.txt` file according to your model (example [property] ... onnx-file=yolov8s.onnx -model-engine-file=yolov8s.onnx_b1_gpu0_fp32.engine ... num-detected-classes=80 ... @@ -152,8 +163,18 @@ parse-bbox-func-name=NvDsInferParseYolo **NOTE**: The **YOLOv8** resizes the input with center padding. To get better accuracy, use ``` +... maintain-aspect-ratio=1 symmetric-padding=1 +... +``` + +**NOTE**: By default, the dynamic batch-size is set. To use implicit batch-size, uncomment the line + +``` +... +force-implicit-batch-dim=1 +... ``` ## diff --git a/docs/benchmarks.md b/docs/benchmarks.md new file mode 100644 index 0000000..7719569 --- /dev/null +++ b/docs/benchmarks.md @@ -0,0 +1,88 @@ +# Benchmarks + +### Config + +``` +board = NVIDIA Tesla V100 16GB (AWS: p3.2xlarge) +batch-size = 1 +eval = val2017 (COCO) +sample = 1920x1080 video +``` + +**NOTE**: Used maintain-aspect-ratio=1 in config_infer file for Darknet (with letter_box=1) and PyTorch models. + +### NMS config + +- Eval + +``` +nms-iou-threshold = 0.6 (Darknet) / 0.65 (YOLOv5, YOLOv6, YOLOv7, YOLOR and YOLOX) / 0.7 (Paddle, YOLO-NAS, DAMO-YOLO, YOLOv8 and YOLOv7-u6) +pre-cluster-threshold = 0.001 +topk = 300 +``` + +- Test + +``` +nms-iou-threshold = 0.45 +pre-cluster-threshold = 0.25 +topk = 300 +``` + +### Results + +**NOTE**: * = PyTorch. + +**NOTE**: ** = The YOLOv4 is trained with the trainvalno5k set, so the mAP is high on val2017 test. + +**NOTE**: star = DAMO-YOLO model trained with distillation. + +**NOTE**: The V100 GPU decoder max out at 625-635 FPS on DeepStream even using lighter models. + +**NOTE**: The GPU bbox parser is a bit slower than CPU bbox parser on V100 GPU tests. + +| DeepStream | Precision | Resolution | IoU=0.5:0.95 | IoU=0.5 | IoU=0.75 | FPS
(without display) | +|:------------------:|:---------:|:----------:|:------------:|:-------:|:--------:|:--------------------------:| +| YOLO-NAS L | FP16 | 640 | 0.484 | 0.658 | 0.532 | 235.27 | +| YOLO-NAS M | FP16 | 640 | 0.480 | 0.651 | 0.524 | 287.39 | +| YOLO-NAS S | FP16 | 640 | 0.442 | 0.614 | 0.485 | 478.52 | +| PP-YOLOE+_x | FP16 | 640 | 0.528 | 0.705 | 0.579 | 121.17 | +| PP-YOLOE+_l | FP16 | 640 | 0.511 | 0.686 | 0.557 | 191.82 | +| PP-YOLOE+_m | FP16 | 640 | 0.483 | 0.658 | 0.528 | 264.39 | +| PP-YOLOE+_s | FP16 | 640 | 0.424 | 0.594 | 0.464 | 476.13 | +| PP-YOLOE-s (400) | FP16 | 640 | 0.423 | 0.589 | 0.463 | 461.23 | +| DAMO-YOLO-L star | FP16 | 640 | 0.502 | 0.674 | 0.551 | 176.93 | +| DAMO-YOLO-M star | FP16 | 640 | 0.485 | 0.656 | 0.530 | 242.24 | +| DAMO-YOLO-S star | FP16 | 640 | 0.460 | 0.631 | 0.502 | 385.09 | +| DAMO-YOLO-S | FP16 | 640 | 0.445 | 0.611 | 0.486 | 378.68 | +| DAMO-YOLO-T star | FP16 | 640 | 0.419 | 0.586 | 0.455 | 492.24 | +| DAMO-YOLO-Nl | FP16 | 416 | 0.392 | 0.559 | 0.423 | 483.73 | +| DAMO-YOLO-Nm | FP16 | 416 | 0.371 | 0.532 | 0.402 | 555.94 | +| DAMO-YOLO-Ns | FP16 | 416 | 0.312 | 0.460 | 0.335 | 627.67 | +| YOLOX-x | FP16 | 640 | 0.447 | 0.616 | 0.483 | 125.40 | +| YOLOX-l | FP16 | 640 | 0.430 | 0.598 | 0.466 | 193.10 | +| YOLOX-m | FP16 | 640 | 0.397 | 0.566 | 0.431 | 298.61 | +| YOLOX-s | FP16 | 640 | 0.335 | 0.502 | 0.365 | 522.05 | +| YOLOX-s legacy | FP16 | 640 | 0.375 | 0.569 | 0.407 | 518.52 | +| YOLOX-Darknet | FP16 | 640 | 0.414 | 0.595 | 0.453 | 212.88 | +| YOLOX-Tiny | FP16 | 640 | 0.274 | 0.427 | 0.292 | 633.95 | +| YOLOX-Nano | FP16 | 640 | 0.212 | 0.342 | 0.222 | 633.04 | +| YOLOv8x | FP16 | 640 | 0.499 | 0.669 | 0.545 | 130.49 | +| YOLOv8l | FP16 | 640 | 0.491 | 0.660 | 0.535 | 180.75 | +| YOLOv8m | FP16 | 640 | 0.468 | 0.637 | 0.510 | 278.08 | +| YOLOv8s | FP16 | 640 | 0.415 | 0.578 | 0.453 | 493.45 | +| YOLOv8n | FP16 | 640 | 0.343 | 0.492 | 0.373 | 627.43 | +| YOLOv7-u6 | FP16 | 640 | 0.484 | 0.652 | 0.530 | 193.54 | +| YOLOv7x* | FP16 | 640 | 0.496 | 0.679 | 0.536 | 155.07 | +| YOLOv7* | FP16 | 640 | 0.476 | 0.660 | 0.518 | 226.01 | +| YOLOv7-Tiny Leaky* | FP16 | 640 | 0.345 | 0.516 | 0.372 | 626.23 | +| YOLOv7-Tiny Leaky* | FP16 | 416 | 0.328 | 0.493 | 0.349 | 633.90 | +| YOLOv6-L 4.0 | FP16 | 640 | 0.490 | 0.671 | 0.535 | 178.41 | +| YOLOv6-M 4.0 | FP16 | 640 | 0.460 | 0.635 | 0.502 | 293.39 | +| YOLOv6-S 4.0 | FP16 | 640 | 0.416 | 0.585 | 0.453 | 513.90 | +| YOLOv6-N 4.0 | FP16 | 640 | 0.349 | 0.503 | 0.378 | 633.37 | +| YOLOv5x 7.0 | FP16 | 640 | 0.471 | 0.652 | 0.513 | 149.93 | +| YOLOv5l 7.0 | FP16 | 640 | 0.455 | 0.637 | 0.497 | 235.55 | +| YOLOv5m 7.0 | FP16 | 640 | 0.421 | 0.604 | 0.459 | 351.69 | +| YOLOv5s 7.0 | FP16 | 640 | 0.344 | 0.529 | 0.372 | 618.13 | +| YOLOv5n 7.0 | FP16 | 640 | 0.247 | 0.414 | 0.257 | 629.66 | diff --git a/docs/dGPUInstalation.md b/docs/dGPUInstalation.md new file mode 100644 index 0000000..115d71b --- /dev/null +++ b/docs/dGPUInstalation.md @@ -0,0 +1,684 @@ +# dGPU installation + +To install the DeepStream on dGPU (x86 platform), without docker, we need to do some steps to prepare the computer. + +
DeepStream 6.2 + +### 1. Disable Secure Boot in BIOS + +### 2. Install dependencies + +``` +sudo apt-get update +sudo apt-get install gcc make git libtool autoconf autogen pkg-config cmake +sudo apt-get install python3 python3-dev python3-pip +sudo apt-get install dkms +sudo apt install libssl1.1 libgstreamer1.0-0 gstreamer1.0-tools gstreamer1.0-plugins-good gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly gstreamer1.0-libav libgstreamer-plugins-base1.0-dev libgstrtspserver-1.0-0 libjansson4 libyaml-cpp-dev libjsoncpp-dev protobuf-compiler +sudo apt-get install linux-headers-$(uname -r) +``` + +**NOTE**: Purge all NVIDIA driver, CUDA, etc (replace $CUDA_PATH to your CUDA path) + +``` +sudo nvidia-uninstall +sudo $CUDA_PATH/bin/cuda-uninstaller +sudo apt-get remove --purge '*nvidia*' +sudo apt-get remove --purge '*cuda*' +sudo apt-get remove --purge '*cudnn*' +sudo apt-get remove --purge '*tensorrt*' +sudo apt autoremove --purge && sudo apt autoclean && sudo apt clean +``` + +### 3. Install CUDA Keyring + +``` +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb +sudo dpkg -i cuda-keyring_1.0-1_all.deb +sudo apt-get update +``` + +### 4. Download and install NVIDIA Driver + +
TITAN, GeForce RTX / GTX series and RTX / Quadro series
+ +- Download + + ``` + wget https://us.download.nvidia.com/XFree86/Linux-x86_64/525.105.17/NVIDIA-Linux-x86_64-525.105.17.run + ``` + +
Laptop + +* Run + + ``` + sudo sh NVIDIA-Linux-x86_64-525.105.17.run --no-cc-version-check --silent --disable-nouveau --dkms --install-libglvnd + ``` + + **NOTE**: This step will disable the nouveau drivers. + +* Reboot + + ``` + sudo reboot + ``` + +* Install + + ``` + sudo sh NVIDIA-Linux-x86_64-525.105.17.run --no-cc-version-check --silent --disable-nouveau --dkms --install-libglvnd + ``` + +**NOTE**: If you are using a laptop with NVIDIA Optimius, run + +``` +sudo apt-get install nvidia-prime +sudo prime-select nvidia +``` + +
+ +
Desktop + +* Run + + ``` + sudo sh NVIDIA-Linux-x86_64-525.105.17.run --no-cc-version-check --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig + ``` + + **NOTE**: This step will disable the nouveau drivers. + +* Reboot + + ``` + sudo reboot + ``` + +* Install + + ``` + sudo sh NVIDIA-Linux-x86_64-525.105.17.run --no-cc-version-check --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig + ``` + +
+ +
+ +
Data center / Tesla series
+ + - Download + + ``` + wget https://us.download.nvidia.com/XFree86/Linux-x86_64/525.105.17/NVIDIA-Linux-x86_64-525.105.17.run + ``` + + * Run + + ``` + sudo sh NVIDIA-Linux-x86_64-525.105.17.run --no-cc-version-check --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig + ``` + +
+ +### 5. Download and install CUDA + +``` +wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run +sudo sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit +``` + +* Export environment variables + + ``` + echo $'export PATH=/usr/local/cuda-11.8/bin${PATH:+:${PATH}}\nexport LD_LIBRARY_PATH=/usr/local/cuda-11.8/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> ~/.bashrc && source ~/.bashrc + ``` + +### 6. Install TensorRT + +``` +sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub +sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" +sudo apt-get update +sudo apt-get install libnvinfer8=8.5.2-1+cuda11.8 libnvinfer-plugin8=8.5.2-1+cuda11.8 libnvparsers8=8.5.2-1+cuda11.8 libnvonnxparsers8=8.5.2-1+cuda11.8 libnvinfer-bin=8.5.2-1+cuda11.8 libnvinfer-dev=8.5.2-1+cuda11.8 libnvinfer-plugin-dev=8.5.2-1+cuda11.8 libnvparsers-dev=8.5.2-1+cuda11.8 libnvonnxparsers-dev=8.5.2-1+cuda11.8 libnvinfer-samples=8.5.2-1+cuda11.8 libcudnn8=8.7.0.84-1+cuda11.8 libcudnn8-dev=8.7.0.84-1+cuda11.8 python3-libnvinfer=8.5.2-1+cuda11.8 python3-libnvinfer-dev=8.5.2-1+cuda11.8 +sudo apt-mark hold libnvinfer* libnvparsers* libnvonnxparsers* libcudnn8* python3-libnvinfer* tensorrt +``` + +### 7. Download from [NVIDIA website](https://developer.nvidia.com/deepstream-getting-started) and install the DeepStream SDK + +DeepStream 6.2 for Servers and Workstations (.deb) + +``` +sudo apt-get install ./deepstream-6.2_6.2.0-1_amd64.deb +rm ${HOME}/.cache/gstreamer-1.0/registry.x86_64.bin +sudo ln -snf /usr/local/cuda-11.8 /usr/local/cuda +``` + +### 8. Reboot the computer + +``` +sudo reboot +``` + +
+ +
DeepStream 6.1.1 + +### 1. Disable Secure Boot in BIOS + +### 2. Install dependencies + +``` +sudo apt-get update +sudo apt-get install gcc make git libtool autoconf autogen pkg-config cmake +sudo apt-get install python3 python3-dev python3-pip +sudo apt-get install dkms +sudo apt-get install libssl1.1 libgstreamer1.0-0 gstreamer1.0-tools gstreamer1.0-plugins-good gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly gstreamer1.0-libav libgstreamer-plugins-base1.0-dev libgstrtspserver-1.0-0 libjansson4 libyaml-cpp-dev +sudo apt-get install linux-headers-$(uname -r) +``` + +**NOTE**: Purge all NVIDIA driver, CUDA, etc (replace $CUDA_PATH to your CUDA path) + +``` +sudo nvidia-uninstall +sudo $CUDA_PATH/bin/cuda-uninstaller +sudo apt-get remove --purge '*nvidia*' +sudo apt-get remove --purge '*cuda*' +sudo apt-get remove --purge '*cudnn*' +sudo apt-get remove --purge '*tensorrt*' +sudo apt autoremove --purge && sudo apt autoclean && sudo apt clean +``` + +### 3. Install CUDA Keyring + +``` +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb +sudo dpkg -i cuda-keyring_1.0-1_all.deb +sudo apt-get update +``` + +### 4. Download and install NVIDIA Driver + +
TITAN, GeForce RTX / GTX series and RTX / Quadro series
+ +- Download + + ``` + wget https://us.download.nvidia.com/XFree86/Linux-x86_64/515.65.01/NVIDIA-Linux-x86_64-515.65.01.run + ``` + +
Laptop + +* Run + + ``` + sudo sh NVIDIA-Linux-x86_64-515.65.01.run --silent --disable-nouveau --dkms --install-libglvnd + ``` + + **NOTE**: This step will disable the nouveau drivers. + +* Reboot + + ``` + sudo reboot + ``` + +* Install + + ``` + sudo sh NVIDIA-Linux-x86_64-515.65.01.run --silent --disable-nouveau --dkms --install-libglvnd + ``` + +**NOTE**: If you are using a laptop with NVIDIA Optimius, run + +``` +sudo apt-get install nvidia-prime +sudo prime-select nvidia +``` + +
+ +
Desktop + +* Run + + ``` + sudo sh NVIDIA-Linux-x86_64-515.65.01.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig + ``` + + **NOTE**: This step will disable the nouveau drivers. + +* Reboot + + ``` + sudo reboot + ``` + +* Install + + ``` + sudo sh NVIDIA-Linux-x86_64-515.65.01.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig + ``` + +
+ +
+ +
Data center / Tesla series
+ + - Download + + ``` + wget https://us.download.nvidia.com/tesla/515.65.01/NVIDIA-Linux-x86_64-515.65.01.run + ``` + + * Run + + ``` + sudo sh NVIDIA-Linux-x86_64-515.65.01.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig + ``` + +
+ +### 5. Download and install CUDA + +``` +wget https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run +sudo sh cuda_11.7.1_515.65.01_linux.run --silent --toolkit +``` + +* Export environment variables + + ``` + echo $'export PATH=/usr/local/cuda-11.7/bin${PATH:+:${PATH}}\nexport LD_LIBRARY_PATH=/usr/local/cuda-11.7/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> ~/.bashrc && source ~/.bashrc + ``` + +### 6. Download from [NVIDIA website](https://developer.nvidia.com/nvidia-tensorrt-8x-download) and install the TensorRT + +TensorRT 8.4 GA for Ubuntu 20.04 and CUDA 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6 and 11.7 DEB local repo Package + +``` +sudo dpkg -i nv-tensorrt-repo-ubuntu2004-cuda11.6-trt8.4.1.5-ga-20220604_1-1_amd64.deb +sudo apt-key add /var/nv-tensorrt-repo-ubuntu2004-cuda11.6-trt8.4.1.5-ga-20220604/9a60d8bf.pub +sudo apt-get update +sudo apt-get install libnvinfer8=8.4.1-1+cuda11.6 libnvinfer-plugin8=8.4.1-1+cuda11.6 libnvparsers8=8.4.1-1+cuda11.6 libnvonnxparsers8=8.4.1-1+cuda11.6 libnvinfer-bin=8.4.1-1+cuda11.6 libnvinfer-dev=8.4.1-1+cuda11.6 libnvinfer-plugin-dev=8.4.1-1+cuda11.6 libnvparsers-dev=8.4.1-1+cuda11.6 libnvonnxparsers-dev=8.4.1-1+cuda11.6 libnvinfer-samples=8.4.1-1+cuda11.6 libcudnn8=8.4.1.50-1+cuda11.6 libcudnn8-dev=8.4.1.50-1+cuda11.6 python3-libnvinfer=8.4.1-1+cuda11.6 python3-libnvinfer-dev=8.4.1-1+cuda11.6 +sudo apt-mark hold libnvinfer* libnvparsers* libnvonnxparsers* libcudnn8* tensorrt +``` + +### 7. Download from [NVIDIA website](https://developer.nvidia.com/deepstream-getting-started) and install the DeepStream SDK + +DeepStream 6.1.1 for Servers and Workstations (.deb) + +``` +sudo apt-get install ./deepstream-6.1_6.1.1-1_amd64.deb +rm ${HOME}/.cache/gstreamer-1.0/registry.x86_64.bin +sudo ln -snf /usr/local/cuda-11.7 /usr/local/cuda +``` + +### 8. Reboot the computer + +``` +sudo reboot +``` + +
+ +
DeepStream 6.1 + +### 1. Disable Secure Boot in BIOS + +### 2. Install dependencies + +``` +sudo apt-get update +sudo apt-get install gcc make git libtool autoconf autogen pkg-config cmake +sudo apt-get install python3 python3-dev python3-pip +sudo apt-get install dkms +sudo apt-get install libssl1.1 libgstreamer1.0-0 gstreamer1.0-tools gstreamer1.0-plugins-good gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly gstreamer1.0-libav libgstrtspserver-1.0-0 libjansson4 libyaml-cpp-dev +sudo apt-get install linux-headers-$(uname -r) +``` + +**NOTE**: Purge all NVIDIA driver, CUDA, etc (replace $CUDA_PATH to your CUDA path) + +``` +sudo nvidia-uninstall +sudo $CUDA_PATH/bin/cuda-uninstaller +sudo apt-get remove --purge '*nvidia*' +sudo apt-get remove --purge '*cuda*' +sudo apt-get remove --purge '*cudnn*' +sudo apt-get remove --purge '*tensorrt*' +sudo apt autoremove --purge && sudo apt autoclean && sudo apt clean +``` + +### 3. Install CUDA Keyring + +``` +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb +sudo dpkg -i cuda-keyring_1.0-1_all.deb +sudo apt-get update +``` + +### 4. Download and install NVIDIA Driver + +
TITAN, GeForce RTX / GTX series and RTX / Quadro series
+ +- Download + + ``` + wget https://us.download.nvidia.com/XFree86/Linux-x86_64/510.47.03/NVIDIA-Linux-x86_64-510.47.03.run + ``` + +
Laptop + +* Run + + ``` + sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd + ``` + + **NOTE**: This step will disable the nouveau drivers. + +* Reboot + + ``` + sudo reboot + ``` + +* Install + + ``` + sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd + ``` + +**NOTE**: If you are using a laptop with NVIDIA Optimius, run + +``` +sudo apt-get install nvidia-prime +sudo prime-select nvidia +``` + +
+ +
Desktop + +* Run + + ``` + sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig + ``` + + **NOTE**: This step will disable the nouveau drivers. + +* Reboot + + ``` + sudo reboot + ``` + +* Install + + ``` + sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig + ``` + +
+ +
+ +
Data center / Tesla series
+ + - Download + + ``` + wget https://us.download.nvidia.com/tesla/510.47.03/NVIDIA-Linux-x86_64-510.47.03.run + ``` + + * Run + + ``` + sudo sh NVIDIA-Linux-x86_64-510.47.03.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig + ``` + +
+ +### 5. Download and install CUDA + +``` +wget https://developer.download.nvidia.com/compute/cuda/11.6.1/local_installers/cuda_11.6.1_510.47.03_linux.run +sudo sh cuda_11.6.1_510.47.03_linux.run --silent --toolkit +``` + +* Export environment variables + + ``` + echo $'export PATH=/usr/local/cuda-11.6/bin${PATH:+:${PATH}}\nexport LD_LIBRARY_PATH=/usr/local/cuda-11.6/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> ~/.bashrc && source ~/.bashrc + ``` + +### 6. Download from [NVIDIA website](https://developer.nvidia.com/nvidia-tensorrt-8x-download) and install the TensorRT + +TensorRT 8.2 GA Update 4 for Ubuntu 20.04 and CUDA 11.0, 11.1, 11.2, 11.3, 11.4 and 11.5 DEB local repo Package + +``` +sudo dpkg -i nv-tensorrt-repo-ubuntu2004-cuda11.4-trt8.2.5.1-ga-20220505_1-1_amd64.deb +sudo apt-key add /var/nv-tensorrt-repo-ubuntu2004-cuda11.4-trt8.2.5.1-ga-20220505/82307095.pub +sudo apt-get update +sudo apt-get install libnvinfer8=8.2.5-1+cuda11.4 libnvinfer-plugin8=8.2.5-1+cuda11.4 libnvparsers8=8.2.5-1+cuda11.4 libnvonnxparsers8=8.2.5-1+cuda11.4 libnvinfer-bin=8.2.5-1+cuda11.4 libnvinfer-dev=8.2.5-1+cuda11.4 libnvinfer-plugin-dev=8.2.5-1+cuda11.4 libnvparsers-dev=8.2.5-1+cuda11.4 libnvonnxparsers-dev=8.2.5-1+cuda11.4 libnvinfer-samples=8.2.5-1+cuda11.4 libnvinfer-doc=8.2.5-1+cuda11.4 libcudnn8-dev=8.4.0.27-1+cuda11.6 libcudnn8=8.4.0.27-1+cuda11.6 +sudo apt-mark hold libnvinfer* libnvparsers* libnvonnxparsers* libcudnn8* tensorrt +``` + +### 7. Download from [NVIDIA website](https://developer.nvidia.com/deepstream-sdk-download-tesla-archived) and install the DeepStream SDK + +DeepStream 6.1 for Servers and Workstations (.deb) + +``` +sudo apt-get install ./deepstream-6.1_6.1.0-1_amd64.deb +rm ${HOME}/.cache/gstreamer-1.0/registry.x86_64.bin +sudo ln -snf /usr/local/cuda-11.6 /usr/local/cuda +``` + +### 8. Reboot the computer + +``` +sudo reboot +``` + +
+ +
DeepStream 6.0.1 / 6.0 + +### 1. Disable Secure Boot in BIOS + +
If you are using a laptop with newer Intel/AMD processors and your Graphics in Settings->Details->About tab is llvmpipe, please update the kernel. + +``` +wget https://kernel.ubuntu.com/~kernel-ppa/mainline/v5.11/amd64/linux-headers-5.11.0-051100_5.11.0-051100.202102142330_all.deb +wget https://kernel.ubuntu.com/~kernel-ppa/mainline/v5.11/amd64/linux-headers-5.11.0-051100-generic_5.11.0-051100.202102142330_amd64.deb +wget https://kernel.ubuntu.com/~kernel-ppa/mainline/v5.11/amd64/linux-image-unsigned-5.11.0-051100-generic_5.11.0-051100.202102142330_amd64.deb +wget https://kernel.ubuntu.com/~kernel-ppa/mainline/v5.11/amd64/linux-modules-5.11.0-051100-generic_5.11.0-051100.202102142330_amd64.deb +sudo dpkg -i *.deb +sudo reboot +``` + +
+ +### 2. Install dependencies + +``` +sudo apt-get update +sudo apt-get install gcc make git libtool autoconf autogen pkg-config cmake +sudo apt-get install python3 python3-dev python3-pip +sudo apt-get install libssl1.0.0 libgstreamer1.0-0 gstreamer1.0-tools gstreamer1.0-plugins-good gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly gstreamer1.0-libav libgstrtspserver-1.0-0 libjansson4 +sudo apt-get install linux-headers-$(uname -r) +``` + +**NOTE**: Install DKMS only if you are using the default Ubuntu kernel + +``` +sudo apt-get install dkms +``` + +**NOTE**: Purge all NVIDIA driver, CUDA, etc (replace $CUDA_PATH to your CUDA path) + +``` +sudo nvidia-uninstall +sudo $CUDA_PATH/bin/cuda-uninstaller +sudo apt-get remove --purge '*nvidia*' +sudo apt-get remove --purge '*cuda*' +sudo apt-get remove --purge '*cudnn*' +sudo apt-get remove --purge '*tensorrt*' +sudo apt autoremove --purge && sudo apt autoclean && sudo apt clean +``` + +### 3. Install CUDA Keyring + +``` +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb +sudo dpkg -i cuda-keyring_1.0-1_all.deb +sudo apt-get update +``` + +### 4. Download and install NVIDIA Driver + +
TITAN, GeForce RTX / GTX series and RTX / Quadro series
+ +- Download + + ``` + wget https://us.download.nvidia.com/XFree86/Linux-x86_64/470.129.06/NVIDIA-Linux-x86_64-470.129.06.run + ``` + +
Laptop + +* Run + + ``` + sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd + ``` + + **NOTE**: This step will disable the nouveau drivers. + + **NOTE**: Remove --dkms flag if you installed the 5.11.0 kernel. + +* Reboot + + ``` + sudo reboot + ``` + +* Install + + ``` + sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd + ``` + + **NOTE**: Remove --dkms flag if you installed the 5.11.0 kernel. + +**NOTE**: If you are using a laptop with NVIDIA Optimius, run + +``` +sudo apt-get install nvidia-prime +sudo prime-select nvidia +``` + +
+ +
Desktop + +* Run + + ``` + sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig + ``` + + **NOTE**: This step will disable the nouveau drivers. + + **NOTE**: Remove --dkms flag if you installed the 5.11.0 kernel. + +* Reboot + + ``` + sudo reboot + ``` + +* Install + + ``` + sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig + ``` + + **NOTE**: Remove --dkms flag if you installed the 5.11.0 kernel. + +
+ +
+ +
Data center / Tesla series
+ + - Download + + ``` + wget https://us.download.nvidia.com/tesla/470.129.06/NVIDIA-Linux-x86_64-470.129.06.run + ``` + + * Run + + ``` + sudo sh NVIDIA-Linux-x86_64-470.129.06.run --silent --disable-nouveau --dkms --install-libglvnd --run-nvidia-xconfig + ``` + + **NOTE**: Remove --dkms flag if you installed the 5.11.0 kernel. + +
+ +### 5. Download and install CUDA + +``` +wget https://developer.download.nvidia.com/compute/cuda/11.4.1/local_installers/cuda_11.4.1_470.57.02_linux.run +sudo sh cuda_11.4.1_470.57.02_linux.run --silent --toolkit +``` + +* Export environment variables + + ``` + echo $'export PATH=/usr/local/cuda-11.4/bin${PATH:+:${PATH}}\nexport LD_LIBRARY_PATH=/usr/local/cuda-11.4/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> ~/.bashrc && source ~/.bashrc + ``` + +### 6. Download from [NVIDIA website](https://developer.nvidia.com/nvidia-tensorrt-8x-download) and install the TensorRT + +TensorRT 8.0.1 GA for Ubuntu 18.04 and CUDA 11.3 DEB local repo package + +``` +sudo dpkg -i nv-tensorrt-repo-ubuntu1804-cuda11.3-trt8.0.1.6-ga-20210626_1-1_amd64.deb +sudo apt-key add /var/nv-tensorrt-repo-ubuntu1804-cuda11.3-trt8.0.1.6-ga-20210626/7fa2af80.pub +sudo apt-get update +sudo apt-get install libnvinfer8=8.0.1-1+cuda11.3 libnvinfer-plugin8=8.0.1-1+cuda11.3 libnvparsers8=8.0.1-1+cuda11.3 libnvonnxparsers8=8.0.1-1+cuda11.3 libnvinfer-bin=8.0.1-1+cuda11.3 libnvinfer-dev=8.0.1-1+cuda11.3 libnvinfer-plugin-dev=8.0.1-1+cuda11.3 libnvparsers-dev=8.0.1-1+cuda11.3 libnvonnxparsers-dev=8.0.1-1+cuda11.3 libnvinfer-samples=8.0.1-1+cuda11.3 libnvinfer-doc=8.0.1-1+cuda11.3 libcudnn8-dev=8.2.1.32-1+cuda11.3 libcudnn8=8.2.1.32-1+cuda11.3 +sudo apt-mark hold libnvinfer* libnvparsers* libnvonnxparsers* libcudnn8* tensorrt +``` + +### 7. Download from [NVIDIA website](https://developer.nvidia.com/deepstream-sdk-download-tesla-archived) and install the DeepStream SDK + +* DeepStream 6.0.1 for Servers and Workstations (.deb) + + ``` + sudo apt-get install ./deepstream-6.0_6.0.1-1_amd64.deb + ``` + +* DeepStream 6.0 for Servers and Workstations (.deb) + + ``` + sudo apt-get install ./deepstream-6.0_6.0.0-1_amd64.deb + ``` + +* Run + + ``` + rm ${HOME}/.cache/gstreamer-1.0/registry.x86_64.bin + sudo ln -snf /usr/local/cuda-11.4 /usr/local/cuda + ``` + +### 8. Reboot the computer + +``` +sudo reboot +``` + +
diff --git a/nvdsinfer_custom_impl_Yolo/Makefile b/nvdsinfer_custom_impl_Yolo/Makefile index f22527f..fa06b68 100644 --- a/nvdsinfer_custom_impl_Yolo/Makefile +++ b/nvdsinfer_custom_impl_Yolo/Makefile @@ -33,9 +33,9 @@ ifeq ($(OPENCV),) OPENCV=0 endif -LEGACY?= -ifeq ($(LEGACY),) - LEGACY=0 +GRAPH?= +ifeq ($(GRAPH),) + GRAPH=0 endif CC:= g++ @@ -50,13 +50,13 @@ ifeq ($(OPENCV), 1) LIBS+= $(shell pkg-config --libs opencv4 2> /dev/null || pkg-config --libs opencv) endif -ifeq ($(LEGACY), 1) - COMMON+= -DLEGACY +ifeq ($(GRAPH), 1) + COMMON+= -GRAPH endif CUFLAGS:= -I/opt/nvidia/deepstream/deepstream/sources/includes -I/usr/local/cuda-$(CUDA_VER)/include -LIBS+= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs +LIBS+= -lnvinfer_plugin -lnvinfer -lnvparsers -lnvonnxparser -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group INCS:= $(wildcard *.h) diff --git a/nvdsinfer_custom_impl_Yolo/calibrator.cpp b/nvdsinfer_custom_impl_Yolo/calibrator.cpp index c445de7..2eba320 100644 --- a/nvdsinfer_custom_impl_Yolo/calibrator.cpp +++ b/nvdsinfer_custom_impl_Yolo/calibrator.cpp @@ -8,17 +8,18 @@ #include #include -Int8EntropyCalibrator2::Int8EntropyCalibrator2(const int& batchsize, const int& channels, const int& height, - const int& width, const int& letterbox, const std::string& imgPath, - const std::string& calibTablePath) : batchSize(batchsize), inputC(channels), inputH(height), inputW(width), - letterBox(letterbox), calibTablePath(calibTablePath), imageIndex(0) +Int8EntropyCalibrator2::Int8EntropyCalibrator2(const int& batchSize, const int& channels, const int& height, const int& width, + const float& scaleFactor, const float* offsets, const std::string& imgPath, const std::string& calibTablePath) : + batchSize(batchSize), inputC(channels), inputH(height), inputW(width), scaleFactor(scaleFactor), offsets(offsets), + calibTablePath(calibTablePath), imageIndex(0) { - inputCount = batchsize * channels * height * width; + inputCount = batchSize * channels * height * width; std::fstream f(imgPath); if (f.is_open()) { std::string temp; - while (std::getline(f, temp)) + while (std::getline(f, temp)) { imgPaths.push_back(temp); + } } batchData = new float[inputCount]; CUDA_CHECK(cudaMalloc(&deviceInput, inputCount * sizeof(float))); @@ -27,8 +28,9 @@ Int8EntropyCalibrator2::Int8EntropyCalibrator2(const int& batchsize, const int& Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(deviceInput)); - if (batchData) + if (batchData) { delete[] batchData; + } } int @@ -40,24 +42,33 @@ Int8EntropyCalibrator2::getBatchSize() const noexcept bool Int8EntropyCalibrator2::getBatch(void** bindings, const char** names, int nbBindings) noexcept { - if (imageIndex + batchSize > uint(imgPaths.size())) + if (imageIndex + batchSize > uint(imgPaths.size())) { return false; + } float* ptr = batchData; for (size_t i = imageIndex; i < imageIndex + batchSize; ++i) { - cv::Mat img = cv::imread(imgPaths[i], cv::IMREAD_COLOR); - std::vector inputData = prepareImage(img, inputC, inputH, inputW, letterBox); + cv::Mat img = cv::imread(imgPaths[i]); + if (img.empty()){ + std::cerr << "Failed to read image for calibration" << std::endl; + return false; + } + + std::vector inputData = prepareImage(img, inputC, inputH, inputW, scaleFactor, offsets); - int len = (int) (inputData.size()); + size_t len = inputData.size(); memcpy(ptr, inputData.data(), len * sizeof(float)); - ptr += inputData.size(); + std::cout << "Load image: " << imgPaths[i] << std::endl; - std::cout << "Progress: " << (i + 1)*100. / imgPaths.size() << "%" << std::endl; + std::cout << "Progress: " << (i + 1) * 100. / imgPaths.size() << "%" << std::endl; } + imageIndex += batchSize; + CUDA_CHECK(cudaMemcpy(deviceInput, batchData, inputCount * sizeof(float), cudaMemcpyHostToDevice)); bindings[0] = deviceInput; + return true; } @@ -67,8 +78,9 @@ Int8EntropyCalibrator2::readCalibrationCache(std::size_t &length) noexcept calibrationCache.clear(); std::ifstream input(calibTablePath, std::ios::binary); input >> std::noskipws; - if (readCache && input.good()) + if (readCache && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calibrationCache)); + } length = calibrationCache.size(); return length ? calibrationCache.data() : nullptr; } @@ -81,43 +93,24 @@ Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, std::size_t len } std::vector -prepareImage(cv::Mat& img, int input_c, int input_h, int input_w, int letter_box) +prepareImage(cv::Mat& img, int input_c, int input_h, int input_w, float scaleFactor, const float* offsets) { cv::Mat out; + + cv::cvtColor(img, out, cv::COLOR_BGR2RGB); + int image_w = img.cols; int image_h = img.rows; - if (image_w != input_w || image_h != input_h) { - if (letter_box == 1) { - float ratio_w = (float) image_w / (float) input_w; - float ratio_h = (float) image_h / (float) input_h; - if (ratio_w > ratio_h) { - int new_width = input_w * ratio_h; - int x = (image_w - new_width) / 2; - cv::Rect roi(abs(x), 0, new_width, image_h); - out = img(roi); - } - else if (ratio_w < ratio_h) { - int new_height = input_h * ratio_w; - int y = (image_h - new_height) / 2; - cv::Rect roi(0, abs(y), image_w, new_height); - out = img(roi); - } - else - out = img; - cv::resize(out, out, cv::Size(input_w, input_h), 0, 0, cv::INTER_CUBIC); - } - else { - cv::resize(img, out, cv::Size(input_w, input_h), 0, 0, cv::INTER_CUBIC); - } - cv::cvtColor(out, out, cv::COLOR_BGR2RGB); - } - else - cv::cvtColor(img, out, cv::COLOR_BGR2RGB); - if (input_c == 3) - out.convertTo(out, CV_32FC3, 1.0 / 255.0); - else - out.convertTo(out, CV_32FC1, 1.0 / 255.0); + if (image_w != input_w || image_h != input_h) { + float resizeFactor = std::max(input_w / (float) image_w, input_h / (float) img.rows); + cv::resize(out, out, cv::Size(0, 0), resizeFactor, resizeFactor, cv::INTER_CUBIC); + cv::Rect crop(cv::Point(0.5 * (out.cols - input_w), 0.5 * (out.rows - input_h)), cv::Size(input_w, input_h)); + out = out(crop); + } + + out.convertTo(out, CV_32F, scaleFactor); + cv::subtract(out, cv::Scalar(offsets[2] / 255, offsets[1] / 255, offsets[0] / 255), out, cv::noArray(), -1); std::vector input_channels(input_c); cv::split(out, input_channels); diff --git a/nvdsinfer_custom_impl_Yolo/calibrator.h b/nvdsinfer_custom_impl_Yolo/calibrator.h index e390a76..1a92100 100644 --- a/nvdsinfer_custom_impl_Yolo/calibrator.h +++ b/nvdsinfer_custom_impl_Yolo/calibrator.h @@ -22,8 +22,8 @@ class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: - Int8EntropyCalibrator2(const int& batchsize, const int& channels, const int& height, const int& width, - const int& letterbox, const std::string& imgPath, const std::string& calibTablePath); + Int8EntropyCalibrator2(const int& batchSize, const int& channels, const int& height, const int& width, + const float& scaleFactor, const float* offsets, const std::string& imgPath, const std::string& calibTablePath); virtual ~Int8EntropyCalibrator2(); @@ -41,6 +41,8 @@ class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { int inputH; int inputW; int letterBox; + float scaleFactor; + const float* offsets; std::string calibTablePath; size_t imageIndex; size_t inputCount; @@ -51,6 +53,7 @@ class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { std::vector calibrationCache; }; -std::vector prepareImage(cv::Mat& img, int input_c, int input_h, int input_w, int letter_box); +std::vector prepareImage(cv::Mat& img, int input_c, int input_h, int input_w, float scaleFactor, + const float* offsets); #endif //CALIBRATOR_H diff --git a/nvdsinfer_custom_impl_Yolo/layers/implicit_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/implicit_layer.cpp index b02eba9..6c3eb23 100644 --- a/nvdsinfer_custom_impl_Yolo/layers/implicit_layer.cpp +++ b/nvdsinfer_custom_impl_Yolo/layers/implicit_layer.cpp @@ -28,7 +28,7 @@ implicitLayer(int layerIdx, std::map& block, std::vect convWt.values = val; trtWeights.push_back(convWt); - nvinfer1::IConstantLayer* implicit = network->addConstant(nvinfer1::Dims{3, {filters, 1, 1}}, convWt); + nvinfer1::IConstantLayer* implicit = network->addConstant(nvinfer1::Dims{4, {1, filters, 1, 1}}, convWt); assert(implicit != nullptr); std::string implicitLayerName = block.at("type") + "_" + std::to_string(layerIdx); implicit->setName(implicitLayerName.c_str()); diff --git a/nvdsinfer_custom_impl_Yolo/layers/reorg_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/reorg_layer.cpp index 7404776..890d31d 100644 --- a/nvdsinfer_custom_impl_Yolo/layers/reorg_layer.cpp +++ b/nvdsinfer_custom_impl_Yolo/layers/reorg_layer.cpp @@ -14,46 +14,100 @@ reorgLayer(int layerIdx, std::map& block, nvinfer1::IT { nvinfer1::ITensor* output; - assert(block.at("type") == "reorg3d"); + assert(block.at("type") == "reorg" || block.at("type") == "reorg3d"); + + int stride = 1; + if(block.find("stride") != block.end()) { + stride = std::stoi(block.at("stride")); + } nvinfer1::Dims inputDims = input->getDimensions(); - nvinfer1::ISliceLayer *slice1 = network->addSlice(*input, nvinfer1::Dims{3, {0, 0, 0}}, - nvinfer1::Dims{3, {inputDims.d[0], inputDims.d[1] / 2, inputDims.d[2] / 2}}, nvinfer1::Dims{3, {1, 2, 2}}); - assert(slice1 != nullptr); - std::string slice1LayerName = "slice1_" + std::to_string(layerIdx); - slice1->setName(slice1LayerName.c_str()); + if (block.at("type") == "reorg3d") { + nvinfer1::ISliceLayer* slice1 = network->addSlice(*input, nvinfer1::Dims{4, {0, 0, 0, 0}}, + nvinfer1::Dims{4, {inputDims.d[0], inputDims.d[1], inputDims.d[2] / stride, inputDims.d[3] / stride}}, + nvinfer1::Dims{4, {1, 1, stride, stride}}); + assert(slice1 != nullptr); + std::string slice1LayerName = "slice1_" + std::to_string(layerIdx); + slice1->setName(slice1LayerName.c_str()); - nvinfer1::ISliceLayer *slice2 = network->addSlice(*input, nvinfer1::Dims{3, {0, 1, 0}}, - nvinfer1::Dims{3, {inputDims.d[0], inputDims.d[1] / 2, inputDims.d[2] / 2}}, nvinfer1::Dims{3, {1, 2, 2}}); - assert(slice2 != nullptr); - std::string slice2LayerName = "slice2_" + std::to_string(layerIdx); - slice2->setName(slice2LayerName.c_str()); + nvinfer1::ISliceLayer* slice2 = network->addSlice(*input, nvinfer1::Dims{4, {0, 0, 0, 1}}, + nvinfer1::Dims{4, {inputDims.d[0], inputDims.d[1], inputDims.d[2] / stride, inputDims.d[3] / stride}}, + nvinfer1::Dims{4, {1, 1, stride, stride}}); + assert(slice2 != nullptr); + std::string slice2LayerName = "slice2_" + std::to_string(layerIdx); + slice2->setName(slice2LayerName.c_str()); - nvinfer1::ISliceLayer *slice3 = network->addSlice(*input, nvinfer1::Dims{3, {0, 0, 1}}, - nvinfer1::Dims{3, {inputDims.d[0], inputDims.d[1] / 2, inputDims.d[2] / 2}}, nvinfer1::Dims{3, {1, 2, 2}}); - assert(slice3 != nullptr); - std::string slice3LayerName = "slice3_" + std::to_string(layerIdx); - slice3->setName(slice3LayerName.c_str()); + nvinfer1::ISliceLayer* slice3 = network->addSlice(*input, nvinfer1::Dims{4, {0, 0, 1, 0}}, + nvinfer1::Dims{4, {inputDims.d[0], inputDims.d[1], inputDims.d[2] / stride, inputDims.d[3] / stride}}, + nvinfer1::Dims{4, {1, 1, stride, stride}}); + assert(slice3 != nullptr); + std::string slice3LayerName = "slice3_" + std::to_string(layerIdx); + slice3->setName(slice3LayerName.c_str()); - nvinfer1::ISliceLayer *slice4 = network->addSlice(*input, nvinfer1::Dims{3, {0, 1, 1}}, - nvinfer1::Dims{3, {inputDims.d[0], inputDims.d[1] / 2, inputDims.d[2] / 2}}, nvinfer1::Dims{3, {1, 2, 2}}); - assert(slice4 != nullptr); - std::string slice4LayerName = "slice4_" + std::to_string(layerIdx); - slice4->setName(slice4LayerName.c_str()); + nvinfer1::ISliceLayer* slice4 = network->addSlice(*input, nvinfer1::Dims{4, {0, 0, 1, 1}}, + nvinfer1::Dims{4, {inputDims.d[0], inputDims.d[1], inputDims.d[2] / stride, inputDims.d[3] / stride}}, + nvinfer1::Dims{4, {1, 1, stride, stride}}); + assert(slice4 != nullptr); + std::string slice4LayerName = "slice4_" + std::to_string(layerIdx); + slice4->setName(slice4LayerName.c_str()); - std::vector concatInputs; - concatInputs.push_back(slice1->getOutput(0)); - concatInputs.push_back(slice2->getOutput(0)); - concatInputs.push_back(slice3->getOutput(0)); - concatInputs.push_back(slice4->getOutput(0)); + std::vector concatInputs; + concatInputs.push_back(slice1->getOutput(0)); + concatInputs.push_back(slice2->getOutput(0)); + concatInputs.push_back(slice3->getOutput(0)); + concatInputs.push_back(slice4->getOutput(0)); - nvinfer1::IConcatenationLayer* concat = network->addConcatenation(concatInputs.data(), concatInputs.size()); - assert(concat != nullptr); - std::string concatLayerName = "concat_" + std::to_string(layerIdx); - concat->setName(concatLayerName.c_str()); - concat->setAxis(0); - output = concat->getOutput(0); + nvinfer1::IConcatenationLayer* concat = network->addConcatenation(concatInputs.data(), concatInputs.size()); + assert(concat != nullptr); + std::string concatLayerName = "concat_" + std::to_string(layerIdx); + concat->setName(concatLayerName.c_str()); + concat->setAxis(0); + output = concat->getOutput(0); + } + else { + nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(*input); + assert(shuffle1 != nullptr); + std::string shuffle1LayerName = "shuffle1_" + std::to_string(layerIdx); + shuffle1->setName(shuffle1LayerName.c_str()); + nvinfer1::Dims reshapeDims1{6, {inputDims.d[0], inputDims.d[1] / (stride * stride), inputDims.d[2], stride, + inputDims.d[3], stride}}; + shuffle1->setReshapeDimensions(reshapeDims1); + nvinfer1::Permutation permutation1{{0, 1, 2, 4, 3, 5}}; + shuffle1->setSecondTranspose(permutation1); + output = shuffle1->getOutput(0); + + nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*output); + assert(shuffle2 != nullptr); + std::string shuffle2LayerName = "shuffle2_" + std::to_string(layerIdx); + shuffle2->setName(shuffle2LayerName.c_str()); + nvinfer1::Dims reshapeDims2{4, {inputDims.d[0], inputDims.d[1] / (stride * stride), inputDims.d[2] * inputDims.d[3], + stride * stride}}; + shuffle2->setReshapeDimensions(reshapeDims2); + nvinfer1::Permutation permutation2{{0, 1, 3, 2}}; + shuffle2->setSecondTranspose(permutation2); + output = shuffle2->getOutput(0); + + nvinfer1::IShuffleLayer* shuffle3 = network->addShuffle(*output); + assert(shuffle3 != nullptr); + std::string shuffle3LayerName = "shuffle3_" + std::to_string(layerIdx); + shuffle3->setName(shuffle3LayerName.c_str()); + nvinfer1::Dims reshapeDims3{4, {inputDims.d[0], inputDims.d[1] / (stride * stride), stride * stride, + inputDims.d[2] * inputDims.d[3]}}; + shuffle3->setReshapeDimensions(reshapeDims3); + nvinfer1::Permutation permutation3{{0, 2, 1, 3}}; + shuffle3->setSecondTranspose(permutation3); + output = shuffle3->getOutput(0); + + nvinfer1::IShuffleLayer* shuffle4 = network->addShuffle(*output); + assert(shuffle4 != nullptr); + std::string shuffle4LayerName = "shuffle4_" + std::to_string(layerIdx); + shuffle4->setName(shuffle4LayerName.c_str()); + nvinfer1::Dims reshapeDims4{4, {inputDims.d[0], inputDims.d[1] * stride * stride, inputDims.d[2] / stride, + inputDims.d[3] / stride}}; + shuffle4->setReshapeDimensions(reshapeDims4); + output = shuffle4->getOutput(0); + } return output; } diff --git a/nvdsinfer_custom_impl_Yolo/layers/route_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/route_layer.cpp index 2222841..a3925a0 100644 --- a/nvdsinfer_custom_impl_Yolo/layers/route_layer.cpp +++ b/nvdsinfer_custom_impl_Yolo/layers/route_layer.cpp @@ -24,29 +24,36 @@ routeLayer(int layerIdx, std::string& layers, std::map } if (lastPos < strLayers.length()) { std::string lastV = trim(strLayers.substr(lastPos)); - if (!lastV.empty()) + if (!lastV.empty()) { idxLayers.push_back(std::stoi(lastV)); + } } - assert (!idxLayers.empty()); + assert(!idxLayers.empty()); std::vector concatInputs; for (uint i = 0; i < idxLayers.size(); ++i) { - if (idxLayers[i] < 0) + if (idxLayers[i] < 0) { idxLayers[i] = tensorOutputs.size() + idxLayers[i]; - assert (idxLayers[i] >= 0 && idxLayers[i] < (int)tensorOutputs.size()); + } + assert(idxLayers[i] >= 0 && idxLayers[i] < (int)tensorOutputs.size()); concatInputs.push_back(tensorOutputs[idxLayers[i]]); - if (i < idxLayers.size() - 1) + if (i < idxLayers.size() - 1) { layers += std::to_string(idxLayers[i]) + ", "; + } } layers += std::to_string(idxLayers[idxLayers.size() - 1]); - if (concatInputs.size() == 1) + if (concatInputs.size() == 1) { output = concatInputs[0]; + } else { - int axis = 0; - if (block.find("axis") != block.end()) - axis = std::stoi(block.at("axis")); - if (axis < 0) - axis = concatInputs[0]->getDimensions().nbDims + axis; + int axis = 1; + if (block.find("axis") != block.end()) { + axis += std::stoi(block.at("axis")); + std::cout << axis << std::endl; + } + if (axis < 0) { + axis += concatInputs[0]->getDimensions().nbDims; + } nvinfer1::IConcatenationLayer* concat = network->addConcatenation(concatInputs.data(), concatInputs.size()); assert(concat != nullptr); @@ -60,10 +67,11 @@ routeLayer(int layerIdx, std::string& layers, std::map nvinfer1::Dims prevTensorDims = output->getDimensions(); int groups = stoi(block.at("groups")); int group_id = stoi(block.at("group_id")); - int startSlice = (prevTensorDims.d[0] / groups) * group_id; - int channelSlice = (prevTensorDims.d[0] / groups); - nvinfer1::ISliceLayer* slice = network->addSlice(*output, nvinfer1::Dims{3, {startSlice, 0, 0}}, - nvinfer1::Dims{3, {channelSlice, prevTensorDims.d[1], prevTensorDims.d[2]}}, nvinfer1::Dims{3, {1, 1, 1}}); + int startSlice = (prevTensorDims.d[1] / groups) * group_id; + int channelSlice = (prevTensorDims.d[1] / groups); + nvinfer1::ISliceLayer* slice = network->addSlice(*output, nvinfer1::Dims{4, {0, startSlice, 0, 0}}, + nvinfer1::Dims{4, {prevTensorDims.d[0], channelSlice, prevTensorDims.d[2], prevTensorDims.d[3]}}, + nvinfer1::Dims{4, {1, 1, 1, 1}}); assert(slice != nullptr); std::string sliceLayerName = "slice_" + std::to_string(layerIdx); slice->setName(sliceLayerName.c_str()); diff --git a/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.cpp index 3e58e72..2ce4759 100644 --- a/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.cpp +++ b/nvdsinfer_custom_impl_Yolo/layers/shortcut_layer.cpp @@ -17,8 +17,8 @@ shortcutLayer(int layerIdx, std::string activation, std::string inputVol, std::s assert(block.at("type") == "shortcut"); if (inputVol != shortcutVol) { - nvinfer1::ISliceLayer* slice = network->addSlice(*shortcutInput, nvinfer1::Dims{3, {0, 0, 0}}, input->getDimensions(), - nvinfer1::Dims{3, {1, 1, 1}}); + nvinfer1::ISliceLayer* slice = network->addSlice(*shortcutInput, nvinfer1::Dims{4, {0, 0, 0, 0}}, input->getDimensions(), + nvinfer1::Dims{4, {1, 1, 1, 1}}); assert(slice != nullptr); std::string sliceLayerName = "slice_" + std::to_string(layerIdx); slice->setName(sliceLayerName.c_str()); diff --git a/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp b/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp index e5e1caa..db05c83 100644 --- a/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp +++ b/nvdsinfer_custom_impl_Yolo/layers/upsample_layer.cpp @@ -18,14 +18,14 @@ upsampleLayer(int layerIdx, std::map& block, nvinfer1: int stride = std::stoi(block.at("stride")); - float scale[3] = {1, static_cast(stride), static_cast(stride)}; + float scale[4] = {1, 1, static_cast(stride), static_cast(stride)}; nvinfer1::IResizeLayer* resize = network->addResize(*input); assert(resize != nullptr); std::string resizeLayerName = "upsample_" + std::to_string(layerIdx); resize->setName(resizeLayerName.c_str()); resize->setResizeMode(nvinfer1::ResizeMode::kNEAREST); - resize->setScales(scale, 3); + resize->setScales(scale, 4); output = resize->getOutput(0); return output; diff --git a/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp b/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp index 63175c3..9180c41 100644 --- a/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp +++ b/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp @@ -35,39 +35,56 @@ static bool getYoloNetworkInfo(NetworkInfo& networkInfo, const NvDsInferContextInitParams* initParams) { - std::string yoloCfg = initParams->customNetworkConfigFilePath; - std::string yoloType; + std::string onnxWtsFilePath = initParams->onnxFilePath; + std::string darknetWtsFilePath = initParams->modelFilePath; + std::string darknetCfgFilePath = initParams->customNetworkConfigFilePath; - std::transform(yoloCfg.begin(), yoloCfg.end(), yoloCfg.begin(), [] (uint8_t c) { + std::string yoloType = onnxWtsFilePath != "" ? "onnx" : "darknet"; + std::string modelName = yoloType == "onnx" ? + onnxWtsFilePath.substr(0, onnxWtsFilePath.find(".onnx")).substr(onnxWtsFilePath.rfind("/") + 1) : + darknetWtsFilePath.substr(0, darknetWtsFilePath.find(".weights")).substr(darknetWtsFilePath.rfind("/") + 1); + + std::transform(modelName.begin(), modelName.end(), modelName.begin(), [] (uint8_t c) { return std::tolower(c); }); - yoloType = yoloCfg.substr(0, yoloCfg.find(".cfg")); - networkInfo.inputBlobName = "input"; networkInfo.networkType = yoloType; - networkInfo.configFilePath = initParams->customNetworkConfigFilePath; - networkInfo.wtsFilePath = initParams->modelFilePath; + networkInfo.modelName = modelName; + networkInfo.onnxWtsFilePath = onnxWtsFilePath; + networkInfo.darknetWtsFilePath = darknetWtsFilePath; + networkInfo.darknetCfgFilePath = darknetCfgFilePath; + networkInfo.batchSize = initParams->maxBatchSize; + networkInfo.implicitBatch = initParams->forceImplicitBatchDimension; networkInfo.int8CalibPath = initParams->int8CalibrationFilePath; - networkInfo.deviceType = (initParams->useDLA ? "kDLA" : "kGPU"); + networkInfo.deviceType = initParams->useDLA ? "kDLA" : "kGPU"; networkInfo.numDetectedClasses = initParams->numDetectedClasses; networkInfo.clusterMode = initParams->clusterMode; + networkInfo.scaleFactor = initParams->networkScaleFactor; + networkInfo.offsets = initParams->offsets; - if (initParams->networkMode == 0) + if (initParams->networkMode == NvDsInferNetworkMode_FP32) networkInfo.networkMode = "FP32"; - else if (initParams->networkMode == 1) + else if (initParams->networkMode == NvDsInferNetworkMode_INT8) networkInfo.networkMode = "INT8"; - else if (initParams->networkMode == 2) + else if (initParams->networkMode == NvDsInferNetworkMode_FP16) networkInfo.networkMode = "FP16"; - if (networkInfo.configFilePath.empty() || networkInfo.wtsFilePath.empty()) { - std::cerr << "YOLO config file or weights file is not specified\n" << std::endl; - return false; + if (yoloType == "onnx") { + if (!fileExists(networkInfo.onnxWtsFilePath)) { + std::cerr << "ONNX model file does not exist\n" << std::endl; + return false; + } } - - if (!fileExists(networkInfo.configFilePath) || !fileExists(networkInfo.wtsFilePath)) { - std::cerr << "YOLO config file or weights file is not exist\n" << std::endl; - return false; + else { + if (!fileExists(networkInfo.darknetWtsFilePath)) { + std::cerr << "Darknet weights file does not exist\n" << std::endl; + return false; + } + else if (!fileExists(networkInfo.darknetCfgFilePath)) { + std::cerr << "Darknet cfg file does not exist\n" << std::endl; + return false; + } } return true; @@ -99,7 +116,7 @@ NvDsInferYoloCudaEngineGet(nvinfer1::IBuilder* const builder, nvinfer1::IBuilder Yolo yolo(networkInfo); cudaEngine = yolo.createEngine(builder, builderConfig); if (cudaEngine == nullptr) { - std::cerr << "Failed to build CUDA engine on " << networkInfo.configFilePath << std::endl; + std::cerr << "Failed to build CUDA engine" << std::endl; return false; } diff --git a/nvdsinfer_custom_impl_Yolo/nvdsinitinputlayers_Yolo.cpp b/nvdsinfer_custom_impl_Yolo/nvdsinitinputlayers_Yolo.cpp index 2742dfb..ae5b1f0 100644 --- a/nvdsinfer_custom_impl_Yolo/nvdsinitinputlayers_Yolo.cpp +++ b/nvdsinfer_custom_impl_Yolo/nvdsinitinputlayers_Yolo.cpp @@ -26,10 +26,10 @@ #include "nvdsinfer_custom_impl.h" bool -NvDsInferInitializeInputLayers(std::vector const &inputLayersInfo, - NvDsInferNetworkInfo const &networkInfo, unsigned int maxBatchSize) +NvDsInferInitializeInputLayers(std::vector const& inputLayersInfo, + NvDsInferNetworkInfo const& networkInfo, unsigned int maxBatchSize) { - float *scaleFactor = (float *) inputLayersInfo[0].buffer; + float* scaleFactor = (float*) inputLayersInfo[0].buffer; for (unsigned int i = 0; i < maxBatchSize; i++) { scaleFactor[i * 2 + 0] = 1.0; scaleFactor[i * 2 + 1] = 1.0; diff --git a/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp b/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp index 624cbee..21308b8 100644 --- a/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp +++ b/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp @@ -73,22 +73,22 @@ addBBoxProposal(const float bx1, const float by1, const float bx2, const float b } static std::vector -decodeTensorYolo(const float* detection, const uint& outputSize, const uint& netW, const uint& netH, - const std::vector& preclusterThreshold) +decodeTensorYolo(const float* boxes, const float* scores, const int* classes, const uint& outputSize, const uint& netW, + const uint& netH, const std::vector& preclusterThreshold) { std::vector binfo; for (uint b = 0; b < outputSize; ++b) { - float maxProb = detection[b * 6 + 4]; - int maxIndex = (int) detection[b * 6 + 5]; + float maxProb = scores[b]; + int maxIndex = classes[b]; if (maxProb < preclusterThreshold[maxIndex]) continue; - float bxc = detection[b * 6 + 0]; - float byc = detection[b * 6 + 1]; - float bw = detection[b * 6 + 2]; - float bh = detection[b * 6 + 3]; + float bxc = boxes[b * 4 + 0]; + float byc = boxes[b * 4 + 1]; + float bw = boxes[b * 4 + 2]; + float bh = boxes[b * 4 + 3]; float bx1 = bxc - bw / 2; float by1 = byc - bh / 2; @@ -102,22 +102,22 @@ decodeTensorYolo(const float* detection, const uint& outputSize, const uint& net } static std::vector -decodeTensorYoloE(const float* detection, const uint& outputSize, const uint& netW, const uint& netH, - const std::vector& preclusterThreshold) +decodeTensorYoloE(const float* boxes, const float* scores, const int* classes, const uint& outputSize, const uint& netW, + const uint& netH, const std::vector& preclusterThreshold) { std::vector binfo; for (uint b = 0; b < outputSize; ++b) { - float maxProb = detection[b * 6 + 4]; - int maxIndex = (int) detection[b * 6 + 5]; + float maxProb = scores[b]; + int maxIndex = classes[b]; if (maxProb < preclusterThreshold[maxIndex]) continue; - float bx1 = detection[b * 6 + 0]; - float by1 = detection[b * 6 + 1]; - float bx2 = detection[b * 6 + 2]; - float by2 = detection[b * 6 + 3]; + float bx1 = boxes[b * 4 + 0]; + float by1 = boxes[b * 4 + 1]; + float bx2 = boxes[b * 4 + 2]; + float by2 = boxes[b * 4 + 3]; addBBoxProposal(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo); } @@ -136,12 +136,27 @@ NvDsInferParseCustomYolo(std::vector const& outputLayersInfo std::vector objects; - const NvDsInferLayerInfo& layer = outputLayersInfo[0]; + NvDsInferLayerInfo* boxes; + NvDsInferLayerInfo* scores; + NvDsInferLayerInfo* classes; - const uint outputSize = layer.inferDims.d[0]; + for (uint i = 0; i < 3; ++i) { + if (outputLayersInfo[i].dataType == NvDsInferDataType::INT32) { + classes = (NvDsInferLayerInfo*) &outputLayersInfo[i]; + } + else if (outputLayersInfo[i].inferDims.d[1] == 4) { + boxes = (NvDsInferLayerInfo*) &outputLayersInfo[i]; + } + else { + scores = (NvDsInferLayerInfo*) &outputLayersInfo[i]; + } + } - std::vector outObjs = decodeTensorYolo((const float*) (layer.buffer), outputSize, - networkInfo.width, networkInfo.height, detectionParams.perClassPreclusterThreshold); + const uint outputSize = boxes->inferDims.d[0]; + + std::vector outObjs = decodeTensorYolo((const float*) (boxes->buffer), + (const float*) (scores->buffer), (const int*) (classes->buffer), outputSize, networkInfo.width, networkInfo.height, + detectionParams.perClassPreclusterThreshold); objects.insert(objects.end(), outObjs.begin(), outObjs.end()); @@ -161,12 +176,27 @@ NvDsInferParseCustomYoloE(std::vector const& outputLayersInf std::vector objects; - const NvDsInferLayerInfo& layer = outputLayersInfo[0]; + NvDsInferLayerInfo* boxes; + NvDsInferLayerInfo* scores; + NvDsInferLayerInfo* classes; - const uint outputSize = layer.inferDims.d[0]; + for (uint i = 0; i < 3; ++i) { + if (outputLayersInfo[i].dataType == NvDsInferDataType::INT32) { + classes = (NvDsInferLayerInfo*) &outputLayersInfo[i]; + } + else if (outputLayersInfo[i].inferDims.d[1] == 4) { + boxes = (NvDsInferLayerInfo*) &outputLayersInfo[i]; + } + else { + scores = (NvDsInferLayerInfo*) &outputLayersInfo[i]; + } + } - std::vector outObjs = decodeTensorYoloE((const float*) (layer.buffer), outputSize, - networkInfo.width, networkInfo.height, detectionParams.perClassPreclusterThreshold); + const uint outputSize = boxes->inferDims.d[0]; + + std::vector outObjs = decodeTensorYoloE((const float*) (boxes->buffer), + (const float*) (scores->buffer), (const int*) (classes->buffer), outputSize, networkInfo.width, networkInfo.height, + detectionParams.perClassPreclusterThreshold); objects.insert(objects.end(), outObjs.begin(), outObjs.end()); diff --git a/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo_cuda.cu b/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo_cuda.cu index ffff5ee..ab07d65 100644 --- a/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo_cuda.cu +++ b/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo_cuda.cu @@ -30,33 +30,33 @@ #include "nvdsinfer_custom_impl.h" extern "C" bool -NvDsInferParseYolo_cuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, +NvDsInferParseYoloCuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList); extern "C" bool -NvDsInferParseYoloE_cuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, +NvDsInferParseYoloECuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList); -__global__ void decodeTensorYolo_cuda(NvDsInferParseObjectInfo *binfo, float* input, int outputSize, int netW, int netH, - float minPreclusterThreshold) +__global__ void decodeTensorYoloCuda(NvDsInferParseObjectInfo *binfo, float* boxes, float* scores, int* classes, + int outputSize, int netW, int netH, float minPreclusterThreshold) { int x_id = blockIdx.x * blockDim.x + threadIdx.x; if (x_id >= outputSize) return; - float maxProb = input[x_id * 6 + 4]; - int maxIndex = (int) input[x_id * 6 + 5]; + float maxProb = scores[x_id]; + int maxIndex = classes[x_id]; if (maxProb < minPreclusterThreshold) { binfo[x_id].detectionConfidence = 0.0; return; } - float bxc = input[x_id * 6 + 0]; - float byc = input[x_id * 6 + 1]; - float bw = input[x_id * 6 + 2]; - float bh = input[x_id * 6 + 3]; + float bxc = boxes[x_id * 4 + 0]; + float byc = boxes[x_id * 4 + 1]; + float bw = boxes[x_id * 4 + 2]; + float bh = boxes[x_id * 4 + 3]; float x0 = bxc - bw / 2; float y0 = byc - bh / 2; @@ -76,26 +76,26 @@ __global__ void decodeTensorYolo_cuda(NvDsInferParseObjectInfo *binfo, float* in binfo[x_id].classId = maxIndex; } -__global__ void decodeTensorYoloE_cuda(NvDsInferParseObjectInfo *binfo, float* input, int outputSize, int netW, int netH, - float minPreclusterThreshold) +__global__ void decodeTensorYoloECuda(NvDsInferParseObjectInfo *binfo, float* boxes, float* scores, int* classes, + int outputSize, int netW, int netH, float minPreclusterThreshold) { int x_id = blockIdx.x * blockDim.x + threadIdx.x; if (x_id >= outputSize) return; - float maxProb = input[x_id * 6 + 4]; - int maxIndex = (int) input[x_id * 6 + 5]; + float maxProb = scores[x_id]; + int maxIndex = classes[x_id]; if (maxProb < minPreclusterThreshold) { binfo[x_id].detectionConfidence = 0.0; return; } - float x0 = input[x_id * 6 + 0]; - float y0 = input[x_id * 6 + 1]; - float x1 = input[x_id * 6 + 2]; - float y1 = input[x_id * 6 + 3]; + float x0 = boxes[x_id * 4 + 0]; + float y0 = boxes[x_id * 4 + 1]; + float x1 = boxes[x_id * 4 + 2]; + float y1 = boxes[x_id * 4 + 3]; x0 = fminf(float(netW), fmaxf(float(0.0), x0)); y0 = fminf(float(netH), fmaxf(float(0.0), y0)); @@ -110,7 +110,7 @@ __global__ void decodeTensorYoloE_cuda(NvDsInferParseObjectInfo *binfo, float* i binfo[x_id].classId = maxIndex; } -static bool NvDsInferParseCustomYolo_cuda(std::vector const& outputLayersInfo, +static bool NvDsInferParseCustomYoloCuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList) { @@ -119,9 +119,23 @@ static bool NvDsInferParseCustomYolo_cuda(std::vector const& return false; } - const NvDsInferLayerInfo &layer = outputLayersInfo[0]; + NvDsInferLayerInfo* boxes; + NvDsInferLayerInfo* scores; + NvDsInferLayerInfo* classes; - const int outputSize = layer.inferDims.d[0]; + for (uint i = 0; i < 3; ++i) { + if (outputLayersInfo[i].dataType == NvDsInferDataType::INT32) { + classes = (NvDsInferLayerInfo*) &outputLayersInfo[i]; + } + else if (outputLayersInfo[i].inferDims.d[1] == 4) { + boxes = (NvDsInferLayerInfo*) &outputLayersInfo[i]; + } + else { + scores = (NvDsInferLayerInfo*) &outputLayersInfo[i]; + } + } + + const int outputSize = boxes->inferDims.d[0]; thrust::device_vector objects(outputSize); @@ -131,9 +145,9 @@ static bool NvDsInferParseCustomYolo_cuda(std::vector const& int threads_per_block = 1024; int number_of_blocks = ((outputSize - 1) / threads_per_block) + 1; - decodeTensorYolo_cuda<<>>( - thrust::raw_pointer_cast(objects.data()), (float*) layer.buffer, outputSize, networkInfo.width, networkInfo.height, - minPreclusterThreshold); + decodeTensorYoloCuda<<>>( + thrust::raw_pointer_cast(objects.data()), (float*) (boxes->buffer), (float*) (scores->buffer), + (int*) (classes->buffer), outputSize, networkInfo.width, networkInfo.height, minPreclusterThreshold); objectList.resize(outputSize); thrust::copy(objects.begin(), objects.end(), objectList.begin()); @@ -141,7 +155,7 @@ static bool NvDsInferParseCustomYolo_cuda(std::vector const& return true; } -static bool NvDsInferParseCustomYoloE_cuda(std::vector const& outputLayersInfo, +static bool NvDsInferParseCustomYoloECuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList) { @@ -150,9 +164,23 @@ static bool NvDsInferParseCustomYoloE_cuda(std::vector const return false; } - const NvDsInferLayerInfo &layer = outputLayersInfo[0]; + NvDsInferLayerInfo* boxes; + NvDsInferLayerInfo* scores; + NvDsInferLayerInfo* classes; - const int outputSize = layer.inferDims.d[0]; + for (uint i = 0; i < 3; ++i) { + if (outputLayersInfo[i].dataType == NvDsInferDataType::INT32) { + classes = (NvDsInferLayerInfo*) &outputLayersInfo[i]; + } + else if (outputLayersInfo[i].inferDims.d[1] == 4) { + boxes = (NvDsInferLayerInfo*) &outputLayersInfo[i]; + } + else { + scores = (NvDsInferLayerInfo*) &outputLayersInfo[i]; + } + } + + const int outputSize = boxes->inferDims.d[0]; thrust::device_vector objects(outputSize); @@ -162,9 +190,9 @@ static bool NvDsInferParseCustomYoloE_cuda(std::vector const int threads_per_block = 1024; int number_of_blocks = ((outputSize - 1) / threads_per_block) + 1; - decodeTensorYoloE_cuda<<>>( - thrust::raw_pointer_cast(objects.data()), (float*) layer.buffer, outputSize, networkInfo.width, networkInfo.height, - minPreclusterThreshold); + decodeTensorYoloECuda<<>>( + thrust::raw_pointer_cast(objects.data()), (float*) (boxes->buffer), (float*) (scores->buffer), + (int*) (classes->buffer), outputSize, networkInfo.width, networkInfo.height, minPreclusterThreshold); objectList.resize(outputSize); thrust::copy(objects.begin(), objects.end(), objectList.begin()); @@ -173,18 +201,18 @@ static bool NvDsInferParseCustomYoloE_cuda(std::vector const } extern "C" bool -NvDsInferParseYolo_cuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, +NvDsInferParseYoloCuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList) { - return NvDsInferParseCustomYolo_cuda(outputLayersInfo, networkInfo, detectionParams, objectList); + return NvDsInferParseCustomYoloCuda(outputLayersInfo, networkInfo, detectionParams, objectList); } extern "C" bool -NvDsInferParseYoloE_cuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, +NvDsInferParseYoloECuda(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList) { - return NvDsInferParseCustomYoloE_cuda(outputLayersInfo, networkInfo, detectionParams, objectList); + return NvDsInferParseCustomYoloECuda(outputLayersInfo, networkInfo, detectionParams, objectList); } -CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYolo_cuda); -CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYoloE_cuda); +CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYoloCuda); +CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYoloECuda); diff --git a/nvdsinfer_custom_impl_Yolo/utils.cpp b/nvdsinfer_custom_impl_Yolo/utils.cpp index b3ff68f..96f6070 100644 --- a/nvdsinfer_custom_impl_Yolo/utils.cpp +++ b/nvdsinfer_custom_impl_Yolo/utils.cpp @@ -60,15 +60,16 @@ bool fileExists(const std::string fileName, bool verbose) { if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(fileName))) { - if (verbose) + if (verbose) { std::cout << "\nFile does not exist: " << fileName << std::endl; + } return false; } return true; } std::vector -loadWeights(const std::string weightsFilePath, const std::string& networkType) +loadWeights(const std::string weightsFilePath, const std::string& modelName) { assert(fileExists(weightsFilePath)); std::cout << "\nLoading pre-trained weights" << std::endl; @@ -80,7 +81,7 @@ loadWeights(const std::string weightsFilePath, const std::string& networkType) assert(file.good()); std::string line; - if (networkType.find("yolov2") != std::string::npos && networkType.find("yolov2-tiny") == std::string::npos) { + if (modelName.find("yolov2") != std::string::npos && modelName.find("yolov2-tiny") == std::string::npos) { // Remove 4 int32 bytes of data from the stream belonging to the header file.ignore(4 * 4); } @@ -94,8 +95,9 @@ loadWeights(const std::string weightsFilePath, const std::string& networkType) file.read(floatWeight, 4); assert(file.gcount() == 4); weights.push_back(*reinterpret_cast(floatWeight)); - if (file.peek() == std::istream::traits_type::eof()) + if (file.peek() == std::istream::traits_type::eof()) { break; + } } } else { @@ -103,7 +105,7 @@ loadWeights(const std::string weightsFilePath, const std::string& networkType) assert(0); } - std::cout << "Loading weights of " << networkType << " complete" << std::endl; + std::cout << "Loading weights of " << modelName << " complete" << std::endl; std::cout << "Total weights read: " << weights.size() << std::endl; return weights; @@ -116,8 +118,9 @@ dimsToString(const nvinfer1::Dims d) std::stringstream s; s << "["; - for (int i = 0; i < d.nbDims - 1; ++i) + for (int i = 1; i < d.nbDims - 1; ++i) { s << d.d[i] << ", "; + } s << d.d[d.nbDims - 1] << "]"; return s.str(); @@ -127,16 +130,15 @@ int getNumChannels(nvinfer1::ITensor* t) { nvinfer1::Dims d = t->getDimensions(); - assert(d.nbDims == 3); - - return d.d[0]; + assert(d.nbDims == 4); + return d.d[1]; } void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput, std::string layerOutput, std::string weightPtr) { - std::cout << std::setw(8) << std::left << layerIndex << std::setw(30) << std::left << layerName; - std::cout << std::setw(20) << std::left << layerInput << std::setw(20) << std::left << layerOutput; + std::cout << std::setw(7) << std::left << layerIndex << std::setw(40) << std::left << layerName; + std::cout << std::setw(19) << std::left << layerInput << std::setw(19) << std::left << layerOutput; std::cout << weightPtr << std::endl; } diff --git a/nvdsinfer_custom_impl_Yolo/utils.h b/nvdsinfer_custom_impl_Yolo/utils.h index f50f954..af7e202 100644 --- a/nvdsinfer_custom_impl_Yolo/utils.h +++ b/nvdsinfer_custom_impl_Yolo/utils.h @@ -40,7 +40,7 @@ float clamp(const float val, const float minVal, const float maxVal); bool fileExists(const std::string fileName, bool verbose = true); -std::vector loadWeights(const std::string weightsFilePath, const std::string& networkType); +std::vector loadWeights(const std::string weightsFilePath, const std::string& modelName); std::string dimsToString(const nvinfer1::Dims d); diff --git a/nvdsinfer_custom_impl_Yolo/yolo.cpp b/nvdsinfer_custom_impl_Yolo/yolo.cpp index 3f39d59..1bb203d 100644 --- a/nvdsinfer_custom_impl_Yolo/yolo.cpp +++ b/nvdsinfer_custom_impl_Yolo/yolo.cpp @@ -23,6 +23,8 @@ * https://www.github.com/marcoslucianops */ +#include "NvOnnxParser.h" + #include "yolo.h" #include "yoloPlugins.h" @@ -31,11 +33,14 @@ #endif Yolo::Yolo(const NetworkInfo& networkInfo) : m_InputBlobName(networkInfo.inputBlobName), - m_NetworkType(networkInfo.networkType), m_ConfigFilePath(networkInfo.configFilePath), - m_WtsFilePath(networkInfo.wtsFilePath), m_Int8CalibPath(networkInfo.int8CalibPath), m_DeviceType(networkInfo.deviceType), - m_NumDetectedClasses(networkInfo.numDetectedClasses), m_ClusterMode(networkInfo.clusterMode), - m_NetworkMode(networkInfo.networkMode), m_InputH(0), m_InputW(0), m_InputC(0), m_InputSize(0), m_NumClasses(0), - m_LetterBox(0), m_NewCoords(0), m_YoloCount(0) + m_NetworkType(networkInfo.networkType), m_ModelName(networkInfo.modelName), + m_OnnxWtsFilePath(networkInfo.onnxWtsFilePath), m_DarknetWtsFilePath(networkInfo.darknetWtsFilePath), + m_DarknetCfgFilePath(networkInfo.darknetCfgFilePath), m_BatchSize(networkInfo.batchSize), + m_ImplicitBatch(networkInfo.implicitBatch), m_Int8CalibPath(networkInfo.int8CalibPath), + m_DeviceType(networkInfo.deviceType), m_NumDetectedClasses(networkInfo.numDetectedClasses), + m_ClusterMode(networkInfo.clusterMode), m_NetworkMode(networkInfo.networkMode), m_ScaleFactor(networkInfo.scaleFactor), + m_Offsets(networkInfo.offsets), m_InputC(0), m_InputH(0), m_InputW(0), m_InputSize(0), m_NumClasses(0), m_LetterBox(0), + m_NewCoords(0), m_YoloCount(0) { } @@ -47,74 +52,175 @@ Yolo::~Yolo() nvinfer1::ICudaEngine* Yolo::createEngine(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config) { - assert (builder); + assert(builder); - m_ConfigBlocks = parseConfigFile(m_ConfigFilePath); - parseConfigBlocks(); + nvinfer1::NetworkDefinitionCreationFlags flags = + (1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); - nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0); - if (parseModel(*network) != NVDSINFER_SUCCESS) { + nvinfer1::INetworkDefinition* network = builder->createNetworkV2(flags); + assert(network); -#ifdef LEGACY - network->destroy(); + nvonnxparser::IParser* parser; + + if (m_NetworkType == "onnx") { + parser = nvonnxparser::createParser(*network, *builder->getLogger()); + if (!parser->parseFromFile(m_OnnxWtsFilePath.c_str(), static_cast(nvinfer1::ILogger::Severity::kWARNING))) { + std::cerr << "\nCould not parse the ONNX model\n" << std::endl; + +#if NV_TENSORRT_MAJOR >= 8 + delete parser; + delete network; #else - delete network; + parser->destroy(); + network->destroy(); #endif - return nullptr; + return nullptr; + } + m_InputC = network->getInput(0)->getDimensions().d[1]; + m_InputH = network->getInput(0)->getDimensions().d[2]; + m_InputW = network->getInput(0)->getDimensions().d[3]; + } + else { + m_ConfigBlocks = parseConfigFile(m_DarknetCfgFilePath); + parseConfigBlocks(); + if (parseModel(*network) != NVDSINFER_SUCCESS) { + +#if NV_TENSORRT_MAJOR >= 8 + delete network; +#else + network->destroy(); +#endif + + return nullptr; + } } - std::cout << "Building the TensorRT Engine\n" << std::endl; - - if (m_NumClasses != m_NumDetectedClasses) { - std::cout << "NOTE: Number of classes mismatch, make sure to set num-detected-classes=" << m_NumClasses - << " in config_infer file\n" << std::endl; + if (!m_ImplicitBatch && network->getInput(0)->getDimensions().d[0] == -1) { + nvinfer1::IOptimizationProfile* profile = builder->createOptimizationProfile(); + assert(profile); + for (int32_t i = 0; i < network->getNbInputs(); ++i) { + nvinfer1::ITensor* input = network->getInput(i); + nvinfer1::Dims inputDims = input->getDimensions(); + nvinfer1::Dims dims = inputDims; + dims.d[0] = 1; + profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, dims); + dims.d[0] = m_BatchSize; + profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, dims); + dims.d[0] = m_BatchSize; + profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, dims); + } + config->addOptimizationProfile(profile); } - if (m_LetterBox == 1) { - std::cout << "NOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file" - << " to get better accuracy\n" << std::endl; + + std::cout << "\nBuilding the TensorRT Engine\n" << std::endl; + + if (m_NetworkType == "darknet") { + if (m_NumClasses != m_NumDetectedClasses) { + std::cout << "NOTE: Number of classes mismatch, make sure to set num-detected-classes=" << m_NumClasses + << " in config_infer file\n" << std::endl; + } + if (m_LetterBox == 1) { + std::cout << "NOTE: letter_box is set in cfg file, make sure to set maintain-aspect-ratio=1 in config_infer file" + << " to get better accuracy\n" << std::endl; + } } if (m_ClusterMode != 2) { std::cout << "NOTE: Wrong cluster-mode is set, make sure to set cluster-mode=2 in config_infer file\n" << std::endl; } - if (m_NetworkMode == "INT8" && !fileExists(m_Int8CalibPath)) { + if (m_NetworkMode == "FP16") { + assert(builder->platformHasFastFp16()); + config->setFlag(nvinfer1::BuilderFlag::kFP16); + } + else if (m_NetworkMode == "INT8") { assert(builder->platformHasFastInt8()); -#ifdef OPENCV - std::string calib_image_list; - int calib_batch_size; - if (getenv("INT8_CALIB_IMG_PATH")) - calib_image_list = getenv("INT8_CALIB_IMG_PATH"); - else { - std::cerr << "INT8_CALIB_IMG_PATH not set" << std::endl; - assert(0); - } - if (getenv("INT8_CALIB_BATCH_SIZE")) - calib_batch_size = std::stoi(getenv("INT8_CALIB_BATCH_SIZE")); - else { - std::cerr << "INT8_CALIB_BATCH_SIZE not set" << std::endl; - assert(0); - } - nvinfer1::IInt8EntropyCalibrator2 *calibrator = new Int8EntropyCalibrator2(calib_batch_size, m_InputC, m_InputH, - m_InputW, m_LetterBox, calib_image_list, m_Int8CalibPath); config->setFlag(nvinfer1::BuilderFlag::kINT8); - config->setInt8Calibrator(calibrator); + if (m_Int8CalibPath != "" && !fileExists(m_Int8CalibPath)) { + +#ifdef OPENCV + std::string calib_image_list; + int calib_batch_size; + if (getenv("INT8_CALIB_IMG_PATH")) { + calib_image_list = getenv("INT8_CALIB_IMG_PATH"); + } + else { + std::cerr << "INT8_CALIB_IMG_PATH not set" << std::endl; + assert(0); + } + if (getenv("INT8_CALIB_BATCH_SIZE")) { + calib_batch_size = std::stoi(getenv("INT8_CALIB_BATCH_SIZE")); + } + else { + std::cerr << "INT8_CALIB_BATCH_SIZE not set" << std::endl; + assert(0); + } + nvinfer1::IInt8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(calib_batch_size, m_InputC, m_InputH, + m_InputW, m_ScaleFactor, m_Offsets, calib_image_list, m_Int8CalibPath); + config->setInt8Calibrator(calibrator); #else - std::cerr << "OpenCV is required to run INT8 calibrator\n" << std::endl; - assert(0); + std::cerr << "OpenCV is required to run INT8 calibrator\n" << std::endl; + +#if NV_TENSORRT_MAJOR >= 8 + if (m_NetworkType == "onnx") { + delete parser; + } + delete network; +#else + if (m_NetworkType == "onnx") { + parser->destroy(); + } + network->destroy(); #endif + + return nullptr; +#endif + + } } - nvinfer1::ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config); - if (engine) - std::cout << "Building complete\n" << std::endl; - else - std::cerr << "Building engine failed\n" << std::endl; +#ifdef GRAPH + config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kDETAILED); +#endif -#ifdef LEGACY - network->destroy(); + nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); + if (engine) { + std::cout << "Building complete\n" << std::endl; + } + else { + std::cerr << "Building engine failed\n" << std::endl; + } + +#ifdef GRAPH + nvinfer1::IExecutionContext *context = engine->createExecutionContext(); + nvinfer1::IEngineInspector *inpector = engine->createEngineInspector(); + inpector->setExecutionContext(context); + std::ofstream graph; + graph.open("graph.json"); + graph << inpector->getEngineInformation(nvinfer1::LayerInformationFormat::kJSON); + graph.close(); + std::cout << "Network graph saved to graph.json\n" << std::endl; + +#if NV_TENSORRT_MAJOR >= 8 + delete inpector; + delete context; #else - delete network; + inpector->destroy(); + context->destroy(); +#endif + +#endif + +#if NV_TENSORRT_MAJOR >= 8 + if (m_NetworkType == "onnx") { + delete parser; + } + delete network; +#else + if (m_NetworkType == "onnx") { + parser->destroy(); + } + network->destroy(); #endif return engine; @@ -124,14 +230,16 @@ NvDsInferStatus Yolo::parseModel(nvinfer1::INetworkDefinition& network) { destroyNetworkUtils(); - std::vector weights = loadWeights(m_WtsFilePath, m_NetworkType); + std::vector weights = loadWeights(m_DarknetWtsFilePath, m_ModelName); std::cout << "Building YOLO network\n" << std::endl; NvDsInferStatus status = buildYoloNetwork(weights, network); - if (status == NVDSINFER_SUCCESS) + if (status == NVDSINFER_SUCCESS) { std::cout << "Building YOLO network complete" << std::endl; - else + } + else { std::cerr << "Building YOLO network failed" << std::endl; + } return status; } @@ -141,8 +249,11 @@ Yolo::buildYoloNetwork(std::vector& weights, nvinfer1::INetworkDefinition { int weightPtr = 0; + uint batchSize = m_ImplicitBatch ? m_BatchSize : -1; + nvinfer1::ITensor* data = network.addInput(m_InputBlobName.c_str(), nvinfer1::DataType::kFLOAT, - nvinfer1::Dims{3, {static_cast(m_InputC), static_cast(m_InputH), static_cast(m_InputW)}}); + nvinfer1::Dims{4, {static_cast(batchSize), static_cast(m_InputC), static_cast(m_InputH), + static_cast(m_InputW)}}); assert(data != nullptr && data->getDimensions().nbDims > 0); nvinfer1::ITensor* previous = data; @@ -287,28 +398,13 @@ Yolo::buildYoloNetwork(std::vector& weights, nvinfer1::INetworkDefinition std::string layerName = m_ConfigBlocks.at(i).at("type"); printLayerInfo(layerIndex, layerName, inputVol, outputVol, "-"); } - else if (m_ConfigBlocks.at(i).at("type") == "reorg3d") { + else if (m_ConfigBlocks.at(i).at("type") == "reorg" || m_ConfigBlocks.at(i).at("type") == "reorg3d") { std::string inputVol = dimsToString(previous->getDimensions()); previous = reorgLayer(i, m_ConfigBlocks.at(i), previous, &network); assert(previous != nullptr); std::string outputVol = dimsToString(previous->getDimensions()); tensorOutputs.push_back(previous); - std::string layerName = "reorg3d"; - printLayerInfo(layerIndex, layerName, inputVol, outputVol, "-"); - } - else if (m_ConfigBlocks.at(i).at("type") == "reorg") { - std::string inputVol = dimsToString(previous->getDimensions()); - nvinfer1::IPluginV2* reorgPlugin = createReorgPlugin(2); - assert(reorgPlugin != nullptr); - nvinfer1::IPluginV2Layer* reorg = network.addPluginV2(&previous, 1, *reorgPlugin); - assert(reorg != nullptr); - std::string reorglayerName = "reorg_" + std::to_string(i); - reorg->setName(reorglayerName.c_str()); - previous = reorg->getOutput(0); - assert(previous != nullptr); - std::string outputVol = dimsToString(previous->getDimensions()); - tensorOutputs.push_back(previous); - std::string layerName = "reorg"; + std::string layerName = m_ConfigBlocks.at(i).at("type"); printLayerInfo(layerIndex, layerName, inputVol, outputVol, "-"); } else if (m_ConfigBlocks.at(i).at("type") == "yolo" || m_ConfigBlocks.at(i).at("type") == "region") { @@ -317,9 +413,8 @@ Yolo::buildYoloNetwork(std::vector& weights, nvinfer1::INetworkDefinition nvinfer1::Dims prevTensorDims = previous->getDimensions(); TensorInfo& curYoloTensor = m_YoloTensors.at(yoloCountInputs); curYoloTensor.blobName = blobName; - curYoloTensor.gridSizeX = prevTensorDims.d[2]; - curYoloTensor.gridSizeY = prevTensorDims.d[1]; - + curYoloTensor.gridSizeY = prevTensorDims.d[2]; + curYoloTensor.gridSizeX = prevTensorDims.d[3]; std::string inputVol = dimsToString(previous->getDimensions()); tensorOutputs.push_back(previous); yoloTensorInputs[yoloCountInputs] = previous; @@ -345,10 +440,10 @@ Yolo::buildYoloNetwork(std::vector& weights, nvinfer1::INetworkDefinition uint64_t outputSize = 0; for (uint j = 0; j < yoloCountInputs; ++j) { TensorInfo& curYoloTensor = m_YoloTensors.at(j); - outputSize += curYoloTensor.gridSizeX * curYoloTensor.gridSizeY * curYoloTensor.numBBoxes; + outputSize += curYoloTensor.numBBoxes * curYoloTensor.gridSizeY * curYoloTensor.gridSizeX; } - nvinfer1::IPluginV2* yoloPlugin = new YoloLayer(m_InputW, m_InputH, m_NumClasses, m_NewCoords, m_YoloTensors, + nvinfer1::IPluginV2DynamicExt* yoloPlugin = new YoloLayer(m_InputW, m_InputH, m_NumClasses, m_NewCoords, m_YoloTensors, outputSize); assert(yoloPlugin != nullptr); nvinfer1::IPluginV2Layer* yolo = network.addPluginV2(yoloTensorInputs, m_YoloCount, *yoloPlugin); @@ -356,10 +451,19 @@ Yolo::buildYoloNetwork(std::vector& weights, nvinfer1::INetworkDefinition std::string yoloLayerName = "yolo"; yolo->setName(yoloLayerName.c_str()); - nvinfer1::ITensor* outputYolo = yolo->getOutput(0); - std::string outputYoloLayerName = "output"; - outputYolo->setName(outputYoloLayerName.c_str()); - network.markOutput(*outputYolo); + std::string outputlayerName; + nvinfer1::ITensor* detection_boxes = yolo->getOutput(0); + outputlayerName = "boxes"; + detection_boxes->setName(outputlayerName.c_str()); + nvinfer1::ITensor* detection_scores = yolo->getOutput(1); + outputlayerName = "scores"; + detection_scores->setName(outputlayerName.c_str()); + nvinfer1::ITensor* detection_classes = yolo->getOutput(2); + outputlayerName = "classes"; + detection_classes->setName(outputlayerName.c_str()); + network.markOutput(*detection_boxes); + network.markOutput(*detection_scores); + network.markOutput(*detection_classes); } else { std::cerr << "\nError in yolo cfg file" << std::endl; diff --git a/nvdsinfer_custom_impl_Yolo/yolo.h b/nvdsinfer_custom_impl_Yolo/yolo.h index 5ce4ed3..095aa9a 100644 --- a/nvdsinfer_custom_impl_Yolo/yolo.h +++ b/nvdsinfer_custom_impl_Yolo/yolo.h @@ -45,13 +45,19 @@ struct NetworkInfo { std::string inputBlobName; std::string networkType; - std::string configFilePath; - std::string wtsFilePath; + std::string modelName; + std::string onnxWtsFilePath; + std::string darknetWtsFilePath; + std::string darknetCfgFilePath; + uint batchSize; + int implicitBatch; std::string int8CalibPath; std::string deviceType; uint numDetectedClasses; int clusterMode; std::string networkMode; + float scaleFactor; + const float* offsets; }; struct TensorInfo @@ -74,7 +80,8 @@ class Yolo : public IModelParser { bool hasFullDimsSupported() const override { return false; } const char* getModelName() const override { - return m_ConfigFilePath.empty() ? m_NetworkType.c_str() : m_ConfigFilePath.c_str(); + return m_NetworkType == "onnx" ? m_OnnxWtsFilePath.substr(0, m_OnnxWtsFilePath.find(".onnx")).c_str() : + m_DarknetCfgFilePath.substr(0, m_DarknetCfgFilePath.find(".cfg")).c_str(); } NvDsInferStatus parseModel(nvinfer1::INetworkDefinition& network) override; @@ -84,17 +91,23 @@ class Yolo : public IModelParser { protected: const std::string m_InputBlobName; const std::string m_NetworkType; - const std::string m_ConfigFilePath; - const std::string m_WtsFilePath; + const std::string m_ModelName; + const std::string m_OnnxWtsFilePath; + const std::string m_DarknetWtsFilePath; + const std::string m_DarknetCfgFilePath; + const uint m_BatchSize; + const int m_ImplicitBatch; const std::string m_Int8CalibPath; const std::string m_DeviceType; const uint m_NumDetectedClasses; const int m_ClusterMode; const std::string m_NetworkMode; + const float m_ScaleFactor; + const float* m_Offsets; + uint m_InputC; uint m_InputH; uint m_InputW; - uint m_InputC; uint64_t m_InputSize; uint m_NumClasses; uint m_LetterBox; diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward.cu b/nvdsinfer_custom_impl_Yolo/yoloForward.cu index 98fa2ff..be6ebcb 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloForward.cu +++ b/nvdsinfer_custom_impl_Yolo/yoloForward.cu @@ -4,13 +4,12 @@ */ #include -#include inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); } -__global__ void gpuYoloLayer(const float* input, float* output, int* count, const uint netWidth, const uint netHeight, - const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY, - const float* anchors, const int* mask) +__global__ void gpuYoloLayer(const float* input, float* boxes, float* scores, int* classes, const uint netWidth, + const uint netHeight, const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, + const uint64_t lastInputSize, const float scaleXY, const float* anchors, const int* mask) { uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint y_id = blockIdx.y * blockDim.y + threadIdx.y; @@ -22,8 +21,6 @@ __global__ void gpuYoloLayer(const float* input, float* output, int* count, cons const int numGridCells = gridSizeX * gridSizeY; const int bbindex = y_id * gridSizeX + x_id; - const float objectness = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); - const float alpha = scaleXY; const float beta = -0.5 * (scaleXY - 1); @@ -37,6 +34,8 @@ __global__ void gpuYoloLayer(const float* input, float* output, int* count, cons float h = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) * anchors[mask[z_id] * 2 + 1]; + const float objectness = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); + float maxProb = 0.0f; int maxIndex = -1; @@ -48,25 +47,25 @@ __global__ void gpuYoloLayer(const float* input, float* output, int* count, cons } } - int _count = (int)atomicAdd(count, 1); + int count = z_id * gridSizeX * gridSizeY + y_id * gridSizeY + x_id + lastInputSize; - output[_count * 6 + 0] = xc; - output[_count * 6 + 1] = yc; - output[_count * 6 + 2] = w; - output[_count * 6 + 3] = h; - output[_count * 6 + 4] = maxProb * objectness; - output[_count * 6 + 5] = maxIndex; + boxes[count * 4 + 0] = xc; + boxes[count * 4 + 1] = yc; + boxes[count * 4 + 2] = w; + boxes[count * 4 + 3] = h; + scores[count] = maxProb * objectness; + classes[count] = maxIndex; } -cudaError_t cudaYoloLayer(const void* input, void* output, void* count, const uint& batchSize, uint64_t& inputSize, - uint64_t& outputSize, const uint& netWidth, const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, - const uint& numOutputClasses, const uint& numBBoxes, const float& scaleXY, const void* anchors, const void* mask, - cudaStream_t stream); +cudaError_t cudaYoloLayer(const void* input, void* boxes, void* scores, void* classes, const uint& batchSize, + const uint64_t& inputSize, const uint64_t& outputSize, const uint64_t& lastInputSize, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); -cudaError_t cudaYoloLayer(const void* input, void* output, void* count, const uint& batchSize, uint64_t& inputSize, - uint64_t& outputSize, const uint& netWidth, const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, - const uint& numOutputClasses, const uint& numBBoxes, const float& scaleXY, const void* anchors, const void* mask, - cudaStream_t stream) +cudaError_t cudaYoloLayer(const void* input, void* boxes, void* scores, void* classes, const uint& batchSize, + const uint64_t& inputSize, const uint64_t& outputSize, const uint64_t& lastInputSize, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream) { dim3 threads_per_block(16, 16, 4); dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1, (gridSizeY / threads_per_block.y) + 1, @@ -75,9 +74,10 @@ cudaError_t cudaYoloLayer(const void* input, void* output, void* count, const ui for (unsigned int batch = 0; batch < batchSize; ++batch) { gpuYoloLayer<<>>( reinterpret_cast (input) + (batch * inputSize), - reinterpret_cast (output) + (batch * 6 * outputSize), - reinterpret_cast (count) + (batch), - netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY, + reinterpret_cast (boxes) + (batch * 4 * outputSize), + reinterpret_cast (scores) + (batch * 1 * outputSize), + reinterpret_cast (classes) + (batch * 1 * outputSize), + netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, lastInputSize, scaleXY, reinterpret_cast (anchors), reinterpret_cast (mask)); } return cudaGetLastError(); diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu b/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu index e3cbc7f..03c01f0 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu +++ b/nvdsinfer_custom_impl_Yolo/yoloForward_nc.cu @@ -5,9 +5,9 @@ #include -__global__ void gpuYoloLayer_nc(const float* input, float* output, int* count, const uint netWidth, const uint netHeight, - const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, const float scaleXY, - const float* anchors, const int* mask) +__global__ void gpuYoloLayer_nc(const float* input, float* boxes, float* scores, int* classes, const uint netWidth, + const uint netHeight, const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, + const uint64_t lastInputSize, const float scaleXY, const float* anchors, const int* mask) { uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint y_id = blockIdx.y * blockDim.y + threadIdx.y; @@ -19,8 +19,6 @@ __global__ void gpuYoloLayer_nc(const float* input, float* output, int* count, c const int numGridCells = gridSizeX * gridSizeY; const int bbindex = y_id * gridSizeX + x_id; - const float objectness = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]; - const float alpha = scaleXY; const float beta = -0.5 * (scaleXY - 1); @@ -34,6 +32,8 @@ __global__ void gpuYoloLayer_nc(const float* input, float* output, int* count, c float h = __powf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)] * 2, 2) * anchors[mask[z_id] * 2 + 1]; + const float objectness = input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]; + float maxProb = 0.0f; int maxIndex = -1; @@ -45,25 +45,25 @@ __global__ void gpuYoloLayer_nc(const float* input, float* output, int* count, c } } - int _count = (int)atomicAdd(count, 1); + int count = z_id * gridSizeX * gridSizeY + y_id * gridSizeY + x_id + lastInputSize; - output[_count * 6 + 0] = xc; - output[_count * 6 + 1] = yc; - output[_count * 6 + 2] = w; - output[_count * 6 + 3] = h; - output[_count * 6 + 4] = maxProb * objectness; - output[_count * 6 + 5] = maxIndex; + boxes[count * 4 + 0] = xc; + boxes[count * 4 + 1] = yc; + boxes[count * 4 + 2] = w; + boxes[count * 4 + 3] = h; + scores[count] = maxProb * objectness; + classes[count] = maxIndex; } -cudaError_t cudaYoloLayer_nc(const void* input, void* output, void* count, const uint& batchSize, uint64_t& inputSize, - uint64_t& outputSize, const uint& netWidth, const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, - const uint& numOutputClasses, const uint& numBBoxes, const float& scaleXY, const void* anchors, const void* mask, - cudaStream_t stream); +cudaError_t cudaYoloLayer_nc(const void* input, void* boxes, void* scores, void* classes, const uint& batchSize, + const uint64_t& inputSize, const uint64_t& outputSize, const uint64_t& lastInputSize, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); -cudaError_t cudaYoloLayer_nc(const void* input, void* output, void* count, const uint& batchSize, uint64_t& inputSize, - uint64_t& outputSize, const uint& netWidth, const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, - const uint& numOutputClasses, const uint& numBBoxes, const float& scaleXY, const void* anchors, const void* mask, - cudaStream_t stream) +cudaError_t cudaYoloLayer_nc(const void* input, void* boxes, void* scores, void* classes, const uint& batchSize, + const uint64_t& inputSize, const uint64_t& outputSize, const uint64_t& lastInputSize, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream) { dim3 threads_per_block(16, 16, 4); dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1, (gridSizeY / threads_per_block.y) + 1, @@ -72,9 +72,10 @@ cudaError_t cudaYoloLayer_nc(const void* input, void* output, void* count, const for (unsigned int batch = 0; batch < batchSize; ++batch) { gpuYoloLayer_nc<<>>( reinterpret_cast (input) + (batch * inputSize), - reinterpret_cast (output) + (batch * 6 * outputSize), - reinterpret_cast (count) + (batch), - netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, scaleXY, + reinterpret_cast (boxes) + (batch * 4 * outputSize), + reinterpret_cast (scores) + (batch * 1 * outputSize), + reinterpret_cast (classes) + (batch * 1 * outputSize), + netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, lastInputSize, scaleXY, reinterpret_cast (anchors), reinterpret_cast (mask)); } return cudaGetLastError(); diff --git a/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu b/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu index c13a1f0..c5ebc4a 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu +++ b/nvdsinfer_custom_impl_Yolo/yoloForward_v2.cu @@ -27,9 +27,9 @@ __device__ void softmaxGPU(const float* input, const int bbindex, const int numG } } -__global__ void gpuRegionLayer(const float* input, float* softmax, float* output, int* count, const uint netWidth, - const uint netHeight, const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses, const uint numBBoxes, - const float* anchors) +__global__ void gpuRegionLayer(const float* input, float* softmax, float* boxes, float* scores, int* classes, + const uint netWidth, const uint netHeight, const uint gridSizeX, const uint gridSizeY, const uint numOutputClasses, + const uint numBBoxes, const uint64_t lastInputSize, const float* anchors) { uint x_id = blockIdx.x * blockDim.x + threadIdx.x; uint y_id = blockIdx.y * blockDim.y + threadIdx.y; @@ -41,8 +41,6 @@ __global__ void gpuRegionLayer(const float* input, float* softmax, float* output const int numGridCells = gridSizeX * gridSizeY; const int bbindex = y_id * gridSizeX + x_id; - const float objectness = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); - float xc = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]) + x_id) * netWidth / gridSizeX; float yc = (sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]) + y_id) * netHeight / gridSizeY; @@ -53,6 +51,8 @@ __global__ void gpuRegionLayer(const float* input, float* softmax, float* output float h = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) * anchors[z_id * 2 + 1] * netHeight / gridSizeY; + const float objectness = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]); + softmaxGPU(input, bbindex, numGridCells, z_id, numOutputClasses, 1.0, softmax); float maxProb = 0.0f; @@ -66,23 +66,25 @@ __global__ void gpuRegionLayer(const float* input, float* softmax, float* output } } - int _count = (int)atomicAdd(count, 1); + int count = z_id * gridSizeX * gridSizeY + y_id * gridSizeY + x_id + lastInputSize; - output[_count * 6 + 0] = xc; - output[_count * 6 + 1] = yc; - output[_count * 6 + 2] = w; - output[_count * 6 + 3] = h; - output[_count * 6 + 4] = maxProb * objectness; - output[_count * 6 + 5] = maxIndex; + boxes[count * 4 + 0] = xc; + boxes[count * 4 + 1] = yc; + boxes[count * 4 + 2] = w; + boxes[count * 4 + 3] = h; + scores[count] = maxProb * objectness; + classes[count] = maxIndex; } -cudaError_t cudaRegionLayer(const void* input, void* softmax, void* output, void* count, const uint& batchSize, - uint64_t& inputSize, uint64_t& outputSize, const uint& netWidth, const uint& netHeight, const uint& gridSizeX, - const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, const void* anchors, cudaStream_t stream); +cudaError_t cudaRegionLayer(const void* input, void* softmax, void* boxes, void* scores, void* classes, + const uint& batchSize, const uint64_t& inputSize, const uint64_t& outputSize, const uint64_t& lastInputSize, + const uint& netWidth, const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, + const uint& numBBoxes, const void* anchors, cudaStream_t stream); -cudaError_t cudaRegionLayer(const void* input, void* softmax, void* output, void* count, const uint& batchSize, - uint64_t& inputSize, uint64_t& outputSize, const uint& netWidth, const uint& netHeight, const uint& gridSizeX, - const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, const void* anchors, cudaStream_t stream) +cudaError_t cudaRegionLayer(const void* input, void* softmax, void* boxes, void* scores, void* classes, + const uint& batchSize, const uint64_t& inputSize, const uint64_t& outputSize, const uint64_t& lastInputSize, + const uint& netWidth, const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, + const uint& numBBoxes, const void* anchors, cudaStream_t stream) { dim3 threads_per_block(16, 16, 4); dim3 number_of_blocks((gridSizeX / threads_per_block.x) + 1, (gridSizeY / threads_per_block.y) + 1, @@ -92,9 +94,10 @@ cudaError_t cudaRegionLayer(const void* input, void* softmax, void* output, void gpuRegionLayer<<>>( reinterpret_cast (input) + (batch * inputSize), reinterpret_cast (softmax) + (batch * inputSize), - reinterpret_cast (output) + (batch * 6 * outputSize), - reinterpret_cast (count) + (batch), - netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, + reinterpret_cast (boxes) + (batch * 4 * outputSize), + reinterpret_cast (scores) + (batch * 1 * outputSize), + reinterpret_cast (classes) + (batch * 1 * outputSize), + netWidth, netHeight, gridSizeX, gridSizeY, numOutputClasses, numBBoxes, lastInputSize, reinterpret_cast (anchors)); } return cudaGetLastError(); diff --git a/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp b/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp index 4f54a1f..5c921b9 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp +++ b/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp @@ -38,19 +38,20 @@ namespace { } } -cudaError_t cudaYoloLayer_nc(const void* input, void* output, void* count, const uint& batchSize, uint64_t& inputSize, - uint64_t& outputSize, const uint& netWidth, const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, - const uint& numOutputClasses, const uint& numBBoxes, const float& scaleXY, const void* anchors, const void* mask, - cudaStream_t stream); +cudaError_t cudaYoloLayer_nc(const void* input, void* boxes, void* scores, void* classes, const uint& batchSize, + const uint64_t& inputSize, const uint64_t& outputSize, const uint64_t& lastInputSize, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); -cudaError_t cudaYoloLayer(const void* input, void* output, void* count, const uint& batchSize, uint64_t& inputSize, - uint64_t& outputSize, const uint& netWidth, const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, - const uint& numOutputClasses, const uint& numBBoxes, const float& scaleXY, const void* anchors, const void* mask, - cudaStream_t stream); +cudaError_t cudaYoloLayer(const void* input, void* boxes, void* scores, void* classes, const uint& batchSize, + const uint64_t& inputSize, const uint64_t& outputSize, const uint64_t& lastInputSize, const uint& netWidth, + const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, + const float& scaleXY, const void* anchors, const void* mask, cudaStream_t stream); -cudaError_t cudaRegionLayer(const void* input, void* softmax, void* output, void* count, const uint& batchSize, - uint64_t& inputSize, uint64_t& outputSize, const uint& netWidth, const uint& netHeight, const uint& gridSizeX, - const uint& gridSizeY, const uint& numOutputClasses, const uint& numBBoxes, const void* anchors, cudaStream_t stream); +cudaError_t cudaRegionLayer(const void* input, void* softmax, void* boxes, void* scores, void* classes, + const uint& batchSize, const uint64_t& inputSize, const uint64_t& outputSize, const uint64_t& lastInputSize, + const uint& netWidth, const uint& netHeight, const uint& gridSizeX, const uint& gridSizeY, const uint& numOutputClasses, + const uint& numBBoxes, const void* anchors, cudaStream_t stream); YoloLayer::YoloLayer(const void* data, size_t length) { const char* d = static_cast(data); @@ -99,96 +100,10 @@ YoloLayer::YoloLayer(const uint& netWidth, const uint& netHeight, const uint& nu assert(m_NetHeight > 0); }; -nvinfer1::Dims -YoloLayer::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) noexcept +nvinfer1::IPluginV2DynamicExt* +YoloLayer::clone() const noexcept { - assert(index == 0); - return nvinfer1::Dims{2, {static_cast(m_OutputSize), 6}}; -} - -bool -YoloLayer::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const noexcept { - return (type == nvinfer1::DataType::kFLOAT && format == nvinfer1::PluginFormat::kLINEAR); -} - -void -YoloLayer::configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, - int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept -{ - assert(nbInputs > 0); - assert(format == nvinfer1::PluginFormat::kLINEAR); - assert(inputDims != nullptr); -} - -#ifdef LEGACY -int -YoloLayer::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) -#else -int32_t -YoloLayer::enqueue(int batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) - noexcept -#endif -{ - void* output = outputs[0]; - CUDA_CHECK(cudaMemsetAsync((float*) output, 0, sizeof(float) * m_OutputSize * 6 * batchSize, stream)); - - void* count = workspace; - CUDA_CHECK(cudaMemsetAsync((int*) count, 0, sizeof(int) * batchSize, stream)); - - uint yoloTensorsSize = m_YoloTensors.size(); - for (uint i = 0; i < yoloTensorsSize; ++i) { - TensorInfo& curYoloTensor = m_YoloTensors.at(i); - - uint numBBoxes = curYoloTensor.numBBoxes; - float scaleXY = curYoloTensor.scaleXY; - uint gridSizeX = curYoloTensor.gridSizeX; - uint gridSizeY = curYoloTensor.gridSizeY; - std::vector anchors = curYoloTensor.anchors; - std::vector mask = curYoloTensor.mask; - - void* v_anchors; - void* v_mask; - if (anchors.size() > 0) { - CUDA_CHECK(cudaMalloc(&v_anchors, sizeof(float) * anchors.size())); - CUDA_CHECK(cudaMemcpyAsync(v_anchors, anchors.data(), sizeof(float) * anchors.size(), cudaMemcpyHostToDevice, stream)); - } - if (mask.size() > 0) { - CUDA_CHECK(cudaMalloc(&v_mask, sizeof(int) * mask.size())); - CUDA_CHECK(cudaMemcpyAsync(v_mask, mask.data(), sizeof(int) * mask.size(), cudaMemcpyHostToDevice, stream)); - } - - uint64_t inputSize = gridSizeX * gridSizeY * (numBBoxes * (4 + 1 + m_NumClasses)); - - if (mask.size() > 0) { - if (m_NewCoords) { - CUDA_CHECK(cudaYoloLayer_nc(inputs[i], output, count, batchSize, inputSize, m_OutputSize, m_NetWidth, m_NetHeight, - gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY, v_anchors, v_mask, stream)); - } - else { - CUDA_CHECK(cudaYoloLayer(inputs[i], output, count, batchSize, inputSize, m_OutputSize, m_NetWidth, m_NetHeight, - gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY, v_anchors, v_mask, stream)); - } - } - else { - void* softmax; - CUDA_CHECK(cudaMalloc(&softmax, sizeof(float) * inputSize * batchSize)); - CUDA_CHECK(cudaMemsetAsync((float*)softmax, 0, sizeof(float) * inputSize * batchSize, stream)); - - CUDA_CHECK(cudaRegionLayer(inputs[i], softmax, output, count, batchSize, inputSize, m_OutputSize, m_NetWidth, - m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, v_anchors, stream)); - - CUDA_CHECK(cudaFree(softmax)); - } - - if (anchors.size() > 0) { - CUDA_CHECK(cudaFree(v_anchors)); - } - if (mask.size() > 0) { - CUDA_CHECK(cudaFree(v_mask)); - } - } - - return 0; + return new YoloLayer(m_NetWidth, m_NetHeight, m_NumClasses, m_NewCoords, m_YoloTensors, m_OutputSize); } size_t @@ -250,10 +165,113 @@ YoloLayer::serialize(void* buffer) const noexcept } } -nvinfer1::IPluginV2* -YoloLayer::clone() const noexcept +nvinfer1::DimsExprs +YoloLayer::getOutputDimensions(INT index, const nvinfer1::DimsExprs* inputs, INT nbInputDims, + nvinfer1::IExprBuilder& exprBuilder)noexcept { - return new YoloLayer(m_NetWidth, m_NetHeight, m_NumClasses, m_NewCoords, m_YoloTensors, m_OutputSize); + assert(index < 3); + if (index == 0) { + return nvinfer1::DimsExprs{3, {inputs->d[0], exprBuilder.constant(static_cast(m_OutputSize)), + exprBuilder.constant(4)}}; + } + return nvinfer1::DimsExprs{3, {inputs->d[0], exprBuilder.constant(static_cast(m_OutputSize)), + exprBuilder.constant(1)}}; +} + +bool +YoloLayer::supportsFormatCombination(INT pos, const nvinfer1::PluginTensorDesc* inOut, INT nbInputs, INT nbOutputs) noexcept +{ + return inOut[pos].format == nvinfer1::TensorFormat::kLINEAR && (inOut[pos].type == nvinfer1::DataType::kFLOAT || + inOut[pos].type == nvinfer1::DataType::kINT32); +} + +nvinfer1::DataType +YoloLayer::getOutputDataType(INT index, const nvinfer1::DataType* inputTypes, INT nbInputs) const noexcept +{ + assert(index < 3); + if (index == 2) { + return nvinfer1::DataType::kINT32; + } + return nvinfer1::DataType::kFLOAT; +} + +void +YoloLayer::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, INT nbInput, + const nvinfer1::DynamicPluginTensorDesc* out, INT nbOutput) noexcept +{ + assert(nbInput > 0); + assert(in->desc.format == nvinfer1::PluginFormat::kLINEAR); + assert(in->desc.dims.d != nullptr); +} + +INT +YoloLayer::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc, + void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept +{ + INT batchSize = inputDesc[0].dims.d[0]; + + void* boxes = outputs[0]; + void* scores = outputs[1]; + void* classes = outputs[2]; + + uint64_t lastInputSize = 0; + + uint yoloTensorsSize = m_YoloTensors.size(); + for (uint i = 0; i < yoloTensorsSize; ++i) { + TensorInfo& curYoloTensor = m_YoloTensors.at(i); + + const uint numBBoxes = curYoloTensor.numBBoxes; + const float scaleXY = curYoloTensor.scaleXY; + const uint gridSizeX = curYoloTensor.gridSizeX; + const uint gridSizeY = curYoloTensor.gridSizeY; + const std::vector anchors = curYoloTensor.anchors; + const std::vector mask = curYoloTensor.mask; + + void* v_anchors; + void* v_mask; + if (anchors.size() > 0) { + CUDA_CHECK(cudaMalloc(&v_anchors, sizeof(float) * anchors.size())); + CUDA_CHECK(cudaMemcpyAsync(v_anchors, anchors.data(), sizeof(float) * anchors.size(), cudaMemcpyHostToDevice, stream)); + } + if (mask.size() > 0) { + CUDA_CHECK(cudaMalloc(&v_mask, sizeof(int) * mask.size())); + CUDA_CHECK(cudaMemcpyAsync(v_mask, mask.data(), sizeof(int) * mask.size(), cudaMemcpyHostToDevice, stream)); + } + + const uint64_t inputSize = (numBBoxes * (4 + 1 + m_NumClasses)) * gridSizeY * gridSizeX; + + if (mask.size() > 0) { + if (m_NewCoords) { + CUDA_CHECK(cudaYoloLayer_nc(inputs[i], boxes, scores, classes, batchSize, inputSize, m_OutputSize, lastInputSize, + m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY, v_anchors, v_mask, stream)); + } + else { + CUDA_CHECK(cudaYoloLayer(inputs[i], boxes, scores, classes, batchSize, inputSize, m_OutputSize, lastInputSize, + m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, scaleXY, v_anchors, v_mask, stream)); + } + } + else { + void* softmax; + CUDA_CHECK(cudaMalloc(&softmax, sizeof(float) * inputSize * batchSize)); + CUDA_CHECK(cudaMemsetAsync((float*)softmax, 0, sizeof(float) * inputSize * batchSize, stream)); + + CUDA_CHECK(cudaRegionLayer(inputs[i], softmax, boxes, scores, classes, batchSize, inputSize, m_OutputSize, + lastInputSize, m_NetWidth, m_NetHeight, gridSizeX, gridSizeY, m_NumClasses, numBBoxes, v_anchors, stream)); + + CUDA_CHECK(cudaFree(softmax)); + } + + if (anchors.size() > 0) { + CUDA_CHECK(cudaFree(v_anchors)); + } + if (mask.size() > 0) { + CUDA_CHECK(cudaFree(v_mask)); + } + + lastInputSize += numBBoxes * gridSizeY * gridSizeX; + } + + return 0; } REGISTER_TENSORRT_PLUGIN(YoloLayerPluginCreator); diff --git a/nvdsinfer_custom_impl_Yolo/yoloPlugins.h b/nvdsinfer_custom_impl_Yolo/yoloPlugins.h index e586b7e..8cee9ff 100644 --- a/nvdsinfer_custom_impl_Yolo/yoloPlugins.h +++ b/nvdsinfer_custom_impl_Yolo/yoloPlugins.h @@ -38,57 +38,68 @@ } \ } +#if NV_TENSORRT_MAJOR >= 8 + #define INT int32_t +#else + #define INT int +#endif + namespace { const char* YOLOLAYER_PLUGIN_VERSION {"1"}; const char* YOLOLAYER_PLUGIN_NAME {"YoloLayer_TRT"}; } // namespace -class YoloLayer : public nvinfer1::IPluginV2 { +class YoloLayer : public nvinfer1::IPluginV2DynamicExt { public: YoloLayer(const void* data, size_t length); YoloLayer(const uint& netWidth, const uint& netHeight, const uint& numClasses, const uint& newCoords, const std::vector& yoloTensors, const uint64_t& outputSize); - const char* getPluginType() const noexcept override { return YOLOLAYER_PLUGIN_NAME; } - - const char* getPluginVersion() const noexcept override { return YOLOLAYER_PLUGIN_VERSION; } - - int getNbOutputs() const noexcept override { return 1; } - - nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) noexcept override; - - bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const noexcept override; - - void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, int nbOutputs, - nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) noexcept override; + nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; int initialize() noexcept override { return 0; } void terminate() noexcept override {} - size_t getWorkspaceSize(int maxBatchSize) const noexcept override { - return maxBatchSize * sizeof(int); - } - -#ifdef LEGACY - int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override; -#else - int32_t enqueue(int batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) - noexcept override; -#endif + void destroy() noexcept override { delete this; } size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; - void destroy() noexcept override { delete this; } + int getNbOutputs() const noexcept override { return 3; } - nvinfer1::IPluginV2* clone() const noexcept override; + nvinfer1::DimsExprs getOutputDimensions(INT index, const nvinfer1::DimsExprs* inputs, INT nbInputDims, + nvinfer1::IExprBuilder& exprBuilder) noexcept override; + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, INT nbInputs, + const nvinfer1::PluginTensorDesc* outputs, INT nbOutputs) const noexcept override { return 0; } + + bool supportsFormatCombination(INT pos, const nvinfer1::PluginTensorDesc* inOut, INT nbInputs, INT nbOutputs) noexcept + override; + + const char* getPluginType() const noexcept override { return YOLOLAYER_PLUGIN_NAME; } + + const char* getPluginVersion() const noexcept override { return YOLOLAYER_PLUGIN_VERSION; } void setPluginNamespace(const char* pluginNamespace) noexcept override { m_Namespace = pluginNamespace; } - virtual const char* getPluginNamespace() const noexcept override { return m_Namespace.c_str(); } + const char* getPluginNamespace() const noexcept override { return m_Namespace.c_str(); } + + nvinfer1::DataType getOutputDataType(INT index, const nvinfer1::DataType* inputTypes, INT nbInputs) const noexcept + override; + + void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, nvinfer1::IGpuAllocator* gpuAllocator) + noexcept override {} + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, INT nbInput, + const nvinfer1::DynamicPluginTensorDesc* out, INT nbOutput) noexcept override; + + void detachFromContext() noexcept override {} + + INT enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc, + void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; private: std::string m_Namespace {""}; @@ -115,12 +126,14 @@ class YoloLayerPluginCreator : public nvinfer1::IPluginCreator { return nullptr; } - nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) noexcept override { + nvinfer1::IPluginV2DynamicExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) noexcept + override { std::cerr<< "YoloLayerPluginCreator::getFieldNames is not implemented"; return nullptr; } - nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override { + nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept + override { std::cout << "Deserialize yoloLayer plugin: " << name << std::endl; return new YoloLayer(serialData, serialLength); } diff --git a/utils/export_damoyolo.py b/utils/export_damoyolo.py index 01f69ea..1735df8 100644 --- a/utils/export_damoyolo.py +++ b/utils/export_damoyolo.py @@ -18,7 +18,7 @@ class DeepStreamOutput(nn.Module): def forward(self, x): boxes = x[1] scores, classes = torch.max(x[0], 2, keepdim=True) - return torch.cat((boxes, scores, classes.float()), dim=2) + return boxes, scores, classes def suppress_warnings(): @@ -65,21 +65,27 @@ def main(args): img_size = args.size * 2 if len(args.size) == 1 else args.size - onnx_input_im = torch.zeros(1, 3, *img_size).to(device) + onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) onnx_output_file = cfg.miscs['exp_name'] + '.onnx' dynamic_axes = { 'input': { 0: 'batch' }, - 'output': { + 'boxes': { + 0: 'batch' + }, + 'scores': { + 0: 'batch' + }, + 'classes': { 0: 'batch' } } print('Exporting the model to ONNX') torch.onnx.export(model, onnx_input_im, onnx_output_file, verbose=False, opset_version=args.opset, - do_constant_folding=True, input_names=['input'], output_names=['output'], + do_constant_folding=True, input_names=['input'], output_names=['boxes', 'scores', 'classes'], dynamic_axes=dynamic_axes if args.dynamic else None) if args.simplify: @@ -100,11 +106,14 @@ def parse_args(): parser.add_argument('--opset', type=int, default=11, help='ONNX opset version') parser.add_argument('--simplify', action='store_true', help='ONNX simplify model') parser.add_argument('--dynamic', action='store_true', help='Dynamic batch-size') + parser.add_argument('--batch', type=int, default=1, help='Implicit batch-size') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid weights file') if not os.path.isfile(args.config): raise SystemExit('Invalid config file') + if args.dynamic and args.batch > 1: + raise SystemExit('Cannot set dynamic batch-size and implicit batch-size at same time') return args diff --git a/utils/export_ppyoloe.py b/utils/export_ppyoloe.py index a03c26d..eee6236 100644 --- a/utils/export_ppyoloe.py +++ b/utils/export_ppyoloe.py @@ -19,8 +19,8 @@ class DeepStreamOutput(nn.Layer): boxes = x['bbox'] x['bbox_num'] = x['bbox_num'].transpose([0, 2, 1]) scores = paddle.max(x['bbox_num'], 2, keepdim=True) - classes = paddle.cast(paddle.argmax(x['bbox_num'], 2, keepdim=True), dtype='float32') - return paddle.concat((boxes, scores, classes), axis=2) + classes = paddle.argmax(x['bbox_num'], 2, keepdim=True) + return boxes, scores, classes def ppyoloe_export(FLAGS): @@ -65,8 +65,8 @@ def main(FLAGS): img_size = [cfg.eval_height, cfg.eval_width] onnx_input_im = {} - onnx_input_im['image'] = paddle.static.InputSpec(shape=[None, 3, *img_size], dtype='float32', name='image') - onnx_input_im['scale_factor'] = paddle.static.InputSpec(shape=[None, 2], dtype='float32', name='scale_factor') + onnx_input_im['image'] = paddle.static.InputSpec(shape=[FLAGS.batch, 3, *img_size], dtype='float32', name='image') + onnx_input_im['scale_factor'] = paddle.static.InputSpec(shape=[FLAGS.batch, 2], dtype='float32', name='scale_factor') onnx_output_file = cfg.filename + '.onnx' print('\nExporting the model to ONNX\n') @@ -88,7 +88,15 @@ def parse_args(): parser.add_argument('--slim_config', default=None, type=str, help='Slim configuration file of slim method') parser.add_argument('--opset', type=int, default=11, help='ONNX opset version') parser.add_argument('--simplify', action='store_true', help='ONNX simplify model') + parser.add_argument('--dynamic', action='store_true', help='Dynamic batch-size') + parser.add_argument('--batch', type=int, default=1, help='Implicit batch-size') args = parser.parse_args() + if not os.path.isfile(args.weights): + raise SystemExit('\nInvalid weights file') + if args.dynamic and args.batch > 1: + raise SystemExit('\nCannot set dynamic batch-size and implicit batch-size at same time') + elif args.dynamic: + args.batch = None return args diff --git a/utils/export_yoloV5.py b/utils/export_yoloV5.py index f5ca1d1..6d1d345 100644 --- a/utils/export_yoloV5.py +++ b/utils/export_yoloV5.py @@ -19,7 +19,8 @@ class DeepStreamOutput(nn.Module): boxes = x[:, :, :4] objectness = x[:, :, 4:5] scores, classes = torch.max(x[:, :, 5:], 2, keepdim=True) - return torch.cat((boxes, scores * objectness, classes.float()), dim=2) + scores *= objectness + return boxes, scores, classes def suppress_warnings(): @@ -63,21 +64,27 @@ def main(args): if img_size == [640, 640] and args.p6: img_size = [1280] * 2 - onnx_input_im = torch.zeros(1, 3, *img_size).to(device) + onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) onnx_output_file = os.path.basename(args.weights).split('.pt')[0] + '.onnx' dynamic_axes = { 'input': { 0: 'batch' }, - 'output': { + 'boxes': { + 0: 'batch' + }, + 'scores': { + 0: 'batch' + }, + 'classes': { 0: 'batch' } } print('\nExporting the model to ONNX') torch.onnx.export(model, onnx_input_im, onnx_output_file, verbose=False, opset_version=args.opset, - do_constant_folding=True, input_names=['input'], output_names=['output'], + do_constant_folding=True, input_names=['input'], output_names=['boxes', 'scores', 'classes'], dynamic_axes=dynamic_axes if args.dynamic else None) if args.simplify: @@ -98,9 +105,12 @@ def parse_args(): parser.add_argument('--opset', type=int, default=17, help='ONNX opset version') parser.add_argument('--simplify', action='store_true', help='ONNX simplify model') parser.add_argument('--dynamic', action='store_true', help='Dynamic batch-size') + parser.add_argument('--batch', type=int, default=1, help='Implicit batch-size') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid weights file') + if args.dynamic and args.batch > 1: + raise SystemExit('Cannot set dynamic batch-size and implicit batch-size at same time') return args diff --git a/utils/export_yoloV6.py b/utils/export_yoloV6.py index 6daeb27..22950f2 100644 --- a/utils/export_yoloV6.py +++ b/utils/export_yoloV6.py @@ -23,7 +23,8 @@ class DeepStreamOutput(nn.Module): boxes = x[:, :, :4] objectness = x[:, :, 4:5] scores, classes = torch.max(x[:, :, 5:], 2, keepdim=True) - return torch.cat((boxes, scores * objectness, classes.float()), dim=2) + scores *= objectness + return boxes, scores, classes def suppress_warnings(): @@ -66,21 +67,27 @@ def main(args): if img_size == [640, 640] and args.p6: img_size = [1280] * 2 - onnx_input_im = torch.zeros(1, 3, *img_size).to(device) + onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) onnx_output_file = os.path.basename(args.weights).split('.pt')[0] + '.onnx' dynamic_axes = { 'input': { 0: 'batch' }, - 'output': { + 'boxes': { + 0: 'batch' + }, + 'scores': { + 0: 'batch' + }, + 'classes': { 0: 'batch' } } print('\nExporting the model to ONNX') torch.onnx.export(model, onnx_input_im, onnx_output_file, verbose=False, opset_version=args.opset, - do_constant_folding=True, input_names=['input'], output_names=['output'], + do_constant_folding=True, input_names=['input'], output_names=['boxes', 'scores', 'classes'], dynamic_axes=dynamic_axes if args.dynamic else None) if args.simplify: @@ -101,9 +108,12 @@ def parse_args(): parser.add_argument('--opset', type=int, default=13, help='ONNX opset version') parser.add_argument('--simplify', action='store_true', help='ONNX simplify model') parser.add_argument('--dynamic', action='store_true', help='Dynamic batch-size') + parser.add_argument('--batch', type=int, default=1, help='Implicit batch-size') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid weights file') + if args.dynamic and args.batch > 1: + raise SystemExit('Cannot set dynamic batch-size and implicit batch-size at same time') return args diff --git a/utils/export_yoloV7.py b/utils/export_yoloV7.py index 1dca872..24affb2 100644 --- a/utils/export_yoloV7.py +++ b/utils/export_yoloV7.py @@ -19,7 +19,8 @@ class DeepStreamOutput(nn.Module): boxes = x[:, :, :4] objectness = x[:, :, 4:5] scores, classes = torch.max(x[:, :, 5:], 2, keepdim=True) - return torch.cat((boxes, scores * objectness, classes.float()), dim=2) + scores *= objectness + return boxes, scores, classes def suppress_warnings(): @@ -67,21 +68,27 @@ def main(args): if img_size == [640, 640] and args.p6: img_size = [1280] * 2 - onnx_input_im = torch.zeros(1, 3, *img_size).to(device) + onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) onnx_output_file = os.path.basename(args.weights).split('.pt')[0] + '.onnx' dynamic_axes = { 'input': { 0: 'batch' }, - 'output': { + 'boxes': { + 0: 'batch' + }, + 'scores': { + 0: 'batch' + }, + 'classes': { 0: 'batch' } } print('\nExporting the model to ONNX') torch.onnx.export(model, onnx_input_im, onnx_output_file, verbose=False, opset_version=args.opset, - do_constant_folding=True, input_names=['input'], output_names=['output'], + do_constant_folding=True, input_names=['input'], output_names=['boxes', 'scores', 'classes'], dynamic_axes=dynamic_axes if args.dynamic else None) if args.simplify: @@ -102,9 +109,12 @@ def parse_args(): parser.add_argument('--opset', type=int, default=12, help='ONNX opset version') parser.add_argument('--simplify', action='store_true', help='ONNX simplify model') parser.add_argument('--dynamic', action='store_true', help='Dynamic batch-size') + parser.add_argument('--batch', type=int, default=1, help='Implicit batch-size') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid weights file') + if args.dynamic and args.batch > 1: + raise SystemExit('Cannot set dynamic batch-size and implicit batch-size at same time') return args diff --git a/utils/export_yoloV7_u6.py b/utils/export_yoloV7_u6.py index d0a1864..e8a4849 100644 --- a/utils/export_yoloV7_u6.py +++ b/utils/export_yoloV7_u6.py @@ -18,7 +18,7 @@ class DeepStreamOutput(nn.Module): x = x.transpose(1, 2) boxes = x[:, :, :4] scores, classes = torch.max(x[:, :, 4:], 2, keepdim=True) - return torch.cat((boxes, scores, classes.float()), dim=2) + return boxes, scores, classes def suppress_warnings(): @@ -59,21 +59,27 @@ def main(args): img_size = args.size * 2 if len(args.size) == 1 else args.size - onnx_input_im = torch.zeros(1, 3, *img_size).to(device) + onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) onnx_output_file = os.path.basename(args.weights).split('.pt')[0] + '.onnx' dynamic_axes = { 'input': { 0: 'batch' }, - 'output': { + 'boxes': { + 0: 'batch' + }, + 'scores': { + 0: 'batch' + }, + 'classes': { 0: 'batch' } } print('\nExporting the model to ONNX') torch.onnx.export(model, onnx_input_im, onnx_output_file, verbose=False, opset_version=args.opset, - do_constant_folding=True, input_names=['input'], output_names=['output'], + do_constant_folding=True, input_names=['input'], output_names=['boxes', 'scores', 'classes'], dynamic_axes=dynamic_axes if args.dynamic else None) if args.simplify: @@ -93,9 +99,12 @@ def parse_args(): parser.add_argument('--opset', type=int, default=12, help='ONNX opset version') parser.add_argument('--simplify', action='store_true', help='ONNX simplify model') parser.add_argument('--dynamic', action='store_true', help='Dynamic batch-size') + parser.add_argument('--batch', type=int, default=1, help='Implicit batch-size') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid weights file') + if args.dynamic and args.batch > 1: + raise SystemExit('Cannot set dynamic batch-size and implicit batch-size at same time') return args diff --git a/utils/export_yoloV8.py b/utils/export_yoloV8.py index fe441ce..757b524 100644 --- a/utils/export_yoloV8.py +++ b/utils/export_yoloV8.py @@ -19,7 +19,7 @@ class DeepStreamOutput(nn.Module): x = x.transpose(1, 2) boxes = x[:, :, :4] scores, classes = torch.max(x[:, :, 4:], 2, keepdim=True) - return torch.cat((boxes, scores, classes.float()), dim=2) + return boxes, scores, classes def suppress_warnings(): @@ -67,21 +67,27 @@ def main(args): img_size = args.size * 2 if len(args.size) == 1 else args.size - onnx_input_im = torch.zeros(1, 3, *img_size).to(device) + onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) onnx_output_file = os.path.basename(args.weights).split('.pt')[0] + '.onnx' dynamic_axes = { 'input': { 0: 'batch' }, - 'output': { + 'boxes': { + 0: 'batch' + }, + 'scores': { + 0: 'batch' + }, + 'classes': { 0: 'batch' } } print('\nExporting the model to ONNX') torch.onnx.export(model, onnx_input_im, onnx_output_file, verbose=False, opset_version=args.opset, - do_constant_folding=True, input_names=['input'], output_names=['output'], + do_constant_folding=True, input_names=['input'], output_names=['boxes', 'scores', 'classes'], dynamic_axes=dynamic_axes if args.dynamic else None) if args.simplify: @@ -101,9 +107,12 @@ def parse_args(): parser.add_argument('--opset', type=int, default=16, help='ONNX opset version') parser.add_argument('--simplify', action='store_true', help='ONNX simplify model') parser.add_argument('--dynamic', action='store_true', help='Dynamic batch-size') + parser.add_argument('--batch', type=int, default=1, help='Implicit batch-size') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid weights file') + if args.dynamic and args.batch > 1: + raise SystemExit('Cannot set dynamic batch-size and implicit batch-size at same time') return args diff --git a/utils/export_yolonas.py b/utils/export_yolonas.py index df7983d..327443a 100644 --- a/utils/export_yolonas.py +++ b/utils/export_yolonas.py @@ -15,7 +15,7 @@ class DeepStreamOutput(nn.Module): def forward(self, x): boxes = x[0] scores, classes = torch.max(x[1], 2, keepdim=True) - return torch.cat((boxes, scores, classes.float()), dim=2) + return boxes, scores, classes def suppress_warnings(): @@ -46,21 +46,27 @@ def main(args): img_size = args.size * 2 if len(args.size) == 1 else args.size - onnx_input_im = torch.zeros(1, 3, *img_size).to(device) + onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) onnx_output_file = os.path.basename(args.weights).split('.pt')[0] + '.onnx' dynamic_axes = { 'input': { 0: 'batch' }, - 'output': { + 'boxes': { + 0: 'batch' + }, + 'scores': { + 0: 'batch' + }, + 'classes': { 0: 'batch' } } print('\nExporting the model to ONNX') torch.onnx.export(model, onnx_input_im, onnx_output_file, verbose=False, opset_version=args.opset, - do_constant_folding=True, input_names=['input'], output_names=['output'], + do_constant_folding=True, input_names=['input'], output_names=['boxes', 'scores', 'classes'], dynamic_axes=dynamic_axes if args.dynamic else None) if args.simplify: @@ -82,11 +88,14 @@ def parse_args(): parser.add_argument('--opset', type=int, default=14, help='ONNX opset version') parser.add_argument('--simplify', action='store_true', help='ONNX simplify model') parser.add_argument('--dynamic', action='store_true', help='Dynamic batch-size') + parser.add_argument('--batch', type=int, default=1, help='Implicit batch-size') args = parser.parse_args() if args.model == '': raise SystemExit('Invalid model name') if not os.path.isfile(args.weights): raise SystemExit('Invalid weights file') + if args.dynamic and args.batch > 1: + raise SystemExit('Cannot set dynamic batch-size and implicit batch-size at same time') return args diff --git a/utils/export_yolor.py b/utils/export_yolor.py index 86fefec..aa5785f 100644 --- a/utils/export_yolor.py +++ b/utils/export_yolor.py @@ -16,7 +16,8 @@ class DeepStreamOutput(nn.Module): boxes = x[:, :, :4] objectness = x[:, :, 4:5] scores, classes = torch.max(x[:, :, 5:], 2, keepdim=True) - return torch.cat((boxes, scores * objectness, classes.float()), dim=2) + scores *= objectness + return boxes, scores, classes def suppress_warnings(): @@ -79,21 +80,27 @@ def main(args): if img_size == [640, 640] and args.p6: img_size = [1280] * 2 - onnx_input_im = torch.zeros(1, 3, *img_size).to(device) + onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) onnx_output_file = os.path.basename(args.weights).split('.pt')[0] + '.onnx' dynamic_axes = { 'input': { 0: 'batch' }, - 'output': { + 'boxes': { + 0: 'batch' + }, + 'scores': { + 0: 'batch' + }, + 'classes': { 0: 'batch' } } print('\nExporting the model to ONNX') torch.onnx.export(model, onnx_input_im, onnx_output_file, verbose=False, opset_version=args.opset, - do_constant_folding=True, input_names=['input'], output_names=['output'], + do_constant_folding=True, input_names=['input'], output_names=['boxes', 'scores', 'classes'], dynamic_axes=dynamic_axes if args.dynamic else None) if args.simplify: @@ -115,9 +122,12 @@ def parse_args(): parser.add_argument('--opset', type=int, default=12, help='ONNX opset version') parser.add_argument('--simplify', action='store_true', help='ONNX simplify model') parser.add_argument('--dynamic', action='store_true', help='Dynamic batch-size') + parser.add_argument('--batch', type=int, default=1, help='Implicit batch-size') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid weights file') + if args.dynamic and args.batch > 1: + raise SystemExit('Cannot set dynamic batch-size and implicit batch-size at same time') return args diff --git a/utils/export_yolox.py b/utils/export_yolox.py index 75d2f0a..2f3d441 100644 --- a/utils/export_yolox.py +++ b/utils/export_yolox.py @@ -18,7 +18,8 @@ class DeepStreamOutput(nn.Module): boxes = x[:, :, :4] objectness = x[:, :, 4:5] scores, classes = torch.max(x[:, :, 5:], 2, keepdim=True) - return torch.cat((boxes, scores * objectness, classes.float()), dim=2) + scores *= objectness + return boxes, scores, classes def suppress_warnings(): @@ -54,21 +55,27 @@ def main(args): img_size = [exp.input_size[1], exp.input_size[0]] - onnx_input_im = torch.zeros(1, 3, *img_size).to(device) + onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) onnx_output_file = os.path.basename(args.weights).split('.pt')[0] + '.onnx' dynamic_axes = { 'input': { 0: 'batch' }, - 'output': { + 'boxes': { + 0: 'batch' + }, + 'scores': { + 0: 'batch' + }, + 'classes': { 0: 'batch' } } print('Exporting the model to ONNX') torch.onnx.export(model, onnx_input_im, onnx_output_file, verbose=False, opset_version=args.opset, - do_constant_folding=True, input_names=['input'], output_names=['output'], + do_constant_folding=True, input_names=['input'], output_names=['boxes', 'scores', 'classes'], dynamic_axes=dynamic_axes if args.dynamic else None) if args.simplify: @@ -88,11 +95,14 @@ def parse_args(): parser.add_argument('--opset', type=int, default=11, help='ONNX opset version') parser.add_argument('--simplify', action='store_true', help='ONNX simplify model') parser.add_argument('--dynamic', action='store_true', help='Dynamic batch-size') + parser.add_argument('--batch', type=int, default=1, help='Implicit batch-size') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid weights file') if not os.path.isfile(args.exp): raise SystemExit('Invalid exp file') + if args.dynamic and args.batch > 1: + raise SystemExit('Cannot set dynamic batch-size and implicit batch-size at same time') return args