C++实现推理, 基于Onnx-simplifier、onnx-tensorrt、TensorRT和Cuda编程

Cuda基础

深度学习模型推理,
为啥需要学习tensorRT, 因为需要加速, 需要C++部署,
为啥又需要学习cuda编程呢, 因为有些前处理, 后处理需要cuda编程来并行运算进行加速, 比如anchor的解码. nms等后处理

1: CUDA编程之基本步骤

操作概括来说包含5个步骤：

1.CPU在GPU上分配内存：cudaMalloc；

2.CPU把数据发送到GPU：cudaMemcpy,cudaMemcpyAsync；

3.CPU在GPU上启动内核（kernel），它是自己写的一段程序，在每个线程上运行；

4.CPU把数据从GPU取回：cudaMemcpy,cudaMemcpyAsync；

5.CPU释放GPU上的内存。

其中关键是第3步，能否写出合适的kernel，决定了能否正确解决问题和能否高效的解决问题。

所以这里,  
需要学习Cuda编程的基础,
其他几个步骤直接调用函数,
步骤三则是调用CUDA的核函数在device上完成指定的运算,需要自己写函数；
如果要求低,至少能看懂cuda代码, 能在别人的基础上进行修改,完成自己的任务;

详细简单学习:
推荐以下链接:
https://zhuanlan.zhihu.com/p/34587739
https://blog.csdn.net/hujingshuang/article/details/53097222

1.1 CUDA编程之核函数

线程索引的计算公式
一个Grid可以包含多个Blocks，Blocks的组织方式可以是一维的，二维或者三维的。block包含多个Threads，这些Threads的组织方式也可以是一维，二维或者三维的。
CUDA中每一个线程都有一个唯一的标识ID—ThreadIdx，这个ID随着Grid和Block的划分方式的不同而变化，这里给出Grid和Block不同划分方式下线程索引ID的计算公式。
dim3 grid(1, 1, 1), block(4, 4, 1); // 设置参数
我们将block改成上面的这样，其线程模型为下图：

在CUDA上可以使用内置变量来获取Thread ID和Block ID：

threadIdx.[x, y, z]表示Block内Thread的编号
blockIdx.[x, y, z]表示Gird内Block的编号
blockDim.[x, y, z]表示Block的维度，也就是Block中每个方向上的Thread的数目

gridDim.[x, y, z]表示Gird的维度，也就是Grid中每个方向上Block的数目

例如１：dim3 grid(1, 1, 1), block(4, 4, 1);

在这里插入图片描述

// 一：线程执行代码
__global__ void vector_add(float* vec1, float* vec2, float* vecres, int length) {
    int tid = threadIdx.y * blockDim.x +  threadIdx.x;  // 使用了threadIdx.x, threadIdx.x, blockDim.x
    if (tid < length) {
        vecres[tid] = vec1[tid] + vec2[tid];
    }
}

例如２：dim3 grid(2, 2, 1), block(2, 2, 1);

在这里插入图片描述


// 二：线程执行代码
__global__ void vector_add(float* vec1, float* vec2, float* vecres, int length) {
    // 在第几个块中 * 块的大小 + 块中的x, y维度（几行几列）
    int tid = (blockIdx.y * gridDim.x + blockIdx.x) * (blockDim.x * blockDim.y) + threadIdx.y * blockDim.y + threadIdx.x;
    if (tid < length) {
        vecres[tid] = vec1[tid] + vec2[tid];
    }

1、 grid划分成1维，block划分为1维

int threadId = blockIdx.x *blockDim.x + threadIdx.x;

２、 grid划分成1维，block划分为3维

int threadId = blockIdx.x * blockDim.x * blockDim.y * blockDim.z  
                   + threadIdx.z * blockDim.y * blockDim.x  
                   + threadIdx.y * blockDim.x + threadIdx.x;

３、 grid划分成2维，block划分为1维

int blockId = blockIdx.y * gridDim.x + blockIdx.x;  
int threadId = blockId * blockDim.x + threadIdx.x;

４、 grid划分成2维，block划分为2维

int blockId = blockIdx.x + blockIdx.y * gridDim.x;  
int threadId = blockId * (blockDim.x * blockDim.y)  
                   + (threadIdx.y * blockDim.x) + threadIdx.x;

５、 grid划分成2维，block划分为3维

int blockId = blockIdx.x + blockIdx.y * gridDim.x;  
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)  
                   + (threadIdx.z * (blockDim.x * blockDim.y))  
                   + (threadIdx.y * blockDim.x) + threadIdx.x;

６、 grid划分成3维，block划分为3维

int blockId = blockIdx.x + blockIdx.y * gridDim.x  
                 + gridDim.x * gridDim.y * blockIdx.z;  
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)  
                   + (threadIdx.z * (blockDim.x * blockDim.y))  
                   + (threadIdx.y * blockDim.x) + threadIdx.x;

在这里插入图片描述

1.2 CUDA编程之核函数代码

函数修饰符：
__global__，只能在GPU上运行，可以从CPU或者GPU调用
__device__，只能在GPU上运行，只能从GPU调用

变量修饰符：
__device__，存放在GPU的全局存储器，同一个grid里面的所有线程都可以直接访问，CPU端需要调用库函数来访问
__managed__，可以从CPU、GPU访问

#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>

		__global__ void addCuda(float* x, float* y, float* z, int n){
			int index = threadIdx.x+blockIdx.x*blockDim.x;
			int stride =  blockDim.x*gridDim.x;
			for(int i=index; i<n; i+=stride){
				z[i] = x[i] + y[i];
			}
		}
		int main(){
			int N =20;
			int nBytes = N*sizeof(float);
			float *x, *y, *z;
			x = (float*)malloc(nBytes);
			y = (float*)malloc(nBytes);
			z = (float*)malloc(nBytes);
			for(int i=0; i<N; i++){
				x[i] = 10.0;
				y[i] = 20.0;
			}
			float *dx, *dy, *dz;
			cudaMalloc((void**)&dx, nBytes);
			cudaMalloc((void**)&dy, nBytes);
			cudaMalloc((void**)&dz, nBytes);
			// x-y为原始数据， 将其复制到gpu上计算；
			cudaMemcpy((void*)dx, (void*)x, nBytes, cudaMemcpyHostToDevice);
			cudaMemcpy((void*)dy, (void*)y, nBytes, cudaMemcpyHostToDevice);
			dim3 blockSize(256);
			dim3 gridsize((N + blockSize.x - 1) / blockSize.x);
			// 在gpu上计算的结果保存在dz上；
			addCuda<<<gridsize, blockSize>>>(dx, dy, dz, N);
			// dz为得到的结果，将其复制出来放到z上， 然后检测z与实际值的差距。
			cudaMemcpy((void*)z, (void*)dz, nBytes, cudaMemcpyDeviceToHost);
			float maxError = 0.0;
		    for (int i = 0; i < N; i++)
			maxError = fmax(maxError, fabs(z[i] - 30.0));
		    std::cout << "最大误差: " << maxError << std::endl;
			cudaFree(dx);
			cudaFree(dy);
			cudaFree(dz);
			free(x);
			free(y);
			free(z);
			return 0;
		}

   // ｃｕｄａ编程细节
   // test.cu
	std::string classesFile = "coco.names";
	std::ifstream ifs(classesFile.c_str());
    std::string line;
    std::cout<<"labels:";
    while (getline(ifs, line)){
        std::cout<<line<<" ";
        labels.push_back(line);
    }
    //.cu 编程时, 主要括号打起来,否则会出错
    //.cpp 编程时,则不需要

在这里插入图片描述

2: CUDA编程之深度学习模型的推理

*部署环境：
tensorRT 7.0.0.11，cuda 10
官方版本 trt模型转换的代码路径:
．/TensorRT-7.0.0.11/samples/python/yolov3_onnx

onnx模型的生成：

可以由官方python脚本进行转换，如果是需要416尺度，仅仅需要进行微调代码。
那么onnx模型为纯粹的输入为图片，输出为head(三个输出尺度，不进行anchor解码等封装操作。*
yolov3_to_onnx.py

trt模型的生成：

可以由官方python脚本进行转换，如果是需要转int8，也仅仅需要进行微调代码。推荐使用FP16
onnx_to_tensorrt.py

2.1 cuda编程之yolov3在CPU上后处理

YOLOV3

如下代码
doInference函数所示步骤,
包括三个部分，输入处理，trt引擎推理，后处理。

代码部分，只有trt推理在cuda上完成，nvidia已经完成了，只需要调用接口。输入只需要copy数据到GPU,输出只需要copy数据到cpu进行解码运算即可

//执行前向推理
#include <algorithm>
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <assert.h>
#include <cmath>
#include <cuda_runtime_api.h>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <sys/stat.h>
#include <time.h>

#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "logger.h"
#include "common.h"
using namespace std;
using namespace nvinfer1;
using namespace nvonnxparser;
using namespace cv;

string onnxFile = "yolov3_416_office_fp16.onnx";
string engineFile = "yolov3_416_office_fp16.trt";
vector<string> labels;
const int Classes = 80;
vector<vector<int>> output_shape = {{1, 15 + 3 * Classes, 13, 13}, {1, 15 + 3 * Classes, 26, 26}, {1, 15 + 3 * Classes, 52, 52}};
vector<vector<int>> g_masks = {{6, 7, 8}, {3, 4, 5}, {0, 1, 2}};
vector<vector<int>> g_anchors = {{10, 13}, {16, 30}, {33, 23}, {30, 61}, {62, 45}, {59, 119}, {116, 90}, {156, 198}, {373, 326}};

float *merge(float *out1, float *out2, int bsize_out1, int bsize_out2)
{
}
vector<string> split(const string &str, char delim)
{
}
void drawPred(int classId, float conf, int left, int top, int right, int bottom, cv::Mat &frame)
{
}
vector<cv::Rect> postProcess(cv::Mat &image, float *output)
{
}
vector<float> prepareImage(cv::Mat &img)
{
}
bool readTrtFile(const std::string &engineFile, 
				 IHostMemory *&trtModelStream)
{
}
bool onnxToTRTModel(const std::string &modelFile, // onnx文件的名字
					const std::string &filename,  // TensorRT引擎的名字
					IHostMemory *&trtModelStream) // output buffer for the 
{
	// create the builder
	//tensorrt 7.0版本需要这样改写
	nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(gLogger.getTRTLogger());
	const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
	//nvinfer1::INetworkDefinition* network = builder->createNetwork();
	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch);
}
void doInference(IHostMemory *trtModelStream)
{
   *******
   *******
	// 创建推理引擎
	//读取输入数据到缓冲区管理对象中
	assert(engine->getNbBindings() == 4);
	void *buffers[4];
	std::vector<int64_t> bufferSize;
	int nbBindings = engine->getNbBindings();
	bufferSize.resize(nbBindings);
	for (int i = 0; i < nbBindings; ++i){
		nvinfer1::Dims dims = engine->getBindingDimensions(i);
		nvinfer1::DataType dtype = engine->getBindingDataType(i);
		int64_t totalSize = volume(dims) * 1 * getElementSize(dtype);
		bufferSize[i] = totalSize;
		std::cout << "bufferSize[i]:" << bufferSize[i] << std::endl;
		CHECK(cudaMalloc(&buffers[i], totalSize));
	}
	// 创建CUDA流以执行此推断
	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));
	vector<cv::Mat> inputImgs;
	vector<DetectionRes> outputs;
	int outSize1 = bufferSize[1] / sizeof(float);
	int outSize2 = bufferSize[2] / sizeof(float);
	int outSize3 = bufferSize[3] / sizeof(float);
	float *out1 = new float[outSize1];
	float *out2 = new float[outSize2];
	float *out3 = new float[outSize3];
	int index = 1, batchCount = 0;
	cv::Mat img = cv::imread("/home/ting/dog.jpg");
	inputImgs.push_back(img);
	auto t_start_pre = std::chrono::high_resolution_clock::now();
	vector<float> curInput = prepareImage(img);
	// 将数据从主机输入缓冲区异步复制到设备输入缓冲区
	CHECK(cudaMemcpyAsync(buffers[0], curInput.data(), bufferSize[0], cudaMemcpyHostToDevice, stream));
	// 执行推理
	auto t_start = std::chrono::high_resolution_clock::now();
	// 7.0版本的变化，使用　executeV2
	//context->execute(BATCH_SIZE, buffers);
	context->executeV2((void **) buffers);
	auto t_end = std::chrono::high_resolution_clock::now();
	float total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
	std::cout << "Inference take: " << total << " ms." << endl;
	CHECK(cudaMemcpyAsync(out1, buffers[1], bufferSize[1], cudaMemcpyDeviceToHost, stream));
	CHECK(cudaMemcpyAsync(out2, buffers[2], bufferSize[2], cudaMemcpyDeviceToHost, stream));
	CHECK(cudaMemcpyAsync(out3, buffers[3], bufferSize[3], cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);
	//  合并head的时候 , 注意顺序
	float *out1_out2 = new float[outSize1 + outSize2];
	out1_out2 = merge(out1, out2, outSize1, outSize2);
	float *out = new float[outSize1 + outSize2 + outSize3];
	out = merge(out1_out2, out3, outSize1 + outSize2, outSize3);
	// postprocess
	auto t_start_post = std::chrono::high_resolution_clock::now();
	auto boxes = postProcess(img, out);
	auto t_end_post = std::chrono::high_resolution_clock::now();
	float total_post = std::chrono::duration<float, std::milli>(t_end_post - t_start_post).count();
	std::cout << "Postprocess take: " << total_post << " ms." << endl;
	// release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[0]));
	CHECK(cudaFree(buffers[1]));
	CHECK(cudaFree(buffers[2]));
	CHECK(cudaFree(buffers[3]));
	// destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();
	cv::imshow("result", img);
	cv::imwrite("result.jpg", img);
	waitKey(0);
}
int main()
{
	string classesFile = "/home/ting/coco.names";
	std::ifstream ifs(classesFile.c_str());
	string line;
	while (getline(ifs, line))
		labels.push_back(line);
    std::cout <<"labels.size:"<<labels.size()<<std::endl;
	IHostMemory *trtModelStream{nullptr};
	fstream existEngine;
	existEngine.open(engineFile, ios::in);
	if (existEngine){
		readTrtFile(engineFile, trtModelStream);
		assert(trtModelStream != nullptr);
	}
	else{
		std::cout << "onnxToTRTModel" << std::endl;
		onnxToTRTModel(onnxFile, engineFile, trtModelStream);
		assert(trtModelStream != nullptr);
	}
	//do inference
	doInferenceFrieza(trtModelStream);
	return 0;
}

anchor解码解析

解码过程和可视化其实就是画先验框和调整先验框获得最后的预测框。
如下代码，以13x13的特征层为例来进行解析的：

#调整先验框的过程就是解码
#decodebox这个类就是对先验框进行调整，每次只能对一个特征层进行解码，
class DecodeBox(nn.Module):
    def __init__(self, anchors, num_classes, img_size):
        super(DecodeBox, self).__init__()
        self.anchors = anchors
        self.num_anchors = len(anchors)
        self.num_classes = num_classes
        self.bbox_attrs = 5 + num_classes
        self.img_size = img_size

    def forward(self, input):
        """
        拿到预测结果以后，就放进这个forward函数，
        这里的input的shape是batchsize, 3x(1+4+num_classes), 13, 13
        3x(1+4+num_classes)分析：
        3是代表3个先验框，1代表先验框内部是否包含有物体，4表示先验框的调整参数，num_classes表示先验框内部物体的种类
        """
        #判断一共有多少张图片
        batch_size = input.size(0)
        #然后得到特征层的宽和高，根据我们的例子，这里的宽和高都是13和13
        input_height = input.size(2)
        input_width = input.size(3)

        # 计算步长，这里的步长其实就是输入进来的图片的大小除以我们输入进来的特征层，这里步长的别名也叫感受野
        """
        步长也就是每一个特征点对应原图上有多少个像素
        如我们的例子，我们将原图划分为13x13的网格，一张原图有412的像素，那么每一个特征点就对应412/13=32个像素点。（这里除以13是因为我们需要分开计算宽和高）
        那么就可以分别计算出高和宽的步长都是32
        """
        stride_h = self.img_size[1] / input_height
        stride_w = self.img_size[0] / input_width
        # 归一到特征层上
        """
        这里就是对先验框的样式进行调整
        """
        scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in self.anchors]

        # 对预测结果进行resize，进行通道转换和reshape
        """
        batchsize, 3x(5+num_classes), 13, 13->batchsize, 3, 13, 13, (5+num_classes)
        下面self.num_anchors表示的是3，也就是先验框的个数
        self.bbox_attrs也就是5+num_classes
        """
        prediction = input.view(batch_size, self.num_anchors,
                                self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous()

        #下面步骤就是获得先验框的调整参数

        # 获得先验框的中心位置的调整参数，先验框的中心其实就是我们划分网格的时候网格与网格之间的交点
        #中心位置就是先验框和预测框中心的偏移距离
        #在这里加上一个sigmoid可以将我们的值固定在0和1之间，这样我们的先验框的中心就只会往右下角的网格偏移了
        x = torch.sigmoid(prediction[..., 0])  
        y = torch.sigmoid(prediction[..., 1])
        # 获得先验框的宽高调整参数，就是先验框的大小调整，调整到预测框的大小
        w = prediction[..., 2]  # Width
        h = prediction[..., 3]  # Height

        # 获得置信度，是否有物体
        conf = torch.sigmoid(prediction[..., 4])
        # 种类置信度
        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.

        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
        LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor

        # 下面就是生成网格，生成先验框
        #首先是生成先验框的中心，也就是每个网格相交的网格点，它的shape是:batch_size,3,13,13(也就是13x13的网格，每个网格有三个先验框)
        grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_width, 1).repeat(
            batch_size * self.num_anchors, 1, 1).view(x.shape).type(FloatTensor)
        grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_height, 1).t().repeat(
            batch_size * self.num_anchors, 1, 1).view(y.shape).type(FloatTensor)

        # 生成先验框的宽高
        anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
        anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
        anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape)
        anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape)

        # 计算调整后的先验框中心与宽高
        pred_boxes = FloatTensor(prediction[..., :4].shape)
        #对先验框的中心进行调整
        pred_boxes[..., 0] = x.data + grid_x
        pred_boxes[..., 1] = y.data + grid_y
        #对先验框的宽高进行调整
        pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
        pred_boxes[..., 3] = torch.exp(h.data) * anchor_h

        # 用于将输出调整为相对于416x416的大小
        _scale = torch.Tensor([stride_w, stride_h] * 2).type(FloatTensor)
        output = torch.cat((pred_boxes.view(batch_size, -1, 4) * _scale,
                            conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1)

2.2 cuda编程之yolov3图片输入处理

主要使用: cudaMemcpyAsync

输入, 简单说就是把图片数据copy到cuda上。

prepareImage函数如下所示: 目得是将原始的图片,resize为416*416,但是,又不是直接进行处理:

vector<float> prepareImage(cv::Mat &img)
{
	int c = 3;
	int h = DETECT_WIDTH; //net h
	int w = DETECT_WIDTH; //net w
	float scale = min(float(w) / img.cols, float(h) / img.rows);
	auto scaleSize = cv::Size(img.cols * scale, img.rows * scale);
	cv::Mat rgb;
	cv::cvtColor(img, rgb, CV_BGR2RGB);
	cv::Mat resized;
	cv::resize(rgb, resized, scaleSize, 0, 0, INTER_CUBIC);
	cv::Mat cropped(h, w, CV_8UC3, 127);
	Rect rect((w - scaleSize.width) / 2, (h - scaleSize.height) / 2, scaleSize.width, scaleSize.height);
	resized.copyTo(cropped(rect));
	cv::Mat img_float;
	cropped.convertTo(img_float, CV_32FC3, 1.f / 255.0);
	vector<Mat> input_channels(c);
	cv::split(img_float, input_channels);
	vector<float> result(h * w * c);
	auto data = result.data();
	int channelLength = h * w;
	for (int i = 0; i < c; ++i)
	{
		memcpy(data, input_channels[i].data, channelLength * sizeof(float));
		data += channelLength;
	}
	return result;
}

图片处理如下示意图,
不是真实结果,只是示意裁剪压缩的基本原则:

在这里插入图片描述

主要使用: cudaMemcpyAsync

后处理,
简单说就是把tensorrt跑的结果从gpu上copy出来进行解析
如下代码所示: out就是输出的3个head的合并

// 执行推理
	auto t_start = std::chrono::high_resolution_clock::now();
    tmt.sstart("execute");
	context->execute(BATCH_SIZE, buffers);
	auto t_end = std::chrono::high_resolution_clock::now();
	float total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
	std::cout << "Inference take: " << total << " ms." << endl;
	CHECK(cudaMemcpyAsync(out1, buffers[1], bufferSize[1], cudaMemcpyDeviceToHost, stream));
	CHECK(cudaMemcpyAsync(out2, buffers[2], bufferSize[2], cudaMemcpyDeviceToHost, stream));
	CHECK(cudaMemcpyAsync(out3, buffers[3], bufferSize[3], cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);
	//  合并head的时候 , 注意顺序
	float *out1_out2 = new float[outSize1 + outSize2];
	out1_out2 = merge(out1, out2, outSize1, outSize2);
	float *out = new float[outSize1 + outSize2 + outSize3];
	out = merge(out1_out2, out3, outSize1 + outSize2, outSize3);
	// postprocess
	auto t_start_post = std::chrono::high_resolution_clock::now();
    tmt.sstart("postProcess");
	auto boxes = postProcess(img, out);

以上即可完成一个只需要简单的cudaMemcpy的CUDA编程和深度学习的tensorrt的C++版本的推理

2.3 cuda编程之yolov3在GPU上后处理

如果不由代码自动获取trt模型size，
则需要根据自己的模型输入输出来cudaMalloc.
包括anchor,输入image,输出的feature map及类别来设定;

操作概括来说包含5个步骤：
1.先根据GPU运算的需要内存,进行cudaMalloc;
2.再设定线程数量;
3.写核函数, 再调用kernel函数进行运算;
4.最后把最终结果copy回CPU;
5.释放空间,完成运算;

例如:
COCO数据集上80个类别训练的模型，
那么输入图片尺寸416×416,
那么输出feature map尺寸 13×13,26×26,52×52;
如果trt模型是fp32,则为float
bufferSize[0]: 2076672 = 416×416×3×4
bufferSize[1]: 172380 = 13×13×3×(4+1+80)×4
bufferSize[2]: 689520 =26×26×3×(4+1+80)×4
bufferSize[3]: 2758080 = 52×52×3×(4+1+80)×4

／／暂时不提供ＧＰＵ版本的后处理代码，
／／后处理思路，可参考如下代码

//1: 先根据GPU运算的需要内存,进行cudaMalloc
cudaMalloc((void**) &sorted_filtered_box,size*sizeof(<float>));
cudaMalloc((void**) &sorted_filtered_box, size*sizeof(<float>));

//2: 再设定线程数量
 num_threads_=dim3(32);
 num_blocks=dim3((100+31)/32,100);
 sort_boxes_by_kernel<<<num_blocks, NUM_THREADS_>>>(sorted_filtered_box,sorted_box_for_nms);

//3: 再调用kernel函数进行运算
__global__ void sort_boxes_by_kernel(sorted_filtered_box,sorted_box_for_nms)
{ji
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if(tid < filter_count)
  {
    int sort_index = indexes[tid];
  }
}

//4: 最后把最终结果copy回CPU     
cudaMemcpy(Output[i],size*sizeof(<float>),cudaMemcpyDeviceToHost);

//5: 释放空间,完成运算
cudaFree(sorted_filtered_box);
cudaFree(sorted_box_for_nms);

如下TensorRT加速后的效果，
能保证精度，密集车辆能够检测，行人也能检测；

补充：
tensorrt版本的问题：

Assertion failed: !_importer_ctx.network()->hasImplicitBatchDimension() && "This version of the ONNX parser only supports TensorRT INetworkDefinitions with an explicit batch dimension. Please ensure the network was created using the EXPLICIT_BATCH Networ

// tensorrt 6.0 5.0
// auto network = UniquePtr<nvinfer1::INetworkDefinition>(builder->createNetwork());
// if (!network)
// {
//     return false;
// }
// tensorrt 7.0
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
auto network = UniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));

以上即可完成一个只需要简单的CUDA编程和深度学习的tensorrt的C++版本的推理

…

cuda入门阶段,记录学习所写,

如需tensorflow或者opencv实现推理,请看另一篇blog

https://blog.csdn.net/nh54zyt/article/details/110630891

如需要完整代码, 可留言沟通,一起交流学习,

谢谢!

3 Onnx-simplifier

https://github.com/daquexian/onnx-simplifier
Our solution

ONNX Simplifier is presented to simplify the ONNX model. It infers the whole computation graph and then replaces the redundant operators with their constant outputs.
Web version

We have published ONNX Simplifier on https://convertmodel.com. It works out of the box and doesn’t need any installation. Just open the webpage, choose ONNX as the output format, check the onnx simplifier and then select your model to simplify. Note that the web version is in its very early stage, if the web version doesn’t work well for you, you can install the Python version following the instructions below.

Python version
pip3 install -U pip && pip3 install onnx-simplifier
Then
python3 -m onnxsim input_onnx_model output_onnx_model

For more functions like skipping optimization and setting input shape manually (when input shape is dynamic itself), try the following command for help message

python3 -m onnxsim -h

 python -m onnxsim ${INPUT_ONNX_MODEL} ${OUTPUT_ONNX_MODEL}

4 使用onnx-tensorrt转模型

版本 tensorrt7.0 、onnx-tensorrt7.0 、onnx==1.6




https://github.com/onnx/onnx-tensorrt




git clone https://github.com/onnx/onnx-tensorrt.git
git checkout 7.0

cd onnx-tensorrt
mkdir build && cd build


cd ../third_party
git clone https://github.com/onnx/onnx.git
git checkout v1.6.0


cmake .. -DTENSORRT_ROOT=<path_to_trt> && make -j

cmake .. -DTENSORRT_ROOT=/home/t/TensorRT-7.0.0.11 && make -j

[ 86%] Building CXX object CMakeFiles/nvonnxparser.dir/ShapedWeights.cpp.o
[ 87%] Building CXX object CMakeFiles/nvonnxparser.dir/onnx2trt_utils.cpp.o
[ 89%] Building CXX object CMakeFiles/nvonnxparser_static.dir/NvOnnxParser.cpp.o
[ 90%] Linking CXX static library libonnx.a
[ 90%] Built target onnx
[ 91%] Linking CXX static library libnvonnxparser_static.a
[ 91%] Built target nvonnxparser_static
Scanning dependencies of target onnx2trt
Scanning dependencies of target getSupportedAPITest
[ 94%] Building CXX object CMakeFiles/getSupportedAPITest.dir/getSupportedAPITest.cpp.o
[ 94%] Building CXX object CMakeFiles/getSupportedAPITest.dir/ModelImporter.cpp.o
[ 95%] Building CXX object CMakeFiles/onnx2trt.dir/main.cpp.o
[ 97%] Linking CXX shared library libnvonnxparser.so
[ 97%] Built target nvonnxparser
[ 98%] Linking CXX executable onnx2trt
[ 98%] Built target onnx2trt
[100%] Linking CXX executable getSupportedAPITest
[100%] Built target getSupportedAPITest


./onnx2trt ../../yolox_x.onnx -o my_engine.trt

原文链接：https://blog.csdn.net/nh54zyt/article/details/110423223