之前做项目总结到另一个博客的文章,文章可以作为参考直接拿来使用,但是中间的问题可能还需要自己解决,比如不同的层可能不支持,我在pytorch转onnx,onnx再转TRT的时候就存在一个问题,不能够进行广播操作,所以我就只能退而求其次,利用固定值的深度可分离的1x1卷积了,这是我在TRT 遇到的。转化程序是TensorRT源码里面提供的。
Key Words:模型转换、onnx->TRT、caffe->TRT、
Beijing, 2020
- TensorRT转化模型程序
- 编译依赖
- TensorRT转化模型用法
- TensorRT转化onnx模型为engine文件
- TensorRT转化caffe模型为engine文件
- engine对象的一些属性
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cuda_runtime_api.h>
#include <fstream>
#include <functional>
#include <iostream>
#include <iterator>
#include <map>
#include <random>
#include <sstream>
#include <string.h>
#include <sys/stat.h>
#include <time.h>
#include <vector>
#include "NvOnnxParser.h"
#include "NvCaffeParser.h"
#include "NvInfer.h"
#include "NvInferPlugin.h"
#include "NvUffParser.h"
#include "buffers.h"
#include "common.h"
#include "logger.h"
using namespace nvinfer1;
using namespace nvcaffeparser1;
using namespace nvuffparser;
using namespace nvonnxparser;
const std::string gSampleName = "TensorRT.BuildEngineFromOtherDLFrame";
struct Params
std::string deployFile{};
std::string modelFile{};
std::string engine{};
std::string saveEngine{};
std::string loadEngine{};
std::string calibrationCache{"CalibrationTable"};
std::string uffFile{};
std::string onnxModelFile{};
std::vector<std::string> inputs{};
std::vector<std::string> outputs{};
std::vector<std::pair<std::string, Dims3>> uffInputs{};
int device{0};
int batchSize{1};
int workspaceSize{16};
int iterations{10};
int avgRuns{10};
int useDLACore{-1};
bool safeMode{false};
bool fp16{false};
bool int8{false};
bool verbose{false};
bool allowGPUFallback{false};
float pct{99};
bool useSpinWait{false};
bool dumpOutput{false};
bool help{false};
} gParams;
inline int volume(Dims dims)
return std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int>());
std::map<std::string, Dims3> gInputDimensions;
std::vector<std::string> split(const std::string& s, char delim)
std::vector<std::string> res;
std::stringstream ss;
std::string item;
while (std::getline(ss, item, delim))
return res;
float percentile(float percentage, std::vector<float>& times)
int all = static_cast<int>(times.size());
int exclude = static_cast<int>((1 - percentage / 100) * all);
if (0 <= exclude && exclude <= all)
std::sort(times.begin(), times.end());
return times[all == exclude ? 0 : all - 1 - exclude];
return std::numeric_limits<float>::infinity();
class RndInt8Calibrator : public IInt8EntropyCalibrator2
RndInt8Calibrator(int totalSamples, std::string cacheFile)
: mTotalSamples(totalSamples)
, mCurrentSample(0)
, mCacheFile(cacheFile)
std::default_random_engine generator;
std::uniform_real_distribution<float> distribution(-1.0F, 1.0F);
for (auto& elem : gInputDimensions)
int elemCount = volume(elem.second);
std::vector<float> rnd_data(elemCount);
for (auto& val : rnd_data)
val = distribution(generator);
void* data;
CHECK(cudaMalloc(&data, elemCount * sizeof(float)));
CHECK(cudaMemcpy(data, &rnd_data[0], elemCount * sizeof(float), cudaMemcpyHostToDevice));
mInputDeviceBuffers.insert(std::make_pair(elem.first, data));
for (auto& elem : mInputDeviceBuffers)
int getBatchSize() const override
return 1;
bool getBatch(void* bindings[], const char* names[], int nbBindings) override
if (mCurrentSample >= mTotalSamples)
return false;
for (int i = 0; i < nbBindings; ++i)
bindings[i] = mInputDeviceBuffers[names[i]];
return true;
const void* readCalibrationCache(size_t& length) override
std::ifstream input(mCacheFile, std::ios::binary);
input >> std::noskipws;
if (input.good())
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
length = mCalibrationCache.size();
return length ? &mCalibrationCache[0] : nullptr;
virtual void writeCalibrationCache(const void*, size_t) override
int mTotalSamples;
int mCurrentSample;
std::string mCacheFile;
std::map<std::string, void*> mInputDeviceBuffers;
std::vector<char> mCalibrationCache;
void configureBuilder(IBuilder* builder, RndInt8Calibrator& calibrator)
// 需要在这里指定最大的BatchSize,如果不指定,默认为1,如果此时跑多个就好出错
builder->setMaxWorkspaceSize(static_cast<size_t>(gParams.workspaceSize) << 20);
if (gParams.int8)
if (gParams.safeMode)
builder->setEngineCapability(gParams.useDLACore >= 0 ? EngineCapability::kSAFE_DLA : EngineCapability::kSAFE_GPU);
ICudaEngine* caffeToTRTModel()
// create the builder
IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
if (builder == nullptr)
return nullptr;
// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
const IBlobNameToTensor* blobNameToTensor = parser->parse(gParams.deployFile.c_str(),
gParams.modelFile.empty() ? 0 : gParams.modelFile.c_str(),
if (!blobNameToTensor)
return nullptr;
for (int i = 0, n = network->getNbInputs(); i < n; i++)
Dims3 dims = static_cast<Dims3&&>(network->getInput(i)->getDimensions());
gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
gLogInfo << "Input "" << network->getInput(i)->getName() << "": " << dims.d[0] << "x" << dims.d[1] << "x" << dims.d[2] << std::endl;
// specify which tensors are outputs
for (auto& s : gParams.outputs)
if (blobNameToTensor->find(s.c_str()) == nullptr)
gLogError << "could not find output blob " << s << std::endl;
return nullptr;
for (int i = 0, n = network->getNbOutputs(); i < n; i++)
Dims3 dims = static_cast<Dims3&&>(network->getOutput(i)->getDimensions());
gLogInfo << "Output "" << network->getOutput(i)->getName() << "": " << dims.d[0] << "x" << dims.d[1] << "x"
<< dims.d[2] << std::endl;
// Build the engine
RndInt8Calibrator calibrator(1, gParams.calibrationCache);
configureBuilder(builder, calibrator);
samplesCommon::enableDLA(builder, gParams.useDLACore, gParams.allowGPUFallback);
ICudaEngine* engine = builder->buildCudaEngine(*network);
if (engine == nullptr)
gLogError << "could not build engine" << std::endl;
return engine;
ICudaEngine* uffToTRTModel()
// create the builder
IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
if (builder == nullptr)
return nullptr;
// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
IUffParser* parser = createUffParser();
// specify which tensors are outputs
for (auto& s : gParams.outputs)
if (!parser->registerOutput(s.c_str()))
gLogError << "Failed to register output " << s << std::endl;
return nullptr;
// specify which tensors are inputs (and their dimensions)
for (auto& s : gParams.uffInputs)
if (!parser->registerInput(s.first.c_str(), s.second, UffInputOrder::kNCHW))
gLogError << "Failed to register input " << s.first << std::endl;
return nullptr;
if (!parser->parse(gParams.uffFile.c_str(), *network))
return nullptr;
for (int i = 0, n = network->getNbInputs(); i < n; i++)
Dims3 dims = static_cast<Dims3&&>(network->getInput(i)->getDimensions());
gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
// Build the engine
RndInt8Calibrator calibrator(1, gParams.calibrationCache);
configureBuilder(builder, calibrator);
samplesCommon::enableDLA(builder, gParams.useDLACore);
ICudaEngine* engine = builder->buildCudaEngine(*network);
if (engine == nullptr)
gLogError << "could not build engine" << std::endl;
return engine;
ICudaEngine* onnxToTRTModel()
// create the builder
IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
if (builder == nullptr)
return nullptr;
nvinfer1::INetworkDefinition* network = builder->createNetwork();
// parse the onnx model to populate the network, then set the outputs
IParser* parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
if ( !parser->parseFromFile( gParams.onnxModelFile.c_str(), static_cast<int>( gLogger.getReportableSeverity() ) ) )
gLogError << "failed to parse onnx file" << std::endl;
return nullptr;
for (int i = 0, n = network->getNbInputs(); i < n; i++)
Dims3 dims = static_cast<Dims3&&>(network->getInput(i)->getDimensions());
gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
// Build the engine
RndInt8Calibrator calibrator(1, gParams.calibrationCache);
configureBuilder(builder, calibrator);
samplesCommon::enableDLA(builder, gParams.useDLACore);
ICudaEngine* engine = builder->buildCudaEngine(*network);
if (engine == nullptr)
gLogError << "could not build engine" << std::endl;
return engine;
void doInference(ICudaEngine& engine)
IExecutionContext* context = engine.createExecutionContext();
// Use an aliasing shared_ptr since we don't want engine to be deleted when bufferManager goes out of scope.
std::shared_ptr<ICudaEngine> emptyPtr{};
std::shared_ptr<ICudaEngine> aliasPtr(emptyPtr, &engine);
samplesCommon::BufferManager bufferManager(aliasPtr, gParams.batchSize);
std::vector<void*> buffers = bufferManager.getDeviceBindings();
cudaStream_t stream;
cudaEvent_t start, end;
unsigned int cudaEventFlags = gParams.useSpinWait ? cudaEventDefault : cudaEventBlockingSync;
CHECK(cudaEventCreateWithFlags(&start, cudaEventFlags));
CHECK(cudaEventCreateWithFlags(&end, cudaEventFlags));
std::vector<float> times(gParams.avgRuns);
for (int j = 0; j < gParams.iterations; j++)
float totalGpu{0}, totalHost{0}; // GPU and Host timers
for (int i = 0; i < gParams.avgRuns; i++)
auto tStart = std::chrono::high_resolution_clock::now();
cudaEventRecord(start, stream);
context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
cudaEventRecord(end, stream);
auto tEnd = std::chrono::high_resolution_clock::now();
totalHost += std::chrono::duration<float, std::milli>(tEnd - tStart).count();
float ms;
cudaEventElapsedTime(&ms, start, end);
times[i] = ms;
totalGpu += ms;
totalGpu /= gParams.avgRuns;
totalHost /= gParams.avgRuns;
gLogInfo << "Average over " << gParams.avgRuns << " runs is " << totalGpu << " ms (host walltime is " << totalHost
<< " ms, " << static_cast<int>(gParams.pct) << "% percentile time is " << percentile(gParams.pct, times) << ")." << std::endl;
if (gParams.dumpOutput)
int nbBindings = engine.getNbBindings();
for (int i = 0; i < nbBindings; i++)
if (!engine.bindingIsInput(i))
const char* tensorName = engine.getBindingName(i);
gLogInfo << "Dumping output tensor " << tensorName << ":" << std::endl;
bufferManager.dumpBuffer(gLogInfo, tensorName);
static void printUsage()
printf("Mandatory params:n");
printf(" --deploy=<file> Caffe deploy filen");
printf(" OR --uff=<file> UFF filen");
printf(" OR --onnx=<file> ONNX Model filen");
printf(" OR --loadEngine=<file> Load a saved enginen");
printf("nMandatory params for UFF:n");
printf(" --uffInput=<name>,C,H,W Input blob name and its dimensions for UFF parser (can be specified multiple times)n");
printf(" --output=<name> Output blob name (can be specified multiple times)n");
printf("nMandatory params for Caffe:n");
printf(" --output=<name> Output blob name (can be specified multiple times)n");
printf("nOptional params:n");
printf(" --model=<file> Caffe model file (default = no model, random weights used)n");
printf(" --batch=N Set batch size (default = %d)n", gParams.batchSize);
printf(" --device=N Set cuda device to N (default = %d)n", gParams.device);
printf(" --iterations=N Run N iterations (default = %d)n", gParams.iterations);
printf(" --avgRuns=N Set avgRuns to N - perf is measured as an average of avgRuns (default=%d)n", gParams.avgRuns);
printf(" --percentile=P For each iteration, report the percentile time at P percentage (0<=P<=100, with 0 representing min, and 100 representing max; default = %.1f%%)n", gParams.pct);
printf(" --workspace=N Set workspace size in megabytes (default = %d)n", gParams.workspaceSize);
printf(" --safe Only test the functionality available in safety restricted flows.n");
printf(" --fp16 Run in fp16 mode (default = false). Permits 16-bit kernelsn");
printf(" --int8 Run in int8 mode (default = false). Currently no support for ONNX model.n");
printf(" --verbose Use verbose logging (default = false)n");
printf(" --saveEngine=<file> Save a serialized engine to file.n");
printf(" --loadEngine=<file> Load a serialized engine from file.n");
printf(" --calib=<file> Read INT8 calibration cache file. Currently no support for ONNX model.n");
printf(" --useDLACore=N Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform.n");
printf(" --allowGPUFallback If --useDLACore flag is present and if a layer can't run on DLA, then run on GPU. n");
printf(" --useSpinWait Actively wait for work completion. This option may decrease multi-process synchronization time at the cost of additional CPU usage. (default = false)n");
printf(" --dumpOutput Dump outputs at end of test. n");
printf(" -h, --help Print usagen");
bool parseString(const char* arg, const char* name, std::string& value)
size_t n = strlen(name);
bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
if (match)
value = arg + n + 3;
gLogInfo << name << ": " << value << std::endl;
return match;
template<typename T>
bool parseAtoi(const char* arg, const char* name, T& value)
size_t n = strlen(name);
bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
if (match)
value = static_cast<T>(atoi(arg + n + 3));
gLogInfo << name << ": " << value << std::endl;
return match;
bool parseInt(const char* arg, const char* name, int& value)
return parseAtoi<int>(arg, name, value);
bool parseUnsigned(const char* arg, const char* name, unsigned int& value)
return parseAtoi<unsigned int>(arg, name, value);
// parse a boolean option of the form --name, or optionally, -letter.
bool parseBool(const char* arg, const char* name, bool& value, char letter = '