YOLOV5加速之TensorRT Python版API构建模型

269 阅读 0 评论 178 点赞

我是靠谱客的博主炙热啤酒，这篇文章主要介绍YOLOV5加速之TensorRT Python版API构建模型，现在分享给大家，希望可以做个参考。

看到几篇文章转YOLOV5到TRT时基本都在用C++构建，实际上TRT也有Python版本的API，自己试着搞了下也能用效果一样，下面贴下代码：

from collections import OrderedDict
import tensorrt as trt
import torch
from numpy import ceil
import numpy as np
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
INPUT_BLOB_NAME = "data"
OUTPUT_BLOB_NAME = "prob"
INPUT_H = 640
INPUT_W = 320
CLASS_NUM = 6
def get_width(x, gw, divisor=8):
return int(ceil((x * gw) / divisor)) * divisor
def get_depth(x, gd):
if x == 1:
return 1
r = round(x * gd)
if x * gd - int(x * gd) == 0.5 and (int(x * gd) % 2) == 0:
r -= 1
return max(r, 1)
def autopad(k, p=None):
# kernel, padding
# Pad to 'same'
if p is None:
p = k // 2 if isinstance(k, int) else [x // 2 for x in k]
# auto-pad
return p
def addBatchNorm2D(network, weights, input, layer_name):
gamma = weights[layer_name + '.weight'].numpy()
beta = weights[layer_name + '.bias'].numpy()
mean = weights[layer_name + '.running_mean'].numpy()
var = weights[layer_name + '.running_var'].numpy()
eps = 1e-05
var = np.sqrt(var + eps)
scale = gamma / var
shift = - mean / var * gamma + beta
scale1 = network.add_scale(input, trt.ScaleMode.CHANNEL, shift, scale)
return scale1
def convBlock(network, weights, input, out_channel, ksize, s, g, layer_name):
p = autopad(ksize)
conv1 = network.add_convolution_nd(input, out_channel, (ksize, ksize), weights[layer_name + '.conv.weight'].numpy())
conv1.stride_nd = (s, s)
conv1.padding_nd = (p, p)
conv1.num_groups = g
bn1 = addBatchNorm2D(network, weights, conv1.get_output(0), layer_name + ".bn")
sigmoid_ = network.add_activation(bn1.get_output(0), trt.ActivationType.SIGMOID)
ew = network.add_elementwise(bn1.get_output(0), sigmoid_.get_output(0), trt.ElementWiseOperation.PROD)
# return bn1
return ew
def focus(network, weights, input, in_channel, out_channel, ksize, layer_name):
s1 = network.add_slice(input, (0, 0, 0), (in_channel, INPUT_H // 2, INPUT_W // 2), (1, 2, 2))
s2 = network.add_slice(input, (0, 1, 0), (in_channel, INPUT_H // 2, INPUT_W // 2), (1, 2, 2))
s3 = network.add_slice(input, (0, 0, 1), (in_channel, INPUT_H // 2, INPUT_W // 2), (1, 2, 2))
s4 = network.add_slice(input, (0, 1, 1), (in_channel, INPUT_H // 2, INPUT_W // 2), (1, 2, 2))
cat = network.add_concatenation([s1.get_output(0), s2.get_output(0), s3.get_output(0), s4.get_output(0)])
conv = convBlock(network, weights, cat.get_output(0), out_channel, ksize, 1, 1, layer_name + '.conv')
return conv
def bottleneck(network, weights, input, c1, c2, shortcut, g, e, layer_name):
cv1 = convBlock(network, weights, input, int(c2 * e), 1, 1, 1, layer_name + '.cv1')
cv2 = convBlock(network, weights, cv1.get_output(0), c2, 3, 1, g, layer_name + '.cv2')
if shortcut and c1 == c2:
ew = network.add_elementwise(input, cv2.get_output(0), trt.ElementWiseOperation.SUM)
return ew
return cv2
def C3(network, weights, input, c1, c2, n, shortcut, g, e, layer_name):
c_ = int(c2 * e)
cv1 = convBlock(network, weights, input, c_, 1, 1, 1, layer_name + '.cv1')
cv2 = convBlock(network, weights, input, c_, 1, 1, 1, layer_name + '.cv2')
y1 = cv1.get_output(0)
for i in range(n):
b = bottleneck(network, weights, y1, c_, c_, shortcut, g, 1.0, layer_name + '.m.' + str(i))
y1 = b.get_output(0)
cat = network.add_concatenation([y1, cv2.get_output(0)])
cv3 = convBlock(network, weights, cat.get_output(0), c2, 1, 1, 1, layer_name + '.cv3')
return cv3
def SPP(network, weights, input, c1, c2, k1, k2, k3, layer_name):
c_ = c1 // 2
cv1 = convBlock(network, weights, input, c_, 1, 1, 1, layer_name + ".cv1")
pool1 = network.add_pooling_nd(cv1.get_output(0), trt.PoolingType.MAX, (k1, k1))
pool1.padding_nd = (k1 // 2, k1 // 2)
pool1.stride_nd = (1, 1)
pool2 = network.add_pooling_nd(cv1.get_output(0), trt.PoolingType.MAX, (k2, k2))
pool2.padding_nd = (k2 // 2, k2 // 2)
pool2.stride_nd = (1, 1)
pool3 = network.add_pooling_nd(cv1.get_output(0), trt.PoolingType.MAX, (k3, k3))
pool3.padding_nd = (k3 // 2, k3 // 2)
pool3.stride_nd = (1, 1)
cat = network.add_concatenation([cv1.get_output(0), pool1.get_output(0), pool2.get_output(0), pool3.get_output(0)])
cv2 = convBlock(network, weights, cat.get_output(0), c2, 1, 1, 1, layer_name + '.cv2')
return cv2
def addYoLoLayer(network, weights, layer_name, input):
return network
pt_file = "/yolov5l.pt"
model = torch.load(pt_file, map_location="cpu")['model']
# load to FP32
new_state_dictBA = OrderedDict()
for k, v in model.items():
if k[:7] == 'module.':
name = k[7:]
# remove `module.`
else:
name = k
new_state_dictBA[name] = v
model_weights = new_state_dictBA
# print(model.state_dict().keys())
with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network:
gw = 1.
gd = 1.
input_tensor = network.add_input(name=INPUT_BLOB_NAME, dtype=trt.float32, shape=(3, 640, 320))
# identity = network.add_identity(input_tensor)
focus0 = focus(network, model_weights, input_tensor, 3, get_width(64, gw), 3, "model.0")
conv1 = convBlock(network, model_weights, focus0.get_output(0), get_width(128, gw), 3, 2, 1, "model.1")
bottleneck_CSP2 = C3(network, model_weights, conv1.get_output(0),
get_width(128, gw), get_width(128, gw),
get_depth(3, gd), True, 1, 0.5, "model.2")
conv3 = convBlock(network, model_weights, bottleneck_CSP2.get_output(0), get_width(256, gw), 3, 2, 1, "model.3")
bottleneck_csp4 = C3(network, model_weights, conv3.get_output(0), get_width(256, gw), get_width(256, gw),
get_depth(9, gd), True, 1, 0.5, "model.4")
conv5 = convBlock(network, model_weights, bottleneck_csp4.get_output(0), get_width(512, gw), 3, 2, 1, "model.5")
bottleneck_csp6 = C3(network, model_weights, conv5.get_output(0), get_width(512, gw), get_width(512, gw),
get_depth(9, gd), True, 1, 0.5, "model.6")
conv7 = convBlock(network, model_weights, bottleneck_csp6.get_output(0), get_width(1024, gw), 3, 2, 1, "model.7")
spp8 = SPP(network, model_weights, conv7.get_output(0), get_width(1024, gw), get_width(1024, gw),
5, 9, 13, "model.8")
bottleneck_csp9 = C3(network, model_weights, spp8.get_output(0), get_width(1024, gw), get_width(1024, gw),
get_depth(3, gd), False, 1, 0.5, "model.9")
conv10 = convBlock(network, model_weights, bottleneck_csp9.get_output(0), get_width(512, gw), 1, 1, 1, "model.10")
upsample11 = network.add_resize(conv10.get_output(0))
upsample11.resize_mode = trt.ResizeMode.NEAREST
upsample11.shape = bottleneck_csp6.get_output(0).shape
cat12 = network.add_concatenation([upsample11.get_output(0), bottleneck_csp6.get_output(0)])
bottleneck_csp13 = C3(network, model_weights, cat12.get_output(0), get_width(1024, gw), get_width(512, gw),
get_depth(3, gd), False, 1, 0.5, "model.13")
conv14 = convBlock(network, model_weights, bottleneck_csp13.get_output(0), get_width(256, gw), 1, 1, 1, "model.14")
upsample15 = network.add_resize(conv14.get_output(0))
upsample15.resize_mode = trt.ResizeMode.NEAREST
upsample15.shape = bottleneck_csp4.get_output(0).shape
cat16 = network.add_concatenation([upsample15.get_output(0), bottleneck_csp4.get_output(0)])
bottleneck_csp17 = C3(network, model_weights, cat16.get_output(0), get_width(512, gw), get_width(256, gw),
get_depth(3, gd), False, 1, 0.5, "model.17")
det0 = network.add_convolution_nd(bottleneck_csp17.get_output(0), int(3 * (CLASS_NUM + 5)), (1, 1),
model_weights["model.24.m.0.weight"].numpy(), model_weights["model.24.m.0.bias"].numpy())
conv18 = convBlock(network, model_weights, bottleneck_csp17.get_output(0), get_width(256, gw), 3, 2, 1, "model.18")
cat19 = network.add_concatenation([conv18.get_output(0), conv14.get_output(0)])
bottleneck_csp20 = C3(network, model_weights, cat19.get_output(0), get_width(512, gw), get_width(512, gw),
get_depth(3, gd), False, 1, 0.5, "model.20")
det1 = network.add_convolution_nd(bottleneck_csp20.get_output(0), int(3 * (CLASS_NUM + 5)), (1, 1),
model_weights["model.24.m.1.weight"].numpy(), model_weights["model.24.m.1.bias"].numpy())
conv21 = convBlock(network, model_weights, bottleneck_csp20.get_output(0), get_width(512, gw), 3, 2, 1, "model.21")
cat22 = network.add_concatenation([conv21.get_output(0), conv10.get_output(0)])
bottleneck_csp23 = C3(network, model_weights, cat22.get_output(0), get_width(1024, gw), get_width(1024, gw),
get_depth(3, gd), False, 1, 0.5, "model.23")
det2 = network.add_convolution_nd(bottleneck_csp23.get_output(0), int(3 * (CLASS_NUM + 5)), (1, 1),
model_weights["model.24.m.2.weight"].numpy(), model_weights["model.24.m.2.bias"].numpy())
# yolo = addYoLoLayer(network, model_weights, "model.24", [det0, det1, det2])
# yolo.get_output(0).name = OUTPUT_BLOB_NAME
network.mark_output(det0.get_output(0))
# network.mark_output(det1.get_output(0))
# network.mark_output(det2.get_output(0))
builder.max_batch_size = 144
# builder.
config = builder.create_builder_config()
engine = builder.build_engine(network, config)
binary_model = engine.serialize()
with open('./yolov5l.engine', 'wb') as fp:
fp.write(binary_model)

代码暂时没有封装，使用没有问题，后期会维护。