onnxruntime推理
使用mmdeploy导出onnx模型:
from mmdeploy.apis import torch2onnx
from mmdeploy.backend.sdk.export_info import export2SDK
img = './bus.jpg'
work_dir = './work_dir/onnx/mask2former'
save_file = './end2end.onnx'
deploy_cfg = './configs/mmdet/panoptic-seg/panoptic-seg_maskformer_onnxruntime_dynamic.py'
model_cfg = '../mmdetection-3.3.0/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco.py'
model_checkpoint = '../checkpoints/mask2former_r50_8xb2-lsj-50e_coco_20220506_191028-41b088b6.pth'
device = 'cpu'
# 1. convert model to onnx
torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg, model_checkpoint, device)
# 2. extract pipeline info for sdk use (dump-info)
export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, device=device)
自行编写python推理脚本,目前SDK尚未支持:
import cv2
import random
import numpy as np
import onnxruntime
num_classes = 80
num_things_classes = 80
object_mask_thr = 0.5
resize_shape = (1333, 800)
palette = [ ]
for i in range(num_classes):
palette.append((np.random.randint(0, 256), np.random.randint(0, 256), np.random.randint(0, 256)))
def resize_keep_ratio(image, img_scale):
h, w = image.shape[0], image.shape[1]
max_long_edge = max(img_scale)
max_short_edge = min(img_scale)
scale_factor = min(max_long_edge / max(h, w), max_short_edge / min(h, w))
scale_w = int(w * float(scale_factor ) + 0.5)
scale_h = int(h * float(scale_factor ) + 0.5)
img_new = cv2.resize(image, (scale_w, scale_h))
return img_new
def draw_binary_masks(img, binary_masks, alphas=0.5):
binary_masks = binary_masks.astype('uint8') * 255
alphas = [alphas] * binary_masks.shape[0]
for binary_mask, alpha in zip(binary_masks, alphas):
binary_mask_complement = cv2.bitwise_not(binary_mask)
rgb = np.zeros_like(img)
rgb[...] = [random.randint(0, 256), random.randint(0, 256), random.randint(0, 256)]
rgb = cv2.bitwise_and(rgb, rgb, mask=binary_mask)
img_complement = cv2.bitwise_and(img, img, mask=binary_mask_complement)
rgb = rgb + img_complement
img = cv2.addWeighted(img, 1 - alpha, rgb, alpha, 0)
cv2.imwrite("result.jpg", img)
if __name__=="__main__":
image = cv2.imread('bus.jpg')
image_resize = resize_keep_ratio(image, resize_shape)
scale = (image.shape[0]/image_resize.shape[0], image.shape[1]/image_resize.shape[1])
pad_shape = (np.ceil(image_resize.shape[1]/32)*32, np.ceil(image_resize.shape[0]/32)*32)
pad_x, pad_y = int(pad_shape[0]-image_resize.shape[1]), int(pad_shape[1]-image_resize.shape[0])
image_pad = cv2.copyMakeBorder(image_resize, 0, pad_y, 0, pad_x, cv2.BORDER_CONSTANT, value=0)
input = image_pad[:, :, ::-1].transpose(2, 0, 1).astype(dtype=np.float32) #BGR2RGB和HWC2CHW
input[0,:] = (input[0,:] - 123.675) / 58.395
input[1,:] = (input[1,:] - 116.28) / 57.12
input[2,:] = (input[2,:] - 103.53) / 57.375
input = np.expand_dims(input, axis=0)
import ctypes
ctypes.CDLL('/home/tfy/document/mmdeploy-1.3.1/onnxruntime-linux-x64-1.14.1/lib/libonnxruntime.so')
session_options = onnxruntime.SessionOptions()
session_options.register_custom_ops_library('/home/tfy/document/mmdeploy-1.3.1/mmdeploy/lib/libmmdeploy_onnxruntime_ops.so')
onnx_session = onnxruntime.InferenceSession('./work_dir/onnx/mask2former/end2end.onnx', session_options, providers=['CPUExecutionProvider'])
input_name = []
for node in onnx_session.get_inputs():
input_name.append(node.name)
output_name=[]
for node in onnx_session.get_outputs():
output_name.append(node.name)
inputs = {}
for name in input_name:
inputs[name] = input
outputs = onnx_session.run(None, inputs)
mask_cls = outputs[0][0]
mask_pred_results = outputs[1][0][:, :image_resize.shape[0], :image_resize.shape[1]]
#mask_pred = F.interpolate(mask_pred[:, None], size=(image.shape[0], image.shape[1]), mode='bilinear', align_corners=False)[:, 0]
mask_pred = np.zeros((mask_pred_results.shape[0], image.shape[0], image.shape[1]))
for i in range(mask_pred.shape[0]):
mask_pred[i] = cv2.resize(mask_pred_results[i], dsize=(image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)
#scores = F.softmax(mask_cls, dim=-1)[:, :-1]
scores = np.array([np.exp(mask_cls[i]) / np.exp(mask_cls[i]).sum() for i in range(mask_cls.shape[0])])[:, :-1]
#labels = torch.arange(num_classes).unsqueeze(0).repeat(num_queries, 1).flatten(0, 1)
labels = np.tile(np.arange(num_classes), mask_cls.shape[0])
#scores_per_image, top_indices = scores.flatten(0, 1).topk(100, sorted=True)
flattened_scores = scores.ravel()
top_indices = np.argsort(flattened_scores)[-100:][::-1]
scores_per_image = flattened_scores[top_indices]
labels_per_image = labels[top_indices]
query_indices = top_indices // num_classes
mask_pred = mask_pred[query_indices]
is_thing = labels_per_image < num_things_classes
scores_per_image = scores_per_image[is_thing]
labels_per_image = labels_per_image[is_thing]
mask_pred = mask_pred[is_thing]
mask_pred_binary = (mask_pred > 0).astype(np.float32)
mask_pred_sigmoid = 1/ (1 + np.exp(-mask_pred))
temp = mask_pred_sigmoid * mask_pred_binary
mask_scores_per_image = temp.reshape(temp.shape[0], -1).sum(1) / (mask_pred_binary.reshape(mask_pred_binary.shape[0], -1).sum(1) + 1e-6)
det_scores = scores_per_image * mask_scores_per_image
mask_pred_binary = mask_pred_binary[det_scores>object_mask_thr ].astype(np.bool_)
draw_binary_masks(image, mask_pred_binary)
tensorrt推理
使用trtexec转换模型:
trtexec.exe --onnx=end2end.onnx --saveEngine=end2end2.engine --plugins=mmdeploy_tensorrt_ops.dll
自行编写python推理脚本,目前SDK尚未支持:
import cv2
import random
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
num_classes = 80
num_things_classes = 80
object_mask_thr = 0.5
resize_shape = (1333, 800)
palette = [ ]
for i in range(num_classes):
palette.append((np.random.randint(0, 256), np.random.randint(0, 256), np.random.randint(0, 256)))
def resize_keep_ratio(image, img_scale):
h, w = image.shape[0], image.shape[1]
max_long_edge = max(img_scale)
max_short_edge = min(img_scale)
scale_factor = min(max_long_edge / max(h, w), max_short_edge / min(h, w))
scale_w = int(w * float(scale_factor ) + 0.5)
scale_h = int(h * float(scale_factor ) + 0.5)
img_new = cv2.resize(image, (scale_w, scale_h))
return img_new
def draw_binary_masks(img, binary_masks, alphas=0.5):
binary_masks = binary_masks.astype('uint8') * 255
alphas = [alphas] * binary_masks.shape[0]
for binary_mask, alpha in zip(binary_masks, alphas):
binary_mask_complement = cv2.bitwise_not(binary_mask)
rgb = np.zeros_like(img)
rgb[...] = [random.randint(0, 256), random.randint(0, 256), random.randint(0, 256)]
rgb = cv2.bitwise_and(rgb, rgb, mask=binary_mask)
img_complement = cv2.bitwise_and(img, img, mask=binary_mask_complement)
rgb = rgb + img_complement
img = cv2.addWeighted(img, 1 - alpha, rgb, alpha, 0)
cv2.imwrite("result.jpg", img)
if __name__=="__main__":
logger = trt.Logger(trt.Logger.WARNING)
ctypes.CDLL('E:/vscode_workspace/mmdeploy-1.3.1/mmdeploy/lib/mmdeploy_tensorrt_ops.dll')
with open("E:/vscode_workspace/mmdeploy-1.3.1/work_dir/trt/mask2former/end2end.engine", "rb") as f, trt.Runtime(logger) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
h_output0 = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
h_output1 = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(2)), dtype=np.float32)
d_input = cuda.mem_alloc(h_input.nbytes)
d_output0 = cuda.mem_alloc(h_output0.nbytes)
d_output1 = cuda.mem_alloc(h_output1.nbytes)
stream = cuda.Stream()
image = cv2.imread('bus.jpg')
image_resize = resize_keep_ratio(image, resize_shape)
scale = (image.shape[0]/image_resize.shape[0], image.shape[1]/image_resize.shape[1])
pad_shape = (np.ceil(image_resize.shape[1]/32)*32, np.ceil(image_resize.shape[0]/32)*32)
pad_x, pad_y = int(pad_shape[0]-image_resize.shape[1]), int(pad_shape[1]-image_resize.shape[0])
image_pad = cv2.copyMakeBorder(image_resize, 0, pad_y, 0, pad_x, cv2.BORDER_CONSTANT, value=0)
input = image_pad[:, :, ::-1].transpose(2, 0, 1).astype(dtype=np.float32) #BGR2RGB和HWC2CHW
input[0,:] = (input[0,:] - 123.675) / 58.395
input[1,:] = (input[1,:] - 116.28) / 57.12
input[2,:] = (input[2,:] - 103.53) / 57.375
input = np.expand_dims(input, axis=0)
with engine.create_execution_context() as context:
cuda.memcpy_htod_async(d_input, h_input, stream)
context.execute_async_v2(bindings=[int(d_input), int(d_output0), int(d_output1)], stream_handle=stream.handle)
cuda.memcpy_dtoh_async(h_output0, d_output0, stream)
cuda.memcpy_dtoh_async(h_output1, d_output1, stream)
stream.synchronize()
mask_cls = h_output0.reshape(context.get_binding_shape(1))
mask_pred_results = h_output1.reshape(context.get_binding_shape(2))
mask_pred_results = mask_pred_results [:, :image_resize.shape[0], :image_resize.shape[1]]
mask_pred = np.zeros((mask_pred_results.shape[0], image.shape[0], image.shape[1]))
for i in range(mask_pred.shape[0]):
mask_pred[i] = cv2.resize(mask_pred_results[i], dsize=(image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)
scores = np.array([np.exp(mask_cls[i]) / np.exp(mask_cls[i]).sum() for i in range(mask_cls.shape[0])])[:, :-1]
labels = np.tile(np.arange(num_classes), mask_cls.shape[0])
flattened_scores = scores.ravel()
top_indices = np.argsort(flattened_scores)[-100:][::-1]
scores_per_image = flattened_scores[top_indices]
labels_per_image = labels[top_indices]
query_indices = top_indices // num_classes
mask_pred = mask_pred[query_indices]
is_thing = labels_per_image < num_things_classes
scores_per_image = scores_per_image[is_thing]
labels_per_image = labels_per_image[is_thing]
mask_pred = mask_pred[is_thing]
mask_pred_binary = (mask_pred > 0).astype(np.float32)
mask_pred_sigmoid = 1/ (1 + np.exp(-mask_pred))
temp = mask_pred_sigmoid * mask_pred_binary
mask_scores_per_image = temp.reshape(temp.shape[0], -1).sum(1) / (mask_pred_binary.reshape(mask_pred_binary.shape[0], -1).sum(1) + 1e-6)
det_scores = scores_per_image * mask_scores_per_image
mask_pred_binary = mask_pred_binary[det_scores>object_mask_thr ].astype(np.bool_)
draw_binary_masks(image, mask_pred_binary)