|
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- # @Created : 2021/04/21
- # @Author : Koala
- # @FileName: run.py
-
- import os
- from typing import List
- import pycuda.driver as cuda
- import pycuda.autoinit
- import tensorrt as trt
- import numpy as np
- import torch
- import torchvision
- import cv2
- from utils import xywh2xyxy, box_iou
-
- TRT_LOGGER = trt.Logger(trt.Logger.INFO)
-
- __all__ = ['TensorRTRun']
-
-
- class HostDeviceMem(object):
- def __init__(self, host_mem, device_mem, shape=None):
- self.host = host_mem
- self.device = device_mem
- self.shape = shape
-
- def __str__(self):
- return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
-
- def __repr__(self):
- return self.__str__()
-
-
- class TensorRTRun:
- def __init__(self, engine_file_path=''):
- self.engine_file_path = engine_file_path
- self.engine = None
- self.inputs = []
- self.outputs = []
- self.bindings = []
- self.stream = None
- self.context = None
-
- anchors = {
- 8: [(10, 13), (16, 30), (33, 23)],
- 16: [(30, 61), (62, 45), (59, 119)],
- 32: [(116, 90), (156, 198), (373, 326)],
- }
- self.strides = list(anchors.keys())
- self.anchors = torch.tensor(list(anchors.values())).float() / torch.tensor(self.strides).view(-1, 1, 1)
- self.image_size = (640, 640)
- self.grid = [torch.zeros((1, 1, 1, 1, 1)) for _ in range(len(self.strides))]
- self._scales = []
- self._shapes = []
-
- def load_engine(self):
- assert os.path.isfile(self.engine_file_path)
- with open(self.engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
- self.engine = runtime.deserialize_cuda_engine(f.read())
- print(f'Load {self.engine_file_path}')
-
- def build_engine(self, onnx_path, fp16=True, max_batch_size=1):
- EXPLICIT_BATCH = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-
- assert os.path.isfile(onnx_path), f"ONNX file '{onnx_path}' is not existed"
- with trt.Builder(TRT_LOGGER) as builder, \
- builder.create_network(EXPLICIT_BATCH) as network, \
- trt.OnnxParser(network, TRT_LOGGER) as parser:
- builder.max_workspace_size = 1 << 31 # 2GiB
- builder.max_batch_size = max_batch_size
- builder.fp16_mode = fp16
-
- assert os.path.exists(onnx_path), 'ONNX file {} not found.'.format(onnx_path)
- print('Loading ONNX file from path {}...'.format(onnx_path))
-
- with open(onnx_path, 'rb') as model:
- print('Beginning ONNX file parsing')
- if not parser.parse(model.read()):
- print('ERROR: Failed to parse the ONNX file.')
- for error in range(parser.num_errors):
- print('parser.get_error(error)', parser.get_error(error))
- print('Completed parsing of ONNX file')
-
- self.engine = builder.build_cuda_engine(network)
- print('Completed build engine')
- if self.engine_file_path:
- with open(self.engine_file_path, "wb") as f:
- f.write(self.engine.serialize())
- print(f'save trt to {self.engine_file_path}!!')
-
- def prepare(self):
- # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
- assert self.engine is not None, "Please load/build engine before run prepare"
- self.stream = cuda.Stream()
- for binding in self.engine:
- shape = self.engine.get_binding_shape(binding)
- size = trt.volume(shape) * self.engine.max_batch_size
- dtype = trt.nptype(self.engine.get_binding_dtype(binding))
- # Allocate host and device buffers
- host_mem = cuda.pagelocked_empty(size, dtype)
- device_mem = cuda.mem_alloc(host_mem.nbytes)
- # Append the device buffer to device bindings.
- self.bindings.append(int(device_mem))
- # Append to the appropriate list.
- shape[0] = self.engine.max_batch_size
- if self.engine.binding_is_input(binding):
- self.inputs.append(HostDeviceMem(host_mem, device_mem, shape))
- self.image_size = shape[-2:]
- else:
- self.outputs.append(HostDeviceMem(host_mem, device_mem, shape))
- self.context = self.engine.create_execution_context()
- return
-
- def __call__(self, *images: np.ndarray):
- images = [self.pre_process(img) for img in images]
- batch_size = len(images)
- assert self.context is not None and batch_size <= self.engine.max_batch_size
- images = np.stack(images)
- np.copyto(self.inputs[0].host[:images.size], images.ravel())
- # Transfer input data to the GPU.
- [cuda.memcpy_htod_async(inp.device, inp.host, self.stream) for inp in self.inputs]
- # stream.synchronize()
- # Run inference.
- self.context.execute_async(batch_size=batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
- # Transfer predictions back from the GPU.
- [cuda.memcpy_dtoh_async(out.host, out.device, self.stream) for out in self.outputs]
- # Synchronize the stream
- self.stream.synchronize()
- # Return only the host outputs.
- outputs = [out.host.reshape(out.shape)[:batch_size] for out in self.outputs]
- # return outputs
- return self.post_process(outputs)
-
- def pre_process(self, *args: np.ndarray):
- images = []
- self._scales.clear()
- self._shapes.clear()
- for img in args:
- size = img.shape[:2] # HxW
- scale = min(self.image_size[0] / size[0], self.image_size[1] / size[1])
- self._scales.append(scale)
- self._shapes.append(size)
- new_size = int(round(size[1] * scale)), int(round((size[0] * scale)))
- if size != new_size:
- img = cv2.resize(img, new_size, interpolation=cv2.INTER_LINEAR)
- dw, dh = self.image_size[1] - new_size[0], self.image_size[0] - new_size[1]
- if dw != 0 or dh != 0:
- img = cv2.copyMakeBorder(img, 0, dh, 0, dw, cv2.BORDER_CONSTANT, value=(114, 114, 114))
- images.append(img)
- # plt.imshow(img)
- # plt.show()
-
- images = np.stack(images, 0)
- images = images[:, :, :, ::-1].transpose(0, 3, 1, 2) # BGR to RGB, to bsx3x416x416
- images = np.ascontiguousarray(images)
- images = images.astype(np.float32) / 255.
- return images
-
- @torch.no_grad()
- def post_process(self, outputs):
- z = []
- for i, stride in enumerate(self.strides):
- x = torch.from_numpy(outputs[i]) # type: torch.Tensor
- N, A, H, W, C = x.shape
- if self.grid[i].shape[2:4] != x.shape[2:4]:
- yv, xv = torch.meshgrid([torch.arange(H, device=x.device), torch.arange(W, device=x.device)])
- self.grid[i] = torch.stack((xv, yv), 2).view((1, 1, H, W, 2)).float()
- y = x.sigmoid()
- y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * stride # xy
- y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * (self.anchors[i, None, :, None, None, :] * stride) # wh
- z.append(y.view(N, -1, C))
- z = torch.cat(z, 1).float()
-
- prediction = z
- conf_thres = 0.25
- iou_thres = 0.45
- agnostic = False
-
- nc = z.shape[2] - 5
- xc = z[..., 4] > conf_thres
-
- min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height
- max_det = 300 # maximum number of detections per image
- max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
- time_limit = 10.0 # seconds to quit after
- redundant = True # require redundant detections
- multi_label = False & (nc > 1) # multiple labels per box (adds 0.5ms/img)
- merge = False # use merge-NMS
-
- output = [np.zeros((0, 6),)] * prediction.shape[0]
- for xi, x in enumerate(prediction): # image index, image inference
- x = x[xc[xi]] # confidence
- # If none remain process next image
- if not x.shape[0]:
- continue
- # Compute conf
- x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
- box = xywh2xyxy(x[:, :4])
-
- # Detections matrix nx6 (xyxy, conf, cls)
- if multi_label:
- i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
- x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
- else: # best class only
- conf, j = x[:, 5:].max(1, keepdim=True)
- x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
-
- # Check shape
- n = x.shape[0] # number of boxes
- if not n: # no boxes
- continue
- elif n > max_nms: # excess boxes
- x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence
-
- # Batched NMS
- c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
- boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
- i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
- if i.shape[0] > max_det: # limit detections
- i = i[:max_det]
- if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
- # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
- iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
- weights = iou * scores[None] # box weights
- x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
- if redundant:
- i = i[iou.sum(1) > 1] # require redundancy
- output[xi] = x[i].cpu().numpy()
- output[xi][:, :4] /= self._scales[xi]
-
- return output
|