|
- from math import floor
-
- import numpy as np
- from threading import Thread
- from collections import namedtuple
- import torch
- import torch.nn.functional as F
- from torch import sigmoid
-
- import training.models as mdl
- from appSiamFC.app_utils import get_sequence, make_gaussian_map
- import utils.image_utils as imutils
- from utils.tensor_conv import numpy_to_torch_var, torch_var_to_numpy
-
- device = torch.device("cuda") if torch.cuda.is_available() \
- else torch.device("cpu")
- FLAG = 'safe'
- if imutils.LIBJPEG_TURBO_PRESENT:
- FLAG = 'fast'
- img_read_fcn = imutils.get_decode_jpeg_fcn(flag=FLAG)
- img_resize_fcn = imutils.get_resize_fcn(flag='fast')
-
- BufferElement = namedtuple('BufferElement', ['score_map', 'img', 'ref_img',
- 'visible', 'name', 'bbox'])
-
-
- class ProducerThread(Thread):
- """
- """
-
- def __init__(self, seq, buffer, dataset_path, model_path, set_type='train',
- max_res=800, branch_arch='alexnet', ctx_mode='max'):
- """
- Args:
- seq: (int) The number of the sequence according to the get_sequence
- function, which mirrors the indexing of the ImageNetVID class.
- buffer: (queue.Queue) The data buffer between the producerThread and
- the consumer application (the display). The elements stored in
- this buffer are defined by the BufferElement namedtuple.
- dataset_path: (string) The path to the root of the ImageNet dataset.
- model_path: (string) The path to the models .pth.tar file containing
- the model's weights.
- set_type: (string) The subset of the ImageNet VID dataset, can be
- 'train' or 'val'.
- max_res: (int) The maximum resolution in pixels. If any dimension
- of the image exceeds this value, the final image published by
- the producer is resized (keeping the aspect ratio). Used to
- balance the load between the consumer (main) thread and the
- producer.
- branch_arch: (string) The architecture of the branch of the siamese
- net. Might be: 'alexnet', 'vgg11_5c'.
- ctx_mode: (string) The strategy used to define the context region
- around the target, using the bounding box dimensions. The 'max'
- mode uses the biggest dimension, while the 'mean' mode uses the
- mean of the dimensions.
- """
- super(ProducerThread, self).__init__(daemon=True)
- self.frames, self.bboxes_norm, self.valid_frames, self.vid_dims = (
- get_sequence(seq, dataset_path, set_type=set_type))
- self.idx = 0
- self.seq_size = len(self.frames)
- self.buffer = buffer
- # TODO put the model info inside the checkpoint file.
- if branch_arch == 'alexnet':
- self.net = mdl.SiameseNet(mdl.BaselineEmbeddingNet(), stride=4)
- elif branch_arch == 'vgg11_5c':
- self.net = mdl.SiameseNet(mdl.VGG11EmbeddingNet_5c(), stride=4)
- elif branch_arch == "vgg16_8c":
- self.net = mdl.SiameseNet(mdl.VGG16EmbeddingNet_8c(), stride=4)
- checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
- self.net.load_state_dict(checkpoint['state_dict'])
- # Tuple of (H, w), the dimensions to which the image will be resized.
- self.resize_dims = None
- self.net = self.net.to(device)
- self.net.eval()
- self.ref, self.ref_emb = self.make_ref(ctx_mode=ctx_mode)
-
- @torch.no_grad()
- def run(self):
- """ The main loop of the Thread. It processes sequentially each frame
- of the specified sequence and publishes the results to the main thread
- through their shared buffer. When it finishes all the frames it sends
- a signal to the buffer indicating it is done and waits for the main
- thread to finish (because daemon=True it dies along with the main thread).
- """
- while self.idx < self.seq_size:
- dims = self.vid_dims
- if self.resize_dims is not None:
- img = img_read_fcn(self.frames[self.idx])
- img = img_resize_fcn(img, self.resize_dims, interp='bilinear')
- dims = self.resize_dims
- if self.valid_frames[self.idx]:
- bbox = self.denorm_bbox(self.bboxes_norm[self.idx], dims)
- else:
- bbox = None
- score_map = self.make_score_map(img)
- data = BufferElement(score_map,
- img,
- self.ref,
- self.valid_frames[self.idx],
- self.frames[self.idx],
- bbox)
- self.buffer.put(data)
- self.idx += 1
- print("ProducerThread finished publishing the data")
- # Publish a None to sinalize to the consumer that the stream has finished
- self.buffer.put(None)
-
- def denorm_bbox(self, bbox_norm, img_dims):
- """ Denormalizes the bounding box, taking it from its relative values to
- the pixel values in the full image with dimension img_dims.
-
- Args:
- bbox_norm: (list) The normalized bounding boxes, with 4 values that
- represent respectively, the x and y dimensions of the upper-left
- corner, and the width and height of the bounding boxes. All values
- are normalized by the full image's dimensions, so they are
- invariant to resizes of the image.
- img_dims: (tuple) The dimensions of the current image, in the form
- (Height, Width).
- Returns:
- bbox: (tuple) The bounding box in pixel terms, corresponding to the
- correct dimensions for an image with the given dimensions.
- """
- bbox = bbox_norm[:]
- bbox[0] = int(bbox[0]*img_dims[1])
- bbox[1] = int(bbox[1]*img_dims[0])
- bbox[2] = int(floor(bbox[2]*img_dims[1]))
- bbox[3] = int(floor(bbox[3]*img_dims[0]))
- return tuple(bbox)
-
- @torch.no_grad()
- def make_ref(self, ctx_mode='max'):
- """ Extracts the reference image and its embedding.
-
- Args:
- ctx_mode: (str) The method used to define the context region around
- the target, options are ['max', 'mean'], where 'max' simply takes
- the largest of the two dimensions of the bounding box and mean
- takes the mean.
- """
- # Get the first valid frame index
- ref_idx = self.valid_frames.index(True)
- ref_frame = img_read_fcn(self.frames[ref_idx])
- bbox = self.denorm_bbox(self.bboxes_norm[ref_idx], self.vid_dims)
- if ctx_mode == 'max':
- ctx_size = max(bbox[2], bbox[3])
- elif ctx_mode == 'mean':
- ctx_size = int((bbox[2] + bbox[3])/2)
- # It resizes the image so that the reference image has dimensions 127x127
- if ctx_size != 127:
|