dfh123157750
/
dongxinxin

 
			
							from math import floor

import numpy as np
from threading import Thread
from collections import namedtuple
import torch
import torch.nn.functional as F
from torch import sigmoid

import training.models as mdl
from appSiamFC.app_utils import get_sequence, make_gaussian_map
import utils.image_utils as imutils
from utils.tensor_conv import numpy_to_torch_var, torch_var_to_numpy

device = torch.device("cuda") if torch.cuda.is_available() \
    else torch.device("cpu")
FLAG = 'safe'
if imutils.LIBJPEG_TURBO_PRESENT:
    FLAG = 'fast'
img_read_fcn = imutils.get_decode_jpeg_fcn(flag=FLAG)
img_resize_fcn = imutils.get_resize_fcn(flag='fast')

BufferElement = namedtuple('BufferElement', ['score_map', 'img', 'ref_img',
                                             'visible', 'name', 'bbox'])


class ProducerThread(Thread):
    """
    """

    def __init__(self, seq, buffer, dataset_path, model_path, set_type='train',
                 max_res=800, branch_arch='alexnet', ctx_mode='max'):
        """
        Args:
            seq: (int) The number of the sequence according to the get_sequence
                function, which mirrors the indexing of the ImageNetVID class.
            buffer: (queue.Queue) The data buffer between the producerThread and
                the consumer application (the display). The elements stored in
                this buffer are defined by the BufferElement namedtuple.
            dataset_path: (string) The path to the root of the ImageNet dataset.
            model_path: (string) The path to the models .pth.tar file containing
                the model's weights.
            set_type: (string) The subset of the ImageNet VID dataset, can be
                'train' or 'val'.
            max_res: (int) The maximum resolution in pixels. If any dimension
                of the image exceeds this value, the final image published by
                the producer is resized (keeping the aspect ratio). Used to
                balance the load between the consumer (main) thread and the
                producer.
            branch_arch: (string) The architecture of the branch of the siamese
                net. Might be: 'alexnet', 'vgg11_5c'.
            ctx_mode: (string) The strategy used to define the context region
                around the target, using the bounding box dimensions. The 'max'
                mode uses the biggest dimension, while the 'mean' mode uses the
                mean of the dimensions.
        """
        super(ProducerThread, self).__init__(daemon=True)
        self.frames, self.bboxes_norm, self.valid_frames, self.vid_dims = (
            get_sequence(seq, dataset_path, set_type=set_type))
        self.idx = 0
        self.seq_size = len(self.frames)
        self.buffer = buffer
        # TODO put the model info inside the checkpoint file.
        if branch_arch == 'alexnet':
            self.net = mdl.SiameseNet(mdl.BaselineEmbeddingNet(), stride=4)
        elif branch_arch == 'vgg11_5c':
            self.net = mdl.SiameseNet(mdl.VGG11EmbeddingNet_5c(), stride=4)
        elif branch_arch == "vgg16_8c":
            self.net = mdl.SiameseNet(mdl.VGG16EmbeddingNet_8c(), stride=4)
        checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
        self.net.load_state_dict(checkpoint['state_dict'])
        # Tuple of (H, w), the dimensions to which the image will be resized.
        self.resize_dims = None
        self.net = self.net.to(device)
        self.net.eval()
        self.ref, self.ref_emb = self.make_ref(ctx_mode=ctx_mode)

    @torch.no_grad()
    def run(self):
        """ The main loop of the Thread. It processes sequentially each frame
        of the specified sequence and publishes the results to the main thread
        through their shared buffer. When it finishes all the frames it sends
        a signal to the buffer indicating it is done and waits for the main
        thread to finish (because daemon=True it dies along with the main thread).
        """
        while self.idx < self.seq_size:
            dims = self.vid_dims
            if self.resize_dims is not None:
                img = img_read_fcn(self.frames[self.idx])
                img = img_resize_fcn(img, self.resize_dims, interp='bilinear')
                dims = self.resize_dims
            if self.valid_frames[self.idx]:
                bbox = self.denorm_bbox(self.bboxes_norm[self.idx], dims)
            else:
                bbox = None
            score_map = self.make_score_map(img)
            data = BufferElement(score_map,
                                 img,
                                 self.ref,
                                 self.valid_frames[self.idx],
                                 self.frames[self.idx],
                                 bbox)
            self.buffer.put(data)
            self.idx += 1
        print("ProducerThread finished publishing the data")
        # Publish a None to sinalize to the consumer that the stream has finished
        self.buffer.put(None)

    def denorm_bbox(self, bbox_norm, img_dims):
        """ Denormalizes the bounding box, taking it from its relative values to
        the pixel values in the full image with dimension img_dims.

        Args:
            bbox_norm: (list) The normalized bounding boxes, with 4 values that
                represent respectively, the x and y dimensions of the upper-left
                corner, and the width and height of the bounding boxes. All values
                are normalized by the full image's dimensions, so they are
                invariant to resizes of the image.
            img_dims: (tuple) The dimensions of the current image, in the form
                (Height, Width).
        Returns:
            bbox: (tuple) The bounding box in pixel terms, corresponding to the
                correct dimensions for an image with the given dimensions.
        """
        bbox = bbox_norm[:]
        bbox[0] = int(bbox[0]*img_dims[1])
        bbox[1] = int(bbox[1]*img_dims[0])
        bbox[2] = int(floor(bbox[2]*img_dims[1]))
        bbox[3] = int(floor(bbox[3]*img_dims[0]))
        return tuple(bbox)

    @torch.no_grad()
    def make_ref(self, ctx_mode='max'):
        """ Extracts the reference image and its embedding.

        Args:
            ctx_mode: (str) The method used to define the context region around
                the target, options are ['max', 'mean'], where 'max' simply takes
                the largest of the two dimensions of the bounding box and mean
                takes the mean.
        """
        # Get the first valid frame index
        ref_idx = self.valid_frames.index(True)
        ref_frame = img_read_fcn(self.frames[ref_idx])
        bbox = self.denorm_bbox(self.bboxes_norm[ref_idx], self.vid_dims)
        if ctx_mode == 'max':
            ctx_size = max(bbox[2], bbox[3])
        elif ctx_mode == 'mean':
            ctx_size = int((bbox[2] + bbox[3])/2)
        # It resizes the image so that the reference image has dimensions 127x127
        if ctx_size != 127: