|
- # Copyright 2022 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
- """Auxiliary utils."""
- import os
-
- import numpy as np
- from mindspore import Tensor
- from mindspore import dtype as mstype
- from mindspore import nn
- from mindspore import numpy as msnp
- from mindspore import ops
- from mindspore.ops import functional as F
-
-
- def mkdir_if_missing(directory):
- os.makedirs(directory, exist_ok=True)
-
-
- def xyxy2xywh(x):
- """
- Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h],
- where x, y are coordinates of center, (x1, y1) and (x2, y2)
- are coordinates of bottom left and top right respectively.
- """
- y = np.zeros_like(x)
- y[:, 0] = (x[:, 0] + x[:, 2]) / 2 # x center
- y[:, 1] = (x[:, 1] + x[:, 3]) / 2 # y center
- y[:, 2] = x[:, 2] - x[:, 0] # width
- y[:, 3] = x[:, 3] - x[:, 1] # height
- return y
-
-
- def xywh2xyxy(x):
- """
- Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2],
- where x, y are coordinates of center, (x1, y1) and (x2, y2)
- are coordinates of bottom left and top right respectively.
- """
- y = np.zeros_like(x)
- y[:, 0] = (x[:, 0] - x[:, 2] / 2) # Bottom left x
- y[:, 1] = (x[:, 1] - x[:, 3] / 2) # Bottom left y
- y[:, 2] = (x[:, 0] + x[:, 2] / 2) # Top right x
- y[:, 3] = (x[:, 1] + x[:, 3] / 2) # Top right y
- return y
-
-
- def scale_coords(img_size, coords, img0_shape):
- """
- Rescale x1, y1, x2, y2 to image size.
- """
- gain_w = float(img_size[0]) / img0_shape[1] # gain = old / new
- gain_h = float(img_size[1]) / img0_shape[0]
- gain = min(gain_w, gain_h)
- pad_x = (img_size[0] - img0_shape[1] * gain) / 2 # width padding
- pad_y = (img_size[1] - img0_shape[0] * gain) / 2 # height padding
- coords[:, [0, 2]] -= pad_x
- coords[:, [1, 3]] -= pad_y
- coords[:, 0:4] /= gain
- cords_max = np.max(coords[:, :4])
- coords[:, :4] = np.clip(coords[:, :4], a_min=0, a_max=cords_max)
- return coords
-
-
- class SoftmaxCE(nn.Cell):
- """
- Original nn.SoftmaxCrossEntropyWithLogits with modifications:
- 1) Set ignore index = -1.
- 2) Reshape labels and logits to (n, C).
- 3) Calculate mean by mask.
- """
- def __init__(self):
- super().__init__()
- # Set necessary operations and constants
- self.soft_ce = ops.SoftmaxCrossEntropyWithLogits()
- self.expand_dim = ops.ExpandDims()
- self.transpose = ops.Transpose()
- self.reshape = ops.Reshape()
- self.one_hot = ops.OneHot()
- self.sum = ops.ReduceSum()
- self.one = Tensor(1, mstype.float32)
- self.zero = Tensor(0, mstype.float32)
-
- # Set eps to escape division by zero
- self.eps = Tensor(1e-16, dtype=mstype.float32)
-
- def construct(self, logits, labels, ignore_index):
- """
- Calculate softmax loss between logits and labels with ignore mask.
- """
- # Ignore indices which have not exactly recognized iou
- mask = labels != ignore_index
- mask = mask.astype('float32')
- channels = F.shape(logits)[-1]
-
- # One-hot labels for total identities in dataset
- labels_one_hot = self.one_hot(labels.flatten(), channels, self.one, self.zero)
- raw_loss, _ = self.soft_ce(
- self.reshape(logits, (-1, channels)),
- self.reshape(labels_one_hot, (-1, channels)),
- )
-
- # Apply mask and take mean of losses
- result = raw_loss * mask.reshape(raw_loss.shape)
- result = self.sum(result) / (self.sum(mask) + self.eps)
-
- return result
-
-
- def build_targets_thres(target, anchor_wh, na, ngh, ngw, k_max):
- """
- Build grid of targets confidence mask, bbox delta and id with thresholds.
-
- Args:
- target (np_array): Targets bbox cords and ids.
- anchor_wh (np_array): Resized anchors for map size.
- na (int): Number of anchors.
- ngh (int): Map height.
- ngw (int): Map width.
- k_max (int): Limitation of max detections per image.
-
- Returns:
- tconf (np_array): Mask with bg (0), gt (1) and ign (-1) indices. Shape (na, ngh, ngw).
- tbox (np_array): Targets delta bbox values. Shape (na, ngh, ngw, 4).
- tid (np_array): Grid with id for every cell. Shape (na, ngh, ngw).
-
- """
- id_thresh = 0.5
- fg_thresh = 0.5
- bg_thresh = 0.4
-
- bg_id = -1 # Background id
-
- tbox = np.zeros((na, ngh, ngw, 4), dtype=np.float32) # Fill grid with zeros bbox cords
- tconf = np.zeros((na, ngh, ngw), dtype=np.int32) # Fill grid with zeros confidence
- tid = np.full((na, ngh, ngw), bg_id, dtype=np.int32) # Fill grid with background id
-
- t = target
- t_id = t[:, 1].copy().astype(np.int32)
- t = t[:, [0, 2, 3, 4, 5]]
-
- # Convert relative cords for map size
- gxy, gwh = t[:, 1:3].copy(), t[:, 3:5].copy()
- gxy[:, 0] = gxy[:, 0] * ngw
- gxy[:, 1] = gxy[:, 1] * ngh
- gwh[:, 0] = gwh[:, 0] * ngw
- gwh[:, 1] = gwh[:, 1] * ngh
- gxy[:, 0] = np.clip(gxy[:, 0], a_min=0, a_max=ngw - 1)
- gxy[:, 1] = np.clip(gxy[:, 1], a_min=0, a_max=ngh - 1)
-
- gt_boxes = np.concatenate((gxy, gwh), axis=1) # Shape (num of targets, 4), 4 is (xc, yc, w, h)
-
- # Apply anchor to each cell of the grid
- anchor_mesh = generate_anchor(ngh, ngw, anchor_wh) # Shape (na, 4, ngh, ngw)
- anchor_list = anchor_mesh.transpose(0, 2, 3, 1).reshape(-1, 4) # Shape (na x ngh x ngw, 4)
-
- # Compute anchor iou with ground truths bboxes
- iou_pdist = bbox_iou(anchor_list, gt_boxes) # Shape (na x ngh x ngw, Ng)
- max_gt_index = iou_pdist.argmax(axis=1) # Shape (na x ngh x ngw)
- iou_max = iou_pdist.max(axis=1) # Shape (na x ngh x ngw)
-
- iou_map = iou_max.reshape(na, ngh, ngw)
- gt_index_map = max_gt_index.reshape(na, ngh, ngw)
-
- # Fill tconf by thresholds
- id_index = iou_map > id_thresh
- fg_index = iou_map > fg_thresh
- bg_index = iou_map < bg_thresh
- ign_index = (iou_map < fg_thresh) * (iou_map > bg_thresh) # Search unclear cells
- tconf[fg_index] = 1
- tconf[bg_index] = 0
- tconf[ign_index] = -1 # Index to ignore unclear cells
-
- # Take ground truths with mask
- gt_index = gt_index_map[fg_index]
- gt_box_list = gt_boxes[gt_index]
- gt_id_list = t_id[gt_index_map[id_index]]
- if np.sum(fg_index) > 0:
- tid[id_index] = gt_id_list
- fg_anchor_list = anchor_list.reshape((na, ngh, ngw, 4))[fg_index]
- delta_target = encode_delta(gt_box_list, fg_anchor_list)
- tbox[fg_index] = delta_target
-
- # Indices of cells with detections
- tconf_max = tconf.max(0)
- tid_max = tid.max(0)
- indices = np.where((tconf_max.flatten() > 0) & (tid_max.flatten() >= 0))[0]
-
- # Fill indices with zeros if k < k_max
- # Where k - is the detections per image
- # k_max - max detections per image
- k = len(indices)
- t_indices = np.zeros(k_max)
- t_indices[..., :min(k_max, k)] = indices[..., :min(k_max, k)]
-
- return tconf, tbox, tid, t_indices
-
-
- def bbox_iou(box1, box2, x1y1x2y2=False):
- """
- Returns the IoU of two bounding boxes.
- """
- n, m = len(box1), len(box2)
- if x1y1x2y2:
- # Get the coordinates of bounding boxes
- b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
- b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
- else:
- # Transform from center and width to exact coordinates
- b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
- b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
- b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
- b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
-
- # Get the coordinates of the intersection rectangle
- inter_rect_x1 = np.maximum(np.expand_dims(b1_x1, 1), b2_x1)
- inter_rect_y1 = np.maximum(np.expand_dims(b1_y1, 1), b2_y1)
- inter_rect_x2 = np.minimum(np.expand_dims(b1_x2, 1), b2_x2)
- inter_rect_y2 = np.minimum(np.expand_dims(b1_y2, 1), b2_y2)
-
- # Intersection area
- i_r_x = inter_rect_x2 - inter_rect_x1
- i_r_y = inter_rect_y2 - inter_rect_y1
- inter_area = np.clip(i_r_x, 0, np.max(i_r_x)) * np.clip(i_r_y, 0, np.max(i_r_y))
-
- # Union Area
- b1_area = np.broadcast_to(((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), (n, m))
- b2_area = np.broadcast_to(((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), (n, m))
-
- return inter_area / (b1_area + b2_area - inter_area + 1e-16)
-
-
- def generate_anchor(ngh, ngw, anchor_wh):
- """
- Generate anchor for every cell in grid.
- """
- na = len(anchor_wh)
- yy, xx = np.meshgrid(np.arange(ngh), np.arange(ngw), indexing='ij')
-
- mesh = np.stack([xx, yy], axis=0) # Shape 2, ngh, ngw
- mesh = np.tile(np.expand_dims(mesh, 0), (na, 1, 1, 1)).astype(np.float32) # Shape na, 2, ngh, ngw
- anchor_offset_mesh = np.tile(np.expand_dims(np.expand_dims(anchor_wh, -1), -1), (1, 1, ngh, ngw)) # Shape na, 2, ngh, ngw
- anchor_mesh = np.concatenate((mesh, anchor_offset_mesh), axis=1) # Shape na, 4, ngh, ngw
- return anchor_mesh
-
-
- def encode_delta(gt_box_list, fg_anchor_list):
- """
- Calculate delta for bbox center, width, height.
- """
- px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:, 1], \
- fg_anchor_list[:, 2], fg_anchor_list[:, 3]
- gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \
- gt_box_list[:, 2], gt_box_list[:, 3]
- dx = (gx - px) / pw
- dy = (gy - py) / ph
- dw = np.log(gw / pw)
- dh = np.log(gh / ph)
-
- return np.stack([dx, dy, dw, dh], axis=1)
-
-
- def create_grids(anchors, img_size, ngw):
- """
- Resize anchor according to image size and feature map size.
-
- Note:
- Ratio of feature maps dimensions if 1:3 such as anchors.
- Thus, it's enough to calculate stride per one dimension.
- """
- stride = img_size[0] / ngw
- anchor_vec = np.array(anchors) / stride
-
- return anchor_vec, stride
-
-
- def build_thresholds(
- labels,
- anchor_vec_s,
- anchor_vec_m,
- anchor_vec_b,
- k_max,
- ):
- """
- Build thresholds for all feature map sizes.
- """
- s = build_targets_thres(labels, anchor_vec_s, 4, 19, 34, k_max)
- m = build_targets_thres(labels, anchor_vec_m, 4, 38, 68, k_max)
- b = build_targets_thres(labels, anchor_vec_b, 4, 76, 136, k_max)
-
- return s, m, b
-
-
- def create_anchors_vec(anchors, img_size=(1088, 608)):
- """
- Create anchor vectors for every feature map size.
- """
- anchors1 = anchors[0:4]
- anchors2 = anchors[4:8]
- anchors3 = anchors[8:12]
- anchor_vec_s, stride_s = create_grids(anchors3, img_size, 34)
- anchor_vec_m, stride_m = create_grids(anchors2, img_size, 68)
- anchor_vec_b, stride_b = create_grids(anchors1, img_size, 136)
-
- anchors = (anchor_vec_s, anchor_vec_m, anchor_vec_b)
- strides = (stride_s, stride_m, stride_b)
-
- return anchors, strides
-
-
- class DecodeDeltaMap(nn.Cell):
- """
- Network predicts delta for base anchors.
-
- Decodes predictions into relative bbox cords.
- """
- def __init__(self):
- super().__init__()
- self.exp = ops.operations.Exp()
- self.stack0 = ops.Stack(axis=0)
- self.stack1 = ops.Stack(axis=1)
- self.expand_dims = ops.ExpandDims()
- self.reshape = ops.Reshape()
- self.concat = ops.Concat(axis=2)
-
- def construct(self, delta_map, anchors):
- """
- Decode delta of bbox predictions and summarize it with anchors.
- """
- anchors = anchors.astype('float32')
- nb, na, ngh, ngw, _ = delta_map.shape
- yy, xx = msnp.meshgrid(msnp.arange(ngh), msnp.arange(ngw), indexing='ij')
-
- mesh = self.stack0([xx, yy]).astype('float32') # Shape (2, ngh, ngw)
- mesh = msnp.tile(self.expand_dims(mesh, 0), (nb, na, 1, 1, 1)) # Shape (nb, na, 2, ngh, ngw)
- anchors_unsqueezed = self.expand_dims(self.expand_dims(anchors, -1), -1) # Shape (na, 2, 1, 1)
- anchor_offset_mesh = msnp.tile(anchors_unsqueezed, (nb, 1, 1, ngh, ngw)) # Shape (nb, na, 2, ngh, ngw)
- anchor_mesh = self.concat((mesh, anchor_offset_mesh)) # Shape (nb, na, 4, ngh, ngw)
-
- anchor_mesh = anchor_mesh.transpose(0, 1, 3, 4, 2)
-
- delta = delta_map.reshape(-1, 4)
- fg_anchor_list = anchor_mesh.reshape(-1, 4)
- px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:, 1], \
- fg_anchor_list[:, 2], fg_anchor_list[:, 3]
- dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3]
- gx = pw * dx + px
- gy = ph * dy + py
- gw = pw * self.exp(dw)
- gh = ph * self.exp(dh)
-
- pred_list = self.stack1([gx, gy, gw, gh])
-
- pred_map = pred_list.reshape(nb, na, ngh, ngw, 4)
-
- return pred_map
-
-
- def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4):
- """
- Removes detections with lower object confidence score than 'conf_thres'
- Non-Maximum Suppression to further filter detections.
-
- Args:
- prediction (np.array): All predictions from model output.
- conf_thres (float): Threshold for confidence.
- nms_thres (float): Threshold for iou into nms.
-
- Returns:
- output (np.array): Predictions with shape (x1, y1, x2, y2, object_conf, class_score, class_pred)
- """
-
- output = [None for _ in range(len(prediction))]
- for image_i, pred in enumerate(prediction):
- # Filter out confidence scores below threshold
- # Get score and class with highest confidence
-
- v = pred[:, 4] > conf_thres
- v = np.squeeze(v.nonzero())
- if v.ndim == 0:
- v = np.expand_dims(v, 0)
-
- pred = pred[v]
-
- # If none are remaining => process next image
- npred = pred.shape[0]
- if not npred:
- continue
- # From (center x, center y, width, height) to (x1, y1, x2, y2)
- pred[:, :4] = xywh2xyxy(pred[:, :4])
-
- # Non-maximum suppression
- bboxes = np.concatenate((pred[:, :4], np.expand_dims(pred[:, 4], -1)), axis=1)
- nms_indices = nms(bboxes, nms_thres)
- det_max = pred[nms_indices]
-
- if det_max.size > 0:
- # Add max detections to outputs
- output[image_i] = det_max if output[image_i] is None else np.concatenate((output[image_i], det_max))
-
- return output
-
-
- def nms(dets, thresh):
- """
- Non-maximum suppression with threshold.
- """
- x1 = dets[:, 0]
- y1 = dets[:, 1]
- x2 = dets[:, 2]
- y2 = dets[:, 3]
- scores = dets[:, 4]
-
- areas = (x2 - x1 + 1) * (y2 - y1 + 1)
- order = scores.argsort()[::-1]
-
- keep = []
- while order.size > 0:
- i = order[0]
- keep.append(i)
- xx1 = np.maximum(x1[i], x1[order[1:]])
- yy1 = np.maximum(y1[i], y1[order[1:]])
- xx2 = np.minimum(x2[i], x2[order[1:]])
- yy2 = np.minimum(y2[i], y2[order[1:]])
-
- w = np.maximum(0.0, xx2 - xx1 + 1)
- h = np.maximum(0.0, yy2 - yy1 + 1)
- inter = w * h
- ovr = inter / (areas[i] + areas[order[1:]] - inter)
-
- inds = np.where(ovr <= thresh)[0]
- order = order[inds + 1]
-
- return keep
-
-
- def ap_per_class(tp, conf, pred_cls, target_cls):
- """
- Computes the average precision, given the recall and precision curves.
- Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics.
-
- Args:
- tp (list): True positives.
- conf (list): Objectness value from 0-1.
- pred_cls (np.array): Predicted object classes.
- target_cls (np.array): True object classes.
-
- Returns:
- ap (np.array): The average precision as computed in py-faster-rcnn.
- unique classes (np.array): Classes of predictions.
- r (np.array): Recall.
- p (np.array): Precision.
- """
-
- # lists/pytorch to numpy
- tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(pred_cls), np.array(target_cls)
-
- # Sort by objectness
- i = np.argsort(-conf)
- tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
-
- # Find unique classes
- unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0))
-
- # Create Precision-Recall curve and compute AP for each class
- ap, p, r = [], [], []
- for c in unique_classes:
- i = pred_cls == c
- n_gt = sum(target_cls == c) # Number of ground truth objects
- n_p = sum(i) # Number of predicted objects
-
- if (n_p == 0) and (n_gt == 0):
- continue
-
- if (n_p == 0) or (n_gt == 0):
- ap.append(0)
- r.append(0)
- p.append(0)
- else:
- # Accumulate FPs and TPs
- fpc = np.cumsum(1 - tp[i])
- tpc = np.cumsum(tp[i])
-
- # Recall
- recall_curve = tpc / (n_gt + 1e-16)
- r.append(tpc[-1] / (n_gt + 1e-16))
-
- # Precision
- precision_curve = tpc / (tpc + fpc)
- p.append(tpc[-1] / (tpc[-1] + fpc[-1]))
-
- # AP from recall-precision curve
- ap.append(compute_ap(recall_curve, precision_curve))
-
- return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(p)
-
-
- def compute_ap(recall, precision):
- """
- Computes the average precision, given the recall and precision curves.
- Code originally from https://github.com/rbgirshick/py-faster-rcnn.
-
- Args:
- recall (list): The recall curve.
- precision (list): The precision curve.
-
- Returns:
- ap (np.array): The average precision as computed in py-faster-rcnn.
- """
-
- # correct AP calculation
- # first append sentinel values at the end
- mrec = np.concatenate(([0.], recall, [1.]))
- mpre = np.concatenate(([0.], precision, [0.]))
-
- # compute the precision envelope
- for i in range(mpre.size - 1, 0, -1):
- mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
-
- # to calculate area under PR curve, look for points
- # where X axis (recall) changes value
- i = np.where(mrec[1:] != mrec[:-1])[0]
-
- # and sum (\Delta recall) * prec
- ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
- return ap
|