windandwine
/
YOLO_v3_tensorflow

 
			
							# coding: utf-8

from __future__ import division, print_function
import sys
from utils.data_aug import *
import random
import tensorflow as tf
from setting import train_args as args

"""
数据解析相关
"""

PY_VERSION = sys.version_info[0]
iter_cnt = 0


def parse_line(line):
    """
    解析每行(COCO数据集格式)
    :param line: 格式: line_idx File_name x1 y1 w1 h1 label x2 y2 w2 h2 label x3 y3 w3 h3 label ...
    :return:
        line_idx:行数,
        pic_path: 图片路径
        boxes: [N, 4], N 是GT数量, 4为[x_min, y_min, x_max, y_max]
        labels: [N]. 类别id.
        img_width, img_height: 图片大小
    """
    if 'str' not in str(type(line)):
        line = line.decode()
    s = line.strip().split(' ')
    assert len(s) > 8, '一个图片至少有一个bbox, 检查标注'
    line_idx = int(s[0])
    pic_path = s[1]
    img_width = int(s[2])
    img_height = int(s[3])
    s = s[4:]
    assert len(s) % 5 == 0, 'bbox至少有5个值, 4-坐标和1-类别, 检查标注'
    box_cnt = len(s) // 5
    boxes = []
    labels = []
    for i in range(box_cnt):
        label, x_min, y_min, x_max, y_max \
            = int(s[i * 5]), float(s[i * 5 + 1]), float(s[i * 5 + 2]), float(s[i * 5 + 3]), float(s[i * 5 + 4])
        boxes.append([x_min, y_min, x_max, y_max])
        labels.append(label)
    boxes = np.asarray(boxes, np.float32)
    labels = np.asarray(labels, np.int64)
    return line_idx, pic_path, boxes, labels, img_width, img_height


def process_box(boxes, labels, img_size, class_num, anchors):
    """
    生成 y_true label, 也就是gt在三种不同维度维度上的feature_maps
    :param boxes: float32, [N, 5] x_min, y_min, x_max, y_mix, mixup_weight(混合程度).
    :param labels: int64 [N] shape
    :param img_size:
    :param class_num: int64 num.
    :param anchors: [9, 2] float32
    :return:
    """
    anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
    # 改变boxes格式: [N, 2], 得到中心点相对值坐标(x_center, y_center), 缩放无影响
    box_centers = (boxes[:, 0:2] + boxes[:, 2:4]) / 2
    # (width, height)
    box_sizes = boxes[:, 2:4] - boxes[:, 0:2]

    # 416*416举例, 416/31=13: [13, 13, 3, 5+num_class+1]=[13, 13, 3, 86]
    # 5是坐标和类别标签, 1是mix_up weight(混合程度)
    y_true_13 = np.zeros((img_size[1] // 32, img_size[0] // 32, 3, 6 + class_num), np.float32)
    y_true_26 = np.zeros((img_size[1] // 16, img_size[0] // 16, 3, 6 + class_num), np.float32)
    y_true_52 = np.zeros((img_size[1] // 8, img_size[0] // 8, 3, 6 + class_num), np.float32)

    # mix_up weight默认为1.
    y_true_13[..., -1] = 1.
    y_true_26[..., -1] = 1.
    y_true_52[..., -1] = 1.

    y_true = [y_true_13, y_true_26, y_true_52]

    # [N, 2]-->[N, 1, 2]
    box_sizes = np.expand_dims(box_sizes, 1)
    # 广播: [N, 1, 2] & [9, 2] ==> [N, 9, 2]
    mins = np.maximum(- box_sizes / 2, - anchors / 2)
    maxs = np.minimum(box_sizes / 2, anchors / 2)
    # [N, 9, 2]
    whs = maxs - mins
    # [N, 9] IoU
    iou = (whs[:, :, 0] * whs[:, :, 1]) / (
                box_sizes[:, :, 0] * box_sizes[:, :, 1] + anchors[:, 0] * anchors[:, 1] - whs[:, :, 0] * whs[:, :,
                                                                                                         1] + 1e-10)
    # [N]
    best_match_idx = np.argmax(iou, axis=1)

    ratio_dict = {1.: 8., 2.: 16., 3.: 32.}
    for i, idx in enumerate(best_match_idx):
        # idx: 0,1,2 ==> 2; 3,4,5 ==> 1; 6,7,8 ==> 0
        feature_map_group = 2 - idx // 3
        # scale ratio: 0,1,2 ==> 8; 3,4,5 ==> 16; 6,7,8 ==> 32
        ratio = ratio_dict[np.ceil((idx + 1) / 3.)]
        x = int(np.floor(box_centers[i, 0] / ratio))
        y = int(np.floor(box_centers[i, 1] / ratio))
        k = anchors_mask[feature_map_group].index(idx)
        c = labels[i]
        y_true[feature_map_group][y, x, k, :2] = box_centers[i]
        y_true[feature_map_group][y, x, k, 2:4] = box_sizes[i]
        y_true[feature_map_group][y, x, k, 4] = 1.
        y_true[feature_map_group][y, x, k, 5 + c] = 1.
        y_true[feature_map_group][y, x, k, -1] = boxes[i, -1]

    return y_true_13, y_true_26, y_true_52


def parse_data(line, class_num, img_size, anchors, mode, use_letterbox_resize):
    """
    解析每行数据到y_true
    :param line:
    :param class_num:
    :param img_size:  [width, height]
    :param anchors:
    :param mode: is training
    :param use_letterbox_resize: 是否应用 letterbox resize
    :return:
    """
    # 如果一条，则直接解析
    if not isinstance(line, list):
        img_idx, pic_path, boxes, labels, _, _ = parse_line(line)
        img = cv2.imread(pic_path)
        # expand the 2nd dimension, mix up weight default to 1.
        boxes = np.concatenate((boxes, np.full(shape=(boxes.shape[0], 1), fill_value=1., dtype=np.float32)), axis=-1)
    # 如果两条，则mix up混合
    else:
        _, pic_path1, boxes1, labels1, _, _ = parse_line(line[0])
        img1 = cv2.imread(pic_path1)
        img_idx, pic_path2, boxes2, labels2, _, _ = parse_line(line[1])
        img2 = cv2.imread(pic_path2)

        img, boxes = mix_up(img1, img2, boxes1, boxes2)
        labels = np.concatenate((labels1, labels2))

    # 如果train, 对解析到的img应用各种tricks(bbox随之调整)
    if mode == 'train':
        # 扭曲图片
        img = random_color_distort(img)

        # 50%几率，应用随机放大
        if np.random.uniform(0, 1) > 0.5:
            img, boxes = random_expand(img, boxes, 4)

        # 随机裁剪
        h, w, _ = img.shape
        boxes, crop = random_crop_with_constraints(boxes, (w, h))
        x0, y0, w, h = crop
        img = img[y0: y0 + h, x0: x0 + w]

        # 调整图片大小
        h, w, _ = img.shape
        interp = np.random.randint(0, 5)
        img, boxes = resize_with_bbox(
            img, boxes, img_size[0], img_size[1], interp=interp, letterbox=use_letterbox_resize
        )

        # 随机滑动
        h, w, _ = img.shape
        img, boxes = random_flip(img, boxes, px=0.5)
    # 否则直接将图片调整到输入大小
    else:
        img, boxes = resize_with_bbox(
            img, boxes, img_size[0], img_size[1], interp=1, letterbox=use_letterbox_resize
        )

    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
    img = img / 255.  # v3要求值归一化[0, 255]-->[0, 1]
    # 得到在三种维度上的gt
    y_true_13, y_true_26, y_true_52 = process_box(boxes, labels, img_size, class_num, anchors)
    return img_idx, img, y_true_13, y_true_26, y_true_52


def get_batch_data(batch_line, class_num, img_size, anchors, mode, multi_scale=False, mix_up=False,
                   letterbox_resize=True, interval=10):
    """
    获得批数据(imgs和labels)
    :param batch_line: batch数量的line
    :param class_num: 类别数
    :param img_size: 416*416
    :param anchors: anchors. shape=[9, 2].
    :param mode: train或val.如果是train, 应用data augmentation
    :param multi_scale: 是否multi_scale training, 图片大小 32*[10->20]也就是[320, 320]->[640, 640]，mode=train有效
    :param mix_up:
    :param letterbox_resize: 是否letterbox resize, i.e., keep the original aspect ratio in the resized image.
    :param interval: change the scale of image every interval batches. Note that it's indeterministic because of the multi threading.
    :return:
    """
    global iter_cnt
    # multi_scale 训练
    if multi_scale and mode == 'train':
        random.seed(iter_cnt // interval)
        random_img_size = [[x * 32, x * 32] for x in range(10, 20)]
        img_size = random.sample(random_img_size, 1)[0]
    iter_cnt += 1

    img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = [], [], [], [], []

    # train且mix up
    if mix_up and mode == 'train':
        mix_lines = []
        batch_line = batch_line.tolist()
        for idx, line in enumerate(batch_line):
            if np.random.uniform(0, 1) < 0.5:
                # 在当前batch中获取另一个line
                mix_lines.append([line, random.sample(batch_line[:idx] + batch_line[idx + 1:], 1)[0]])
            else:
                mix_lines.append(line)
        batch_line = mix_lines

    for line in batch_line:
        img_idx, img, y_true_13, y_true_26, y_true_52 = parse_data(
            line, class_num, img_size, anchors, mode, letterbox_resize
        )
        img_idx_batch.append(img_idx)
        img_batch.append(img)
        y_true_13_batch.append(y_true_13)
        y_true_26_batch.append(y_true_26)
        y_true_52_batch.append(y_true_52)

    img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch =\
        np.asarray(img_idx_batch, np.int64), \
        np.asarray(img_batch), \
        np.asarray(y_true_13_batch), \
        np.asarray(y_true_26_batch), \
        np.asarray(y_true_52_batch)

    return img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch


def build_train_dataset():
    """
    构建验证数据
    :return:
    """
    train_dataset = tf.data.TextLineDataset(args.train_file)
    train_dataset = train_dataset.shuffle(args.train_img_cnt)  # 先随机重排
    train_dataset = train_dataset.batch(args.batch_size)  # 分批
    train_dataset = train_dataset.map(
        lambda x: tf.py_func(
            get_batch_data,
            inp=[x, args.class_num, args.img_size, args.anchors, 'train',
                 args.multi_scale_train, args.use_mix_up, args.letterbox_resize],
            Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
        num_parallel_calls=args.num_threads
    )
    train_dataset = train_dataset.prefetch(args.prefetech_buffer)  # 每次取5
    return train_dataset


def build_val_dataset():
    """
    构建验证数据集
    :return:
    """
    val_dataset = tf.data.TextLineDataset(args.val_file)
    val_dataset = val_dataset.batch(1)  # 一批一个
    val_dataset = val_dataset.map(
        lambda x: tf.py_func(
            get_batch_data,
            inp=[x, args.class_num, args.img_size, args.anchors,
                 'val', False, False, args.letterbox_resize],
            Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
        num_parallel_calls=args.num_threads
    )
    val_dataset = val_dataset.prefetch(args.prefetech_buffer)
    return val_dataset


def create_iterator():
    """
    创建迭代器
    :return:
    """
    print('\n\033[32m----------- Begin building dataset  -----------\n')
    train_dataset = build_train_dataset()  # 训练集
    val_dataset = build_val_dataset()  # 验证集
    iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
    train_init_op = iterator.make_initializer(train_dataset)
    val_init_op = iterator.make_initializer(val_dataset)

    # 获得一条数据
    image_ids, image, y_true_13, y_true_26, y_true_52 = iterator.get_next()
    y_true = [y_true_13, y_true_26, y_true_52]

    # 如果丢失了shape,则手动设置
    image_ids.set_shape([None])
    image.set_shape([None, None, None, 3])
    for y in y_true:
        y.set_shape([None, None, None, None, None])
    print('\n\033[32m----------- Finish building dataset  -----------\n')
    return train_init_op, val_init_op, image_ids, image, y_true