|
- # -*- coding: utf-8 -*-
-
- import numpy as np
- import tensorflow as tf
-
- slim = tf.contrib.slim
-
- _BATCH_NORM_DECAY = 0.9
- _BATCH_NORM_EPSILON = 1e-05
- _LEAKY_RELU = 0.1
-
- _ANCHORS = [(10, 13), (16, 30), (33, 23),
- (30, 61), (62, 45), (59, 119),
- (116, 90), (156, 198), (373, 326)]
-
-
- # _ANCHORS = [(18,18), (53,64), (55,121),
- # (121,97), (89,200), (145,347),
- # (258,236), (260,473), (436,347)] #bird-person anchor
- # 18,18, 53,64, 55,121, 121,97, 89,200, 145,347, 258,236, 260,473, 436,347
-
- def darknet53(inputs):
- """
- Builds Darknet-53 model.
- """
- inputs = _conv2d_fixed_padding(inputs, 32, 3)
- inputs = _conv2d_fixed_padding(inputs, 64, 3, strides=2)
- inputs = _darknet53_block(inputs, 32) # residual block
- inputs = _conv2d_fixed_padding(inputs, 128, 3, strides=2)
-
- for i in range(2):
- inputs = _darknet53_block(inputs, 64)
-
- inputs = _conv2d_fixed_padding(inputs, 256, 3, strides=2)
-
- for i in range(8):
- inputs = _darknet53_block(inputs, 128)
-
- route_1 = inputs
- inputs = _conv2d_fixed_padding(inputs, 512, 3, strides=2)
-
- for i in range(8):
- inputs = _darknet53_block(inputs, 256)
-
- route_2 = inputs
- inputs = _conv2d_fixed_padding(inputs, 1024, 3, strides=2)
-
- for i in range(4):
- inputs = _darknet53_block(inputs, 512)
-
- return route_1, route_2, inputs
-
-
- def _conv2d_fixed_padding(inputs, filters, kernel_size, strides=1):
- if strides > 1:
- inputs = _fixed_padding(inputs, kernel_size)
- inputs = slim.conv2d(inputs, filters, kernel_size, stride=strides,
- padding=('SAME' if strides == 1 else 'VALID'))
- return inputs
-
-
- def _darknet53_block(inputs, filters):
- shortcut = inputs
- inputs = _conv2d_fixed_padding(inputs, filters, 1)
- inputs = _conv2d_fixed_padding(inputs, filters * 2, 3)
-
- inputs = inputs + shortcut
- return inputs
-
-
- def _bottleneck_layer(inputs, filters):
- inputs = _conv2d_fixed_padding(inputs, 4 * filters, 1)
- inputs = _conv2d_fixed_padding(inputs, filters, 3)
- return inputs
-
-
- def _transition_layer(inputs, data_format): # 152x152x80
- input_filters = inputs.get_shape().as_list()[1] if data_format == 'NCHW' \
- else inputs.get_shape().as_list()[-1]
- inputs = _conv2d_fixed_padding(inputs, input_filters // 2, 1)
- inputs = slim.max_pool2d(
- inputs, [2, 2], stride=2)
- return inputs
-
-
- def _dense_block(inputs, filters, nb_layers, data_format):
- layers_concat = list()
- layers_concat.append(inputs)
-
- inputs = _bottleneck_layer(inputs, filters)
-
- layers_concat.append(inputs)
-
- for i in range(nb_layers - 1):
- inputs = tf.concat(layers_concat, axis=1 if data_format == 'NCHW' else 3)
- inputs = _bottleneck_layer(inputs, filters)
- layers_concat.append(inputs)
-
- inputs = tf.concat(layers_concat,
- axis=1 if data_format == 'NCHW' else 3)
-
- return inputs
-
-
- def DensenetForYolo(inputs, data_format='NHWC'):
- '''
- Build Densenet model
- '''
- inputs = _conv2d_fixed_padding(inputs, 32, 7, strides=2)
- inputs = _conv2d_fixed_padding(inputs, 64, 3, strides=2)
-
- for i in range(3):
- inputs = _dense_block(inputs, 16, 4, data_format)
- inputs = _transition_layer(inputs, data_format)
-
- if i == 0:
- route_1 = inputs
-
- if i == 1:
- route_2 = inputs
- inputs = _dense_block(inputs, 16, 4, data_format)
-
- return route_1, route_2, inputs
-
-
- @tf.contrib.framework.add_arg_scope
- def _fixed_padding(inputs, kernel_size, *args, mode='CONSTANT', **kwargs):
- """
- Pads the input along the spatial dimensions independently of input size.
-
- Args:
- inputs: A tensor of size [batch, channels, height_in, width_in] or
- [batch, height_in, width_in, channels] depending on data_format.
- kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
- Should be a positive integer.
- data_format: The input format ('NHWC' or 'NCHW').
- mode: The mode for tf.pad.
-
- Returns:
- A tensor with the same format as the input with the data either intact
- (if kernel_size == 1) or padded (if kernel_size > 1).
- """
- pad_total = kernel_size - 1
- pad_beg = pad_total // 2
- pad_end = pad_total - pad_beg
-
- if kwargs['data_format'] == 'NCHW':
- padded_inputs = tf.pad(inputs, [[0, 0], [0, 0],
- [pad_beg, pad_end],
- [pad_beg, pad_end]],
- mode=mode)
- else:
- padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
- [pad_beg, pad_end], [0, 0]], mode=mode)
- return padded_inputs
-
-
- def _yolo_block(inputs, filters):
- inputs = _conv2d_fixed_padding(inputs, filters, 1)
- inputs = _conv2d_fixed_padding(inputs, filters * 2, 3)
- inputs = _conv2d_fixed_padding(inputs, filters, 1)
- inputs = _conv2d_fixed_padding(inputs, filters * 2, 3)
- inputs = _conv2d_fixed_padding(inputs, filters, 1)
- route = inputs
- inputs = _conv2d_fixed_padding(inputs, filters * 2, 3)
- return route, inputs
-
-
- def _get_size(shape, data_format):
- if len(shape) == 4:
- shape = shape[1:]
- return shape[1:3] if data_format == 'NCHW' else shape[0:2]
-
-
- def _detection_layer(inputs, num_classes, anchors, img_size, data_format):
- num_anchors = len(anchors)
- predictions = slim.conv2d(inputs, num_anchors * (5 + num_classes), 1,
- stride=1, normalizer_fn=None,
- activation_fn=None,
- biases_initializer=tf.zeros_initializer())
-
- shape = predictions.get_shape().as_list()
- grid_size = _get_size(shape, data_format)
- dim = grid_size[0] * grid_size[1]
- bbox_attrs = 5 + num_classes
-
- if data_format == 'NCHW':
- predictions = tf.reshape(
- predictions, [-1, num_anchors * bbox_attrs, dim]) # ?????????/
- predictions = tf.transpose(predictions, [0, 2, 1])
-
- predictions = tf.reshape(predictions, [-1, num_anchors * dim, bbox_attrs])
-
- stride = (img_size[0] // grid_size[0], img_size[1] // grid_size[1])
-
- anchors = [(a[0] / stride[0], a[1] / stride[1]) for a in anchors]
-
- box_centers, box_sizes, confidence, classes = tf.split(
- predictions, [2, 2, 1, num_classes], axis=-1)
-
- box_centers = tf.nn.sigmoid(box_centers)
- confidence = tf.nn.sigmoid(confidence)
-
- grid_x = tf.range(grid_size[0], dtype=tf.float32)
- grid_y = tf.range(grid_size[1], dtype=tf.float32)
- a, b = tf.meshgrid(grid_x, grid_y)
-
- x_offset = tf.reshape(a, (-1, 1))
- y_offset = tf.reshape(b, (-1, 1))
-
- x_y_offset = tf.concat([x_offset, y_offset], axis=-1)
- x_y_offset = tf.reshape(tf.tile(x_y_offset, [1, num_anchors]), [1, -1, 2])
-
- box_centers = box_centers + x_y_offset
- box_centers = box_centers * stride # 得到了真实坐标
-
- anchors = tf.tile(anchors, [dim, 1])
- box_sizes = tf.exp(box_sizes) * anchors
- box_sizes = box_sizes * stride
-
- detections = tf.concat([box_centers, box_sizes, confidence], axis=-1)
-
- classes = tf.nn.sigmoid(classes)
- predictions = tf.concat([detections, classes], axis=-1)
- return predictions
-
-
- def _upsample(inputs, out_shape, data_format='NHWC'):
- # tf.image.resize_nearest_neighbor accepts input in format NHWC
- if data_format == 'NCHW':
- inputs = tf.transpose(inputs, [0, 2, 3, 1])
-
- if data_format == 'NCHW':
- new_height = out_shape[3]
- new_width = out_shape[2]
- else:
- new_height = out_shape[2]
- new_width = out_shape[1]
-
- inputs = tf.image.resize_nearest_neighbor(inputs, (new_height, new_width))
-
- # back to NCHW if needed
- if data_format == 'NCHW':
- inputs = tf.transpose(inputs, [0, 3, 1, 2])
-
- inputs = tf.identity(inputs, name='upsampled')
- return inputs
-
-
- def yolo_v3(inputs, num_classes, is_training=False, data_format='NHWC', reuse=False):
- """
- Creates YOLO v3 model.
-
- :param inputs: a 4-D tensor of size [batch_size, height, width, channels].
- Dimension batch_size may be undefined. The channel order is RGB.
- :param num_classes: number of predicted classes.
- :param is_training: whether is training or not.
- :param data_format: data format NCHW or NHWC.
- :param reuse: whether or not the network and its variables should be reused.
- :return:
- """
- # it will be needed later on
- img_size = inputs.get_shape().as_list()[1:3]
-
- # transpose the inputs to NCHW
- if data_format == 'NCHW':
- inputs = tf.transpose(inputs, [0, 3, 1, 2])
-
- # normalize values to range [0..1]
- inputs = inputs / 255
-
- # set batch norm params
- batch_norm_params = {
- 'decay': _BATCH_NORM_DECAY,
- 'epsilon': _BATCH_NORM_EPSILON,
- 'scale': True,
- 'is_training': is_training,
- 'fused': None, # Use fused batch norm if possible.
- }
-
- # Set activation_fn and parameters for conv2d, batch_norm.
- with slim.arg_scope([slim.conv2d, slim.batch_norm, _fixed_padding], data_format=data_format, reuse=reuse):
- with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm,
- normalizer_params=batch_norm_params,
- biases_initializer=None,
- activation_fn=lambda x: tf.nn.leaky_relu(x, alpha=_LEAKY_RELU)):
- with tf.variable_scope('darknet-53'):
- route_1, route_2, inputs = darknet53(inputs)
-
- with tf.variable_scope('yolo-v3'):
- route, inputs = _yolo_block(inputs, 512)
- detect_1 = _detection_layer(
- inputs, num_classes, _ANCHORS[6:9], img_size, data_format)
- detect_1 = tf.identity(detect_1, name='detect_1')
-
- inputs = _conv2d_fixed_padding(route, 256, 1)
- upsample_size = route_2.get_shape().as_list()
- inputs = _upsample(inputs, upsample_size, data_format)
- inputs = tf.concat([inputs, route_2],
- axis=1 if data_format == 'NCHW' else 3)
-
- route, inputs = _yolo_block(inputs, 256)
-
- detect_2 = _detection_layer(
- inputs, num_classes, _ANCHORS[3:6], img_size, data_format)
- detect_2 = tf.identity(detect_2, name='detect_2')
-
- inputs = _conv2d_fixed_padding(route, 128, 1)
- upsample_size = route_1.get_shape().as_list()
- inputs = _upsample(inputs, upsample_size, data_format)
- inputs = tf.concat([inputs, route_1],
- axis=1 if data_format == 'NCHW' else 3)
-
- _, inputs = _yolo_block(inputs, 128)
-
- detect_3 = _detection_layer(
- inputs, num_classes, _ANCHORS[0:3], img_size, data_format)
- detect_3 = tf.identity(detect_3, name='detect_3')
-
- detections = tf.concat([detect_1, detect_2, detect_3], axis=1)
- detections = tf.identity(detections, name='detections')
- return detections
-
-
- def dense_yolo_v3(inputs, num_classes, is_training=False, data_format='NHWC', reuse=False):
- """
- Creates YOLO v3 model.
-
- :param inputs: a 4-D tensor of size [batch_size, height, width, channels].
- Dimension batch_size may be undefined. The channel order is RGB.
- :param num_classes: number of predicted classes.
- :param is_training: whether is training or not.
- :param data_format: data format NCHW or NHWC.
- :param reuse: whether or not the network and its variables should be reused.
- :return:
- """
- # it will be needed later on
- img_size = inputs.get_shape().as_list()[1:3]
-
- # transpose the inputs to NCHW
- if data_format == 'NCHW':
- inputs = tf.transpose(inputs, [0, 3, 1, 2])
-
- # normalize values to range [0..1]
- inputs = inputs / 255
-
- # set batch norm params
- batch_norm_params = {
- 'decay': _BATCH_NORM_DECAY,
- 'epsilon': _BATCH_NORM_EPSILON,
- 'scale': True,
- 'is_training': is_training,
- 'fused': None, # Use fused batch norm if possible.
- }
-
- # Set activation_fn and parameters for conv2d, batch_norm.
- with slim.arg_scope([slim.conv2d, slim.batch_norm, _fixed_padding, slim.max_pool2d], data_format=data_format):
- with slim.arg_scope([slim.conv2d, slim.batch_norm, _fixed_padding], reuse=reuse):
- with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm,
- normalizer_params=batch_norm_params,
- biases_initializer=None,
- activation_fn=lambda x: tf.nn.leaky_relu(x, alpha=_LEAKY_RELU)):
- with tf.variable_scope('densenet'):
- route_1, route_2, inputs = DensenetForYolo(inputs, data_format)
-
- with tf.variable_scope('yolo-v3'):
- inputs = _dense_block(inputs, 16, 2, data_format)
- route = inputs
- inputs = _conv2d_fixed_padding(inputs, 128, 3)
-
- detect_1 = _detection_layer(
- inputs, num_classes, _ANCHORS[6:9], img_size, data_format)
- detect_1 = tf.identity(detect_1, name='detect_1')
-
- upsample_size = route_2.get_shape().as_list()
- inputs = _conv2d_fixed_padding(route, upsample_size[-1] // 2, 3)
- inputs = _upsample(inputs, upsample_size, data_format)
- inputs = tf.concat([inputs, route_2],
- axis=1 if data_format == 'NCHW' else 3)
-
- inputs = _dense_block(inputs, 16, 2, data_format)
- route = inputs
- inputs = _conv2d_fixed_padding(inputs, 64, 3)
-
- detect_2 = _detection_layer(
- inputs, num_classes, _ANCHORS[3:6], img_size, data_format)
- detect_2 = tf.identity(detect_2, name='detect_2')
-
- upsample_size = route_1.get_shape().as_list()
- inputs = _conv2d_fixed_padding(route, upsample_size[-1] // 2, 3)
- inputs = _upsample(inputs, upsample_size, data_format)
- inputs = tf.concat([inputs, route_1],
- axis=1 if data_format == 'NCHW' else 3)
-
- inputs = _dense_block(inputs, 16, 2, data_format)
- inputs = _conv2d_fixed_padding(inputs, 32, 3)
-
- detect_3 = _detection_layer(
- inputs, num_classes, _ANCHORS[0:3], img_size, data_format)
- detect_3 = tf.identity(detect_3, name='detect_3')
-
- detections = tf.concat([detect_1, detect_2, detect_3], axis=1)
- detections = tf.identity(detections, name='detections')
- return detections
|