|
- '''
- Multi-GPU training.
- Near linear scale acceleration for multi-gpus on a single machine.
- Will use H5 dataset in default. If using normal, will shift to the normal dataset.
- '''
-
- import argparse
- import math
- from datetime import datetime
- import h5py
- import numpy as np
- import tensorflow as tf
- import socket
- import importlib
- import os
- import sys
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
- ROOT_DIR = BASE_DIR
- sys.path.append(BASE_DIR)
- sys.path.append(os.path.join(ROOT_DIR, 'models'))
- sys.path.append(os.path.join(ROOT_DIR, 'utils'))
- import provider
- import tf_util
- import modelnet_dataset
- import modelnet_h5_dataset
-
- parser = argparse.ArgumentParser()
- parser.add_argument('--num_gpus', type=int, default=2, help='How many gpus to use [default: 1]')
- parser.add_argument('--model', default='dgs_cls', help='Model name [default: pointnet2_cls_ssg]')
- parser.add_argument('--dump_dir', default='dump', help='dump folder path [dump]')
- parser.add_argument('--num_point', type=int, default=1024, help='Point Number [default: 1024]')
- parser.add_argument('--max_epoch', type=int, default=500, help='Epoch to run [default: 251]')
- parser.add_argument('--batch_size', type=int, default=32, help='Batch Size during training [default: 32]')
- parser.add_argument('--learning_rate', type=float, default=0.001, help='Initial learning rate [default: 0.001]')
- parser.add_argument('--momentum', type=float, default=0.9, help='Initial learning rate [default: 0.9]')
- parser.add_argument('--optimizer', default='adam', help='adam or momentum [default: adam]')
- parser.add_argument('--decay_step', type=int, default=200000, help='Decay step for lr decay [default: 200000]')
- parser.add_argument('--decay_rate', type=float, default=0.7, help='Decay rate for lr decay [default: 0.7]')
- parser.add_argument('--normal', action='store_true', help='Whether to use normal information')
- parser.add_argument('--model_path', default='log/model_acc_0.932739.ckpt', help='model checkpoint file path [default: log/model.ckpt]')
- FLAGS = parser.parse_args()
-
- EPOCH_CNT = 0
-
- NUM_GPUS = FLAGS.num_gpus
- BATCH_SIZE = FLAGS.batch_size
- assert(BATCH_SIZE % NUM_GPUS == 0)
- DEVICE_BATCH_SIZE = BATCH_SIZE / NUM_GPUS
-
- NUM_POINT = FLAGS.num_point
- MAX_EPOCH = FLAGS.max_epoch
- BASE_LEARNING_RATE = FLAGS.learning_rate
- MOMENTUM = FLAGS.momentum
- OPTIMIZER = FLAGS.optimizer
- DECAY_STEP = FLAGS.decay_step
- DECAY_RATE = FLAGS.decay_rate
- MODEL_PATH = FLAGS.model_path
-
- MODEL = importlib.import_module(FLAGS.model) # import network module
- MODEL_FILE = os.path.join(ROOT_DIR, 'models', FLAGS.model+'.py')
- DUMP_DIR = FLAGS.dump_dir
- if not os.path.exists(DUMP_DIR): os.mkdir(DUMP_DIR)
- LOG_FOUT = open(os.path.join(DUMP_DIR, 'log_evaluate.txt'), 'w')
- LOG_FOUT.write(str(FLAGS)+'\n')
-
- BN_INIT_DECAY = 0.5
- BN_DECAY_DECAY_RATE = 0.5
- BN_DECAY_DECAY_STEP = float(DECAY_STEP)
- BN_DECAY_CLIP = 0.99
-
- HOSTNAME = socket.gethostname()
-
- NUM_CLASSES = 40
-
- # Shapenet official train/test split
- if FLAGS.normal:
- assert(NUM_POINT<=10000)
- DATA_PATH = os.path.join(ROOT_DIR, 'data/modelnet40_normal_resampled')
- TRAIN_DATASET = modelnet_dataset.ModelNetDataset(root=DATA_PATH, npoints=NUM_POINT, split='train', normal_channel=FLAGS.normal, batch_size=BATCH_SIZE)
- TEST_DATASET = modelnet_dataset.ModelNetDataset(root=DATA_PATH, npoints=NUM_POINT, split='test', normal_channel=FLAGS.normal, batch_size=BATCH_SIZE)
- else:
- assert(NUM_POINT<=2048)
- TRAIN_DATASET = modelnet_h5_dataset.ModelNetH5Dataset(os.path.join(BASE_DIR, 'data/modelnet40_ply_hdf5_2048_back/train_files.txt'), batch_size=BATCH_SIZE, npoints=NUM_POINT, shuffle=True)
- TEST_DATASET = modelnet_h5_dataset.ModelNetH5Dataset(os.path.join(BASE_DIR, 'data/modelnet40_ply_hdf5_2048_back/test_files.txt'), batch_size=BATCH_SIZE, npoints=NUM_POINT, shuffle=False)
-
- def log_string(out_str):
- LOG_FOUT.write(out_str+'\n')
- LOG_FOUT.flush()
- print(out_str)
-
- def average_gradients(tower_grads):
- """Calculate the average gradient for each shared variable across all towers.
- Note that this function provides a synchronization point across all towers.
- From tensorflow tutorial: cifar10/cifar10_multi_gpu_train.py
- Args:
- tower_grads: List of lists of (gradient, variable) tuples. The outer list
- is over individual gradients. The inner list is over the gradient
- calculation for each tower.
- Returns:
- List of pairs of (gradient, variable) where the gradient has been averaged
- across all towers.
- """
- average_grads = []
- for grad_and_vars in zip(*tower_grads):
- # Note that each grad_and_vars looks like the following:
- # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
- grads = []
- #for g, _ in grad_and_vars:
- for g, v in grad_and_vars:
- # Add 0 dimension to the gradients to represent the tower.
- expanded_g = tf.expand_dims(g, 0)
-
- # Append on a 'tower' dimension which we will average over below.
- grads.append(expanded_g)
-
- # Average over the 'tower' dimension.
- grad = tf.concat(axis=0, values=grads)
- grad = tf.reduce_mean(grad, 0)
-
- # Keep in mind that the Variables are redundant because they are shared
- # across towers. So .. we will just return the first tower's pointer to
- # the Variable.
- v = grad_and_vars[0][1]
- grad_and_var = (grad, v)
- average_grads.append(grad_and_var)
- return average_grads
-
-
- def get_learning_rate(batch):
- learning_rate = tf.train.exponential_decay(
- BASE_LEARNING_RATE, # Base learning rate.
- batch * BATCH_SIZE, # Current index into the dataset.
- DECAY_STEP, # Decay step.
- DECAY_RATE, # Decay rate.
- staircase=True)
- learning_rate = tf.maximum(learning_rate, 0.00001) # CLIP THE LEARNING RATE!
- return learning_rate
-
- def get_bn_decay(batch):
- bn_momentum = tf.train.exponential_decay(
- BN_INIT_DECAY,
- batch*BATCH_SIZE,
- BN_DECAY_DECAY_STEP,
- BN_DECAY_DECAY_RATE,
- staircase=True)
- bn_decay = tf.minimum(BN_DECAY_CLIP, 1 - bn_momentum)
- return bn_decay
-
- def train():
- with tf.Graph().as_default():
- with tf.device('/cpu:0'):
- pointclouds_pl, labels_pl = MODEL.placeholder_inputs(BATCH_SIZE, NUM_POINT)
- is_training_pl = tf.placeholder(tf.bool, shape=())
-
- # Note the global_step=batch parameter to minimize.
- # That tells the optimizer to helpfully increment the 'batch' parameter
- # for you every time it trains.
- batch = tf.get_variable('batch', [],
- initializer=tf.constant_initializer(0), trainable=False)
- bn_decay = get_bn_decay(batch)
- tf.summary.scalar('bn_decay', bn_decay)
-
- # Set learning rate and optimizer
- learning_rate = get_learning_rate(batch)
- tf.summary.scalar('learning_rate', learning_rate)
- if OPTIMIZER == 'momentum':
- optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=MOMENTUM)
- elif OPTIMIZER == 'adam':
- optimizer = tf.train.AdamOptimizer(learning_rate)
-
- # -------------------------------------------
- # Get model and loss on multiple GPU devices
- # -------------------------------------------
- # Allocating variables on CPU first will greatly accelerate multi-gpu training.
- # Ref: https://github.com/kuza55/keras-extras/issues/21
- MODEL.get_model(pointclouds_pl, is_training_pl, bn_decay=bn_decay)
-
- tower_grads = []
- pred_gpu = []
- total_loss_gpu = []
- for i in range(NUM_GPUS):
- with tf.variable_scope(tf.get_variable_scope(), reuse=True):
- with tf.device('/gpu:%d'%(i)), tf.name_scope('gpu_%d'%(i)) as scope:
- # Evenly split input data to each GPU
- pc_batch = tf.slice(pointclouds_pl,
- [i*DEVICE_BATCH_SIZE,0,0], [DEVICE_BATCH_SIZE,-1,-1])
- label_batch = tf.slice(labels_pl,
- [i*DEVICE_BATCH_SIZE], [DEVICE_BATCH_SIZE])
-
- pred, end_points = MODEL.get_model(pc_batch,
- is_training=is_training_pl, bn_decay=bn_decay)
-
- MODEL.get_loss(pred, label_batch, end_points)
- losses = tf.get_collection('losses', scope)
- total_loss = tf.add_n(losses, name='total_loss')
- for l in losses + [total_loss]:
- tf.summary.scalar(l.op.name, l)
-
- grads = optimizer.compute_gradients(total_loss)
- tower_grads.append(grads)
-
- pred_gpu.append(pred)
- total_loss_gpu.append(total_loss)
-
- # Merge pred and losses from multiple GPUs
- pred = tf.concat(pred_gpu, 0)
- total_loss = tf.reduce_mean(total_loss_gpu)
-
- # Get training operator
- grads = average_gradients(tower_grads)
- train_op = optimizer.apply_gradients(grads, global_step=batch)
-
- correct = tf.equal(tf.argmax(pred, 1), tf.to_int64(labels_pl))
- accuracy = tf.reduce_sum(tf.cast(correct, tf.float32)) / float(BATCH_SIZE)
- tf.summary.scalar('accuracy', accuracy)
-
- # Add ops to save and restore all the variables.
- saver = tf.train.Saver()
-
- # Create a session
- config = tf.ConfigProto()
- config.gpu_options.allow_growth = True
- config.allow_soft_placement = True
- config.log_device_placement = False
- sess = tf.Session(config=config)
-
- # Add summary writers
- merged = tf.summary.merge_all()
-
- # Init variables
- init = tf.global_variables_initializer()
- sess.run(init)
- saver.restore(sess, MODEL_PATH)
- log_string("Model restored.")
-
- ops = {'pointclouds_pl': pointclouds_pl,
- 'labels_pl': labels_pl,
- 'is_training_pl': is_training_pl,
- 'pred': pred,
- 'loss': total_loss,
- 'train_op': train_op,
- 'merged': merged,
- 'step': batch,
- 'end_points': end_points}
-
-
-
-
- sys.stdout.flush()
-
- #train_one_epoch(sess, ops, train_writer)
- eval_one_epoch(sess, ops)
-
- def eval_one_epoch(sess, ops):
- """ ops: dict mapping from string to tf ops """
- global EPOCH_CNT
- is_training = False
-
- # Make sure batch data is of same size
- cur_batch_data = np.zeros((BATCH_SIZE,NUM_POINT,TEST_DATASET.num_channel()))
- cur_batch_label = np.zeros((BATCH_SIZE), dtype=np.int32)
-
- total_correct = 0
- total_seen = 0
- loss_sum = 0
- batch_idx = 0
- shape_ious = []
- total_seen_class = [0 for _ in range(NUM_CLASSES)]
- total_correct_class = [0 for _ in range(NUM_CLASSES)]
-
- log_string(str(datetime.now()))
- log_string('---- EPOCH %03d EVALUATION ----'%(EPOCH_CNT))
-
- while TEST_DATASET.has_next_batch():
- batch_data, batch_label = TEST_DATASET.next_batch(augment=False)
- bsize = batch_data.shape[0]
- # for the last batch in the epoch, the bsize:end are from last batch
- cur_batch_data[0:bsize,...] = batch_data
- cur_batch_label[0:bsize] = batch_label
-
- feed_dict = {ops['pointclouds_pl']: cur_batch_data,
- ops['labels_pl']: cur_batch_label,
- ops['is_training_pl']: is_training}
- summary, step, loss_val, pred_val = sess.run([ops['merged'], ops['step'],
- ops['loss'], ops['pred']], feed_dict=feed_dict)
- pred_val = np.argmax(pred_val, 1)
- correct = np.sum(pred_val[0:bsize] == batch_label[0:bsize])
- total_correct += correct
- total_seen += bsize
- loss_sum += loss_val
- batch_idx += 1
- for i in range(0, bsize):
- l = batch_label[i]
- total_seen_class[l] += 1
- total_correct_class[l] += (pred_val[i] == l)
-
- log_string('eval mean loss: %f' % (loss_sum / float(batch_idx)))
- log_string('eval accuracy: %f'% (total_correct / float(total_seen)))
- log_string('eval avg class acc: %f' % (np.mean(np.array(total_correct_class)/np.array(total_seen_class,dtype=np.float))))
- EPOCH_CNT += 1
-
- TEST_DATASET.reset()
- return total_correct/float(total_seen)
-
-
- if __name__ == "__main__":
- log_string('pid: %s'%(str(os.getpid())))
- train()
- LOG_FOUT.close()
|