|
- from data import *
- from utils.augmentations import SSDAugmentation, BaseTransform
- from utils.functions import MovingAverage, SavePath
- from utils.logger import Log
- from utils import timer
- from layers.modules import MultiBoxLoss
- from yolact import Yolact
- import os
- import sys
- import time
- import math, random
- from pathlib import Path
- import paddle
- from paddle import amp
- #from torch.autograd import Variable
- import paddle.nn as nn
- import paddle.optimizer as optim
- # import torch.backends.cudnn as cudnn
- # import torch.nn.init as init
- # import torch.utils.data as data
- import numpy as np
- import argparse
- import datetime
-
- # Oof
- import eval as eval_script
-
- def str2bool(v):
- return v.lower() in ("yes", "true", "t", "1")
-
-
- parser = argparse.ArgumentParser(
- description='Yolact Training Script')
-
- parser.add_argument('--trained_model',
- default='weights/ssd300_mAP_77.43_v2.pth', type=str,
- help='Trained state_dict file path to open. If "interrupt", this will open the interrupt file.')
- parser.add_argument('--batch_size', default=8, type=int,
- help='Batch size for training')
- parser.add_argument('--resume', default=None, type=str,
- help='Checkpoint state_dict file to resume training from. If this is "interrupt"'\
- ', the model will resume training from the interrupt file.')
- parser.add_argument('--start_iter', default=-1, type=int,
- help='Resume training at this iter. If this is -1, the iteration will be'\
- 'determined from the file name.')
- parser.add_argument('--num_workers', default=4, type=int,
- help='Number of workers used in dataloading')
- parser.add_argument('--cuda', default=True, type=str2bool,
- help='Use CUDA to train model')
- parser.add_argument('--lr', '--learning_rate', default=None, type=float,
- help='Initial learning rate. Leave as None to read this from the config.')
- parser.add_argument('--momentum', default=None, type=float,
- help='Momentum for SGD. Leave as None to read this from the config.')
- parser.add_argument('--decay', '--weight_decay', default=None, type=float,
- help='Weight decay for SGD. Leave as None to read this from the config.')
- parser.add_argument('--gamma', default=None, type=float,
- help='For each lr step, what to multiply the lr by. Leave as None to read this from the config.')
- parser.add_argument('--save_folder', default='weights/',
- help='Directory for saving checkpoint models.')
- parser.add_argument('--log_folder', default='logs/',
- help='Directory for saving logs.')
- parser.add_argument('--config', default=None,
- help='The config object to use.')
- parser.add_argument('--save_interval', default=10000, type=int,
- help='The number of iterations between saving the model.')
- parser.add_argument('--validation_size', default=350000, type=int,
- help='The number of images to use for validation.')
- parser.add_argument('--validation_epoch', default=2, type=int,
- help='Output validation information every n iterations. If -1, do no validation.')
- parser.add_argument('--keep_latest', dest='keep_latest', action='store_true',
- help='Only keep the latest checkpoint instead of each one.')
- parser.add_argument('--keep_latest_interval', default=100000, type=int,
- help='When --keep_latest is on, don\'t delete the latest file at these intervals. This should be a multiple of save_interval or 0.')
- parser.add_argument('--dataset', default=None, type=str,
- help='If specified, override the dataset specified in the config with this one (example: coco2017_dataset).')
- parser.add_argument('--no_log', dest='log', action='store_false',
- help='Don\'t log per iteration information into log_folder.')
- parser.add_argument('--log_gpu', dest='log_gpu', action='store_true',
- help='Include GPU information in the logs. Nvidia-smi tends to be slow, so set this with caution.')
- parser.add_argument('--no_interrupt', dest='interrupt', action='store_false',
- help='Don\'t save an interrupt when KeyboardInterrupt is caught.')
- parser.add_argument('--batch_alloc', default=None, type=str,
- help='If using multiple GPUS, you can set this to be a comma separated list detailing which GPUs should get what local batch size (It should add up to your total batch size).')
- parser.add_argument('--no_autoscale', dest='autoscale', action='store_false',
- help='YOLACT will automatically scale the lr and the number of iterations depending on the batch size. Set this if you want to disable that.')
-
- parser.set_defaults(keep_latest=False, log=True, log_gpu=False, interrupt=True, autoscale=True)
- args = parser.parse_args()
-
- if args.config is not None:
- set_cfg(args.config)
-
- if args.dataset is not None:
- set_dataset(args.dataset)
-
- if args.autoscale and args.batch_size != 8:
- factor = args.batch_size / 8
- if __name__ == '__main__':
- print('Scaling parameters by %.2f to account for a batch size of %d.' % (factor, args.batch_size))
-
- cfg.lr *= factor
- cfg.max_iter //= factor
- cfg.lr_steps = [x // factor for x in cfg.lr_steps]
-
- # Update training parameters from the config if necessary
- def replace(name):
- if getattr(args, name) == None: setattr(args, name, getattr(cfg, name))
- replace('lr')
- replace('decay')
- replace('gamma')
- replace('momentum')
-
- # This is managed by set_lr
- cur_lr = args.lr
-
- if 'gpu' not in paddle.get_device():
- print('No GPUs detected. Exiting...')
- exit(-1)
-
- if args.batch_size // paddle.distributed.ParallelEnv().nranks < 6:
- if __name__ == '__main__':
- print('Per-GPU batch size is less than the recommended limit for batch norm. Disabling batch norm.')
- cfg.freeze_bn = True
-
- loss_types = ['B', 'C', 'M', 'P', 'D', 'E', 'S', 'I']
-
- if paddle.is_compiled_with_cuda() :
- paddle.set_device("gpu")
- else:
- paddle.set_device("cpu")
-
- class NetLoss(nn.Layer):
- """
- A wrapper for running the network and computing the loss
- This is so we can more efficiently use DataParallel.
- """
-
- def __init__(self, net:Yolact, criterion:MultiBoxLoss):
- super().__init__()
-
- self.net = net
- self.criterion = criterion
-
- def forward(self, inputs):
- images, targets, masks, num_crowds = inputs
- preds = self.net(images)
- losses = self.criterion(self.net, preds, targets, masks, num_crowds)
- return losses
-
- # class CustomDataParallel(nn.DataParallel):
- # """
- # This is a custom version of DataParallel that works better with our training data.
- # It should also be faster than the general case.
- # """
-
- # def scatter(self, inputs, kwargs, device_ids):
- # # More like scatter and data prep at the same time. The point is we prep the data in such a way
- # # that no scatter is necessary, and there's no need to shuffle stuff around different GPUs.
- # devices = ['cuda:' + str(x) for x in device_ids]
- # splits = prepare_data(inputs[0], devices, allocation=args.batch_alloc)
-
- # return [[split[device_idx] for split in splits] for device_idx in range(len(devices))], \
- # [kwargs] * len(devices)
-
- # def gather(self, outputs, output_device):
- # out = {}
-
- # for k in outputs[0]:
- # out[k] = torch.stack([output[k].to(output_device) for output in outputs])
-
- # return out
-
-
- class PiecewiseDecay(object):
- """
- Multi step learning rate decay
- Args:
- gamma (float | list): decay factor
- milestones (list): steps at which to decay learning rate
- """
-
- def __init__(self,
- gamma=[0.1, 0.01],
- milestones=[280000, 360000, 400000],
- values=None,
- use_warmup=True):
- super(PiecewiseDecay, self).__init__()
- if type(gamma) is not list:
- self.gamma = []
- for i in range(len(milestones)):
- self.gamma.append(gamma / 10**i)
- else:
- self.gamma = gamma
- self.milestones = milestones
- self.values = values
- self.use_warmup = use_warmup
-
- def __call__(self,
- base_lr=None,
- boundary=None,
- value=None,
- step_per_epoch=None):
- if boundary is not None and self.use_warmup:
- boundary.extend([int(step_per_epoch) * i for i in self.milestones])
- else:
- # do not use LinearWarmup
- boundary = [int(step_per_epoch) * i for i in self.milestones]
- value = [base_lr] # during step[0, boundary[0]] is base_lr
-
- # self.values is setted directly in config
- if self.values is not None:
- assert len(self.milestones) + 1 == len(self.values)
- return optim.lr.PiecewiseDecay(boundary, self.values)
-
- # value is computed by self.gamma
- value = value if value is not None else [base_lr]
- for i in self.gamma:
- value.append(base_lr * i)
-
- return optim.lr.PiecewiseDecay(boundary, value)
-
- class LinearWarmup(object):
- """
- Warm up learning rate linearly
- Args:
- steps (int): warm up steps
- start_factor (float): initial learning rate factor
- """
-
- def __init__(self, steps=500, start_factor=1. / 3):
- super(LinearWarmup, self).__init__()
- self.steps = steps
- self.start_factor = start_factor
-
- def __call__(self, base_lr, step_per_epoch):
- boundary = []
- value = []
- for i in range(self.steps + 1):
- if self.steps > 0:
- alpha = i / self.steps
- lr = (base_lr - self.start_factor) * alpha + self.start_factor
- value.append(lr)
- if i > 0:
- boundary.append(i)
- return boundary, value
-
- class LearningRate(object):
- """
- Learning Rate configuration
- Args:
- base_lr (float): base learning rate
- schedulers (list): learning rate schedulers
- """
- __category__ = 'optim'
-
- def __init__(self,
- schedulers=[PiecewiseDecay(gamma=cfg.gamma, milestones=cfg.lr_steps), LinearWarmup(steps=cfg.lr_warmup_until,start_factor=cfg.lr_warmup_init)]):
- super(LearningRate, self).__init__()
- self.schedulers = schedulers
-
- def __call__(self, step_per_epoch):
- assert len(self.schedulers) >= 1
- # warmup
- boundary, value = self.schedulers[1](cfg.lr, step_per_epoch)
- # decay
- decay_lr = self.schedulers[0](cfg.lr, boundary, value,
- step_per_epoch)
- return decay_lr
-
-
- class BaseDataLoader(object):
- def __init__(self,dataloader,n):
- self.dataloader = dataloader
- self.n = n
- self.loader = iter(self.dataloader)
-
- def __len__(self):
- return self.n
-
- def __iter__(self):
- return self
-
- def __next__(self):
- try:
- return next(self.loader)
- except StopIteration:
- self.loader = iter(self.dataloader)
- six.reraise(*sys.exc_info())
-
- def next(self):
- # python2 compatibility
- return self.__next__()
-
-
- def train():
- if not os.path.exists(args.save_folder):
- os.mkdir(args.save_folder)
- scaler = amp.GradScaler(
- enable=True, init_loss_scaling=1024)
-
- dataset = COCODetection(image_path=cfg.dataset.train_images,
- info_file=cfg.dataset.train_info,
- transform=SSDAugmentation(MEANS))
-
- if args.validation_epoch > 0:
- setup_eval()
- val_dataset = COCODetection(image_path=cfg.dataset.valid_images,
- info_file=cfg.dataset.valid_info,
- transform=BaseTransform(MEANS))
-
- # Parallel wraps the underlying module, but when saving and loading we don't want that
- yolact_net = Yolact()
- net = yolact_net
- net.train()
-
- if args.log:
- log = Log(cfg.name, args.log_folder, dict(args._get_kwargs()),
- overwrite=(args.resume is None), log_gpu_stats=args.log_gpu)
-
- # I don't use the timer during training (I use a different timing method).
- # Apparently there's a race condition with multiple GPUs, so disable it just to be safe.
- timer.disable_all()
-
- # Both of these can set args.resume to None, so do them before the check
- if args.resume == 'interrupt':
- args.resume = SavePath.get_interrupt(args.save_folder)
- elif args.resume == 'latest':
- args.resume = SavePath.get_latest(args.save_folder, cfg.name)
-
- if args.resume is not None:
- print('Resuming training, loading {}...'.format(args.resume))
- yolact_net.load_weights(args.resume)
-
- if args.start_iter == -1:
- args.start_iter = SavePath.from_str(args.resume).iteration
- else:
- print('Initializing weights...')
- yolact_net.init_weights(backbone_path=args.save_folder + cfg.backbone.path)
-
- #net.load_weights(args.trained_model)
- grad_clip = nn.ClipGradByGlobalNorm(clip_norm=35)
- lr_scheduler = LearningRate()(1)
- optimizer = optim.Momentum(parameters=net.parameters(), learning_rate=lr_scheduler, momentum=args.momentum,grad_clip=grad_clip,
- weight_decay=args.decay)
-
- if args.start_iter != -1:
- optim_state_dict = paddle.load('./weights/' + 'model_%d.pdopt'%args.start_iter)
- optimizer.set_state_dict(optim_state_dict)
- criterion = MultiBoxLoss(num_classes=cfg.num_classes,
- pos_threshold=cfg.positive_iou_threshold,
- neg_threshold=cfg.negative_iou_threshold,
- negpos_ratio=cfg.ohem_negpos_ratio)
-
- if args.batch_alloc is not None:
- args.batch_alloc = [int(x) for x in args.batch_alloc.split(',')]
- if sum(args.batch_alloc) != args.batch_size:
- print('Error: Batch allocation (%s) does not sum to batch size (%s).' % (args.batch_alloc, args.batch_size))
- exit(-1)
- nranks = paddle.distributed.ParallelEnv().nranks
- local_rank = paddle.distributed.ParallelEnv().local_rank
-
- if nranks > 1:
- # Initialize parallel environment if not done.
- if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(
- ):
- paddle.distributed.init_parallel_env()
- net = paddle.DataParallel(NetLoss(net, criterion))
- #net = CustomDataParallel(NetLoss(net, criterion)) ###
- # if args.cuda:
- # net = net.cuda()
-
- # Initialize everything
- if not cfg.freeze_bn: yolact_net.freeze_bn() # Freeze bn so we don't kill our means
- #yolact_net(paddle.zeros((1, 3, cfg.max_size, cfg.max_size)).cuda())
- if not cfg.freeze_bn: yolact_net.freeze_bn(True)
-
- # loss counters
- loc_loss = 0
- conf_loss = 0
- iteration = max(args.start_iter, 0)
- last_time = time.time()
-
- epoch_size = len(dataset) // args.batch_size
- num_epochs = math.ceil(cfg.max_iter / epoch_size)
-
- # Which learning rate adjustment step are we on? lr' = lr * gamma ^ step_index
- step_index = 0
- batch_sampler = paddle.io.DistributedBatchSampler(
- dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
- data_loader = paddle.io.DataLoader(
- dataset,
- batch_sampler=batch_sampler,
- num_workers=0,
- collate_fn=detection_collate,
- return_list=True,
- )
- # data_loader = paddle.io.DataLoader(dataset, args.batch_size,
- # num_workers=args.num_workers,
- # shuffle=True, collate_fn=detection_collate)#, ######
- # #pin_memory=True)
-
-
- save_path = lambda epoch, iteration: SavePath(cfg.name, epoch, iteration).get_path(root=args.save_folder)
- time_avg = MovingAverage()
-
- global loss_types # Forms the print order
- loss_avgs = { k: MovingAverage(100) for k in loss_types }
-
- print('Begin training!')
- print()
- # try-except so you can use ctrl+c to save early and stop training
- try:
- for epoch in range(num_epochs):
- # Resume from start_iter
- if (epoch+1)*epoch_size < iteration:
- continue
-
- for datum in data_loader:
- #print("datum",datum)
- # Stop if we've reached an epoch if we're resuming from start_iter
- if iteration == (epoch+1)*epoch_size:
- break
-
- # Stop at the configured number of iterations even if mid-epoch
- if iteration == cfg.max_iter:
- break
-
- # Change a config setting if we've reached the specified iteration
- changed = False
- for change in cfg.delayed_settings:
- if iteration >= change[0]:
- changed = True
- cfg.replace(change[1])
-
- # Reset the loss averages because things might have changed
- for avg in loss_avgs:
- avg.reset()
-
- # If a config setting was changed, remove it from the list so we don't keep checking
- if changed:
- cfg.delayed_settings = [x for x in cfg.delayed_settings if x[0] > iteration]
-
- ############ linear Warm up & PiecewiseDecay 封装在 LearningRate ############
-
- # Zero the grad to get ready to compute gradients
- optimizer.clear_grad()
- curr_lr = optimizer.get_lr()
- # Forward Pass + Compute loss at the same time (see CustomDataParallel and NetLoss)
- with amp.auto_cast(enable=True):
- losses = net(datum)
-
- losses = { k: (v).mean() for k,v in losses.items() } # Mean here because Dataparallel
- loss = paddle.add_n([losses[k] for k in losses]) #
- scaled_loss = scaler.scale(loss)
- scaled_loss.backward()
- scaler.minimize(optimizer, scaled_loss)
- lr_scheduler.step()
- # no_inf_mean removes some components from the loss, so make sure to backward through all of it
- # all_loss = sum([v.mean() for v in losses.values()])
-
- # Backprop
- # loss.backward() # Do this to free up vram even if loss is not finite
- # if paddle.isfinite(loss).item():
- # optimizer.step()
- # lr_scheduler.step()
-
- # Add the loss to the moving average for bookkeeping
- for k in losses:
- loss_avgs[k].add(losses[k].item())
-
- cur_time = time.time()
- elapsed = cur_time - last_time
- last_time = cur_time
-
- # Exclude graph setup from the timing information
- if iteration != args.start_iter:
- time_avg.add(elapsed)
-
- if iteration % 10 == 0:
- eta_str = str(datetime.timedelta(seconds=(cfg.max_iter-iteration) * time_avg.get_avg())).split('.')[0]
-
- total = sum([loss_avgs[k].get_avg() for k in losses])
- loss_labels = sum([[k, loss_avgs[k].get_avg()] for k in loss_types if k in losses], [])
-
- print(('[%3d] %7d ||' + " lr: %.7f |" + (' %s: %.3f |' * len(losses)) + ' T: %.3f || ETA: %s || timer: %.3f')
- % tuple([epoch, iteration, curr_lr] + loss_labels + [total, eta_str, elapsed]), flush=True)
-
- if args.log and iteration % 10 == 0:
- precision = 5
- loss_info = {k: round(losses[k].item(), precision) for k in losses}
- loss_info['T'] = round(loss.item(), precision)
-
- if args.log_gpu:
- log.log_gpu_stats = (iteration % 10 == 0) # nvidia-smi is sloooow
-
- log.log('train', loss=loss_info, epoch=epoch, iter=iteration,
- lr=round(curr_lr, 10), elapsed=elapsed)
-
- log.log_gpu_stats = args.log_gpu
-
- iteration += 1
- if iteration % args.save_interval == 0 and iteration != args.start_iter:
- if args.keep_latest:
- latest = SavePath.get_latest(args.save_folder, cfg.name)
-
- print('Saving state, iter:', iteration)
- yolact_net.save_weights(save_path(epoch, iteration))
- paddle.save(optimizer.state_dict(),
- os.path.join("./weights/", 'model_%d.pdopt'%iteration))
- if iteration >= args.validation_epoch: # when iter >= 350000 valid test
- compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None)
- if args.keep_latest and latest is not None:
- if args.keep_latest_interval <= 0 or iteration % args.keep_latest_interval != args.save_interval:
- print('Deleting old save...')
- os.remove(latest)
-
- # This is done per epoch
- # if args.validation_epoch > 0:
- # if epoch % args.validation_epoch == 0 and epoch > 0:
- # compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None)
-
- # Compute validation mAP after training is finished
- # compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None)
- except KeyboardInterrupt:
- if args.interrupt:
- print('Stopping early. Saving network...')
-
- # Delete previous copy of the interrupted network so we don't spam the weights folder
- SavePath.remove_interrupt(args.save_folder)
- paddle.save(optimizer.state_dict(),
- os.path.join("./weights/", 'model_%d.pdopt'%iteration))
- yolact_net.save_weights(save_path(epoch, repr(iteration) + '_interrupt'))
- exit()
-
- yolact_net.save_weights(save_path(epoch, iteration))
- paddle.save(optimizer.state_dict(),
- os.path.join("./weights/", 'model_%d.pdopt'%iteration))
-
-
- def set_lr(optimizer, new_lr):
- for param_group in optimizer.param_groups:
- param_group['lr'] = new_lr
-
- global cur_lr
- cur_lr = new_lr
-
- def gradinator(x):
- x.stop_gradient = True
- return x
-
- def prepare_data(datum, devices:list=None, allocation:list=None): ###
- with paddle.no_grad():
- if devices is None:
- devices = ['cuda:0'] if args.cuda else ['cpu']
- if allocation is None:
- allocation = [args.batch_size // len(devices)] * (len(devices) - 1)
- allocation.append(args.batch_size - sum(allocation)) # The rest might need more/less
-
- images, (targets, masks, num_crowds) = datum
-
- cur_idx = 0
- for device, alloc in zip(devices, allocation):
- for _ in range(alloc):
- images[cur_idx] = gradinator(images[cur_idx].to(device))
- targets[cur_idx] = gradinator(targets[cur_idx].to(device))
- masks[cur_idx] = gradinator(masks[cur_idx].to(device))
- cur_idx += 1
-
- if cfg.preserve_aspect_ratio:
- # Choose a random size from the batch
- _, h, w = images[random.randint(0, len(images)-1)].size()
-
- for idx, (image, target, mask, num_crowd) in enumerate(zip(images, targets, masks, num_crowds)):
- images[idx], targets[idx], masks[idx], num_crowds[idx] \
- = enforce_size(image, target, mask, num_crowd, w, h)
-
- cur_idx = 0
- split_images, split_targets, split_masks, split_numcrowds \
- = [[None for alloc in allocation] for _ in range(4)]
-
- for device_idx, alloc in enumerate(allocation):
- split_images[device_idx] = paddle.stack(images[cur_idx:cur_idx+alloc], axis=0)
- split_targets[device_idx] = targets[cur_idx:cur_idx+alloc]
- split_masks[device_idx] = masks[cur_idx:cur_idx+alloc]
- split_numcrowds[device_idx] = num_crowds[cur_idx:cur_idx+alloc]
-
- cur_idx += alloc
-
- return split_images, split_targets, split_masks, split_numcrowds
-
- def no_inf_mean(x:paddle.Tensor):
- """
- Computes the mean of a vector, throwing out all inf values.
- If there are no non-inf values, this will return inf (i.e., just the normal mean).
- """
-
- no_inf = [a for a in x if paddle.isfinite(a)]
-
- if len(no_inf) > 0:
- return sum(no_inf) / len(no_inf)
- else:
- return x.mean()
-
- def compute_validation_loss(net, data_loader, criterion):
- global loss_types
-
- with paddle.no_grad():
- losses = {}
-
- # Don't switch to eval mode because we want to get losses
- iterations = 0
- for datum in data_loader:
- images, targets, masks, num_crowds = prepare_data(datum)
- out = net(images)
-
- wrapper = ScatterWrapper(targets, masks, num_crowds) ###
- _losses = criterion(out, wrapper, wrapper.make_mask())
-
- for k, v in _losses.items():
- v = v.mean().item()
- if k in losses:
- losses[k] += v
- else:
- losses[k] = v
-
- iterations += 1
- if args.validation_size <= iterations * args.batch_size:
- break
-
- for k in losses:
- losses[k] /= iterations
-
-
- loss_labels = sum([[k, losses[k]] for k in loss_types if k in losses], [])
- print(('Validation ||' + (' %s: %.3f |' * len(losses)) + ')') % tuple(loss_labels), flush=True)
-
- def compute_validation_map(epoch, iteration, yolact_net, dataset, log:Log=None):
- with paddle.no_grad():
- yolact_net.eval()
-
- start = time.time()
- print()
- print("Computing validation mAP (this may take a while)...", flush=True)
- val_info = eval_script.evaluate(yolact_net, dataset, train_mode=True)
- end = time.time()
-
- if log is not None:
- log.log('val', val_info, elapsed=(end - start), epoch=epoch, iter=iteration)
-
- yolact_net.train()
-
- def setup_eval():
- eval_script.parse_args(['--no_bar', '--max_images='+str(args.validation_size)])
-
- if __name__ == '__main__':
- train()
|