|
- import os
- import mindspore
- import mindspore.nn as nn
- import mindspore.common.dtype as mstype
- from mindspore import context
- import mindspore.dataset as ds
- import mindspore.dataset.transforms.c_transforms as C2
- import mindspore.dataset.vision.c_transforms as C
- from mindspore import Tensor,ops
- from CosFace import MobileFaceNet, WholeNet
- from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
- from mindspore.nn.optim import Momentum, SGD
- from mindspore.ops import operations as P
- from mindspore.train.model import Model
- from mindspore.common.initializer import initializer
- from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor
- from lr_schedule import get_multi_step_lr, warmup_cosine_annealing_lr
- import numpy as np
- import argparse
- from collections import Counter
- from mindspore.communication.management import init, get_rank, get_group_size
- import moxing as mox
- from mindspore.context import ParallelMode
- from mindspore.train.callback import Callback
- import time
-
-
- device_id = int(os.getenv('DEVICE_ID'))
- context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
- context.set_context(device_id=device_id) # set device_id
- init()
-
-
- #配置默认的工作空间根目录
- # environment = 'debug'
- environment = 'train'
- if environment == 'debug':
- workroot = '/home/ma-user/work' #调试任务使用该参数
- else:
- workroot = '/home/work/user-job-dir' # 训练任务使用该参数
- print('current work mode:' + environment + ', workroot:' + workroot)
-
- class BuildTrainNetwork(nn.Cell):
- def __init__(self, network, criterion):
- super(BuildTrainNetwork, self).__init__()
- self.network = network
- self.criterion = criterion
-
- def construct(self, input_data, label):
- output = self.network(input_data, label)
- loss = self.criterion(output, label)
- return loss
-
- def get_parse():
- parser = argparse.ArgumentParser(description='Face Recognization')
- parser.add_argument('--batch_size', type=int, default=512, help='the batch num')
- parser.add_argument("--init_lr", type=float, default=0.1, help="the init Learning rate, default is 0.075.")
- parser.add_argument("--lr_strategy", type=str, default="Multistep", help="the lr strategy, default is preserve init lr, consine, Multistep")
- parser.add_argument('--data_url',help='path to training/inference dataset folder',default= workroot + '/data/')
- parser.add_argument('--train_url',help='model folder to save/load',default= workroot + '/model/')
- parser.add_argument('--device_target',type=str,default="Ascend",choices=['Ascend', 'CPU'])
- parser.add_argument('--num_s',type=float,default=64.0, help=" default is 32")
- parser.add_argument('--num_m',type=float,default=0.35, help=" default is 0.5")
- parser.add_argument("--epoch_size", type=int, default=100, help="Epoch size, default is 100.")
- parser.add_argument('--num_save', type=int, default=20, help="the gap of save model")
- parser.add_argument('--dataset_url',type=str, default="https://open-data.obs.cn-south-222.ai.pcl.cn:443/attachment/d/d/dd114064-ca14-4792-91d4-f573b65d1aef/Align-CASIA-WebFace.zip?response-content-disposition=attachment%3B+filename%3D%22Align-CASIA-WebFace.zip%22&AWSAccessKeyId=UJN8OQXLVBV0J9IHDGN9&Expires=1652454091&Signature=IVeI0PGqXAxrUEyRG2JmsZWm82c%3D"
- )
- args_opt = parser.parse_args()
- return args_opt
-
- def create_dataset(data_dir, batch_size):
-
- rank_id = get_rank()
- rank_size = get_group_size()
-
- mean = [0.4914*255, 0.4822*255, 0.4465*255]
- std = [0.2023*255, 0.1994*255, 0.2010*255]
- casia_ds = ds.ImageFolderDataset(data_dir, decode=True, num_shards=rank_size, shard_id=rank_id)
- random_horizontal = C.RandomHorizontalFlip()
- resize_op = C.Resize((112, 96))
- normalize_op = C.Normalize(mean=mean, std=std)
- changeswap_op = C.HWC2CHW()
- transform_img = [random_horizontal, resize_op, normalize_op, changeswap_op]
-
- type_cast_op = C2.TypeCast(mstype.int32)
- transform_label = [type_cast_op]
-
- casia_ds = casia_ds.map(input_columns='image', operations=transform_img)
- casia_ds = casia_ds.map(input_columns='label', operations=transform_label)
- casia_ds = casia_ds.project(columns=["image", "label"])
-
- casia_ds = casia_ds.shuffle(buffer_size=10)
- casia_ds = casia_ds.batch(batch_size=batch_size, drop_remainder=True)
- casia_ds = casia_ds.repeat(1)
-
- return casia_ds
-
-
- class My_Callback(Callback):
- """定义初始化过程"""
-
- def __init__(self, totoal_epoch, save_epoch, epoch_step, save_dir, obs_dir):
- super(My_Callback, self).__init__()
- self.save_epoch = save_epoch
- self.totoal_epoch = totoal_epoch
- self.epoch_step = epoch_step
- self.save_dir = save_dir
- self.obs_dir = obs_dir
- self.begin_time = None
-
- def begin(self, run_context):
- self.begin_time = time.time()
-
- def step_end(self, run_context):
- """定义step结束时的执行操作"""
- cb_params = run_context.original_args()
- epoch_num = cb_params.cur_epoch_num
- step_num = cb_params.cur_step_num
-
- if step_num % self.epoch_step == 0 : # 图模式只有最后一个step才交互
- loss = cb_params.net_outputs
- cur_time = time.time()
- step_time = (cur_time- self.begin_time)/step_num
- epoch_time = ((cur_time- self.begin_time)*self.epoch_step)/step_num
- print(" --{7:}-- Epoch: [{0:}/{1:}], Step: [{2:}/{3:}], Loss: {4:}, Each Step: {5:.3f} s , Each Epoch: {6:.3f} h ".format(epoch_num, self.totoal_epoch, self.epoch_step, self.epoch_step, loss, step_time, epoch_time/3600, str(get_rank())))
- else:
- pass
-
- if (epoch_num % self.save_epoch == 0) and (step_num % self.epoch_step == 0):
- try:
- mox.file.copy_parallel(train_dir, obs_train_url)
- print("Successfully Upload {} to {}".format(self.save_dir, self.obs_dir))
- except Exception as e:
- print('moxing upload {} to {} failed: '.format(self.save_dir, self.obs_dir) + str(e))
- else:
- pass
-
- class SoftmaxCrossEntropyExpand(nn.Cell): # pylint: disable=missing-docstring
- def __init__(self, sparse=False):
- super(SoftmaxCrossEntropyExpand, self).__init__()
- self.exp = ops.Exp()
- self.sum = ops.ReduceSum(keep_dims=True)
- self.onehot = ops.OneHot()
- self.on_value = Tensor(1.0, mstype.float32)
- self.off_value = Tensor(0.0, mstype.float32)
- self.div = ops.RealDiv()
- self.log = ops.Log()
- self.sum_cross_entropy = ops.ReduceSum(keep_dims=False)
- self.mul = ops.Mul()
- self.mul2 = ops.Mul()
- self.mean = ops.ReduceMean(keep_dims=False)
- self.sparse = sparse
- self.max = ops.ReduceMax(keep_dims=True)
- self.sub = ops.Sub()
- self.eps = Tensor(1e-24, mstype.float32)
-
- def construct(self, logit, label): # pylint: disable=missing-docstring
- logit_max = self.max(logit, -1)
- exp = self.exp(self.sub(logit, logit_max))
- exp_sum = self.sum(exp, -1)
- softmax_result = self.div(exp, exp_sum)
- if self.sparse:
- label = self.onehot(label, ops.shape(logit)[1], self.on_value, self.off_value)
-
- softmax_result_log = self.log(softmax_result + self.eps)
- loss = self.sum_cross_entropy((self.mul(softmax_result_log, label)), -1)
- loss = self.mul2(ops.scalar_to_array(-1.0), loss)
- loss = self.mean(loss, -1)
-
- return loss
-
-
- if __name__ == '__main__':
-
- args_opt = get_parse()
- print('args:')
- print(args_opt)
- data_dir = workroot + '/data' #数据集存放路径
- train_dir = workroot + '/model' #模型存放路径
- #初始化数据存放目录
- if not os.path.exists(data_dir):
- os.mkdir(data_dir)
- #初始化模型存放目录
- obs_train_url = args_opt.train_url
- train_dir = workroot + '/model/'
- if not os.path.exists(train_dir):
- os.mkdir(train_dir)
-
- cmd1 = "wget -O /home/work/user-job-dir/data/train.zip '{}'".format(args_opt.dataset_url)
- cmd2 = "unzip /home/work/user-job-dir/data/train.zip -d /home/work/user-job-dir/data"
- print("开始下载数据集")
- test1 = os.popen(cmd1).read()
- for line in test1:
- print(line)
- print("完成下载数据集")
-
- print("开始解压数据集")
- test2 = os.popen(cmd2)
- for line in test2:
- print(line)
- print("完成解压数据集")
-
- context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,parameter_broadcast=True)
- epoch_size = args_opt.epoch_size
- dataset = create_dataset(os.path.join(data_dir, "Align-CASIA-WebFace/CASIA-WebFace-112X96"), args_opt.batch_size)
- if dataset.get_dataset_size() == 0:
- raise ValueError(
- "Please check dataset size > 0 and batch_size <= dataset size")
-
- net = WholeNet(num_class=10575, num_s=args_opt.num_s, num_m=args_opt.num_m)
- loss = SoftmaxCrossEntropyExpand(sparse=True)
-
- if args_opt.lr_strategy == 'default':
- lr = args_opt.init_lr
- elif args_opt.lr_strategy == 'Multistep':
- # lr = Tensor(get_multi_step_lr(dataset.get_dataset_size(), init_lr = args_opt.init_lr, epoch=epoch_size))
- lr = get_multi_step_lr(dataset.get_dataset_size() * 8 , init_lr = args_opt.init_lr, epoch=epoch_size)
- lr = Tensor(lr[::8])
- elif args_opt.lr_strategy == 'cosine':
- # lr = Tensor(warmup_cosine_annealing_lr(dataset.get_dataset_size(), init_lr=args_opt.init_lr, max_epoch=epoch_size))
- lr = warmup_cosine_annealing_lr(dataset.get_dataset_size() * 8, init_lr=args_opt.init_lr, max_epoch=epoch_size)
- lr = Tensor(lr[::8])
- linear1_params = list(filter(lambda x: 'backbone.linear1' in x.name, net.trainable_params()))
- product_weight_params = list(filter(lambda x: 'product.weight' in x.name, net.trainable_params()))
- prelu_params = list(filter(lambda x: 'prelu' in x.name, net.trainable_params()))
- base_params = list(filter(lambda x: 'backbone.linear1' not in x.name and
- 'product.weight' not in x.name and
- 'prelu' not in x.name, net.trainable_params()))
- group_params = [{'params': base_params, 'weight_decay': 4e-5},
- {'params': linear1_params, 'weight_decay': 4e-4},
- {'params': product_weight_params, 'weight_decay': 4e-4}]
- opt = Momentum(group_params, lr, 0.9)
-
- train_net = BuildTrainNetwork(net, loss)
- model = Model(train_net, optimizer=opt)
- print (dataset.num_classes())
- batch_num = dataset.get_dataset_size()
- print (batch_num)
-
- check_dir = train_dir + "/ckpt_" + str(get_rank()) + "/"
- print("保存地址", check_dir)
- check_str = "CosFace_{0:}_s-{1:}_m-0{2:}".format(args_opt.lr_strategy, int(args_opt.num_s), int(100*args_opt.num_m))
- config_ck = CheckpointConfig(save_checkpoint_steps= batch_num * args_opt.num_save , keep_checkpoint_max= 5 )
- ckpoint_cb = ModelCheckpoint(prefix=check_str, directory=check_dir, config=config_ck)
- loss_cb = LossMonitor()
-
- my_call = My_Callback(epoch_size, args_opt.num_save, batch_num, train_dir, obs_train_url)
- model.train(epoch_size, dataset, callbacks=[ckpoint_cb,loss_cb, my_call] ,dataset_sink_mode=True)
-
|