|
- import mindspore.ops as ops
- import os
- import argparse
- import ast
- import mindspore
- import mindspore.dataset.vision.c_transforms as C
- import mindspore.dataset.transforms.c_transforms as C2
- import mindspore.common.dtype as mstype
- import numpy as np
- import math
- from model_deepmar import Deep_Mar_v1
- # from DeepMar_AlexNet import Deep_Mar_v1
-
- from mindspore import context
- from mindspore import Tensor
- from evaluate import attribute_evaluate
- from mindspore.train.model import Model
- from mindspore.context import ParallelMode
-
- from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
-
- from mindspore.communication.management import init, get_rank, get_group_size
- from mindspore.common import set_seed
- from mindspore.parallel import set_algo_parameters
- import mindspore.nn as nn
- import mindspore.common.initializer as weight_init
- from dataset import AttDataset
-
- #import sys
- import moxing as mox
-
- #配置默认的工作空间根目录
- #environment = 'debug'
- environment = 'train'
- if environment == 'debug':
- workroot = '/home/ma-user/work' #调试任务使用该参数
- else:
- workroot = '/home/work/user-job-dir' # 训练任务使用该参数
- print('current work mode:' + environment + ', workroot:' + workroot)
-
-
- parser = argparse.ArgumentParser(description='reidentification')
-
-
- ## dataset parameter
- parser.add_argument('--dataset', type=str, default='peta',
- choices=['peta','rap', 'pa100k', 'rap2'])
- parser.add_argument('--split', type=str, default='trainval',
- choices=['trainval', 'train'])
- parser.add_argument('--test_split', type=str, default='test')
- parser.add_argument('--partition_idx', type=int, default=0)
- parser.add_argument('--resize', type=eval, default=(224, 224))
-
- parser.add_argument('--batch_size', type=int, default=256)
- parser.add_argument('--height', type=int, default=256)
- parser.add_argument('--width', type=int, default=256)
- # model
- parser.add_argument('--num_att', type=int, default=35)
-
- parser.add_argument('--last_conv_stride', type=int, default=2, choices=[1,2])
-
-
- parser.add_argument('--sgd_weight_decay', type=float, default=0.0005)
- parser.add_argument('--sgd_momentum', type=float, default=0.9)
- parser.add_argument('--new_params_lr', type=float, default=0.001)
- parser.add_argument('--finetuned_params_lr', type=float, default=0.001)
-
- parser.add_argument('--staircase_decay_at_epochs', type=eval,
- default=(50, ))
- parser.add_argument('--staircase_decay_multiple_factor', type=float,
- default=0.1)
-
- parser.add_argument('--total_epochs', type=int, default=150)
- # parser.add_argument('--weighted_entropy', type=str2bool, default=True)
- # utils
-
- parser.add_argument('--data_url',
- help='path to training/inference dataset folder',
- default= workroot + '/data/')
-
- parser.add_argument('--train_url',
- help='model folder to save/load',
- default= workroot + '/model/')
-
- parser.add_argument(
- '--device_target',
- type=str,
- default="Ascend",
- choices=['Ascend', 'CPU'],
- help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend')
-
- parser.add_argument('--epochs_per_save', type=int, default=10)
- parser.add_argument('--run', type=int, default=1)
- args = parser.parse_args()
-
-
-
-
- if __name__ == '__main__':
-
- data_dir = workroot + '/data' #数据集存放路径
- train_dir = workroot + '/model' #模型存放路径
- #初始化数据存放目录
- if not os.path.exists(data_dir):
- os.mkdir(data_dir)
- #初始化模型存放目录
- obs_train_url = args.train_url
- train_dir = workroot + '/model/'
- if not os.path.exists(train_dir):
- os.mkdir(train_dir)
- ######################## 将数据集从obs拷贝到训练镜像中 (固定写法)########################
- # 在训练环境中定义data_url和train_url,并把数据从obs拷贝到相应的固定路径,以下写法是将数据拷贝到/home/work/user-job-dir/data/目录下,可修改为其他目录
- #创建数据存放的位置
- if environment == 'train':
- obs_data_url = args.data_url
- #将数据拷贝到训练环境
- try:
- mox.file.copy_parallel(obs_data_url, data_dir)
- print("Successfully Download {} to {}".format(obs_data_url,
- data_dir))
- except Exception as e:
- print('moxing download {} to {} failed: '.format(
- obs_data_url, data_dir) + str(e))
- ######################## 将数据集从obs拷贝到训练镜像中 ########################
-
- #注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU
- context.set_context(mode=context.GRAPH_MODE,
- device_target=args.device_target)
-
-
-
- dpath = data_dir
- datasets_path= os.path.join(dpath, 'peta_dataset.pkl')
- partitions_path =os.path.join(dpath, 'peta_partition.pkl')
- #要创建一遍dataset
- buffer_size = 1000
- #数据的一些处理
- # decode_op = C.Decode()
- horizontal_flip_op = C.RandomHorizontalFlip(prob=0.5)
- resize_op = C.Resize((args.height, args.width))
- resize_op_test = C.Resize((224, 224))
- normalize_op = C.Normalize(mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
- std=[0.229 * 255, 0.224 * 255, 0.225 * 255])
- change_swap_op = C.HWC2CHW()
- Randomcrop = C.RandomCrop(args.resize)
- trans_train = [ resize_op, horizontal_flip_op, Randomcrop, normalize_op, change_swap_op]
- trans_valtest=[ resize_op_test, normalize_op, change_swap_op]
- type_cast_op = C2.TypeCast(mstype.float32)
-
-
- #获取训练数据集
- train_datasetgen=AttDataset(datasets_path,partitions_path,args.split, os.path.join(data_dir, "images"))
- train_dataset=mindspore.dataset.GeneratorDataset(source=train_datasetgen,column_names=["data", "label"],shuffle=True)
- train_dataset = train_dataset.map(operations=trans_train, input_columns="data", num_parallel_workers=1)
- train_dataset = train_dataset.map(operations=type_cast_op, input_columns="label", num_parallel_workers=1)
-
- # apply shuffle operations
- train_dataset = train_dataset.shuffle(buffer_size=buffer_size)
- # apply batch operations
- train_dataset = train_dataset.batch(args.batch_size, drop_remainder=True)
- #获取验证数据集
- val_atasetgen = AttDataset(datasets_path, partitions_path, 'val', os.path.join(data_dir, "images"))
- val_dataset = mindspore.dataset.GeneratorDataset(source=val_atasetgen, column_names=["data", "label"],
- shuffle=True)
- val_dataset = val_dataset.map(operations=trans_valtest, input_columns="data", num_parallel_workers=1)
- val_dataset = val_dataset.map(operations=type_cast_op, input_columns="label", num_parallel_workers=1)
-
- # apply shuffle operations
- val_dataset = val_dataset.shuffle(buffer_size=buffer_size)
- # apply batch operations
- val_dataset = val_dataset.batch(1, drop_remainder=True)
-
- #根据数据集生成相应的weight,这是32个标签的正负weight,要根据标签的0 1 情况来给损失函数相应的权重
- rate = np.array(train_datasetgen.partition['weight_' + args.split][args.partition_idx]) # 都是数据集自身提供的
- rate = rate[train_datasetgen.dataset['selected_attribute']].tolist()
- if len(rate) != args.num_att:
- print ("the length of rate should be equal to %d" % (args.num_att))
- raise ValueError
- weight_pos = []
- weight_neg = []
- for idx, v in enumerate(rate):
- weight_pos.append(math.exp(1.0 - v))
- weight_neg.append(math.exp(v))
-
-
- #训练的时候应该使用分一部一部训练的方法,每次都要专门去生成损失函数
- model=Deep_Mar_v1()
-
- #preckp_path = os.path.join(sys.path[0], 'precheckpoint')
- #mobspath = 'obs://wangl97/pretrained_ckpt/'
- #mox.file.copy_parallel(mobspath, preckp_path)
- #如果使用预训练参数则解除注释
- ############################################################################
- pretrain_chechpoint = os.path.join(data_dir, 'resnet50_frompytorch.ckpt')
- dic = mindspore.train.serialization.load_checkpoint(pretrain_chechpoint)
- dic2 = {}
- for name in dic:
- dic2["base."+name] = dic[name]
- # dic2.pop('base.fc1.weight')
- param_not_load = mindspore.train.serialization.load_param_into_net(model, parameter_dict=dic, strict_load=False)
- print(param_not_load)
- #############################################################################
-
- classifier_param = []
- resnet_back_bone_param = []
- for param in model.trainable_params():
- if 'classifier' not in param.name and 'add_block' not in param.name:
- # print(param)
- resnet_back_bone_param.append(param)
- else:
- #print(param)
- #基本不会触发
- classifier_param.append(param)
- gropu_param = [{'params': resnet_back_bone_param, 'lr': args.finetuned_params_lr},
- {'params': classifier_param, 'lr': args.new_params_lr},
- {'order_params': model.trainable_params()}
- ]
- #可以定义相应的优化器
- optim = nn.SGD(gropu_param, learning_rate=0.001, weight_decay=5e-4, momentum=0.9, nesterov=True)
- iter = train_dataset.create_dict_iterator()
- step_size = train_dataset.get_dataset_size()
- print('一个epoch的部属')
- print(step_size)
-
-
- savepath = os.path.join('./', 'deepmarTestckpt')
- if not os.path.exists(savepath):
- os.mkdir(savepath)
- print(savepath)
-
- best_epoch=0
- best_value=0
-
- for e in range(args.total_epochs):
- train_loss=[]
- step=0
- for data in iter:
- step=step+1
- images=data['data']
- target=data['label']
- label=target.asnumpy()
- # print(label)
- shape=target.shape
- # print(shape)
- #要加载到损失函数中的weight
- weights = np.zeros(shape)
- for i in range(shape[0]):
- for j in range(shape[1]):
- # print(label[i, j])
- if label[i, j] == 0: # -1的才是negative?
- weights[i, j] = weight_neg[j]
- # target[i,j]=0
- elif label[i, j] == 1:
- weights[i, j] = weight_pos[j]
- else:
- weights[i, j] = 0
- # print(weights)
- weights=Tensor(weights,mindspore.float32)
- loss=mindspore.nn.BCELoss(weight=weights,reduction='mean')
- with_loss = nn.WithLossCell(model, loss)
- train_step = nn.TrainOneStepCell(with_loss, optim)
- loss = train_step(images,target).asnumpy()
- train_loss.append(loss.item())
- print('Epoch: {},Step: {}, Train Loss: {}'.format(e, step,loss))
- avg_loss = np.mean(train_loss)
- # if e%5==0:
- testresult=attribute_evaluate(model,224, 224,datasets_path,partitions_path, os.path.join(data_dir, "images"))
- label_acc_avg = np.sum(testresult['label_acc'])/len(testresult['label_acc'])
-
-
- #print("label_acc_avg:",label_acc_avg)
- if label_acc_avg > best_value:
- best_epoch=e
- best_value=label_acc_avg
- ckpt_file_name = 'deepmar_epoch{}_label_acc_avg_is_{}.ckpt'.format(e, label_acc_avg)
- #'deepmar_epoch%d_instance_precision%d'%(e, label_acc_avg)
-
- save_file_path=os.path.join(train_dir,ckpt_file_name)
- mindspore.save_checkpoint(model,save_file_path)
- #mox.file.copy_parallel(savepath,'obs://wangl97/deepmar_ckptsave/lrnochange/')
- print('save {} best now'.format(save_file_path))
-
- print('Epoch: {}, Avg Train Los: {}, label_acc_avg: {}'.format(e, avg_loss, label_acc_avg))
- #print('Epoch: {}, Avg Train Loss: {}'.format(e, avg_loss))
- print('best epoch is: {},its label_acc_avg is : {}'.format(best_epoch, best_value))
-
-
-
- ######################## 将输出的模型拷贝到obs(固定写法) ########################
- # 把训练后的模型数据从本地的运行环境拷贝回obs,在启智平台相对应的训练任务中会提供下载
- if environment == 'train':
- try:
- print("Ready for save")
- mox.file.copy_parallel(train_dir, obs_train_url)
- print("Successfully Upload {} to {}".format(train_dir,
- obs_train_url))
- except Exception as e:
- print("moxing save failed")
- print('moxing upload {} to {} failed: '.format(train_dir,
- obs_train_url) + str(e))
- ######################## 将输出的模型拷贝到obs ########################
|