|
- import numpy as np
- from mindspore import nn
- from mindspore.common.tensor import Tensor
- from mindspore.common import dtype as mstype
- from mindspore.common import Parameter
- from mindspore import ParameterTuple
- from mindspore.ops.composite import GradOperation
- from mindspore.ops import operations as P
- from mindspore import context
- from src.submodels.custom_ops.test_custom import Correlation ,Resample2D
- # from src.submodels.custom_ops.testBatch import Resample2D
-
-
-
- import os
- import datetime
- import glob
- import mindspore as ms
- import mindspore.dataset as ds
- import mindspore.log as logger
- import mindspore.nn as nn
- from mindspore.context import ParallelMode
- from mindspore.nn.optim.adam import Adam
- from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
- from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager
- from mindspore.train.model import Model
- from mindspore.train.serialization import load_checkpoint, load_param_into_net
- from mindspore.communication.management import init, get_rank, get_group_size
- from mindspore.common import set_seed
-
- from src.eval_callback import EvalCallBack
- import src.dataset as datasets
- import src.models as models
- from src.metric import FlowNetEPE
- import src.model_utils.tools as tools
- from src.model_utils.config import config
- from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id
- from pprint import pprint, pformat
-
-
- # 设置保存ckpt目录
- def set_save_ckpt_dir():
- """set save ckpt dir"""
- ckpt_save_dir = config.save_checkpoint_path
- if config.enable_modelarts and config.run_distribute:
- ckpt_save_dir = ckpt_save_dir + "/ckpt_" + str(get_rank_id()) + "/"
- else:
- if config.run_distribute:
- ckpt_save_dir = ckpt_save_dir + "/ckpt_" + str(get_rank()) + "/"
- return ckpt_save_dir
-
-
-
- # 使用eval
- def apply_eval(eval_param):
- eval_model = eval_param["model"]
- eval_ds = eval_param["dataset"]
- metrics_name = eval_param["metrics_name"]
- res = eval_model.eval(eval_ds, dataset_sink_mode=False)
- return res[metrics_name]
-
-
- # 负载预训练检查点
- def load_pre_trained_checkpoint(net, pre_trained, checkpoint_path):
- param_dict = None
- if pre_trained:
- if os.path.isdir(checkpoint_path):
- ckpt_save_dir = os.path.join(checkpoint_path, "ckpt_0")
- ckpt_pattern = os.path.join(ckpt_save_dir, "*.ckpt")
- ckpt_files = glob.glob(ckpt_pattern)
- if not ckpt_files:
- logger.warning(f"There is no ckpt file in {ckpt_save_dir}, "
- f"pre_trained is unsupported.")
- else:
- ckpt_files.sort(key=os.path.getmtime, reverse=True)
- time_stamp = datetime.datetime.now()
- print(f"time stamp {time_stamp.strftime('%Y.%m.%d-%H:%M:%S')}"
- f" pre trained ckpt model {ckpt_files[0]} loading",
- flush=True)
- param_dict = load_checkpoint(ckpt_files[0])
- elif os.path.isfile(checkpoint_path):
- param_dict = load_checkpoint(checkpoint_path)
- else:
- print(f"Invalid pre_trained {checkpoint_path} parameter.")
- return
- load_param_into_net(net, param_dict)
- print(f"loaded param from {checkpoint_path} into net")
-
-
- # 添加ckpt回调
- def add_ckpt_callback(step_size, ckpt_save_dir, cbs):
- if config.save_checkpoint:
- config_ck = CheckpointConfig(save_checkpoint_steps=step_size * config.save_ckpt_interval,
- keep_checkpoint_max=config.keep_checkpoint_max)
- ckpoint_cb = ModelCheckpoint(prefix="flownet2_", directory=ckpt_save_dir, config=config_ck)
- cbs += [ckpoint_cb]
-
-
- # 添加eval回调
- def add_eval_callback(model, ckpt_save_dir, cbs):
- if config.run_evalCallback:
- if config.eval_data_path is None or (not os.path.isdir(config.eval_data_path)):
- raise ValueError("{} is not a existing path.".format(config.eval_data_path))
-
- config.eval_dataset_class = tools.module_to_dict(datasets)[config.eval_data]
- flownet_eval_gen = config.eval_dataset_class(config.crop_type, config.crop_size, config.eval_size,
- config.eval_data_path)
- # sampler = datasets.DistributedSampler(flownet_eval_gen, rank, group_size) sampler=sampler,
- eval_dataset = ds.GeneratorDataset(flownet_eval_gen, ["images", "flow"],
- num_parallel_workers=config.num_parallel_workers,
- max_rowsize=config.max_rowsize)
- eval_dataset = eval_dataset.batch(config.batch_size)
-
- eval_param_dict = {"model": model, "dataset": eval_dataset, "metrics_name": "mean error"}
- eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=config.eval_interval,
- eval_start_epoch=config.eval_start_epoch, save_best_ckpt=config.save_best_ckpt,
- ckpt_directory=ckpt_save_dir, besk_ckpt_name="best_acc.ckpt",
- metrics_name="FlownetAcc")
- cbs += [eval_cb]
-
- # def run_train():
- # set_seed(config.seed)
- # ms.set_context(mode=ms.context.GRAPH_MODE, enable_graph_kernel=False, device_target=config.device_target,device_id=5)
- # ds.config.set_enable_shared_mem(False)
- # device_num = get_device_num()
- # if config.device_target == "Ascend":
- # config.device_id = get_device_id()
- # # TODO lpf rank和 group_size 的设置是否有问题
- # # rank = config.device_id # rank = config.device_id
- # # group_size = get_group_size() # group_size = get_group_size()
- # # ms.set_context(device_id=config.device_id)
- # if device_num > 1:
- # ms.context.reset_auto_parallel_context()
- # ms.context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
- # gradients_mean=True)
- # init()
- # elif config.device_target == "GPU":
- # if config.run_distribute:
- # init()
- # parallel_mode = ParallelMode.DATA_PARALLEL
- # rank = get_rank()
- # print(rank)
- # group_size = get_group_size()
- # else:
- # parallel_mode = ParallelMode.STAND_ALONE
- # rank = 0
- # group_size = 1
-
- # ms.context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=group_size)
-
- # # load dataset by config param
- # config.training_dataset_class = tools.module_to_dict(datasets)[config.train_data]
- # flownet_train_gen = config.training_dataset_class(config.crop_type, config.crop_size, config.eval_size,
- # config.train_data_path)
- # sampler = datasets.DistributedSampler(flownet_train_gen, rank=0, group_size=1, shuffle=True)
- # print('sampler')
- # train_dataset = ds.GeneratorDataset(flownet_train_gen, ["images", "flow"],
- # sampler=sampler, num_parallel_workers=config.num_parallel_workers)
- # train_dataset = train_dataset.batch(config.batch_size)
- # step_size = train_dataset.get_dataset_size()
-
- # # load model by config param
- # config.model_class = tools.module_to_dict(models)[config.model]
- # net = config.model_class(config.rgb_max, config.batchNorm)
-
-
-
-
- context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=6)
-
-
- class Net(nn.Cell):
- def __init__(self):
- super(Net, self).__init__()
- self.matmul = P.MatMul()
- self.z = Parameter(Tensor(np.array([1.0], np.float32)), name='z')
-
- def construct(self, x, y):
- x = x * self.z
- out = self.matmul(x, y)
- return out
-
-
- class GradNetWrtX(nn.Cell):
- def __init__(self, net):
- super(GradNetWrtX, self).__init__()
- self.net = net
- self.grad_op = GradOperation()
-
- def construct(self, x, y):
- gradient_function = self.grad_op(self.net)
- return gradient_function(x)
-
- class GradNetWrtXY(nn.Cell):
- def __init__(self, net):
- super(GradNetWrtXY, self).__init__()
- self.net = net
- self.grad_op = GradOperation(get_all=True)
- def construct(self, x, y):
- gradient_function = self.grad_op(self.net)
- return gradient_function(x, y)
-
-
-
- if __name__ == '__main__':
- print('=='*30)
- print('=='*30)
- # x3 = Tensor(np.arange(0, 2 * 3 * 20 * 20 *20 ).reshape(2, 3, 20, 20, 20).astype(np.float32))
- # x4 = Tensor(np.arange(0, 2 * 3 * 20 * 20 *20 ).reshape(2, 3, 20, 20, 20).astype(np.float32))
- x3 = Tensor(np.arange(0, 2 * 3 * 2 * 384 * 512).reshape(2, 3, 2, 384, 512).astype(np.float32))
- x4 = Tensor(np.arange(0, 2 * 3 * 2 * 384 * 512).reshape(2, 3, 2, 384, 512).astype(np.float32))
-
- config.model_class = tools.module_to_dict(models)[config.model]
- print('config.model_class = ',config.model_class)
- net = config.model_class(config.rgb_max, config.batchNorm)
- outputdddd = GradNetWrtX(net)(x3, x4)
- print('outputdddd',outputdddd)
|