|
- # Copyright 2022 PCL. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
- """ PanguAlpha train script """
-
- import datetime
- import glob
- import os, sys
- import math
- import time
- from pathlib2 import Path
-
- from mindspore import context
- from mindspore.train.model import Model
- import mindspore.communication.management as D
- from mindspore.context import ParallelMode
- import mindspore.nn as nn
- from mindspore.train.callback import TimeMonitor, Callback
- from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
- import mindspore.common.dtype as mstype
- from mindspore.parallel import set_algo_parameters
- from mindspore.parallel._cost_model_context import _set_multi_subgraphs
- from mindspore.nn.wrap.cell_wrapper import PipelineCell, _VirtualDatasetCell
- from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
- from mindspore.train.serialization import load_checkpoint, load_param_into_net
-
- from src.dataset import create_dataset
- from src.pangu_alpha import PanguAlpha, PanguAlphaWithLoss, CrossEntropyLoss
- from src.pangu_alpha_wrapcell import PanguAlphaTrainOneStepWithLossScaleCell, PanguAlphaTrainPipelineWithLossScaleCell
- from src.pangu_alpha_config import PANGUALPHAConfig, set_parse
- from src.utils import LearningRate, get_args, FP32StateAdamWeightDecay
- from src.utils import download_data, ckpt_copy_tar_new, get_ckpt_file_list
- from src.utils import StrategySaveCallback, CheckpointSaveCallback, LossSummaryCallback
-
- import moxing as mox
- from mindspore.common import Parameter
- import mindspore.common.dtype as mstype
- from mindspore.common.tensor import Tensor
-
- import numpy as np
- import AISyncore as asc
- from typing import Dict, List
- from mindspore import save_checkpoint
-
- project_root = os.path.abspath(
- os.path.dirname(os.path.realpath(__file__)) + os.path.sep + "..")
- print('project_root:', project_root)
-
-
- class LossCallBack(Callback):
- """
- Monitor the loss in training.
- If the loss in NAN or INF terminating training.
- """
-
- def __init__(self, dataset_size=-1, local_rank=0, has_trained_epoch=0, has_trained_step=0, micro_size=1):
- super(LossCallBack, self).__init__()
- self._dataset_size = dataset_size
- self.local_rank = local_rank
- self.has_trained_epoch = has_trained_epoch
- self.has_trained_step = has_trained_step
- self.micro_size = micro_size
- print("load has trained epoch :{} and step: {}".format(has_trained_epoch, has_trained_step), flush=True)
-
- def step_end(self, run_context):
- """
- Print loss after each step
- """
- cb_params = run_context.original_args()
- if self._dataset_size > 0 and self.local_rank % 8 == 0:
- percent, epoch_num = math.modf(cb_params.cur_step_num /
- self._dataset_size)
- if percent == 0:
- epoch_num -= 1
- date = time.asctime(time.localtime(time.time()))
- loss_value = cb_params.net_outputs[0].asnumpy() / self.micro_size
- D.init()
- rank = D.get_rank()
- if rank%8 == 0:
- print("time: {} local_rank: {}, epoch: {}, step: {}, loss is {}, overflow is {}, scale is {}".
- format(date,
- int(self.local_rank),
- int(epoch_num) + int(self.has_trained_epoch),
- cb_params.cur_step_num + int(self.has_trained_step),
- loss_value,
- cb_params.net_outputs[1].asnumpy(),
- cb_params.net_outputs[2].asnumpy()))
-
- def add_checkpoint_callback_policy(args_param, callback, rank_id):
- r"""
- Add checkpoint policy to callback.
- """
- if args_param.save_checkpoint and rank_id == 0:
- # checkpoint store epoch_num and step_num info
- ckpt_append_info = [{"epoch_num": args_param.has_trained_epoches, "step_num": args_param.has_trained_steps}]
- ckpt_config = CheckpointConfig(save_checkpoint_steps=args_param.save_checkpoint_steps,
- keep_checkpoint_max=1,
- integrated_save=True,
- append_info=ckpt_append_info
- )
- save_ckptfile_name = args_param.ckpt_name_prefix
- if not os.path.exists(args_param.save_checkpoint_path):
- os.makedirs(args_param.save_checkpoint_path, exist_ok=True)
- ckpoint_cb = ModelCheckpoint(prefix=args_param.ckpt_name_prefix,
- directory=args_param.save_checkpoint_path,
- config=ckpt_config)
-
- ckpt_save_obs_cb = CheckpointSaveCallback(local_ckpt_dir=args_param.save_checkpoint_path,
- local_rank=rank_id,
- has_trained_epoch=args_param.has_trained_epoches,
- has_trained_step=args_param.has_trained_steps,
- bucket=args_param.save_checkpoint_bucket_dir,
- syn_obs_steps=args_param.save_checkpoint_steps)
- callback.append(ckpoint_cb)
- callback.append(ckpt_save_obs_cb)
-
- def load_train_net(args_opt):
- r"""
- load train network
- """
- # Set hccl connect time
- os.environ['HCCL_CONNECT_TIMEOUT'] = "6000"
-
- # Set execution mode
- context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
- context.set_context(variable_memory_max_size="30GB")
- print(args_opt)
- # Set parallel context
- if args_opt.distribute == "true":
- D.init()
- device_num = D.get_group_size()
- rank = D.get_rank()
- print("rank_id is {}, device_num is {}".format(rank, device_num))
-
- context.reset_auto_parallel_context()
-
- local_strategy_ckpt_path="/cache/ckpt_strategy.ckpt"
- if args_opt.pre_trained:
- os.system('ulimit -s 102400')
- mox.file.copy(src_url=args_opt.strategy_load_ckpt_path, dst_url=local_strategy_ckpt_path)
-
- if args_opt.pre_trained:
- if args_opt.device_num > 64:
- context.set_auto_parallel_context(
- parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL,
- gradients_mean=False,
- full_batch=bool(args_opt.full_batch),
- strategy_ckpt_load_file=local_strategy_ckpt_path,
- enable_parallel_optimizer=bool(args_opt.optimizer_shard),
- optimizer_weight_shard_size=64,
- strategy_ckpt_save_file='/cache/strategy.ckpt')
-
- else:
- context.set_auto_parallel_context(
- parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL,
- gradients_mean=False,
- full_batch=bool(args_opt.full_batch),
- strategy_ckpt_load_file=local_strategy_ckpt_path,
- enable_parallel_optimizer=bool(args_opt.optimizer_shard),
- optimizer_weight_shard_aggregated_save=True,
- strategy_ckpt_save_file='/cache/strategy.ckpt')
- else:
- if args_opt.device_num > 64:
- context.set_auto_parallel_context(
- parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL,
- gradients_mean=False,
- full_batch=bool(args_opt.full_batch),
- enable_parallel_optimizer=bool(args_opt.optimizer_shard),
- optimizer_weight_shard_size=64,
- strategy_ckpt_save_file='/cache/strategy.ckpt')
- else:
- context.set_auto_parallel_context(
- parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL,
- gradients_mean=False,
- full_batch=bool(args_opt.full_batch),
- enable_parallel_optimizer=bool(args_opt.optimizer_shard),
- optimizer_weight_shard_aggregated_save=True,
- strategy_ckpt_save_file='/cache/strategy.ckpt')
-
- set_algo_parameters(elementwise_op_strategy_follow=True)
- _set_multi_subgraphs()
- else:
- rank = 0
- device_num = 1
- context.set_context(save_graphs=False, save_graphs_path="./graphs_of_device_id_" + str(rank))
-
- # copy data from the cloud to the /cache/Data
- cache_url = "/cache/Data/"
- if args_opt.offline:
- cache_url = args_opt.data_url
- else:
- download_data(src_data_url=args_opt.data_url, tgt_data_path=cache_url, rank=rank)
- # Set model property
- model_parallel_num = args_opt.op_level_model_parallel_num
- data_parallel_num = int(device_num / model_parallel_num)
- batch_size = args_opt.per_batch_size * data_parallel_num
- config = PANGUALPHAConfig(
- data_parallel_num=data_parallel_num,
- model_parallel_num=model_parallel_num,
- batch_size=batch_size,
- seq_length=args_opt.seq_length,
- vocab_size=args_opt.vocab_size,
- embedding_size=args_opt.embedding_size,
- num_layers=args_opt.num_layers,
- num_heads=args_opt.num_heads,
- expand_ratio=4, dropout_rate=0.1,
- compute_dtype=mstype.float16,
- stage_num=args_opt.stage_num,
- micro_size=args_opt.micro_size,
- eod_reset=bool(args_opt.eod_reset),
- load_ckpt_path=None, ##args_opt.load_ckpt_local_path,## incremental_training ckpt load,None
- param_init_type=mstype.float32 if args_opt.param_init_type == 'fp32' else mstype.float16,
- word_emb_dp=bool(args_opt.word_emb_dp))
- print("===config is: ", config, flush=True)
-
- # Define network
- pangu_alpha = PanguAlpha(config)
- loss = CrossEntropyLoss(config)
- pangu_alpha_with_loss = PanguAlphaWithLoss(config, pangu_alpha, loss)
- pangu_alpha_with_loss = _VirtualDatasetCell(pangu_alpha_with_loss)
-
- # Warm-up and cosine decay learning rate
- lr = LearningRate(learning_rate=args_opt.start_lr, end_learning_rate=args_opt.end_lr,
- warmup_steps=args_opt.warmup_step, decay_steps=args_opt.decay_steps)
-
- # Set weight decay coefficient, zero for bias and layernorm, 1e-1 for rest
- decay_filter = lambda x: 'layernorm' not in x.name.lower() and "bias" not in x.name.lower()
- params = pangu_alpha.trainable_params()
- decay_params = list(filter(decay_filter, params))
- other_params = list(filter(lambda x: not decay_filter(x), params))
- group_params = [{
- 'params': decay_params,
- 'weight_decay': 1e-1
- }, {
- 'params': other_params,
- 'weight_decay': 0.0
- }, {
- 'order_params': params
- }]
- if args_opt.optimizer == "lamb":
- optimizer = nn.Lamb(group_params, learning_rate=lr)
- else:
- optimizer = FP32StateAdamWeightDecay(group_params, learning_rate=lr, eps=1e-8, beta1=0.9, beta2=0.94)
- # Initial scaling sens
- loss_scale_value = math.pow(2, 12)
- callback_size = args_opt.sink_size
- if args_opt.distribute == "true":
- D.init()
- device_num = D.get_group_size()
- rank = D.get_rank()
- if args_opt.pre_trained:
- callback = [
- TimeMonitor(callback_size),
- LossCallBack(callback_size, rank, args_opt.has_trained_epoches, args_opt.has_trained_steps)
- ]
- else:
- callback = [
- TimeMonitor(callback_size), LossCallBack(callback_size, rank, 0, 0)]
-
- update_cell = DynamicLossScaleUpdateCell(loss_scale_value=loss_scale_value, scale_factor=2, scale_window=1500)
- pangu_alpha_with_grads = PanguAlphaTrainOneStepWithLossScaleCell(
- pangu_alpha_with_loss,
- optimizer=optimizer,
- scale_update_cell=update_cell,
- enable_global_norm=True,
- config=config)
- model = Model(pangu_alpha_with_grads)
-
- if not mox.file.exists(args_opt.save_checkpoint_bucket_dir):
- mox.file.make_dirs(args_opt.save_checkpoint_bucket_dir)
- epoch_num = args_opt.epoch_size
-
- # Dataset loading mindrecord files
- ds = create_dataset(config.batch_size,
- data_path=cache_url,
- data_start_index=args_opt.data_start_index,
- eod_reset=config.eod_reset,
- full_batch=bool(args_opt.full_batch),
- eod_id=args_opt.eod_id,
- device_num=device_num,
- rank=rank,
- column_name=args_opt.data_column_name,
- epoch=epoch_num)
-
- return pangu_alpha, args_opt, model, ds, callback
-
- def run_train(model, args_opt, ds, callback, rank_id, epoch_idx):
-
- step_per_epoch = ds.get_dataset_size()
- callback_size = args_opt.sink_size
- actual_epoch_num = int(step_per_epoch / callback_size)
- print("=====dataset size: ", ds.get_dataset_size(), flush=True)
- print("=====actual_epoch_num: ", actual_epoch_num, flush=True)
- print("Dataset size: {}, actual_epoch_num: {}".format(ds.get_dataset_size(), actual_epoch_num), flush=True)
-
- if epoch_idx != 0:
- if rank_id < 8:
- model.train(actual_epoch_num, ds, callbacks=callback, sink_size=callback_size, dataset_sink_mode=True)
- else:
- # synchronization parameters
- obs_ckpt_dir = f"s3://obs_path_to_save_ckpt/{rank_id%8}_epoch--{str(epoch_idx)}.ckpt"
- obs_restore_flags = f"s3://obs_path_to_save_ckpt/{rank_id%8}_Done--{str(epoch_idx)}.txt"
- local_ckpt_dir = f"/cache/ckpt/{rank_id%8}_model_local.ckpt"
-
- while not mox.file.exists(obs_restore_flags):
- print(f"{obs_restore_flags} not found, sleep 0.5s ...")
- time.sleep(3)
- time0 = time.time()
-
- mox.file.copy(obs_ckpt_dir, local_ckpt_dir)
- print(f"copy {obs_ckpt_dir} to {local_ckpt_dir}...")
- print("Download ckpt time use: ", time.time()-time0)
- print(os.path.getsize(local_ckpt_dir) / (1024 * 1024), local_ckpt_dir, '\n\n')
- state_dict = load_checkpoint(local_ckpt_dir)
- load_param_into_net(model.train_network, state_dict)
- model.train(actual_epoch_num, ds, callbacks=callback, sink_size=callback_size, dataset_sink_mode=True)
- else:
- model._init(train_dataset=ds, sink_size=2)
-
- def main(opt):
-
- net, args_opt, model, ds, callback = load_train_net(opt)
- D.init()
- rank_id = D.get_rank()
- epoch_num = 10
-
- class Node_MindSpore_NPU(asc.client.NumPyClient):
- def __init__(self):
- self.obs_ckpt_dir = f"s3://obs_path_to_save_ckpt/{rank_id}_epoch--0.ckpt"
- self.local_ckpt_dir = f"/cache/ckpt/{rank_id}_model_local.ckpt"
- self.count = 0
- state_keys = []
- for key, value in net.parameters_dict().items():
- if 'adam' not in key:
- if 'backbone.' in key:
- state_keys.append(key.replace('backbone.', ''))
- else:
- state_keys.append(key)
- self.keys = sorted(state_keys)
- self.keys_idx = [state_keys.index(i) for i in self.keys]
-
- def get_parameters(self) -> List[np.ndarray]:# get local model parameters
- # init network parameters
- if self.count == 0:
- return [np.ones([2, 128], dtype=np.float16)]
- else:
- # align parameters
- keys = [key for key in net.parameters_dict().keys() if 'adam' not in key]
- return [np.array(net.parameters_dict()[keys[key_idx]].asnumpy(), dtype=np.float32) for key_idx in self.keys_idx]
-
- def set_parameters(self, parameters):
- # init network parameters
- if self.count == 0:
- self.count += 1
- else:
- state_dict = {}
- for idx, key_idx in enumerate(self.keys_idx):
- # align parameters
- state_dict[list(net.parameters_dict().keys())[key_idx]] = Parameter(np.array(parameters[idx], dtype=np.float32))
- load_param_into_net(net, state_dict, strict_load=True)
-
- save_checkpoint(net, self.local_ckpt_dir, integrated_save=False)
- self.obs_ckpt_dir = self.obs_ckpt_dir.split('--')[0]+'--'+str(self.count)+'.ckpt'
- print("Save ckpt and copy to Obs ...", self.obs_ckpt_dir)
- mox.file.copy(self.local_ckpt_dir, self.obs_ckpt_dir)
- f = open(f"/cache/{rank_id}_Done--{str(self.count)}.txt", 'w')
- f.close()
- ckpt_download_done_flag = f"s3://obs_path_to_save_ckpt/{rank_id}_Done--{str(self.count)}.txt"
- mox.file.copy(f"/cache/{rank_id}_Done--{str(self.count)}.txt", ckpt_download_done_flag)
- self.count += 1
-
- def fit(self, parameters, config):# globle model parameters
- self.set_parameters(parameters) # SET NEW model parameters
- run_train(model, args_opt, ds, callback, rank_id, epoch_idx=(self.count-1))
- return self.get_parameters(), 5, {}
-
- def evaluate(self, parameters, config):
- self.set_parameters(parameters)
- # mPanGu don't use evaluate mode
- loss = 1.0
- return float(loss), 5, {"acc: ": float(0.9)}
-
- if rank_id < 8:
- SERVER_IP = '*.*.*.*'
- port = [30000, 30010, 30002, 30003, 30004, 30005, 30006, 30009]
- asc.client.run_numpy_client(f"{SERVER_IP}:{port[rank_id]}",
- client=Node_MindSpore_NPU(),
- grpc_max_message_length=2 * 1024 * 1024 * 1024 - 1)
- else:
- for i in range(epoch_num+1):
- run_train(model, args_opt, ds, callback, rank_id, epoch_idx=i)
-
- if __name__ == "__main__":
- opt = get_args()
- set_parse(opt)
- main(opt)
|