PCL-Platform.Intelligence
/
pangu_alpha_pruning_NPU

 
			
							# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
PanguAlpha train script
"""
import datetime
import glob
import os
import math
import time
import random
import mindspore
import moxing as mox
from pathlib2 import Path
from mindspore import context
from mindspore.train.model import Model
import mindspore.communication.management as D
from mindspore.context import ParallelMode
import mindspore.nn as nn
from mindspore.train.callback import TimeMonitor, Callback
from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
import mindspore.common.dtype as mstype
from mindspore.parallel import set_algo_parameters
from mindspore.parallel._cost_model_context import _set_multi_subgraphs
from mindspore.nn.wrap.cell_wrapper import _VirtualDatasetCell  # PipelineCell,
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig

from dataset_restore_data0 import create_dataset2 as create_dataset
from src.pangu_alpha_tiny_Masked import PanguAlpha, PanguAlphaWithLoss, CrossEntropyLoss, EvalNet_p, generate_samples_cftpd
from src.pangu_alpha_wrapcell import PanguAlphaTrainOneStepWithLossScaleCell, VirtualDatasetOneInputCell
from src.pangu_alpha_config import PANGUALPHAConfig, set_parse
from src.utils import LearningRate, get_args, FP32StateAdamWeightDecay

from mindspore.train.serialization import load_checkpoint, load_param_into_net, build_searched_strategy, merge_sliced_parameter
import numpy as np
from mindspore import Tensor

from utils_fix import LossSummaryCallback, StrategySaveCallback, Ckpt2ObsSummaryCallback
from download_dataset import DatasetDownloader

from mindspore.ops import operations as P
from mindspore.ops import functional as F
from mindspore.ops import composite as C
from tqdm import tqdm
from mindspore import dataset as de

SamplePercentage=0.033
SEED = 5

def set_manual_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)
    de.config.set_seed(seed)
    mindspore.common.set_seed(seed)
    
# TASK_NAME = 'CFT'
# BUCKET_DIR = 'obs://pcl-verify/yizx/distilPangu/datasets/pd_noBlank_0918/'
# LOCAL_PATH = "/cache/pd_noBlank_0918/"

TASK_NAME = 'Pretrained_DatasetSample1G'
BUCKET_DIR = 'obs://datasets/V1-sample300-bpe-1024/'
LOCAL_PATH = "/cache/V1-sample300-bpe-1024/"

def sample_manual_percentage_dataset_from_OBS_dir(obs_dir, local_dir, percent=0.033):
    SCAN_DIRECTORY = 's3://datasets/V1-sample300-bpe-1024'
    hf_obs_file = lambda *x: '/'.join([SCAN_DIRECTORY, *x])
    z = mox.file.list_directory(obs_dir)
    all_file_nums = len(z)
    need_to_select_oushu_files_num = int(all_file_nums * percent)
    if not need_to_select_oushu_files_num % 2 == 0:
        need_to_select_oushu_files_num += 1
    z_tmp = random.sample(z, need_to_select_oushu_files_num)
    for each_f in z:
#         obs_download_file(obs_client, hf_obs_file(each_f), './' + each_f)
#         print(hf_obs_file(each_f), LOCAL_PATH + each_f)
        mox.file.copy(hf_obs_file(each_f), LOCAL_PATH + each_f)

ops_slice = P.StridedSlice()
ops_notEqual = P.NotEqual()

def process_3inputsto4inputs_to_pangu(config, input_ids, input_position, attention_mask):
    tokens = ops_slice(input_ids, (0, 0), (config.batch_size // 1, -1), (1, 1))
    input_mask = F.cast(ops_notEqual(tokens, 6), mstype.float32)
    return tokens, input_mask, input_position, attention_mask

def ckpt_copy_tar(obs_path, target_path="/cache/ckpt"):
    """
        requires the obs_path to be a complete name
        Copy tar file from the obs to the /cache/
    """
    sub_name_list = ['_0.tar', '_1.tar', '_2.tar', '_3.tar']
    for item in sub_name_list:
        sub_name = obs_path + item
        tmp_name = 'model.tar'
        mox.file.copy(sub_name, os.path.join(target_path, tmp_name))
        os.system('cd {}; tar -xvf {}'.format(target_path, tmp_name))

def get_ckpt_file_list(ckpt_path):
    returned_list = []
    for i in range(0, 16):#512):
        returned_list.append('filerted_{}.ckpt'.format(i))
    returned_list = [os.path.join(ckpt_path, item) for item in returned_list if 'embedding' not in item]
    print("Sorted list", returned_list)
    for item in returned_list:
        fsize = os.path.getsize(item)
        f_gb = fsize / float(1024) / 1024 / 1024
        print(item, " :{:.2f}".format(f_gb))
    return returned_list

class LossCallBack(Callback):
    """
    Monitor the loss in training.
    If the loss in NAN or INF terminating training.
    """

    def __init__(self, dataset_size=-1, local_rank=0, has_trained_epoch=0, has_trained_step=0, micro_size=1):
        super(LossCallBack, self).__init__()
        self._dataset_size = dataset_size
        self.local_rank = local_rank
        self.has_trained_epoch = has_trained_epoch
        self.has_trained_step = has_trained_step
        self.micro_size = micro_size
        print("load has trained epoch :{} and step: {}".format(has_trained_epoch, has_trained_step), flush=True)

    def step_end(self, run_context):
        """
        Print loss after each step
        """
        cb_params = run_context.original_args()
        if self._dataset_size > 0 and self.local_rank % 8 == 0:
            percent, epoch_num = math.modf(cb_params.cur_step_num /
                                           self._dataset_size)
            if percent == 0:
                epoch_num -= 1
            date = time.asctime(time.localtime(time.time()))
            loss_value = cb_params.net_outputs[0].asnumpy() / self.micro_size
            print("time: {} local_rank: {}, epoch: {}, step: {}, output is {}, overflow is {}, scale is {}".
                  format(date, int(self.local_rank), int(epoch_num) + int(self.has_trained_epoch),
                         cb_params.cur_step_num + int(self.has_trained_step), loss_value,
                         cb_params.net_outputs[1].asnumpy(), cb_params.net_outputs[2].asnumpy(), 
#                          cb_params.net_outputs[3].asnumpy()
                        ))


project_root = os.path.abspath(
    os.path.dirname(os.path.realpath(__file__)) + os.path.sep + "..")
print('project_root:', project_root)

def count_params(net):
    """Count number of parameters in the network
    Args:
        net (mindspore.nn.Cell): Mindspore network instance
    Returns:
        total_params (int): Total number of trainable params
    """
    total_params = 0
    for param in net.trainable_params():
        total_params += np.prod(param.shape)
    return total_params

def run_train(args_opt):
    r"""
    The main training process.
    """
    # Set hccl connect time
    os.environ['HCCL_CONNECT_TIMEOUT'] = "6000"
    EXEC_PATH = os.path.join(project_root, 'tiny_pangu')
    device_id = int(os.getenv("DEVICE_ID"))
    rank_id_str = os.getenv('RANK_ID', '0')
    rank_id = int(
        rank_id_str[rank_id_str.rfind('-') +
                    1:])  # 'RANK_ID': 'job24535502-job-facereidtome-hn-0/1'
    print('rank_id:{}'.format(rank_id), "rank_id str:{}".format(rank_id_str))
    device_id = int(os.getenv('DEVICE_ID'))
    local_rank = rank_id
    print('local_rank:{}, device id:{}'.format(local_rank, device_id))
    
    # copy strategy_ckpt
    pretrained_strategy_ckpt_path = "/cache/strategy/ckpt_strategy_{}.ckpt".format(local_rank)
    mox.file.copy(src_url="obs://pcl-verify/yizx/distilPangu/strategy_ckpt/pangu26b_finetuneOnPD_PET_saveCKPT_3e6LR_bs30_bt98cktp_strategy.ckpt", dst_url=pretrained_strategy_ckpt_path)
#     mox.file.copy(src_url="obs://mindspore-file/strategy_ckpt/gpt_1024_13b_exp65cktp_strategy.ckpt", dst_url=pretrained_strategy_ckpt_path)
    
    # donload dataset
    if local_rank % 8 == 0:
        print('MindSpore path:', mindspore)
        print("Modify the time out from 300 to 30000")
        tbe_path = "/usr/local/ma/python3.7/lib/python3.7/site-packages/mindspore" \
                   "/_extends/parallel_compile/tbe_compiler/tbe_process.py"
        os.system(
            "sed -i 's/300/30000/g' " + tbe_path
        )
        os.system(
            "sed -i 's/330/33000/g' " + tbe_path
        )
        print("begin download dataset", flush=True)
        
        cache_url = LOCAL_PATH
        if not os.path.exists(LOCAL_PATH):
            Path(LOCAL_PATH).mkdir(parents=True, exist_ok=True)
        

        files = os.listdir(LOCAL_PATH)
        data = [
            os.path.join(LOCAL_PATH, name) for name in files
            if not name.endswith(".db")
        ]
        if len(data) == 0:
            print("Start to copy the dataset", flush=True)
            Path(cache_url).mkdir(parents=True, exist_ok=True)
            if SamplePercentage is None:
                mox.file.copy_parallel(src_url=BUCKET_DIR, dst_url=LOCAL_PATH)
        if not SamplePercentage is None:
            sample_manual_percentage_dataset_from_OBS_dir(LOCAL_PATH, SamplePercentage)
        print("@@@@@@ Dataset download succeed! @@@@@@@", flush=True)

        os.environ['HCCL_CONNECT_TIMEOUT'] = "6000"
        os.system('ulimit -s 102400')
        print(args_opt.ckpt_path)
#         ckpt_copy_tar(args_opt.ckpt_path, target_path="/cache/ckpt_files")
        mox.file.copy('obs://pcl-verify/yizx/distilPangu/merged_ckpt/Newexp65_GPT3_2-3494_2.ckpt', '/cache/Newexp65_GPT3_2-3494_2.ckpt')
        mox.file.copy(args_opt.word_embedding_path, '/cache/word_embedding.npy')
        mox.file.copy(args_opt.position_embedding_path, '/cache/position_embedding.npy')
        mox.file.copy(args_opt.top_query_embedding_path, '/cache/top_query_embedding.npy')
        print("setting env success.")
        f = open("%s/install.txt" % (EXEC_PATH), 'w')
        f.close()
    # 此处用于阻塞其他进程，直到刷包以及下载数据集完成为止
    while not os.path.exists("%s/install.txt" % (EXEC_PATH)):
        time.sleep(1)
    print('local_rank:{}, device id:{} start to run...'.format(
        local_rank, device_id),
          flush=True)
    
    
    # Set execution mode
    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
    context.set_context(variable_memory_max_size="31GB")
    strategy_file = '/tmp/cktp_strategy.ckpt'
    # Set parallel context
    if args_opt.distribute == "true":
        D.init()
        device_num = D.get_group_size()
        rank = D.get_rank()
        print("rank_id is {}, device_num is {}".format(rank, device_num))
        
        
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=False,
            device_num=device_num,
            full_batch=False, 
            strategy_ckpt_load_file=pretrained_strategy_ckpt_path,
            enable_parallel_optimizer=False, 
            strategy_ckpt_save_file=strategy_file)
        set_algo_parameters(elementwise_op_strategy_follow=True)
        _set_multi_subgraphs()
    else:
        rank = 0
        device_num = 1
    context.set_context(save_graphs=False, save_graphs_path="/cache/" + str(rank))
    # copy data from the cloud to the /cache/Data
    cache_url = '/cache/Data/'

    
    # Set model property
    model_parallel_num = 1 #args_opt.op_level_model_parallel_num
    data_parallel_num = int(device_num / model_parallel_num)
    args_opt.per_batch_size = 32
    batch_size = args_opt.per_batch_size * data_parallel_num
    print("@@@@@ batch_size_perDevice is : {} @@@@@".format(batch_size))
    
    teacher_config = PANGUALPHAConfig(
        data_parallel_num=data_parallel_num, model_parallel_num=model_parallel_num, batch_size=batch_size,
        seq_length=args_opt.seq_length, vocab_size=args_opt.vocab_size, embedding_size=args_opt.embedding_size,
        num_layers=args_opt.num_layers, num_heads=args_opt.num_heads, expand_ratio=4, dropout_rate=0.0,
        compute_dtype=mstype.float16, stage_num=args_opt.stage_num, micro_size=args_opt.micro_size,
        eod_reset=bool(args_opt.eod_reset), load_ckpt_path=args_opt.load_ckpt_path,
        param_init_type=mstype.float32 if args_opt.param_init_type == 'fp32' else mstype.float16,
        word_emb_dp=bool(args_opt.word_emb_dp))
    
    print("===teacher_config is: ", teacher_config, flush=True)

    # Define network
    teacher_net = PanguAlpha(teacher_config, is_teacher=True)
    loss = CrossEntropyLoss(teacher_config)
    pangu_alpha_with_loss = PanguAlphaWithLoss(teacher_config, teacher_net, loss)
    pangu_alpha_with_loss = _VirtualDatasetCell(pangu_alpha_with_loss)
    
    ##############################################################################################
    ### load pretrained_pangu_ckpt
    print("##### start to load pangu 2.6B pretrained-ckpt #####", flush=True)
#     from mindspore.train.serialization import load_distributed_checkpoint
#     #tmp_input = Tensor(np.ones(shape=(1, config.seq_length)), mstype.int32)
#     ##strategy = model.infer_train_layout(train_dataset=ds, sink_size=callback_size)
#     ckpt_file_list = get_ckpt_file_list('/cache/ckpt_files')
#     load_distributed_checkpoint(teacher_net, ckpt_file_list)#, predict_layout)
    params_dict = load_checkpoint("/cache/Newexp65_GPT3_2-3494_2.ckpt")
    load_param_into_net(teacher_net, params_dict)
    print('##### PANGU-2.6B partial parameter size is: {} #####'.format(count_params(teacher_net)))
    
    teacher_net.set_train(False)
    params_tmp = teacher_net.trainable_params()

    for param in params_tmp:
        if not 'ctx' in param.name:
            param.requires_grad = False
        else:
            param.requires_grad = True
            
    params = teacher_net.trainable_params()
    print(params)
#     exit()
    
    print("=====args_opt is: ", args_opt, flush=True)

    # Warm-up and cosine decay learning rate
    lr = LearningRate(learning_rate=2e-12, end_learning_rate=1e-12,
                      warmup_steps=args_opt.warmup_step, decay_steps=10000)

    # Set weight decay coefficient, zero for bias and layernorm, 1e-1 for rest
    decay_filter = lambda x: 'layernorm' not in x.name.lower() and "bias" not in x.name.lower()
    #decay_params = list(filter(decay_filter, params))
    other_params = params #list(filter(lambda x: not decay_filter(x), params))
    group_params = [{
        'params': other_params,
        'weight_decay': 0.0
    }, {
        'order_params': params
    }]
    if args_opt.optimizer == "lamb":
        optimizer = nn.Lamb(group_params, learning_rate=lr)
    else:
        optimizer = FP32StateAdamWeightDecay(group_params, learning_rate=lr, eps=1e-8, beta1=0.9, beta2=0.95)
    # Initial scaling sens
    loss_scale_value = math.pow(2, 32)
    epoch_num = 1 #args_opt.epoch_size

    ds = create_dataset(teacher_config.batch_size, data_path=LOCAL_PATH, data_start_index=0, eod_reset=teacher_config.eod_reset, eod_id=args_opt.eod_id, device_num=device_num, rank=rank, hash_check=True)

# ###################### test shape ##################################################################################
#     for item in ds:     # [input_ids, position_id, attention_mask]
#         for d in item:
#             print(d, d.shape)
#         break
#     test_inputs, test_position_id, test_attention_mask = item
#     teacher_seq_output, student_seq_output = pangu_alpha_with_loss(test_inputs
#               , test_position_id, test_attention_mask)
#     print(teacher_seq_output, student_seq_output)
#     print(len(teacher_seq_output), len(student_seq_output))  # 31, 15
#     for i in range(len(teacher_seq_output)):
#         print(teacher_seq_output[i].shape, student_seq_output[i].shape)
#     exit()
# ####################################################################################################

    ckpt_dir = os.path.join("/cache/ckpt/", f"rank_{str(local_rank)}")
    # create dir for ckpt
    if not os.path.exists(ckpt_dir):
        Path(ckpt_dir).mkdir(parents=True, exist_ok=True)

    step_per_epoch = ds.get_dataset_size()
    callback_size = args_opt.sink_size
    actual_epoch_num = int(epoch_num * step_per_epoch / callback_size)
    callback = [
        TimeMonitor(callback_size),
        LossCallBack(callback_size, rank, 0, 0)
    ]
    
#     config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_step,
#                                  keep_checkpoint_max=1,
#                                  integrated_save=False)
#     ckpoint_cb = ModelCheckpoint(prefix="tinyPangu-yizx",
#                                  directory=ckpt_dir,
#                                  config=config_ck)
#     ckpt2obs_cb = Ckpt2ObsSummaryCallback(local_ckpt_dir=ckpt_dir, 
#                                             local_rank=0, 
#                                             has_trained_epoch=0,
#                                             has_trained_step=0, 
#                                             bucket=args_opt.bucket_dir + '/' + str(local_rank),
#                                             syn_obs_steps=args_opt.save_step)
#     callback.append(ckpoint_cb)
#     callback.append(ckpt2obs_cb)
    
    if local_rank == 0:
        sub_dir = args_opt.bucket_dir.split('/')[-1]
        callback.append(LossSummaryCallback(summary_dir="summary", 
                                            local_rank=0, 
                                            has_trained_epoch=0,
                                            has_trained_step=0, 
                                            bucket='obs://pcl-verify/yizx/distilPangu/summary/' + sub_dir,
                                            syn_times=40))
        callback.append(StrategySaveCallback(strategy_path=strategy_file, 
                                            local_rank=0, 
                                            has_trained_epoch=0,
                                            has_trained_step=0, 
                                            bucket='obs://pcl-verify/yizx/distilPangu/strategy_ckpt/' + sub_dir))

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=loss_scale_value, scale_factor=2, scale_window=1000)
    pangu_alpha_with_grads = PanguAlphaTrainOneStepWithLossScaleCell_getGrad(
        pangu_alpha_with_loss, optimizer=optimizer, scale_update_cell=update_cell, enable_global_norm=True,
        config=teacher_config)
    model = Model(pangu_alpha_with_grads)
    print("Dataset size: {}, actual_epoch_num: {}".format(ds.get_dataset_size(), actual_epoch_num), flush=True)
   

    print(model._train_network.network._backbone.network)

    head_importance = np.zeros((teacher_config.num_layers, teacher_config.num_heads))
    tot_tokens = 0
    subset_size = 1.0

    for i in range(epoch_num):
        dataset_helper, _ = model._exec_preprocess(is_train=True,
                                                  dataset=ds,
                                                  dataset_sink_mode=False,
                                                  epoch_num=epoch_num)
        for inputs in tqdm(dataset_helper):
#             outputs = model._train_network(*inputs)
            tokens, input_mask, input_position, attention_mask = process_3inputsto4inputs_to_pangu(teacher_config, *inputs)
            logits, all_layers_ctx = model._train_network.network._backbone.network(tokens, input_mask, input_position, attention_mask)
            #####################################################################
            loss = model._train_network.network(*inputs)  # passed!!
            weights = optimizer.parameters
            scaling_sens = model._train_network.scale_sense
            # alloc status and clear should be right before gradoperation
            status, scaling_sens = model._train_network.start_overflow_check(loss, scaling_sens)
            scaling_sens_filled = C.ones_like(loss) * F.cast(scaling_sens, F.dtype(loss))
            # Backward process using loss scale
            grads = model._train_network.grad(model._train_network.network,
                              weights)(inputs[0],
                                       input_position, attention_mask,
                                       scaling_sens_filled)
            #####################################################################

            loss_value = loss  #outputs[0].asnumpy()
            assert len(grads) == len(all_layers_ctx)

            # calculate_head_importance on this dataset
            for layer_idx in range(len(grads)):
                ctx = all_layers_ctx[layer_idx].asnumpy()
                grad_ctx = grads[layer_idx].asnumpy()

                # take the dot
                dot = np.einsum("bhli, bhli->bhl", grad_ctx, ctx)
                head_importance[layer_idx] += np.abs(dot).sum(-1).sum(0)  # sum seq_length, then sum bs

            tot_tokens += input_mask.asnumpy().sum()

        head_importance[:-1] /= tot_tokens
        head_importance[-1] /= subset_size
        np.save("[{}]-[{}]_head_importance.npy".format(args_opt.mode, TASK_NAME), head_importance)
        mox.file.copy(src_url="[{}]-[{}]_head_importance.npy".format(args_opt.mode, TASK_NAME), dst_url='obs://pcl-verify/yizx/distilPangu/head_importance_npys/[{}]-[{}]_head_importance.npy'.format(args_opt.mode, TASK_NAME))

if __name__ == "__main__":
    set_manual_seed(SEED)
    opt = get_args()
    set_parse(opt)
    if opt.per_batch_size == 0:
        raise ValueError("The per_batch_size has not been configured.")
    run_train(opt)