|
- # Copyright 2021 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
- """
- PanGu predict run
- """
- import json
- import os
- import requests
- import datetime
- import glob
- import time
- import sys
-
- import numpy as np
- from tqdm import tqdm
-
- import mindspore.common.dtype as mstype
- import mindspore.communication.management as D
- import mindspore as ms
- from mindspore import context, Tensor
- from mindspore import export
- from mindspore.context import ParallelMode
- from mindspore.parallel import set_algo_parameters
- from mindspore.parallel._cost_model_context import _set_multi_subgraphs
- from mindspore.train.model import Model
- # from mindspore.train.serialization import load_distributed_checkpoint, load_checkpoint
- # from mindspore.nn.transformer.transformer import TransformerOpParallelConfig
-
- from mindspore.nn.transformer.transformer import TransformerOpParallelConfig, TransformerRecomputeConfig
- from mindspore.nn.transformer.loss import CrossEntropyLoss
- from src.generate import get_scores
- from src.pangu_alpha_hidden import EvalNet, PanguAlphaModel, EvalNet_200B
- from src.pangu_alpha_config import set_parse, PanguAlphaConfig
- from src.utils import get_args
-
- from mindspore.common import Parameter
- from mindspore.common.tensor import Tensor
- from mindspore.train.serialization import load_checkpoint, load_param_into_net
- from src.utils import download_ckpt_from_obs
-
- from transformers import LlamaTokenizer
-
- MAIN_DIR = os.path.dirname(
- os.path.abspath(__file__)
- )
- print(MAIN_DIR)
- sys.path.insert(0, MAIN_DIR)
-
- def restore_checkpoint(args_param, network, cache_url='/cache/Ckpt/'):
- r"""
- Load checkpoint process.
- """
- restore_ranks = D.get_rank()
- print("======start single checkpoint", flush=True)
- ckpt_name = os.path.join(cache_url, f"rank_{restore_ranks}.ckpt")
-
- if not ckpt_name:
- print(f"There is no ckpt file in {ckpt_name}, "
- f"current ckpt_files found is {ckpt_name} "
- f"with pattern {ckpt_name}, so skip the loading.")
-
- time_stamp = datetime.datetime.now()
- print(f"time stamp {time_stamp.strftime('%Y.%m.%d-%H:%M:%S')} pre trained ckpt model {ckpt_name} loading",
- flush=True)
- # Load checkpoint files latest file
- print(f'Start to load from {ckpt_name}')
- param_dict = load_checkpoint(ckpt_name)
- #for k, v in param_dict.items():
- # print(f"{k}: ", v.shape)
- load_param_into_net(network, param_dict, strict_load=False)
-
- def set_auto_parallel_context(args_opt):
- """Set the auto parallel context"""
- rank = 0
- device_num = 1
- context.reset_auto_parallel_context()
- # context.set_auto_parallel_context(
- # strategy_ckpt_load_file=args_opt.strategy_load_ckpt_path)
- if args_opt.distribute == "true":
- D.init()
- device_num = D.get_group_size()
- rank = D.get_rank()
- print("rank_id is {}, device_num is {}".format(rank, device_num))
- context.set_auto_parallel_context(
- parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL,
- gradients_mean=False,
- full_batch=True,
- loss_repeated_mean=True,
- enable_parallel_optimizer=False,
- strategy_ckpt_save_file=f'/cache/strategy_{rank}.ckpt',
- pipeline_stages=args_opt.stage_num)
- set_algo_parameters(elementwise_op_strategy_follow=True)
- _set_multi_subgraphs()
-
- return rank, device_num
-
- def load_model(args_opt):
- r"""
- The main function for load model
- """
- context.set_context(mode=context.GRAPH_MODE)
- # Set parallel context
- rank, device_num = set_auto_parallel_context(args_opt)
-
- # context.set_context(variable_memory_max_size="31GB")
- context.set_context(max_device_memory="30GB")
- context.set_context(save_graphs=False,
- save_graphs_path="/cache/graphs_of_device_id_" + str(rank),
- device_target=args_opt.device_target)
-
- strategy_local_file = f"/cache/inference_strategy_100b_d8_mp8_dp1-{rank}.ckpt"
- ms.set_auto_parallel_context(strategy_ckpt_save_file=strategy_local_file)
-
- if args_opt.eval_task:
- use_past = False
- else:
- use_past = True if args_opt.export else (args_opt.use_past == "true")
- print('local_rank:{}, start to run...'.format(rank), flush=True)
-
- # Set model property, rewrite the model parallel
- if device_num < args_opt.op_level_model_parallel_num:
- print(f"The op_level_model_parallel_num {args_opt.op_level_model_parallel_num} is smaller than the device num,"
- f"so change it to the {device_num}", flush=True)
- args_opt.op_level_model_parallel_num = device_num
- model_parallel_num = args_opt.op_level_model_parallel_num
- data_parallel_num = int(device_num / (model_parallel_num*args_opt.stage_num))
-
- parallel_config = TransformerOpParallelConfig(data_parallel=data_parallel_num,
- model_parallel=model_parallel_num,
- pipeline_stage=args_opt.stage_num,
- micro_batch_num=args_opt.micro_size,
- vocab_emb_dp=False,
- recompute=False)
- # add sequence_parallel
- parallel_config.sequence_parallel = args_opt.sequence_parallel
- # add select_recompute
- parallel_config.select_recompute = args_opt.select_recompute
-
- per_batch_size = args_opt.per_batch_size
- batch_size = per_batch_size * data_parallel_num
- # Now only support single batch_size for predict
- if args_opt.run_type == "predict":
- batch_size = 1
-
- # download ckpt to local
- D.init()
- device_num = D.get_group_size()
- rank_id = D.get_rank()
-
- softmax_compute_type = mstype.float16
- top_query_softmax = mstype.float16
- layernorm_compute_type = mstype.float16
- config = PanguAlphaConfig(
- batch_size=batch_size,
- seq_length=args_opt.seq_length,
- vocab_size=args_opt.vocab_size,
- hidden_size=args_opt.embedding_size,
- num_layers=args_opt.num_layers,
- num_heads=args_opt.num_heads,
- post_layernorm_residual=False,
- dropout_rate=0.0,
- ffn_hidden_size=args_opt.embedding_size * 4,
- use_past=use_past,
- eod_reset=False,
- parallel_config=parallel_config,
- load_ckpt_path=None,
- run_type=args_opt.run_type,
- param_init_type=mstype.float16,
- use_rope=args_opt.use_rope,)
-
- config.softmax_compute_fp32 = softmax_compute_type
- config.top_query_softmax_fp32 = top_query_softmax
- config.layernorm_compute_fp32 = layernorm_compute_type
- print("===config is: ", config, flush=True)
- print("=====args_opt is: ", args_opt, flush=True)
-
- # Define network
- pangu_alpha = PanguAlphaModel(config)
-
- # loss = CrossEntropyLoss()
- # eval_net = PanGUAlphaLossWithPrompt(config, pangu_alpha, loss)
- # eval_net = EvalNet_200B(pangu_alpha, pad_token=args_opt.padding_id)
- eval_net = EvalNet_200B(pangu_alpha, pad_token=args_opt.padding_id, seq_length=args_opt.seq_length)
- eval_net.set_train(False)
-
- # # 完整模型加载,要在构图之前
- # import time
- # time.sleep((rank % 8)*20)
- # load_checkpoint(local_ckpt_path, net=eval_net)
-
- model_predict = Model(eval_net)
- # Compile network and obtain tensor layout for loading ckpt
- inputs_np = Tensor(np.ones(shape=(config.batch_size, config.seq_length)), mstype.int32)
-
- if args_opt.distribute == "false":
- predict_layout = None
- else:
- # Compiling only needs the shape
- current_index = Tensor(np.array([0]), mstype.int32)
- model_predict.infer_predict_layout(inputs_np, current_index)
- #
- # if args_opt.save_strategy_bucket_dir and args_opt.save_strategy_name:
- # import moxing as mox
- # mox.file.copy(src_url=strategy_local_file,
- # dst_url=args_opt.save_strategy_bucket_dir+args_opt.save_strategy_name.replace('*', str(rank_id)))
- # if args_opt.save_strategy_bucket_dir and args_opt.save_strategy_name:
- # import moxing as mox
- # mox.file.copy(src_url=strategy_local_file,
- # dst_url=args_opt.save_strategy_bucket_dir+args_opt.save_strategy_name.replace('*', str(rank_id)))
- # import moxing as mox
- # obs_graphs_path = f"obs://research-my/taoht-100b/graphs_tmp/"
- # mox.file.copy_parallel(src_url="/cache/", dst_url=obs_graphs_path)
- cache_url = args_opt.ckpt_load_path
- #cache_url = '/cache/Ckpt/'
- #download_ckpt_from_obs(args_opt, cache_url, rank=rank_id)
- restore_checkpoint(args_opt, eval_net, cache_url=cache_url)
- print("================load param ok=================", flush=True)
-
- return model_predict, config
-
- def get_local_tokenizer(args_opt):
- vocab_file = 'tokenizer/llama_vocab/llama_zh_hf/tokenizer_2.model'
- vocab_file = os.path.join(args_opt.execute_path, vocab_file)
- #vocab_file = '/home/ma-user/modelarts/user-job-dir/pangu_alpha-r2.0beta_rotary/tokenizer/llama_vocab/llama_zh_hf/tokenizer.model'
- tokenizer = LlamaTokenizer.from_pretrained(vocab_file)
- print("================load tokenizer ok=================", flush=True)
- return tokenizer
-
- def get_model():
- opt = get_args(True)
- set_parse(opt)
- model_predict, config = load_model(opt)
- return model_predict, config, opt
-
- def get_local_model():
- model, config, opt = get_model()
- D.init()
- rank_id = D.get_rank()
- # 将文本转换为词典id
- tokenizer = get_local_tokenizer(opt)
- return model, tokenizer, config, opt, rank_id
-
- def get_layers_resp_one_item(model, tokenizer, opt, input_str, tokens_to_generate, top_p=1.0, top_k=3, logprobs=False):
- # generate_func = generate_increment if config.use_past else generate
- from src.generate import generate, generate_increment, generate_100b, generate_100b_task, layers_resp_100b_task
- import time
- generate_func = layers_resp_100b_task
-
- # Tokenize input sentence to ids
- start_sentence = tokenizer.encode(input_str, add_special_tokens=False)
- input_ids = np.array(start_sentence).reshape(1, -1)
-
- # generate(model, origin_inputs, config, top_p=1.0, top_k_num = 3, max_generate_length=100, duRepeate=True)
- hidden_result, current_index = generate_func(model, input_ids, opt,
- top_p=1.0,
- top_k_num=3,
- max_generate_length=tokens_to_generate,
- duRepeate=opt.duRepeate)
- return hidden_result, current_index
-
- def get_task_processor(task_name, opt):
- base_path = MAIN_DIR + "/dataset"
- model_token_max_len = opt.seq_length
- os.environ['TOKEN_VERSION'] = "v1"
- os.environ['URL_VERSION'] = "v1"
- if task_name == 'webqa':
- from dataset_processor.webqa_processor import WebqaProcessor as MainProcessor
- elif task_name == 'cmnli':
- from dataset_processor.cmnli_processor import CmnliProcessor as MainProcessor
- elif task_name == 'c3_m':
- from dataset_processor.c3_m_processor import C3MProcessor as MainProcessor
- elif task_name == 'cmrc':
- from dataset_processor.cmrc_processor import CMRCProcessor as MainProcessor
- elif task_name == 'siqa':
- from dataset_processor.siqa_processor import SIQAProcessor as MainProcessor
- elif task_name == 'sst2':
- from dataset_processor.sst2_processor import SST2Processor as MainProcessor
- elif task_name == 'winogrande':
- from dataset_processor.winogrande_processor import WinoGrandeProcessor as MainProcessor
- elif task_name == 'iflytek':
- from dataset_processor.iflytek_processor import IflytekProcessor as MainProcessor
- elif task_name == 'dureader':
- from dataset_processor.dureader_processor import DuReaderProcessor as MainProcessor
- elif task_name == 'z_bench':
- from dataset_processor.z_bench_processor import ZBenchProcessor as MainProcessor
- elif task_name == 'c_eval':
- from dataset_processor.c_eval_processor import CEvalProcessor as MainProcessor
- elif task_name == 'gaokao':
- from dataset_processor.gaokao_processor import GaoKaoProcessor as MainProcessor
- elif task_name == 'agi_eval':
- from dataset_processor.agi_eval_processor import AGIEvalProcessor as MainProcessor
- elif task_name == 'mmlu':
- from dataset_processor.mmlu_processor import MMLUProcessor as MainProcessor
- elif task_name == 'c_eval_test':
- from dataset_processor.c_eval_test_processor import CEvalTestProcessor as MainProcessor
- elif task_name == 'samples_fdd':
- from dataset_processor.samples_for_data_distribution import SamplesFDPProcessor as MainProcessor
- os.environ['TEST_JSON_LIST_RANK'] = str(sys.argv[6])
- tokenizer = get_local_tokenizer(opt)
- task_processor = MainProcessor(task_name, base_path, tokenizer, model_token_max_len)
- if task_processor.logprobs:
- os.environ['MODEL_PROBS'] = 'TRUE'
- else:
- os.environ['MODEL_PROBS'] = 'FALSE'
-
- return task_processor
-
- def do_eval(model, tokenizer, config, opt, rank_id, url, task_processor):
-
- MAIN_RANK_FLAG = False
- MODEL_200B_AND_LOGPROBS_FLAG = True if (url == 'MODEL_200B' and task_processor.logprobs) else False
-
-
- print("Eval Task {} start!".format(task_processor.task_name), flush=True)
-
- task_processor.init_example_list()
-
- index_print = 0
- shot = 'zero_shot'
- eval_data_iter = task_processor.get_eval_data_iter(shot)
- task_processor.init_eval_results()
- for input_items, groud_truth_items in eval_data_iter:
- try:
- index_print = index_print + 1
- if index_print > 30:
- break
- model_time = time.time()
- example_str = task_processor.get_shot_example_str()
- input_str, useful_info = task_processor.get_input_str(input_items, example_str)
- hidden_result, current_index = get_layers_resp_one_item(model, tokenizer, opt, input_str, 100)
- print("wait data save", flush=True)
- stop_once = time.time()
- file_str = f"layers_out_{task_processor.task_name}_{index_print}.npy"
- delay_time = 2
- if rank_id == 0:
- start_time = time.time()
- file_path = 'layers_result/'
- os.makedirs(file_path, exist_ok=True)
- hidden_shape = hidden_result.shape
- print(f"hidden_result shape: {hidden_shape}", flush=True)
- for i in range(hidden_shape[0]-2):
- save_dict = {}
- save_dict[f"layer_input"] = hidden_result[i, :, :].asnumpy()
- save_dict["current_index"] = current_index.asnumpy()
- file_path = f'layers_result/layers_{i}/'
- os.makedirs(file_path, exist_ok=True)
- np.save(os.path.join(opt.execute_path, file_path, file_str), save_dict)
-
- save_dict = {}
- save_dict[f"hidden_state"] = hidden_result[-2, :, :].asnumpy()
- save_dict[f"top_query_hidden_states"] = hidden_result[-1].asnumpy()
- save_dict["current_index"] = current_index.asnumpy()
- file_path = 'layers_result/top_query_layer/'
- os.makedirs(file_path, exist_ok=True)
- np.save(os.path.join(opt.execute_path, file_path, file_str), save_dict)
- stop_time = time.time()
- print(f"saving time {stop_time-start_time}", flush=True)
- if stop_time-start_time < delay_time:
- time.sleep(delay_time-(stop_time-start_time))
- else:
- time.sleep(delay_time)
-
- print(f"Eval Task: {task_processor.task_name}, time: {stop_once - model_time}, file name: {file_str}", flush=True)
- except:
- index_print = index_print - 1
-
- if __name__ == "__main__":
- model, tokenizer, config, opt, rank_id = get_local_model()
-
- url = "MODEL_200B"
- task_list = ["webqa", "sst2", "dureader", "cmnli", "iflytek", "c3_m", "winogrande", "siqa", "cmrc", "z_bench",
- "c_eval", "gaokao", "agi_eval", "mmlu"]
- for task_name in task_list:
- task_processor = get_task_processor(task_name, opt)
- do_eval(model, tokenizer, config, opt, rank_id, url, task_processor)
-
|