|
- # coding=utf-8
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
-
- """Sample Generate GPT2"""
-
- import os
- import sys
- import json
- MAIN_DIR = os.path.dirname(
- os.path.abspath(__file__)
- )
- print(MAIN_DIR)
- sys.path.insert(0, MAIN_DIR)
-
- import time
-
- def get_model_resp_one_item(url, input_str, tokens_to_generate, top_k=3, logprobs=False):
-
- return_response = None
- if url == 'MODEL_BAICHUAN':
- from model_baichuan import get_local_model_resp_one_item
- return_response = get_local_model_resp_one_item(input_str, tokens_to_generate, top_k, logprobs)
- elif url == 'MODEL_200B':
- from model_200B import get_local_model_resp_one_item
- return_response = get_local_model_resp_one_item(input_str, tokens_to_generate, top_k, logprobs)
- else:
- from model_url import get_url_model_resp_one_item
- return_response = get_url_model_resp_one_item(url, input_str, tokens_to_generate, top_k, logprobs)
- return return_response
-
- def get_model_resp(url, input_str, tokens_to_generate, top_k=3, logprobs=False):
- if isinstance(input_str, str):
- return get_model_resp_one_item(url, input_str, tokens_to_generate, top_k, logprobs)
- else:
- assert isinstance(input_str, list)
- return_resp_list = []
- for input_str_one_item in input_str:
- return_resp = get_model_resp_one_item(url, input_str_one_item, tokens_to_generate, top_k, logprobs)
- return_resp_list.append(return_resp)
-
- for return_resp_item in return_resp_list:
- if return_resp_item is None:
- return None
-
- return return_resp_list
-
-
- def get_tokenizer(url):
- tokenizer = None
- if url == 'MODEL_BAICHUAN':
- from model_baichuan import get_local_tokenizer
- tokenizer = get_local_tokenizer()
- elif url == 'MODEL_200B':
- from model_200B import get_local_tokenizer
- tokenizer = get_local_tokenizer()
- else:
- from model_url import get_url_tokenizer
- tokenizer = get_url_tokenizer()
- return tokenizer
-
-
- def extract_data(url, task_processor):
-
- MODEL_200B_AND_LOGPROBS_FLAG = True if (url == 'MODEL_200B' and task_processor.logprobs) else False
-
- print("Eval Task {} start!".format(task_processor.task_name))
-
- task_processor.init_example_list()
- print("example_list: {}".format(task_processor.example_list))
-
- shot = "zero_shot"
- current_task_example_list = []
- return_all_task_list = []
- start_time = time.time()
- eval_data_iter = task_processor.get_eval_data_iter(shot)
- task_processor.init_eval_results()
- for input_items, groud_truth_items in eval_data_iter:
- example_str = task_processor.get_shot_example_str()
- input_str, useful_info = task_processor.get_input_str(input_items, example_str)
- if MODEL_200B_AND_LOGPROBS_FLAG:
- input_length_list, mask_length_list = useful_info
- input_str = list(zip(input_str, input_length_list, mask_length_list))
- if task_processor.task_name in ['c_eval', 'gaokao']:
- sub_task_name = input_items[2]
- valid_str = '\n'.join(input_str[0].split('\n')[1:-1]).strip()
-
- elif task_processor.task_name == 'agi_eval':
- sub_task_name = input_items[1]
- valid_str = '\n'.join(input_str[0].split('\n')[:-1]).strip()
- elif task_processor.task_name == 'mmlu':
- sub_task_name = groud_truth_items[1]
- valid_str = '\n'.join(input_str[0].split('\n')[1:-1]).strip()
- task_finish_flag = groud_truth_items[-1]
- if task_finish_flag:
- return_all_task_list.append({'sub_task_name': sub_task_name, 'examples': current_task_example_list})
- current_task_example_list = []
- else:
- current_task_example_list.append(valid_str)
- end_time = time.time()
- print("Eval Task {} {} end! Cost Time: {}".format(task_processor.task_name, shot, end_time - start_time))
- return return_all_task_list
-
-
-
-
- if __name__ == "__main__":
- base_path = MAIN_DIR + "/dataset"
-
- url = 'http://1.14.200.123:5010/api'
- model_token_max_len = 2000
-
- os.environ['TOKEN_VERSION'] = 'v2'
- os.environ['URL_VERSION'] = 'v1'
- json_path = os.path.join(MAIN_DIR, 'extract_data.json')
- json_data = {'all_data':[]}
- for task_name in ['c_eval', 'gaokao', 'agi_eval', 'mmlu']:
- tokenizer = get_tokenizer(url)
- if task_name == 'c_eval':
- from dataset_processor.c_eval_processor_extract import CEvalProcessor as MainProcessor
- elif task_name == 'gaokao':
- from dataset_processor.gaokao_processor_extract import GaoKaoProcessor as MainProcessor
- elif task_name == 'agi_eval':
- from dataset_processor.agi_eval_processor_extract import AGIEvalProcessor as MainProcessor
- elif task_name == 'mmlu':
- from dataset_processor.mmlu_processor import MMLUProcessor as MainProcessor
-
- task_processor = MainProcessor(task_name, base_path, tokenizer, model_token_max_len)
- if task_processor.logprobs:
- os.environ['MODEL_PROBS'] = 'TRUE'
- else:
- os.environ['MODEL_PROBS'] = 'FALSE'
- task_list = extract_data(url, task_processor)
- json_data['all_data'].append({'data_name': task_name, 'sub_task_list': task_list})
-
- with open(json_path, "w", encoding="utf-8") as f:
- json.dump(json_data, f, indent=4, ensure_ascii=False)
-
-
|