Browse Source

update

master
taoht 1 month ago
parent
commit
1e4b800eb7
11 changed files with 85 additions and 3 deletions
  1. +3
    -0
      .idea/.gitignore
  2. +15
    -0
      .idea/PanGu-Alpha.iml
  3. +20
    -0
      .idea/inspectionProfiles/Project_Default.xml
  4. +6
    -0
      .idea/inspectionProfiles/profiles_settings.xml
  5. +4
    -0
      .idea/misc.xml
  6. +8
    -0
      .idea/modules.xml
  7. +6
    -0
      .idea/vcs.xml
  8. +21
    -1
      README.md
  9. +2
    -2
      pangu_alpha_predict.py
  10. +0
    -0
      tokenizer/vocab.model
  11. +0
    -0
      tokenizer/vocab.vocab

+ 3
- 0
.idea/.gitignore View File

@@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

+ 15
- 0
.idea/PanGu-Alpha.iml View File

@@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="GOOGLE" />
<option name="myDocStringFormat" value="Google" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="pytest" />
</component>
</module>

+ 20
- 0
.idea/inspectionProfiles/Project_Default.xml View File

@@ -0,0 +1,20 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="7">
<item index="0" class="java.lang.String" itemvalue="protobuf" />
<item index="1" class="java.lang.String" itemvalue="jsonlines" />
<item index="2" class="java.lang.String" itemvalue="regex" />
<item index="3" class="java.lang.String" itemvalue="boto3" />
<item index="4" class="java.lang.String" itemvalue="pycocotools" />
<item index="5" class="java.lang.String" itemvalue="tensorflow" />
<item index="6" class="java.lang.String" itemvalue="six" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>

+ 6
- 0
.idea/inspectionProfiles/profiles_settings.xml View File

@@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

+ 4
- 0
.idea/misc.xml View File

@@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
</project>

+ 8
- 0
.idea/modules.xml View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/PanGu-Alpha.iml" filepath="$PROJECT_DIR$/.idea/PanGu-Alpha.iml" />
</modules>
</component>
</project>

+ 6
- 0
.idea/vcs.xml View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

+ 21
- 1
README.md View File

@@ -3,7 +3,7 @@
「盘古α」由以鹏城实验室为首的技术团队联合攻关,首次基于“鹏城云脑Ⅱ”和国产MindSpore框架的自动混合并行模式实现在2048卡算力集群上的大规模分布式训练,训练出业界首个2000亿参数以中文为核心的预训练生成语言模型。盘古α预训练模型支持丰富的场景应用,在知识问答、知识检索、知识推理、阅读理解等文本生成领域表现突出,具备很强的小样本学习能力。
[[技术报告](https://git.openi.org.cn/PCL-Platform.Intelligence/PanGu-Alpha/src/branch/master/PANGU-%ce%b1.pdf)]
[[模型下载](#模型下载)]
[[MindSpore大规模分布式自动并行框架](https://mindspore.cn/)]
[[MindSpore官网](https://mindspore.cn/)]
[[评测数据集下载](https://git.openi.org.cn/PCL-Platform.Intelligence/Chinese_WPLC)]
[[serving展示视频下载](#serving展示视频下载)]

@@ -63,6 +63,26 @@ MindSpore是业界首个支持全自动并行的框架,MindSpore多维度自
| 盘古α-13B | 20M |[13B_position_embedding](https://git.openi.org.cn/attachments/76e796fa-0d97-48ea-8e2c-f49bb2f2b21b) |
| 盘古α-200B | ~T | 敬请期待 |

### 训练

运行如下命令开始训练, `MODE` 选择 `2.6B`, `13B` 或 `200B`.
```bash
export MODE=2.6B
bash scripts/run_distribute_train.sh 8 /home/data/ /home/config/rank_table_8p.json $MODE
```
### 推理

首先下载以下三个文件
- tokenizier: vocal.vocab 和 vocab.model 在 $FILE_PATH/tokenizer/ 目录
- ckpts: 下载模型文件放在 $FILE_PATH/checkpiont_file/目录
- training strategy file: 该文件描述了模型在NPU卡间的切分策略,文件放在$FILE_PATH/strategy_load_ckpt_path/ 目录

执行推理
```bash
$FILE_PATH=/home/your_path
bash scripts/run_distribute_predict.sh 8 /home/config/rank_table_8p.json ${FILE_PATH}/strategy_load_ckpt_path \
${FILE_PATH}/tokenizer/ ${FILE_PATH}/checkpiont_file filitered
```

### 下游任务评估



+ 2
- 2
pangu_alpha_predict.py View File

@@ -204,8 +204,8 @@ def run_predict_no_pipeline(args_opt):

from tokenization_jieba import JIEBATokenizer
from generate import generate
tokenizer = JIEBATokenizer(os.path.join(args_opt.tokenizer_path, 'vocab10.vocab'),
os.path.join(args_opt.tokenizer_path, 'vocab10.model'))
tokenizer = JIEBATokenizer(os.path.join(args_opt.tokenizer_path, 'vocab.vocab'),
os.path.join(args_opt.tokenizer_path, 'vocab.model'))

sample = "今天是一个好天气"
tokenized_token = tokenizer.tokenize(sample)


bpe_4w/vocab.model → tokenizer/vocab.model View File


bpe_4w/vocab.vocab → tokenizer/vocab.vocab View File


Loading…
Cancel
Save