使用100G中文高质量语料,128张V100,训练的中文Megatron模型,参数量2.6B
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

83 lines
2.4 KiB

  1. # coding=utf-8
  2. # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Tokenization classes for OpenAI GPT."""
  16. from __future__ import (absolute_import, division, print_function,
  17. unicode_literals)
  18. import sys
  19. import json
  20. import logging
  21. import os
  22. import regex as re
  23. from io import open
  24. import sentencepiece as spm
  25. import jieba
  26. try:
  27. from functools import lru_cache
  28. except ImportError:
  29. # Just a dummy decorator to get the checks to run on python2
  30. # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
  31. def lru_cache():
  32. return lambda func: func
  33. class JIEBATokenizer(object):
  34. def __init__(self, vocab_file, model_file, max_len=None):
  35. self.max_len = max_len if max_len is not None else int(1e12)
  36. self.encoder = json.load(open(vocab_file))
  37. self.decoder = {v:k for k,v in self.encoder.items()}
  38. self.sp = spm.SentencePieceProcessor(model_file=model_file)
  39. self.translator = str.maketrans(" \n", "\u2582\u2583")
  40. self.eod_id = self.encoder['<eod>']
  41. @property
  42. def vocab_size(self):
  43. return len(self.encoder)
  44. def __len__(self):
  45. return len(self.encoder) + len(self.special_tokens)
  46. @property
  47. def eod(self):
  48. return self.eod_id
  49. def tokenize(self, text):
  50. """ Tokenize a string. """
  51. seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
  52. new_seg = " ".join(seg_list)
  53. return self.sp.encode(new_seg)
  54. def convert_tokens_to_ids(self, tokens):
  55. return tokens
  56. def convert_ids_to_tokens(self, ids):
  57. return self.decode(ids)
  58. def encode(self, text):
  59. res = self.tokenize(text)
  60. return res
  61. def decode(self, tokens):
  62. text = self.sp.decode(tokens)
  63. text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')
  64. return text