You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

92 lines
2.7 KiB

  1. # coding=utf-8
  2. # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Tokenization classes for OpenAI GPT."""
  16. from __future__ import (absolute_import, division, print_function,
  17. unicode_literals)
  18. import sys
  19. import json
  20. import logging
  21. import os
  22. import re
  23. from io import open
  24. import sentencepiece as spm
  25. import jieba
  26. try:
  27. from functools import lru_cache
  28. except ImportError:
  29. # Just a dummy decorator to get the checks to run on python2
  30. # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
  31. def lru_cache():
  32. return lambda func: func
  33. class JIEBATokenizer(object):
  34. def __init__(self, vocab_file, model_file, max_len=None):
  35. self.max_len = max_len if max_len is not None else int(1e12)
  36. # self.encoder = json.load(open(vocab_file))
  37. f = open(vocab_file,'r')
  38. lines = f.readlines()
  39. self.encoder = {}
  40. for line in enumerate(lines):
  41. key = line[1].split('\t')[0]
  42. self.encoder[key] = line[0]
  43. self.decoder = {v:k for k,v in self.encoder.items()}
  44. self.sp = spm.SentencePieceProcessor(model_file=model_file)
  45. self.translator = str.maketrans(" \n", "\u2582\u2583")
  46. self.eod_id = self.encoder['<eod>']
  47. self.eot_id = self.encoder['<eot>']
  48. self.pad_id = self.encoder['<pad>']
  49. @property
  50. def vocab_size(self):
  51. return len(self.encoder)
  52. def __len__(self):
  53. return len(self.encoder) + len(self.special_tokens)
  54. @property
  55. def eod(self):
  56. return self.eod_id
  57. def tokenize(self, text):
  58. """ Tokenize a string. """
  59. seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
  60. new_seg = " ".join(seg_list)
  61. return self.sp.encode(new_seg)
  62. def convert_tokens_to_ids(self, tokens):
  63. return tokens
  64. def convert_ids_to_tokens(self, ids):
  65. return self.decode(ids)
  66. def encode(self, text):
  67. res = self.tokenize(text)
  68. return res
  69. def decode(self, tokens):
  70. text = self.sp.decode(tokens)
  71. text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')
  72. return text

简介

2000亿开源中文预训练语言模型「鹏程·盘古α」

Python Markdown Shell