|
- import unicodedata
- import mindspore as ms
- import mindspore.ops as opspdb
- from typing import List, Union, Optional
- import itertools
- # import torch
- import sentencepiece as spm
- import json
- import regex as re
- import sys
- import os
- import logging
- import collections
- import pdb
- from src.utils import COMMAND_TUPLE, VOCAB_JSON_FILE, MERGES_FILE, SP_MODEL_FILE, VOCAB_FILE
- from src.utils import get_pairs
- from src.utils import basic_clean, whitespace_clean
- from src.utils import load_vocab, default_vocab, convert_by_vocab
- from src.utils import convert_to_unicode, bytes_to_unicode, _is_punctuation
- from src.utils import _is_control, _is_whitespace
- from src.utils import _get_model_files, _get_vocab_path, _get_model_id
-
- logger = logging.getLogger(__name__)
-
- def whitespace_tokenize(text):
- """Runs basic whitespace cleaning and splitting on a piece of text."""
- text = text.strip()
- if not text:
- return []
- tokens = text.split()
- return tokens
-
- class BPETokenizer(object):
-
- def __init__(self,
- vocab_file,
- merges_file,
- errors='replace',
- max_len=None,
- **kwargs):
- super().__init__(**kwargs)
- self.max_len = max_len if max_len is not None else int(1e12)
-
- self.errors = errors # how to handle errors in decoding
- self.byte_encoder = bytes_to_unicode()
- self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
- with open(merges_file, encoding='utf-8') as file:
- bpe_data = file.read().split('\n')[1:-1]
- bpe_merges = [tuple(merge.split()) for merge in bpe_data]
- # file.close()
- if not vocab_file:
- vocab = list(bytes_to_unicode().values())
- vocab = vocab + [v for v in vocab]
- for merge in bpe_merges:
- vocab.append(''.join(merge))
- self.encoder = dict(zip(vocab, range(len(vocab))))
- else:
- with open(vocab_file, encoding='utf8') as file:
- self.encoder = json.load(file)
- self.decoder = {v: k for k, v in self.encoder.items()}
-
- self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
- self.cache = {}
- # self.cache = {t:t for t in special_tokens}
-
- self.pat = re.compile(
- r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
- )
-
- self.special_tokens = {}
- self.special_tokens_decoder = {}
- # self.set_special_tokens(special_tokens)
-
- @property
- def vocab_size(self):
- return len(self.encoder)
-
- def get_vocab(self):
- return dict(self.encoder)
-
- def __len__(self):
- return len(self.encoder) + len(self.special_tokens)
-
- def bpe(self, token):
- if token in self.cache:
- return self.cache[token]
- word = tuple(token)
- pairs = get_pairs(word)
-
- if not pairs:
- return token
-
- while True:
- bigram = min(
- pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
- if bigram not in self.bpe_ranks:
- break
- first, second = bigram
- new_word = []
- i = 0
- while i < len(word):
- try:
- j = word.index(first, i)
- new_word.extend(word[i:j])
- i = j
- except:
- new_word.extend(word[i:])
- break
-
- if word[i] == first and i < len(word) - 1 and word[
- i + 1] == second:
- new_word.append(first + second)
- i += 2
- else:
- new_word.append(word[i])
- i += 1
- new_word = tuple(new_word)
- word = new_word
- if len(word) == 1:
- break
- else:
- pairs = get_pairs(word)
- word = ' '.join(word)
- self.cache[token] = word
- return word
-
- def tokenize(self, text):
- """ Tokenize a string. """
- bpe_tokens = []
- for token in re.findall(self.pat, text):
- if sys.version_info[0] == 2:
- token = ''.join(self.byte_encoder[ord(b)] for b in token)
- else:
- token = ''.join(self.byte_encoder[b]
- for b in token.encode('utf-8'))
- bpe_tokens.extend(bpe_token
- for bpe_token in self.bpe(token).split(' '))
- return bpe_tokens
-
- def convert_token_to_id(self, token):
- """ Converts a sequence of tokens into ids using the vocab. """
- return self.encoder.get(token, 0)
-
- def convert_tokens_to_ids(self, tokens):
- """ Converts a sequence of tokens into ids using the vocab. """
- ids = []
- for token in tokens:
- ids.append(self.convert_token_to_id(token))
- if len(ids) > self.max_len:
- logger.warning(
- "Token indices sequence length is longer than the specified maximum "
- " sequence length for this OpenAI GPT model ({} > {}). Running this"
- " sequence through the model will result in indexing errors".
- format(len(ids), self.max_len))
- return ids
-
- def convert_id_to_token(self, id):
- """Converts a sequence of ids in BPE tokens using the vocab."""
- return self.decoder[id]
-
- def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
- """Converts a sequence of ids in BPE tokens using the vocab."""
- tokens = []
- for i in ids:
- tokens.append(self.decoder[i])
- return tokens
-
- def convert_tokens_to_string(self, tokens, all_command_token={}):
- """Converts a sequence of tokens (string) in a single string."""
- text = "".join(tokens)
- text = bytearray([self.byte_decoder[c]
- for c in text]).decode("utf-8", errors=self.errors)
- return text
-
- class MMBPETokenizer(BPETokenizer):
-
- def __init__(self,
- vocab_file,
- merges_file,
- errors='replace',
- max_len=None,
- special_tokens=None,
- **kwargs):
- self.byte_encoder = bytes_to_unicode()
- self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
- merges = open(merges_file).read().split('\n')
- merges = merges[1:49152 - 256 - 2 + 1]
- merges = [tuple(merge.split()) for merge in merges]
- vocab = list(bytes_to_unicode().values())
- vocab = vocab + [v + '</w>' for v in vocab]
- for merge in merges:
- vocab.append(''.join(merge))
- if not special_tokens:
- special_tokens = ['<start_of_text>', '<end_of_text>']
- else:
- special_tokens = ['<start_of_text>', '<end_of_text>'
- ] + special_tokens
- vocab.extend(special_tokens)
- self.encoder = dict(zip(vocab, range(len(vocab))))
- self.decoder = {v: k for k, v in self.encoder.items()}
- self.bpe_ranks = dict(zip(merges, range(len(merges))))
- self.cache = {t: t for t in special_tokens}
- special = "|".join(special_tokens)
- self.pat = re.compile(
- special +
- r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
- re.IGNORECASE)
-
- # self.vocab_size = len(self.encoder)
- # self.all_special_ids = [self.encoder[t] for t in special_tokens]
- def bpe(self, token):
- if token in self.cache:
- return self.cache[token]
- word = tuple(token[:-1]) + (token[-1] + '</w>', )
- pairs = get_pairs(word)
-
- if not pairs:
- return token + '</w>'
-
- while True:
- bigram = min(
- pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
- if bigram not in self.bpe_ranks:
- break
- first, second = bigram
- new_word = []
- i = 0
- while i < len(word):
- try:
- j = word.index(first, i)
- new_word.extend(word[i:j])
- i = j
- except:
- new_word.extend(word[i:])
- break
-
- if word[i] == first and i < len(word) - 1 and word[
- i + 1] == second:
- new_word.append(first + second)
- i += 2
- else:
- new_word.append(word[i])
- i += 1
- new_word = tuple(new_word)
- word = new_word
- if len(word) == 1:
- break
- else:
- pairs = get_pairs(word)
- word = ' '.join(word)
- self.cache[token] = word
- return word
-
- def encode(self, text):
- bpe_tokens = []
- text = whitespace_clean(basic_clean(text)).lower()
- for token in re.findall(self.pat, text):
- token = ''.join(self.byte_encoder[b]
- for b in token.encode('utf-8'))
- bpe_tokens.extend(self.encoder[bpe_token]
- for bpe_token in self.bpe(token).split(' '))
- return bpe_tokens
-
- def decode(self, tokens):
- text = ''.join([self.decoder[token] for token in tokens])
- text = bytearray([self.byte_decoder[c] for c in text
- ]).decode('utf-8',
- errors="replace").replace('</w>', ' ')
- return text
-
- def tokenize(self,
- texts: Union[str, List[str]],
- sot_token: int,
- eot_token: int,
- context_length: int = 77) -> ms.int32:
- """
- Returns the tokenized representation of given input string(s)
-
- Parameters
- ----------
- texts : Union[str, List[str]]
- An input string or a list of input strings to tokenize
- context_length : int
- The context length to use; all CLIP models use 77 as the context length
-
- Returns
- -------
- A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
- """
- if isinstance(texts, str):
- texts = [texts]
-
- all_tokens = [[sot_token] + self.encode(text) + [eot_token]
- for text in texts]
- result = ops.Zeros()((len(all_tokens), context_length), ms.int32)
-
- for i, tokens in enumerate(all_tokens):
- if len(tokens) > context_length:
- tokens = tokens[:context_length] # Truncate
- result[i, :len(tokens)] = ms.Tensor(tokens)
- return result
-
-
- class SentencePieceTokenizer(object):
-
- def __init__(self, model_path):
- self.sp_model = spm.SentencePieceProcessor()
- self.sp_model.Load(model_path)
- # vocab = self.get_vocab()
- # print(vocab["<|endoftext|>"])
- # print(vocab["<|endofpiece|>"])
-
- @property
- def vocab_size(self):
- return self.sp_model.GetPieceSize()
-
- def get_vocab(self):
- vocab = {
- self.convert_id_to_token(i): i
- for i in range(self.vocab_size)
- }
- # vocab.update(self.added_tokens_encoder)
- return vocab
-
- def tokenize(self, text):
- return self.sp_model.EncodeAsPieces(text)
-
- def convert_tokens_to_ids(self, tokens):
- return [self.sp_model.PieceToId(token) for token in tokens]
-
- def convert_token_to_id(self, token):
- return self.sp_model.PieceToId(token)
-
- def convert_id_to_token(self, idx):
- return self.sp_model.IdToPiece(int(idx))
-
- def convert_ids_to_tokens(self, idxs):
- return [self.sp_model.IdToPiece(int(idx)) for idx in idxs]
-
- def convert_tokens_to_string(self, tokens, all_command_token={}):
- """Converts a sequence of tokens (string) in a single string."""
- current_sub_tokens = []
- out_string = ""
- for token in tokens:
- # make sure that special tokens are not decoded using sentencepiece model
- if token in all_command_token:
- out_string += self.sp_model.decode_pieces(
- current_sub_tokens) + token + " "
- current_sub_tokens = []
- else:
- current_sub_tokens.append(token)
- out_string += self.sp_model.decode_pieces(current_sub_tokens)
- return out_string.strip()
-
- class CommandToken(object):
-
- def __init__(self, name, token, Id, lstrip=False, rstrip=False):
- self.name = name
- self.token = token
- self.Id = Id
- self.lstrip = lstrip
- self.rstrip = rstrip
-
- def __str__(self):
- return str(COMMAND_TUPLE(self.name, self.token, self.Id))
-
- class WordpieceTokenizer(object):
-
- def __init__(self,
- vocab_file=None,
- do_basic_tokenize=True,
- do_lower_case=True,
- max_len=None,
- never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"),
- unk_token="[UNK]",
- max_input_chars_per_word=100,
- is_ch=False,
- *input,
- **kwargs):
- if not os.path.isfile(vocab_file):
- raise ValueError(
- "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
- "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
- .format(vocab_file))
- self.vocab = load_vocab(vocab_file)
- self.ids_to_tokens = collections.OrderedDict([
- (ids, tok) for tok, ids in self.vocab.items()
- ])
- self.do_basic_tokenize = do_basic_tokenize
- if do_basic_tokenize:
- self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
- never_split=never_split)
- self.max_len = max_len if max_len is not None else int(1e12)
- self.unk_token = unk_token
- self.max_input_chars_per_word = max_input_chars_per_word
- self.is_ch = is_ch
-
- @property
- def vocab_size(self):
- return len(self.vocab)
-
- def get_vocab(self):
- return self.vocab
-
- def word_piece(self, text):
- """Tokenizes a piece of text into its word pieces.
- This uses a greedy longest-match-first algorithm to perform tokenization
- using the given vocabulary.
- For example:
- input = "unaffable"
- output = ["un", "##aff", "##able"]
- Args:
- text: A single token or whitespace separated tokens. This should have
- already been passed through `BasicTokenizer`.
- Returns:
- A list of wordpiece tokens.
- """
-
- output_tokens = []
- for token in whitespace_tokenize(text):
- chars = list(token)
- if len(chars) > self.max_input_chars_per_word:
- output_tokens.append(self.unk_token)
- continue
-
- is_bad = False
- start = 0
- sub_tokens = []
- while start < len(chars):
- end = len(chars)
- cur_substr = None
- while start < end:
- substr = "".join(chars[start:end])
- if start > 0:
- substr = "##" + substr
- if substr in self.vocab:
- cur_substr = substr
- break
- end -= 1
- if cur_substr is None:
- is_bad = True
- break
- sub_tokens.append(cur_substr)
- start = end
-
- if is_bad:
- output_tokens.append(self.unk_token)
- else:
- output_tokens.extend(sub_tokens)
- return output_tokens
-
- def tokenize(self, text, maxlen=None, add_spatial_tokens=False):
- if self.do_basic_tokenize:
- split_tokens = []
- for token in self.basic_tokenizer.tokenize(text):
- for sub_token in self.word_piece(token):
- split_tokens.append(sub_token)
- else:
- split_tokens = self.word_piece(text)
-
- if add_spatial_tokens:
- split_tokens.insert(0, self._token_cls)
- split_tokens.append(self._token_sep)
-
- if maxlen is not None:
- index = int(self._token_sep is not None) + 1
- self.truncate_sequence(maxlen, split_tokens, pop_index=-index)
- return split_tokens
-
- def truncate_sequence(self,
- max_length,
- first_sequence,
- second_sequence=None,
- pop_index=-1):
-
- if second_sequence is None:
- second_sequence = []
-
- while True:
- total_length = len(first_sequence) + len(second_sequence)
- if total_length <= max_length:
- break
- elif len(first_sequence) > len(second_sequence):
- first_sequence.pop(pop_index)
- else:
- second_sequence.pop(pop_index)
-
- def convert_token_to_id(self, token):
- """ Converts a sequence of tokens into ids using the vocab. """
- return self.vocab[token]
-
- def convert_tokens_to_ids(self, tokens):
- """Converts a sequence of tokens into ids using the vocab."""
- ids = [self.convert_token_to_id(token) for token in tokens]
- if len(ids) > self.max_len:
- logger.warning(
- "Token indices sequence length is longer than the specified maximum "
- " sequence length for this BERT model ({} > {}). Running this"
- " sequence through BERT will result in indexing errors".format(
- len(ids), self.max_len))
- return ids
-
- def convert_id_to_token(self, id):
- """Converts a sequence of ids in wordpiece tokens using the vocab."""
- return self.ids_to_tokens[id]
-
- def convert_ids_to_tokens(self, ids):
- """Converts a sequence of ids in wordpiece tokens using the vocab."""
- return [self.convert_id_to_token(id) for id in ids]
-
- def convert_tokens_to_string(self, tokens, all_command_token={}):
- """Converts a sequence of tokens (string) in a single string."""
- if self.is_ch:
- out_string = "".join(tokens).replace(" ", "").strip()
- else:
- out_string = " ".join(tokens).replace(" ##", "").strip()
- return out_string
-
- class BasicTokenizer(object):
- """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-
- def __init__(self, do_lower_case=True):
- """Constructs a BasicTokenizer.
-
- Args:
- do_lower_case: Whether to lower case the input.
- """
- self.do_lower_case = do_lower_case
-
- def tokenize(self, text):
- """Tokenizes a piece of text."""
- text = convert_to_unicode(text)
- text = self._clean_text(text)
-
- # This was added on November 1st, 2018 for the multilingual and Chinese
- # models. This is also applied to the English models now, but it doesn't
- # matter since the English models were not trained on any Chinese data
- # and generally don't have any Chinese data in them (there are Chinese
- # characters in the vocabulary because Wikipedia does have some Chinese
- # words in the English Wikipedia.).
- text = self._tokenize_chinese_chars(text)
-
- orig_tokens = whitespace_tokenize(text)
- split_tokens = []
- for token in orig_tokens:
- if self.do_lower_case:
- token = token.lower()
- token = self._run_strip_accents(token)
- split_tokens.extend(self._run_split_on_punc(token))
-
- output_tokens = whitespace_tokenize(" ".join(split_tokens))
- return output_tokens
-
- def _run_strip_accents(self, text):
- """Strips accents from a piece of text."""
- text = unicodedata.normalize("NFD", text)
- output = []
- for char in text:
- cat = unicodedata.category(char)
- if cat == "Mn":
- continue
- output.append(char)
- return "".join(output)
-
- def _run_split_on_punc(self, text):
- """Splits punctuation on a piece of text."""
- chars = list(text)
- i = 0
- start_new_word = True
- output = []
- while i < len(chars):
- char = chars[i]
- if _is_punctuation(char):
- output.append([char])
- start_new_word = True
- else:
- if start_new_word:
- output.append([])
- start_new_word = False
- output[-1].append(char)
- i += 1
-
- return ["".join(x) for x in output]
-
- def _tokenize_chinese_chars(self, text):
- """Adds whitespace around any CJK character."""
- output = []
- for char in text:
- cp = ord(char)
- if self._is_chinese_char(cp):
- output.append(" ")
- output.append(char)
- output.append(" ")
- else:
- output.append(char)
- return "".join(output)
-
- def _is_chinese_char(self, cp):
- """Checks whether CP is the codepoint of a CJK character."""
- # This defines a "chinese character" as anything in the CJK Unicode block:
- # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
- #
- # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
- # despite its name. The modern Korean Hangul alphabet is a different block,
- # as is Japanese Hiragana and Katakana. Those alphabets are used to write
- # space-separated words, so they are not treated specially and handled
- # like the all of the other languages.
- if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
- (cp >= 0x3400 and cp <= 0x4DBF) or #
- (cp >= 0x20000 and cp <= 0x2A6DF) or #
- (cp >= 0x2A700 and cp <= 0x2B73F) or #
- (cp >= 0x2B740 and cp <= 0x2B81F) or #
- (cp >= 0x2B820 and cp <= 0x2CEAF) or
- (cp >= 0xF900 and cp <= 0xFAFF) or #
- (cp >= 0x2F800 and cp <= 0x2FA1F)): #
- return True
-
- return False
-
- def _clean_text(self, text):
- """Performs invalid character removal and whitespace cleanup on text."""
- output = []
- for char in text:
- cp = ord(char)
- if cp == 0 or cp == 0xfffd or _is_control(char):
- continue
- if _is_whitespace(char):
- output.append(" ")
- else:
- output.append(char)
- return "".join(output)
-
- class FullTokenizer(object):
- """Runs end-to-end tokenziation."""
-
- def __init__(self, vocab_file=default_vocab(), do_lower_case=True):
- self.vocab = load_vocab(vocab_file)
- self.inv_vocab = {v: k for k, v in self.vocab.items()}
- self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
- self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-
- def _tokenize(self, text):
- split_tokens = []
- for token in self.basic_tokenizer.tokenize(text):
- for sub_token in self.wordpiece_tokenizer.tokenize(token):
- split_tokens.append(sub_token)
-
- return split_tokens
-
- def tokenize(self,
- texts: Union[str, List[str]],
- context_length: int = 64) -> ms.int32:
- """
- Returns the tokenized representation of given input string(s)
- Parameters
- ----------
- texts : Union[str, List[str]]
- An input string or a list of input strings to tokenize
- context_length : int
- The context length to use; all baseline models use 24 as the context length
- Returns
- -------
- A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
- """
- if isinstance(texts, str):
- texts = [texts]
-
- all_tokens = []
- for text in texts:
- all_tokens.append([self.vocab['[CLS]']] +
- self.convert_tokens_to_ids(self._tokenize(text))
- [:context_length - 2] + [self.vocab['[SEP]']])
-
- result = ops.Zeros()((len(all_tokens), context_length), ms.int32)
-
- for i, tokens in enumerate(all_tokens):
- assert len(tokens) <= context_length
- result[i, :len(tokens)] = ms.Tensor(tokens)
-
- return result
-
- def convert_tokens_to_ids(self, tokens):
- return convert_by_vocab(self.vocab, tokens)
-
- def convert_token_to_id(self, token):
- return self.vocab[token]
-
- def convert_ids_to_tokens(self, ids):
- return convert_by_vocab(self.inv_vocab, ids)
-
- @staticmethod
- def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
- """ Converts a sequence of tokens (string) in a single string. """
-
- def clean_up_tokenization(out_string):
- """ Clean up a list of simple English tokenization artifacts
- like spaces before punctuations and abreviated forms.
- """
- out_string = (out_string.replace(" .", ".").replace(
- " ?", "?").replace(" !", "!").replace(" ,", ",").replace(
- " ' ",
- "'").replace(" n't", "n't").replace(" 'm", "'m").replace(
- " 's", "'s").replace(" 've",
- "'ve").replace(" 're", "'re"))
- return out_string
-
- text = ' '.join(tokens).replace(' ##', '').strip()
- if clean_up_tokenization_spaces:
- clean_text = clean_up_tokenization(text)
- return clean_text
- else:
- return text
-
- @property
- def vocab_size(self):
- return len(self.vocab)
-
-
- class BaseTokenizer(object):
-
- @classmethod
- def from_pretrained(cls,
- tokenizer_model_name,
- cache_dir=None,
- *inputs,
- **kwargs):
- """
- Instantiate a PreTrainedBertModel from a pre-trained model file.
- Download and cache the pre-trained model file if needed.
-
- Args:
- tokenizer_model_name (`str`):
- Name of the model associated with the tokenizer
- cache_dir (`str`):
- The directory that contains the vocab files, or will receive the downloaded vocab files
- """
- if cache_dir is None:
- # cache_dir = os.path.join(os.path.dirname(__file__), 'vocabs', f"{tokenizer_model_name}")
- cache_dir = './checkpoints/' + tokenizer_model_name
- tokenizer_class = ""
- # search the cache directory for certain files
- if os.path.exists(cache_dir):
- files = os.listdir(cache_dir)
- if SP_MODEL_FILE in files:
- tokenizer_class = "sp"
- elif MERGES_FILE in files:
- tokenizer_class = "bpe"
- elif VOCAB_FILE in files:
- tokenizer_class = "wp"
- if tokenizer_class == "":
- print("downloading model %s from ModelHub" % tokenizer_model_name)
- files = _get_model_files(tokenizer_model_name)
- model_id = _get_model_id(tokenizer_model_name)
- if SP_MODEL_FILE in files:
- tokenizer_class = "sp"
- _get_vocab_path(cache_dir + '/',
- SP_MODEL_FILE,
- model_id,
- rank=0)
- elif MERGES_FILE in files:
- tokenizer_class = "bpe"
- _get_vocab_path(cache_dir + '/', MERGES_FILE, model_id, rank=0)
- if VOCAB_JSON_FILE in files:
- _get_vocab_path(cache_dir + '/',
- VOCAB_JSON_FILE,
- model_id,
- rank=0)
- elif VOCAB_FILE in files:
- tokenizer_class = "wp"
- _get_vocab_path(cache_dir + '/', VOCAB_FILE, model_id, rank=0)
- else:
- raise FileNotFoundError("Error: no tokenizer files")
- resolved_vocab_json_file = os.path.join(
- cache_dir, VOCAB_JSON_FILE) if VOCAB_JSON_FILE in files else None
- resolved_vocab_file = os.path.join(cache_dir, VOCAB_FILE)
- resolved_merges_file = os.path.join(cache_dir, MERGES_FILE)
- resolved_sp_file = os.path.join(cache_dir, SP_MODEL_FILE)
- if tokenizer_class == "wp":
- return cls(vocab_file=resolved_vocab_file,
- tokenizer_class=tokenizer_class,
- tokenizer_model_name=tokenizer_model_name,
- cache_dir=cache_dir,
- *inputs,
- **kwargs)
- elif tokenizer_class == "bpe":
- return cls(vocab_file=resolved_vocab_json_file,
- merges_file=resolved_merges_file,
- tokenizer_class=tokenizer_class,
- tokenizer_model_name=tokenizer_model_name,
- cache_dir=cache_dir,
- *inputs,
- **kwargs)
- elif tokenizer_class == "sp":
- return cls(sp_model_file=resolved_sp_file,
- tokenizer_class=tokenizer_class,
- tokenizer_model_name=tokenizer_model_name,
- cache_dir=cache_dir,
- *inputs,
- **kwargs)
- else:
- raise NotImplementedError(
- "Cannot find a tokenizer class that matches the files settings in the directory or ModelHub"
- )
-
- def __init__(self,
- vocab_file=None,
- merges_file=None,
- sp_model_file=None,
- tokenizer_class=None,
- tokenizer_model_name=None,
- cache_dir=None,
- *inputs,
- **kwargs):
-
- self.vocab_file = vocab_file
- self.merges_file = merges_file
- self.sp_model_file = sp_model_file
- self.tokenizer_class = tokenizer_class
- self.tokenizer_model_name = tokenizer_model_name
- self.cache_dir = cache_dir
- self.deprecation_warnings = ({})
-
- class Tokenizer(BaseTokenizer):
-
- def __init__(self,
- add_block_symbols=True,
- add_sentinel_token=0,
- add_task_mask=True,
- add_decoder_mask=False,
- fix_command_token=True,
- **kwargs):
- super().__init__(**kwargs)
- if self.tokenizer_class == "wp":
- if self.tokenizer_model_name.lower().endswith("ch"):
- self.text_tokenizer = WordpieceTokenizer(self.vocab_file,
- is_ch=True)
- elif self.tokenizer_model_name.lower().startswith('clip-cn'):
- self.text_tokenizer = FullTokenizer(self.vocab_file)
- else:
- self.text_tokenizer = WordpieceTokenizer(self.vocab_file)
- elif self.tokenizer_class == "bpe":
- if self.tokenizer_model_name.lower().startswith('clip'):
- self.text_tokenizer = MMBPETokenizer(self.vocab_file,
- self.merges_file)
- else:
- self.text_tokenizer = BPETokenizer(self.vocab_file,
- self.merges_file)
- elif self.tokenizer_class == "sp":
- self.text_tokenizer = SentencePieceTokenizer(self.sp_model_file)
- else:
- raise NotImplementedError("cannot assign a tokenize class")
-
- self.is_glm = self.tokenizer_model_name.lower().startswith('glm')
- # self.is_clip = self.tokenizer_model_name.startswith('clip')
- self.num_tokens = self.text_tokenizer.vocab_size
-
- if self.tokenizer_class == "wp":
- # set command tokens from wordpiece tokenizer values
- self.num_command_tokens = 6
- self.num_text_tokens = self.num_tokens - 5
- self.num_type_tokens = 2
- self.token_start_id = None
- self.token_end_id = None
- self.token_pad_id = None
- try:
- self._command_tokens = [
- CommandToken(
- 'pad', '[PAD]',
- self.text_tokenizer.convert_token_to_id('[PAD]')),
- CommandToken(
- 'cls', '[CLS]',
- self.text_tokenizer.convert_token_to_id('[CLS]')),
- CommandToken(
- 'MASK', '[MASK]',
- self.text_tokenizer.convert_token_to_id('[MASK]')),
- CommandToken(
- 'unk', '[UNK]',
- self.text_tokenizer.convert_token_to_id('[UNK]')),
- CommandToken(
- 'sep', '[SEP]',
- self.text_tokenizer.convert_token_to_id('[SEP]')),
- CommandToken(
- 'eos', '[PAD]',
- self.text_tokenizer.convert_token_to_id('[PAD]')),
- ]
- self.token_start_id = self.text_tokenizer.convert_token_to_id(
- '[CLS]')
- self.token_end_id = self.text_tokenizer.convert_token_to_id(
- '[SEP]')
- self.token_pad_id = self.text_tokenizer.convert_token_to_id(
- '[PAD]')
- self.text_tokenizer._token_cls = "[CLS]"
- self.text_tokenizer._token_sep = "[SEP]"
-
- except KeyError:
- self._command_tokens = [
- CommandToken(
- 'pad', '[PAD]',
- self.text_tokenizer.convert_token_to_id('<pad>')),
- CommandToken(
- 'cls', '[CLS]',
- self.text_tokenizer.convert_token_to_id('<s>')),
- CommandToken(
- 'MASK', '[MASK]',
- self.text_tokenizer.convert_token_to_id('<mask>')),
- CommandToken(
- 'unk', '[UNK]',
- self.text_tokenizer.convert_token_to_id('<unk>')),
- CommandToken(
- 'sep', '[SEP]',
- self.text_tokenizer.convert_token_to_id('<sep>')),
- CommandToken(
- 'eos', '[PAD]',
- self.text_tokenizer.convert_token_to_id('</s>')),
- ]
- self.token_start_id = self.text_tokenizer.convert_token_to_id(
- '<s>')
- self.token_end_id = self.text_tokenizer.convert_token_to_id(
- '</s>')
- self.token_pad_id = self.text_tokenizer.convert_token_to_id(
- '<pad>')
- self.text_tokenizer._token_cls = "<s>"
- self.text_tokenizer._token_sep = "</s>"
- if add_block_symbols:
- self._command_tokens.extend([
- CommandToken('sop', '<|startofpiece|>', self.num_tokens),
- CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1)
- ])
- self.num_tokens += 2
- self.num_command_tokens += 2
- if add_task_mask:
- self._command_tokens.extend([
- CommandToken('gMASK', '[gMASK]', self.num_tokens),
- CommandToken('sMASK', '[sMASK]', self.num_tokens + 1)
- ])
- self.num_tokens += 2
- self.num_command_tokens += 2
- if add_decoder_mask:
- self._command_tokens.extend(
- [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)])
- self.num_tokens += 1
- self.num_command_tokens += 1
- if add_sentinel_token > 0:
- for i in range(1, add_sentinel_token):
- self._command_tokens.extend([
- CommandToken(f'MASK{i}', f'[MASK{i}]',
- self.num_tokens),
- CommandToken(f'sop{i}', f'<|startofpiece{i}|>',
- self.num_tokens + 1)
- ])
- self.num_tokens += 2
- self.num_command_tokens += 2
- elif self.tokenizer_class == "bpe":
- if self.tokenizer_model_name.lower().startswith('roberta'):
- self.num_command_tokens = 6
- self.num_text_tokens = self.num_tokens - 3
- self._command_tokens = [
- CommandToken(
- 'pad', '<|endoftext|>',
- self.text_tokenizer.convert_token_to_id('</s>')),
- CommandToken(
- 'eos', '<|endoftext|>',
- self.text_tokenizer.convert_token_to_id('</s>')),
- CommandToken(
- 'sep', '[SEP]',
- self.text_tokenizer.convert_token_to_id('</s>')),
- CommandToken(
- 'cls', '[CLS]',
- self.text_tokenizer.convert_token_to_id('<s>')),
- CommandToken(
- 'MASK',
- '[MASK]',
- self.text_tokenizer.convert_token_to_id('<mask>'),
- lstrip=True),
- CommandToken(
- 'unk', '[UNK]',
- self.text_tokenizer.convert_token_to_id('<unk>'))
- ]
- if add_block_symbols:
- self._command_tokens.extend([
- CommandToken('sop', '<|startofpiece|>',
- self.num_tokens),
- CommandToken('eop', '<|endofpiece|>',
- self.num_tokens + 1)
- ])
- self.num_tokens += 2
- self.num_command_tokens += 2
- self.token_end_id = self.text_tokenizer.convert_token_to_id(
- '</s>')
- elif self.tokenizer_model_name.lower().startswith('clip'):
- self.num_command_tokens = 2
- self._command_tokens = [
- CommandToken(
- 'sot', '<start_of_text>',
- self.text_tokenizer.convert_token_to_id('</s>')),
- CommandToken(
- 'eot', '<end_of_text>',
- self.text_tokenizer.convert_token_to_id('</s>')),
- ]
- self.num_tokens += self.num_command_tokens
- self.token_end_id = self.text_tokenizer.convert_token_to_id(
- '</s>')
- else:
- self.num_command_tokens = 2
- self.num_text_tokens = self.num_tokens - 1
- self._command_tokens = [
- CommandToken(
- 'pad', '<|endoftext|>',
- self.text_tokenizer.convert_token_to_id(
- '<|endoftext|>')),
- CommandToken(
- 'eos', '<|endoftext|>',
- self.text_tokenizer.convert_token_to_id(
- '<|endoftext|>'))
- ]
- self.token_end_id = self.text_tokenizer.convert_token_to_id(
- '<|endoftext|>')
- if add_block_symbols:
- if self.tokenizer_model_name.lower().startswith('glm'):
- unk_token_id = self.num_tokens + 5
- cls_token_id = self.num_tokens + 2
- num_tokens_to_add = 5
- else:
- unk_token_id = self.text_tokenizer.convert_token_to_id(
- '<|endoftext|>')
- cls_token_id = self.text_tokenizer.convert_token_to_id(
- '<|endoftext|>')
- num_tokens_to_add = 4
- self._command_tokens.extend([
- CommandToken('sop', '<|startofpiece|>',
- self.num_tokens),
- CommandToken('eop', '<|endofpiece|>',
- self.num_tokens + 1),
- CommandToken('cls', '[CLS]', cls_token_id),
- CommandToken('MASK',
- '[MASK]',
- self.num_tokens + 3,
- lstrip=True),
- CommandToken('sep', '[SEP]', self.num_tokens + 4),
- CommandToken('unk', '[UNK]', unk_token_id)
- ])
- self.num_tokens += num_tokens_to_add
- self.num_command_tokens += 6
-
- if add_block_symbols:
- if add_task_mask:
- self._command_tokens.extend([
- CommandToken('gMASK',
- '[gMASK]',
- self.num_tokens,
- lstrip=True),
- CommandToken('sMASK',
- '[sMASK]',
- self.num_tokens + 1,
- lstrip=True)
- ])
- self.num_tokens += 2
- self.num_command_tokens += 2
- if add_decoder_mask:
- self._command_tokens.extend(
- [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)])
- self.num_tokens += 1
- self.num_command_tokens += 1
-
- elif self.tokenizer_class == "sp":
- self.num_command_tokens = 0
- self.num_text_tokens = self.text_tokenizer.vocab_size
- self.num_tokens = self.num_text_tokens
-
- if self.tokenizer_model_name.lower().startswith('glm'):
- pad_token_id = self.num_tokens
- eos_token_id = self.num_tokens
- unk_token_id = self.num_tokens + 4
- else:
- pad_token_id = self.text_tokenizer.convert_token_to_id('<pad>')
- eos_token_id = self.text_tokenizer.convert_token_to_id('</s>')
- unk_token_id = self.text_tokenizer.convert_token_to_id('<unk>')
- self._command_tokens = [
- CommandToken('pad', '<|endoftext|>', self.num_text_tokens),
- CommandToken('eos', '<|endoftext|>', self.num_text_tokens),
- CommandToken('sep', '[SEP]', self.num_text_tokens + 1),
- CommandToken('cls', '[CLS]', self.num_text_tokens + 2),
- CommandToken('MASK',
- '[MASK]',
- self.num_text_tokens + 3,
- lstrip=True),
- CommandToken('unk', '[UNK]', self.num_text_tokens + 4)
- ]
-
- self.num_tokens += 5
- self.num_command_tokens += 6
- self.token_end_id = self.text_tokenizer.convert_token_to_id(
- '</s>')
- if add_block_symbols:
- self._command_tokens.extend([
- CommandToken('sop', '<|startofpiece|>',
- self.num_tokens + 1),
- CommandToken('eop', '<|endofpiece|>', self.num_tokens + 2)
- ])
- if fix_command_token:
- self.num_tokens += 3
- else:
- self.num_tokens += 2
- self.num_command_tokens += 2
- if add_task_mask:
- if fix_command_token:
- self._command_tokens.extend([
- CommandToken('sMASK',
- '[sMASK]',
- self.num_tokens,
- lstrip=True),
- CommandToken('gMASK',
- '[gMASK]',
- self.num_tokens + 1,
- lstrip=True)
- ])
- else:
- self._command_tokens.extend([
- CommandToken('gMASK',
- '[gMASK]',
- self.num_tokens,
- lstrip=True),
- CommandToken('sMASK',
- '[sMASK]',
- self.num_tokens + 1,
- lstrip=True)
- ])
- self.num_tokens += 2
- self.num_command_tokens += 2
- if add_decoder_mask:
- self._command_tokens.extend(
- [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)])
- self.num_tokens += 1
- self.num_command_tokens += 1
- self.command_name_map = {tok.name: tok for tok in self._command_tokens}
- self.command_token_map = {
- tok.token: tok
- for tok in self._command_tokens
- }
- self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
- self._command_token_tokens = list(self.command_token_map.keys())
-
- def get_vocab(self):
- return self.text_tokenizer.get_vocab()
-
- def get_command_id(self, name):
- """get command token corresponding to `name`"""
- return self.command_name_map[name].Id
-
- def rematch(self, text, tokens):
- """output the mapping relation between raw text and tokenizezd text
- """
- text = text.lower()
- normalized_text, char_mapping = '', []
-
- for i, ch in enumerate(text):
- ch = ''.join([
- c for c in ch
- if not (ord(c) == 0 or ord(c) == 0xfffd or self._is_control(c))
- ])
- normalized_text += ch
- char_mapping.extend([i] * len(ch))
-
- text, token_mapping, offset = normalized_text, [], 0
- for token in tokens:
- if self._is_special(token):
- token_mapping.append([])
- else:
- token = self.stem(token)
- start = text[offset:].index(token) + offset
- end = start + len(token)
- token_mapping.append(char_mapping[start:end])
- offset = end
- return token_mapping
-
- @staticmethod
- def _is_control(ch):
- return unicodedata.category(ch) in ('Cc', 'Cf')
-
- @staticmethod
- def stem(token):
- if token[:2] == '##':
- return token[2:]
- else:
- return token
-
- @staticmethod
- def _is_special(ch):
- return bool(ch) and (ch[0] == '[') and (ch[-1] == ']')
-
- def _encode(self, text):
- tokens = self.text_tokenizer.tokenize(text)
- ids = self.text_tokenizer.convert_tokens_to_ids(tokens)
- return ids
-
- def convert_tokens_to_ids(self, tokens):
- res = []
- for token in tokens:
- if token in self.command_token_map:
- res.append(self.command_token_map[token].Id)
- else:
- res.append(self.text_tokenizer.convert_token_to_id(token))
- return res
-
- def convert_ids_to_tokens(self, ids):
- print(isinstance(ids, ms.Tensor))
- if isinstance(ids, ms.Tensor):
- ids = ids.asnumpy().tolist()
- res = []
- for id in ids:
- if id in self.command_id_map:
- res.append(self.command_id_map[id].token)
- else:
- res.append(self.text_tokenizer.convert_id_to_token(id))
- return self.text_tokenizer.convert_ids_to_tokens(ids)
-
- def EncodeAsTokens(self, text, process_fn=None):
- """convert wordpiece token to Id"""
- processed_text = text
- if process_fn is not None:
- processed_text = process_fn(processed_text)
- tokens = self.text_tokenizer.tokenize(processed_text)
- return tokens
-
- def IdToToken(self, id):
- """convert Id to sentencpiece token"""
- if isinstance(id, (CommandToken)):
- return id.token
- if id in self.command_id_map:
- return self.command_id_map[id].token
- return self.text_tokenizer.convert_id_to_token(id)
-
- def TokenToId(self, token):
- """convert sentencpiece token to Id"""
- token = token.lower()
- if isinstance(token, (CommandToken)):
- return token.Id
- try:
- return self.text_tokenizer.convert_token_to_id(token)
- except KeyError:
- return self.text_tokenizer.convert_token_to_id(token.strip())
-
- def DecodeIds(self, ids):
- """converts ids to wordpiece tokens and joins them as a text string"""
- tokens = []
- for id in ids:
- if id in self.command_id_map:
- tokens.append(self.command_id_map[id].token)
- else:
- try:
- tokens.extend(
- self.text_tokenizer.convert_ids_to_tokens([id]))
- except KeyError:
- pass
- return self.text_tokenizer.convert_tokens_to_string(
- tokens, self.command_token_map)
-
- def encode(self, text):
- return self.convert_tokens_to_ids(
- self.text_tokenizer.tokenize(text))
-
- def decode(self, ids):
- return self.DecodeIds(ids)
-
- def DecodeTokens(self, tokens):
- """converts wordpiece tokens to a text string"""
- return self.text_tokenizer.convert_tokens_to_string(
- tokens, self.command_token_map)
-
- def EncodeAsIds(self, text, process_fn=None):
- """
- encode text using text tokenizer and shift Id values for command tokens
- """
- processed_text = text
- if process_fn is not None:
- processed_text = process_fn(processed_text)
-
- def split_on_token(tok_extended: CommandToken, text):
- result = []
- tok = tok_extended.token
- split_text = text.split(tok)
- for i, sub_text in enumerate(split_text):
- # CommandToken can control whitespace stripping around them.
- # We use them for GPT2 and Roberta to have different behavior depending on the special token
- # Cf. https://github.com/huggingface/transformers/pull/2778
- # and https://github.com/huggingface/transformers/issues/3788
- # Strip white spaces on the right
- if tok_extended.rstrip and i > 0:
- # A bit counter-intuitive but we strip the left of the string
- # since tok_extended.rstrip means the special token is eating all white spaces on its right
- sub_text = sub_text.lstrip()
- # Strip white spaces on the left
- if tok_extended.lstrip and i < len(split_text) - 1:
- sub_text = sub_text.rstrip() # Opposite here
-
- if i == 0 and not sub_text:
- result.append(tok)
- elif i == len(split_text) - 1:
- if sub_text:
- result.append(sub_text)
- else:
- pass
- else:
- if sub_text:
- result.append(sub_text)
- result.append(tok)
- return result
-
- def split_on_tokens(tok_list, text):
- if not text.strip():
- return []
- if not tok_list:
- return self.encode(text)
-
- tokenized_text = []
- text_list = [text]
- for tok in tok_list:
- tokenized_text = []
- for sub_text in text_list:
- if sub_text not in self._command_token_tokens:
- tokenized_text.extend(split_on_token(tok, sub_text))
- else:
- tokenized_text.append(sub_text)
- text_list = tokenized_text
-
- return list(
- itertools.chain.from_iterable(
- (self._encode(token)
- if token not in self._command_token_tokens else
- [self.command_token_map[token].Id]
- for token in tokenized_text)))
-
- no_split_tokens = self._command_tokens
- Ids = split_on_tokens(no_split_tokens, processed_text)
- return Ids
-
- def CommandTokenIds(self, exception=None):
- result = []
- for s in self._command_tokens:
- if not exception or (exception and s.name not in exception):
- result.append(s.Id)
- return (result)
-
- def encode_plus_non_glm(
- self,
- text,
- second_text=None,
- truncation=True,
- max_length=None,
- ):
-
- def get_input_ids(text):
- tokens = self.text_tokenizer.tokenize(text)
- return self.text_tokenizer.convert_tokens_to_ids(tokens)
-
- first_ids = get_input_ids(text)
- second_ids = get_input_ids(
- second_text) if second_text is not None else None
- return self.prepare_for_model(
- first_ids,
- pair_ids=second_ids,
- truncation=truncation,
- max_length=max_length,
- )
-
- def prepare_for_model(
- self,
- ids: List[int],
- pair_ids: Optional[List[int]] = None,
- add_special_tokens: bool = True,
- truncation: Union[bool, str] = True,
- max_length: Optional[int] = None,
- ):
- pair = bool(pair_ids is not None)
- len_ids = len(ids)
- len_pair_ids = len(pair_ids) if pair else 0
-
- encoded_inputs = {}
- total_len = len_ids + len_pair_ids + 3
-
- # Truncation: Handle max sequence length
- if truncation is True and (max_length is not None
- and total_len > max_length):
- self.truncate_sequence(
- max_length,
- ids,
- pair_ids,
- pop_index=-1,
- )
-
- if add_special_tokens:
- if pair_ids is not None:
- sequence = [self.get_command_id("cls")] + ids + [
- self.token_end_id
- ] + pair_ids + [self.token_end_id]
- token_type_ids = [0] * (len(ids) + 2) + [1] * (len(pair_ids) +
- 1)
- else:
- sequence = [self.get_command_id("cls")
- ] + ids + [self.token_end_id]
- token_type_ids = [0] * (len(ids) + 2)
- else:
- sequence = ids + pair_ids if pair else ids
- token_type_ids = [0] * len(ids) + ([0] *
- len(pair_ids) if pair else [])
-
- encoded_inputs["input_ids"] = sequence
- encoded_inputs["token_type_ids"] = token_type_ids
- return encoded_inputs
-
- def encode_plus( # for Seq2seq
- self,
- source_text: str,
- second_text=None,
- target_text=None,
- truncation=True,
- max_length=None,
- padding=True,
- ):
- if not self.tokenizer_model_name.lower().startswith("glm") and not self.tokenizer_model_name.lower().startswith("alm"):
- return self.encode_plus_non_glm(source_text, second_text,
- truncation, max_length)
- sop_id = self.get_command_id('sop') # start of piece
- eop_id = self.get_command_id('eop') # end of piece
- sep_id = self.get_command_id('sep') # seperation
-
- source_tokens = self.EncodeAsIds(source_text)
- if truncation and max_length:
- self.truncate_sequence(max_length - 2, source_tokens)
- source_tokens = [sop_id] + source_tokens + [sep_id]
-
- # no pading for consistency
- len_source = len(source_tokens)
- sop_pos = source_tokens.index(sop_id)
- loss_mask = [0] * len_source
- block_position_ids = [0] * len_source
- position_ids = list(range(len_source))
-
- if target_text:
- target_tokens = self.EncodeAsIds(target_text)
- if max_length:
- target_tokens_length = min(max_length - len(source_tokens), len(target_tokens))
- pad_token = self.get_command_id('pad')
- padding_length = max(0,max_length-len(source_tokens)-target_tokens_length)
- target_tokens = target_tokens[:target_tokens_length] + [pad_token for i in range(padding_length)] + [eop_id]
- else:
- target_tokens += [eop_id]
- loss_mask += [1] * len(target_tokens)
- block_position_ids += [0] * len(target_tokens)
- position_ids += [x + len_source for x in range(len(target_tokens))]
- tokens = source_tokens + target_tokens
- position_ids = [position_ids[:-1], block_position_ids[:-1]]
-
- sample = {
- 'input_ids': tokens[:-1],
- 'target_ids': tokens[1:],
- 'attention_mask': sop_pos,
- 'loss_mask': loss_mask[:-1],
- "position_ids": position_ids
- }
- else:
- position_ids = [position_ids, block_position_ids]
- sample = {
- 'input_ids': source_tokens,
- 'attention_mask': sop_pos,
- "position_ids": position_ids,
- 'loss_mask': loss_mask,
- }
- return sample
-
- @staticmethod
- def truncate_sequence(max_length,
- first_sequence,
- second_sequence=None,
- pop_index=-1):
-
- if second_sequence is None:
- second_sequence = []
-
- while True:
- total_length = len(first_sequence) + len(second_sequence)
- if total_length <= max_length:
- break
- elif len(first_sequence) > len(second_sequence):
- first_sequence.pop(pop_index)
- else:
- second_sequence.pop(pop_index)
-
- def tokenize_as_tensor(self, texts):
- """
- Returns the tokenized representation of given input string(s)
-
- Parameters
- ----------
- texts : Union[str, List[str]]
- An input string or a list of input strings to tokenize
- context_length : int
- The context length to use; all CLIP models use 77 as the context length
-
- Returns
- -------
- A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
- """
- sot_token = self.get_command_id('sot')
- eot_token = self.get_command_id('eot')
- return self.text_tokenizer.tokenize(texts,
- sot_token=sot_token,
- eot_token=eot_token)
-
- def tokenize(self, text, maxlen=None, add_spatial_tokens=False):
- tokens = self.text_tokenizer.tokenize(text)
-
- if add_spatial_tokens:
- tokens.insert(0, self.get_command_id('cls'))
- tokens.append(self.get_command_id('sep'))
-
- if maxlen is not None:
- index = int(self.get_command_id('sep') is not None) + 1
- self.truncate_sequence(maxlen, tokens, pop_index=-index)
- return tokens
|