jijiehao_2
/
gru_mindspore
forked from lvyufeng/gru_mindspore

 
			
							# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Tokenization utilities."""

import sys
import collections
import unicodedata

def convert_to_printable(text):
    """
    Converts `text` to a printable coding format.
    """
    if sys.version_info[0] == 3:
        if isinstance(text, str):
            return text
        if isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text)))
    raise ValueError("Only supported when running on Python3.")


def convert_to_unicode(text):
    """
    Converts `text` to Unicode format.
    """
    if sys.version_info[0] == 3:
        if isinstance(text, str):
            return text
        if isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        raise ValueError("Only support type `str` or `bytes`, while text type is `%s`" % (type(text)))
    if sys.version_info[0] == 2:
        if isinstance(text, str):
            return text.decode("utf-8", "ignore")
        if isinstance(text, unicode):
            return text
        raise ValueError("Only support type `str` or `unicode`, while text type is `%s`" % (type(text)))
    raise ValueError("Only supported when running on Python2 or Python3.")


def load_vocab_file(vocab_file):
    """
    Loads a vocabulary file and turns into a {token:id} dictionary.
    """
    vocab_dict = collections.OrderedDict()
    index = 0
    with open(vocab_file, "r") as vocab:
        while True:
            token = convert_to_unicode(vocab.readline())
            if not token:
                break
            token = token.strip()
            vocab_dict[token] = index
            index += 1
    return vocab_dict


def convert_by_vocab_dict(vocab_dict, items):
    """
    Converts a sequence of [tokens|ids] according to the vocab dict.
    """
    output = []
    for item in items:
        if item in vocab_dict:
            output.append(vocab_dict[item])
        else:
            output.append(vocab_dict["<unk>"])
    return output


class WhiteSpaceTokenizer():
    """
    Whitespace tokenizer.
    """
    def __init__(self, vocab_file):
        self.vocab_dict = load_vocab_file(vocab_file)
        self.inv_vocab_dict = {index: token for token, index in self.vocab_dict.items()}

    def _is_whitespace_char(self, char):
        """
        Checks if it is a whitespace character(regard "\t", "\n", "\r" as whitespace here).
        """
        if char in (" ", "\t", "\n", "\r"):
            return True
        uni = unicodedata.category(char)
        if uni == "Zs":
            return True
        return False

    def _is_control_char(self, char):
        """
        Checks if it is a control character.
        """
        if char in ("\t", "\n", "\r"):
            return False
        uni = unicodedata.category(char)
        if uni in ("Cc", "Cf"):
            return True
        return False

    def _clean_text(self, text):
        """
        Remove invalid characters and cleanup whitespace.
        """
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or self._is_control_char(char):
                continue
            if self._is_whitespace_char(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

    def _whitespace_tokenize(self, text):
        """
        Clean whitespace and split text into tokens.
        """
        text = text.strip()
        text = text.lower()
        if text.endswith("."):
            text = text.replace(".", " .")
        if not text:
            tokens = []
        else:
            tokens = text.split()
        return tokens

    def tokenize(self, text):
        """
        Tokenizes text.
        """
        text = convert_to_unicode(text)
        text = self._clean_text(text)
        tokens = self._whitespace_tokenize(text)
        return tokens

    def convert_tokens_to_ids(self, tokens):
        return convert_by_vocab_dict(self.vocab_dict, tokens)

    def convert_ids_to_tokens(self, ids):
        return convert_by_vocab_dict(self.inv_vocab_dict, ids)