|
- from typing import Dict, List
-
- from allennlp.common.checks import ConfigurationError
- # from allennlp.common.params import Params
- from allennlp.common.util import pad_sequence_to_length
- from allennlp.data.tokenizers.token import Token
- from allennlp.data.token_indexers.token_indexer import TokenIndexer
- from allennlp.data.vocabulary import Vocabulary
-
-
- def _make_bos_eos(
- character: int,
- padding_character: int,
- beginning_of_word_character: int,
- end_of_word_character: int,
- max_word_length: int
- ):
- char_ids = [padding_character] * max_word_length
- char_ids[0] = beginning_of_word_character
- char_ids[1] = character
- char_ids[2] = end_of_word_character
- return char_ids
-
- class ELMoCharacterMapper:
- """
- Maps individual tokens to sequences of character ids, compatible with ELMo.
- To be consistent with previously trained models, we include it here as special of existing
- character indexers.
- """
- max_word_length = 50
-
- # char ids 0-255 come from utf-8 encoding bytes
- # assign 256-300 to special chars
- beginning_of_sentence_character = 256 # <begin sentence>
- end_of_sentence_character = 257 # <end sentence>
- beginning_of_word_character = 258 # <begin word>
- end_of_word_character = 259 # <end word>
- padding_character = 260 # <padding>
-
- beginning_of_sentence_characters = _make_bos_eos(
- beginning_of_sentence_character,
- padding_character,
- beginning_of_word_character,
- end_of_word_character,
- max_word_length
- )
- end_of_sentence_characters = _make_bos_eos(
- end_of_sentence_character,
- padding_character,
- beginning_of_word_character,
- end_of_word_character,
- max_word_length
- )
-
- bos_token = '<S>'
- eos_token = '</S>'
-
- @staticmethod
- def convert_word_to_char_ids(word: str) -> List[int]:
- if word == ELMoCharacterMapper.bos_token:
- char_ids = ELMoCharacterMapper.beginning_of_sentence_characters
- elif word == ELMoCharacterMapper.eos_token:
- char_ids = ELMoCharacterMapper.end_of_sentence_characters
- else:
- word_encoded = word.encode('utf-8', 'ignore')[:(ELMoCharacterMapper.max_word_length-2)]
- char_ids = [ELMoCharacterMapper.padding_character] * ELMoCharacterMapper.max_word_length
- char_ids[0] = ELMoCharacterMapper.beginning_of_word_character
- for k, chr_id in enumerate(word_encoded, start=1):
- char_ids[k] = chr_id
- char_ids[len(word_encoded) + 1] = ELMoCharacterMapper.end_of_word_character
-
- # +1 one for masking
- return [c + 1 for c in char_ids]
-
-
- @TokenIndexer.register("elmo_characters")
- class ELMoTokenCharactersIndexer(TokenIndexer[List[int]]):
- """
- Convert a token to an array of character ids to compute ELMo representations.
-
- Parameters
- ----------
- namespace : ``str``, optional (default=``elmo_characters``)
- """
- # pylint: disable=no-self-use
- def __init__(self,
- namespace: str = 'elmo_characters') -> None:
- self._namespace = namespace
-
- def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]):
- pass
-
- def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> List[int]:
- # pylint: disable=unused-argument
- if token.text is None:
- raise ConfigurationError('ELMoTokenCharactersIndexer needs a tokenizer '
- 'that retains text')
- return ELMoCharacterMapper.convert_word_to_char_ids(token.text)
-
- def get_padding_lengths(self, token: List[int]) -> Dict[str, int]:
- # pylint: disable=unused-argument
- return {}
-
- def get_padding_token(self) -> List[int]:
- return []
-
- @staticmethod
- def _default_value_for_padding():
- return [0] * ELMoCharacterMapper.max_word_length
-
- def pad_token_sequence(self,
- tokens: List[List[int]],
- desired_num_tokens: int,
- padding_lengths: Dict[str, int]) -> List[List[int]]:
- # pylint: disable=unused-argument
- return pad_sequence_to_length(tokens, desired_num_tokens,
- default_value=self._default_value_for_padding)
-
- # @classmethod
- # def from_params(cls, params: Params) -> 'ELMoTokenCharactersIndexer':
- # """
- # Parameters
- # ----------
- # namespace : ``str``, optional (default=``elmo_characters``)
- # """
- # namespace = params.pop('namespace', 'elmo_characters')
- # params.assert_empty(cls.__name__)
- # return cls(namespace=namespace)
|