|
- #! /usr/bin/python
- # -*- coding: utf-8 -*-
-
- import collections
- import os
- import random
- import re
- import subprocess
- import tempfile
- import warnings
- from collections import Counter
-
- import numpy as np
- import six as _six
- import tensorflow as tf
- from six.moves import urllib, xrange
- from tensorflow.python.platform import gfile
-
- import tensorlayer as tl
- from tensorlayer.lazy_imports import LazyImport
-
- nltk = LazyImport("nltk")
-
- __all__ = [
- 'generate_skip_gram_batch',
- 'sample',
- 'sample_top',
- 'SimpleVocabulary',
- 'Vocabulary',
- 'process_sentence',
- 'create_vocab',
- 'simple_read_words',
- 'read_words',
- 'read_analogies_file',
- 'build_vocab',
- 'build_reverse_dictionary',
- 'build_words_dataset',
- 'words_to_word_ids',
- 'word_ids_to_words',
- 'save_vocab',
- 'basic_tokenizer',
- 'create_vocabulary',
- 'initialize_vocabulary',
- 'sentence_to_token_ids',
- 'data_to_token_ids',
- 'moses_multi_bleu',
- ]
-
-
- def as_bytes(bytes_or_text, encoding='utf-8'):
- """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text.
- Args:
- bytes_or_text: A `bytes`, `str`, or `unicode` object.
- encoding: A string indicating the charset for encoding unicode.
- Returns:
- A `bytes` object.
- Raises:
- TypeError: If `bytes_or_text` is not a binary or unicode string.
- """
- if isinstance(bytes_or_text, _six.text_type):
- return bytes_or_text.encode(encoding)
- elif isinstance(bytes_or_text, bytes):
- return bytes_or_text
- else:
- raise TypeError('Expected binary or unicode string, got %r' % (bytes_or_text, ))
-
-
- def as_text(bytes_or_text, encoding='utf-8'):
- """Returns the given argument as a unicode string.
- Args:
- bytes_or_text: A `bytes`, `str`, or `unicode` object.
- encoding: A string indicating the charset for decoding unicode.
- Returns:
- A `unicode` (Python 2) or `str` (Python 3) object.
- Raises:
- TypeError: If `bytes_or_text` is not a binary or unicode string.
- """
- if isinstance(bytes_or_text, _six.text_type):
- return bytes_or_text
- elif isinstance(bytes_or_text, bytes):
- return bytes_or_text.decode(encoding)
- else:
- raise TypeError('Expected binary or unicode string, got %r' % bytes_or_text)
-
-
- def generate_skip_gram_batch(data, batch_size, num_skips, skip_window, data_index=0):
- """Generate a training batch for the Skip-Gram model.
-
- See `Word2Vec example <https://github.com/tensorlayer/tensorlayer/blob/master/example/tutorial_word2vec_basic.py>`__.
-
- Parameters
- ----------
- data : list of data
- To present context, usually a list of integers.
- batch_size : int
- Batch size to return.
- num_skips : int
- How many times to reuse an input to generate a label.
- skip_window : int
- How many words to consider left and right.
- data_index : int
- Index of the context location. This code use `data_index` to instead of yield like ``tl.iterate``.
-
- Returns
- -------
- batch : list of data
- Inputs.
- labels : list of data
- Labels
- data_index : int
- Index of the context location.
-
- Examples
- --------
- Setting num_skips=2, skip_window=1, use the right and left words.
- In the same way, num_skips=4, skip_window=2 means use the nearby 4 words.
-
- >>> data = [1,2,3,4,5,6,7,8,9,10,11]
- >>> batch, labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, batch_size=8, num_skips=2, skip_window=1, data_index=0)
- >>> print(batch)
- [2 2 3 3 4 4 5 5]
- >>> print(labels)
- [[3]
- [1]
- [4]
- [2]
- [5]
- [3]
- [4]
- [6]]
-
- """
- # global data_index # you can put data_index outside the function, then
- # modify the global data_index in the function without return it.
- # note: without using yield, this code use data_index to instead.
-
- if batch_size % num_skips != 0:
- raise Exception("batch_size should be able to be divided by num_skips.")
- if num_skips > 2 * skip_window:
- raise Exception("num_skips <= 2 * skip_window")
- batch = np.ndarray(shape=(batch_size), dtype=np.int32)
- labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
- span = 2 * skip_window + 1 # [ skip_window target skip_window ]
- buffer = collections.deque(maxlen=span)
- for _ in range(span):
- buffer.append(data[data_index])
- data_index = (data_index + 1) % len(data)
- for i in range(batch_size // num_skips):
- target = skip_window # target label at the center of the buffer
- targets_to_avoid = [skip_window]
- for j in range(num_skips):
- while target in targets_to_avoid:
- target = random.randint(0, span - 1)
- targets_to_avoid.append(target)
- batch[i * num_skips + j] = buffer[skip_window]
- labels[i * num_skips + j, 0] = buffer[target]
- buffer.append(data[data_index])
- data_index = (data_index + 1) % len(data)
- return batch, labels, data_index
-
-
- def sample(a=None, temperature=1.0):
- """Sample an index from a probability array.
-
- Parameters
- ----------
- a : list of float
- List of probabilities.
- temperature : float or None
- The higher the more uniform. When a = [0.1, 0.2, 0.7],
- - temperature = 0.7, the distribution will be sharpen [0.05048273, 0.13588945, 0.81362782]
- - temperature = 1.0, the distribution will be the same [0.1, 0.2, 0.7]
- - temperature = 1.5, the distribution will be filtered [0.16008435, 0.25411807, 0.58579758]
- - If None, it will be ``np.argmax(a)``
-
- Notes
- ------
- - No matter what is the temperature and input list, the sum of all probabilities will be one. Even if input list = [1, 100, 200], the sum of all probabilities will still be one.
- - For large vocabulary size, choice a higher temperature or ``tl.nlp.sample_top`` to avoid error.
-
- """
- if a is None:
- raise Exception("a : list of float")
- b = np.copy(a)
- try:
- if temperature == 1:
- return np.argmax(np.random.multinomial(1, a, 1))
- if temperature is None:
- return np.argmax(a)
- else:
- a = np.log(a) / temperature
- a = np.exp(a) / np.sum(np.exp(a))
- return np.argmax(np.random.multinomial(1, a, 1))
- except Exception:
- # np.set_printoptions(threshold=np.nan)
- # tl.logging.info(a)
- # tl.logging.info(np.sum(a))
- # tl.logging.info(np.max(a))
- # tl.logging.info(np.min(a))
- # exit()
- message = "For large vocabulary_size, choice a higher temperature\
- to avoid log error. Hint : use ``sample_top``. "
-
- warnings.warn(message, Warning)
- # tl.logging.info(a)
- # tl.logging.info(b)
- return np.argmax(np.random.multinomial(1, b, 1))
-
-
- def sample_top(a=None, top_k=10):
- """Sample from ``top_k`` probabilities.
-
- Parameters
- ----------
- a : list of float
- List of probabilities.
- top_k : int
- Number of candidates to be considered.
-
- """
- if a is None:
- a = []
-
- idx = np.argpartition(a, -top_k)[-top_k:]
- probs = a[idx]
- # tl.logging.info("new %f" % probs)
- probs = probs / np.sum(probs)
- choice = np.random.choice(idx, p=probs)
- return choice
- # old implementation
- # a = np.array(a)
- # idx = np.argsort(a)[::-1]
- # idx = idx[:top_k]
- # # a = a[idx]
- # probs = a[idx]
- # tl.logging.info("prev %f" % probs)
- # # probs = probs / np.sum(probs)
- # # choice = np.random.choice(idx, p=probs)
- # # return choice
-
-
- # Vector representations of words (Advanced) UNDOCUMENT
- class SimpleVocabulary(object):
- """Simple vocabulary wrapper, see create_vocab().
-
- Parameters
- ------------
- vocab : dictionary
- A dictionary that maps word to ID.
- unk_id : int
- The ID for 'unknown' word.
-
- """
-
- def __init__(self, vocab, unk_id):
- """Initialize the vocabulary."""
- self._vocab = vocab
- self._unk_id = unk_id
-
- def word_to_id(self, word):
- """Returns the integer id of a word string."""
- if word in self._vocab:
- return self._vocab[word]
- else:
- return self._unk_id
-
-
- class Vocabulary(object):
- """Create Vocabulary class from a given vocabulary and its id-word, word-id convert.
- See create_vocab() and ``tutorial_tfrecord3.py``.
-
- Parameters
- -----------
- vocab_file : str
- The file contains the vocabulary (can be created via ``tl.nlp.create_vocab``), where the words are the first whitespace-separated token on each line (other tokens are ignored) and the word ids are the corresponding line numbers.
- start_word : str
- Special word denoting sentence start.
- end_word : str
- Special word denoting sentence end.
- unk_word : str
- Special word denoting unknown words.
-
- Attributes
- ------------
- vocab : dictionary
- A dictionary that maps word to ID.
- reverse_vocab : list of int
- A list that maps ID to word.
- start_id : int
- For start ID.
- end_id : int
- For end ID.
- unk_id : int
- For unknown ID.
- pad_id : int
- For Padding ID.
-
- Examples
- -------------
- The vocab file looks like follow, includes `start_word` , `end_word` ...
-
- >>> a 969108
- >>> <S> 586368
- >>> </S> 586368
- >>> . 440479
- >>> on 213612
- >>> of 202290
- >>> the 196219
- >>> in 182598
- >>> with 152984
- >>> and 139109
- >>> is 97322
-
- """
-
- def __init__(self, vocab_file, start_word="<S>", end_word="</S>", unk_word="<UNK>", pad_word="<PAD>"):
- if not tf.io.gfile.exists(vocab_file):
- tl.logging.fatal("Vocab file %s not found." % vocab_file)
- tl.logging.info("Initializing vocabulary from file: %s" % vocab_file)
-
- with tf.io.gfile.GFile(vocab_file, mode="r") as f:
- reverse_vocab = list(f.readlines())
- reverse_vocab = [line.split()[0] for line in reverse_vocab]
- # assert start_word in reverse_vocab
- # assert end_word in reverse_vocab
- if start_word not in reverse_vocab: # haodong
- reverse_vocab.append(start_word)
- if end_word not in reverse_vocab:
- reverse_vocab.append(end_word)
- if unk_word not in reverse_vocab:
- reverse_vocab.append(unk_word)
- if pad_word not in reverse_vocab:
- reverse_vocab.append(pad_word)
-
- vocab = dict([(x, y) for (y, x) in enumerate(reverse_vocab)])
-
- tl.logging.info("Vocabulary from %s : %s %s %s" % (vocab_file, start_word, end_word, unk_word))
- tl.logging.info(" vocabulary with %d words (includes start_word, end_word, unk_word)" % len(vocab))
- # tl.logging.info(" vocabulary with %d words" % len(vocab))
-
- self.vocab = vocab # vocab[word] = id
- self.reverse_vocab = reverse_vocab # reverse_vocab[id] = word
-
- # Save special word ids.
- self.start_id = vocab[start_word]
- self.end_id = vocab[end_word]
- self.unk_id = vocab[unk_word]
- self.pad_id = vocab[pad_word]
- tl.logging.info(" start_id: %d" % self.start_id)
- tl.logging.info(" end_id : %d" % self.end_id)
- tl.logging.info(" unk_id : %d" % self.unk_id)
- tl.logging.info(" pad_id : %d" % self.pad_id)
-
- def word_to_id(self, word):
- """Returns the integer word id of a word string."""
- if word in self.vocab:
- return self.vocab[word]
- else:
- return self.unk_id
-
- def id_to_word(self, word_id):
- """Returns the word string of an integer word id."""
- if word_id >= len(self.reverse_vocab):
- return self.reverse_vocab[self.unk_id]
- else:
- return self.reverse_vocab[word_id]
-
-
- def process_sentence(sentence, start_word="<S>", end_word="</S>"):
- """Seperate a sentence string into a list of string words, add start_word and end_word,
- see ``create_vocab()`` and ``tutorial_tfrecord3.py``.
-
- Parameters
- ----------
- sentence : str
- A sentence.
- start_word : str or None
- The start word. If None, no start word will be appended.
- end_word : str or None
- The end word. If None, no end word will be appended.
-
- Returns
- ---------
- list of str
- A list of strings that separated into words.
-
- Examples
- -----------
- >>> c = "how are you?"
- >>> c = tl.nlp.process_sentence(c)
- >>> print(c)
- ['<S>', 'how', 'are', 'you', '?', '</S>']
-
- Notes
- -------
- - You have to install the following package.
- - `Installing NLTK <http://www.nltk.org/install.html>`__
- - `Installing NLTK data <http://www.nltk.org/data.html>`__
-
- """
- if start_word is not None:
- process_sentence = [start_word]
- else:
- process_sentence = []
- process_sentence.extend(nltk.tokenize.word_tokenize(sentence.lower()))
-
- if end_word is not None:
- process_sentence.append(end_word)
- return process_sentence
-
-
- def create_vocab(sentences, word_counts_output_file, min_word_count=1):
- """Creates the vocabulary of word to word_id.
-
- See ``tutorial_tfrecord3.py``.
-
- The vocabulary is saved to disk in a text file of word counts. The id of each
- word in the file is its corresponding 0-based line number.
-
- Parameters
- ------------
- sentences : list of list of str
- All sentences for creating the vocabulary.
- word_counts_output_file : str
- The file name.
- min_word_count : int
- Minimum number of occurrences for a word.
-
- Returns
- --------
- :class:`SimpleVocabulary`
- The simple vocabulary object, see :class:`Vocabulary` for more.
-
- Examples
- --------
- Pre-process sentences
-
- >>> captions = ["one two , three", "four five five"]
- >>> processed_capts = []
- >>> for c in captions:
- >>> c = tl.nlp.process_sentence(c, start_word="<S>", end_word="</S>")
- >>> processed_capts.append(c)
- >>> print(processed_capts)
- ...[['<S>', 'one', 'two', ',', 'three', '</S>'], ['<S>', 'four', 'five', 'five', '</S>']]
-
- Create vocabulary
-
- >>> tl.nlp.create_vocab(processed_capts, word_counts_output_file='vocab.txt', min_word_count=1)
- Creating vocabulary.
- Total words: 8
- Words in vocabulary: 8
- Wrote vocabulary file: vocab.txt
-
- Get vocabulary object
-
- >>> vocab = tl.nlp.Vocabulary('vocab.txt', start_word="<S>", end_word="</S>", unk_word="<UNK>")
- INFO:tensorflow:Initializing vocabulary from file: vocab.txt
- [TL] Vocabulary from vocab.txt : <S> </S> <UNK>
- vocabulary with 10 words (includes start_word, end_word, unk_word)
- start_id: 2
- end_id: 3
- unk_id: 9
- pad_id: 0
-
- """
- tl.logging.info("Creating vocabulary.")
-
- counter = Counter()
-
- for c in sentences:
- counter.update(c)
- # tl.logging.info('c',c)
- tl.logging.info(" Total words: %d" % len(counter))
-
- # Filter uncommon words and sort by descending count.
- word_counts = [x for x in counter.items() if x[1] >= min_word_count]
- word_counts.sort(key=lambda x: x[1], reverse=True)
- word_counts = [("<PAD>", 0)] + word_counts # 1st id should be reserved for padding
- # tl.logging.info(word_counts)
- tl.logging.info(" Words in vocabulary: %d" % len(word_counts))
-
- # Write out the word counts file.
- with tf.io.gfile.GFile(word_counts_output_file, "w") as f:
- f.write("\n".join(["%s %d" % (w, c) for w, c in word_counts]))
- tl.logging.info(" Wrote vocabulary file: %s" % word_counts_output_file)
-
- # Create the vocabulary dictionary.
- reverse_vocab = [x[0] for x in word_counts]
- unk_id = len(reverse_vocab)
- vocab_dict = dict([(x, y) for (y, x) in enumerate(reverse_vocab)])
- vocab = SimpleVocabulary(vocab_dict, unk_id)
-
- return vocab
-
-
- # Vector representations of words
- def simple_read_words(filename="nietzsche.txt"):
- """Read context from file without any preprocessing.
-
- Parameters
- ----------
- filename : str
- A file path (like .txt file)
-
- Returns
- --------
- str
- The context in a string.
-
- """
- with open(filename, "r") as f:
- words = f.read()
- return words
-
-
- def read_words(filename="nietzsche.txt", replace=None):
- """Read list format context from a file.
-
- For customized read_words method, see ``tutorial_generate_text.py``.
-
- Parameters
- ----------
- filename : str
- a file path.
- replace : list of str
- replace original string by target string.
-
- Returns
- -------
- list of str
- The context in a list (split using space).
- """
- if replace is None:
- replace = ['\n', '<eos>']
-
- with tf.io.gfile.GFile(filename, "r") as f:
- try: # python 3.4 or older
- context_list = f.read().replace(*replace).split()
- except Exception: # python 3.5
- f.seek(0)
- replace = [x.encode('utf-8') for x in replace]
- context_list = f.read().replace(*replace).split()
- return context_list
-
-
- def read_analogies_file(eval_file='questions-words.txt', word2id=None):
- """Reads through an analogy question file, return its id format.
-
- Parameters
- ----------
- eval_file : str
- The file name.
- word2id : dictionary
- a dictionary that maps word to ID.
-
- Returns
- --------
- numpy.array
- A ``[n_examples, 4]`` numpy array containing the analogy question's word IDs.
-
- Examples
- ---------
- The file should be in this format
-
- >>> : capital-common-countries
- >>> Athens Greece Baghdad Iraq
- >>> Athens Greece Bangkok Thailand
- >>> Athens Greece Beijing China
- >>> Athens Greece Berlin Germany
- >>> Athens Greece Bern Switzerland
- >>> Athens Greece Cairo Egypt
- >>> Athens Greece Canberra Australia
- >>> Athens Greece Hanoi Vietnam
- >>> Athens Greece Havana Cuba
-
- Get the tokenized analogy question data
-
- >>> words = tl.files.load_matt_mahoney_text8_dataset()
- >>> data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset(words, vocabulary_size, True)
- >>> analogy_questions = tl.nlp.read_analogies_file(eval_file='questions-words.txt', word2id=dictionary)
- >>> print(analogy_questions)
- [[ 3068 1248 7161 1581]
- [ 3068 1248 28683 5642]
- [ 3068 1248 3878 486]
- ...,
- [ 1216 4309 19982 25506]
- [ 1216 4309 3194 8650]
- [ 1216 4309 140 312]]
-
- """
- if word2id is None:
- word2id = {}
-
- questions = []
- questions_skipped = 0
-
- with open(eval_file, "rb") as analogy_f:
- for line in analogy_f:
- if line.startswith(b":"): # Skip comments.
- continue
- words = line.strip().lower().split(b" ") # lowercase
- ids = [word2id.get(w.strip().decode()) for w in words]
- if None in ids or len(ids) != 4:
- questions_skipped += 1
- else:
- questions.append(np.array(ids))
- tl.logging.info("Eval analogy file: %s" % eval_file)
- tl.logging.info("Questions: %d", len(questions))
- tl.logging.info("Skipped: %d", questions_skipped)
- analogy_questions = np.array(questions, dtype=np.int32)
- return analogy_questions
-
-
- def build_vocab(data):
- """Build vocabulary.
-
- Given the context in list format.
- Return the vocabulary, which is a dictionary for word to id.
- e.g. {'campbell': 2587, 'atlantic': 2247, 'aoun': 6746 .... }
-
- Parameters
- ----------
- data : list of str
- The context in list format
-
- Returns
- --------
- dictionary
- that maps word to unique ID. e.g. {'campbell': 2587, 'atlantic': 2247, 'aoun': 6746 .... }
-
- References
- ---------------
- - `tensorflow.models.rnn.ptb.reader <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/models/rnn/ptb>`_
-
- Examples
- --------
- >>> data_path = os.getcwd() + '/simple-examples/data'
- >>> train_path = os.path.join(data_path, "ptb.train.txt")
- >>> word_to_id = build_vocab(read_txt_words(train_path))
-
- """
- # data = _read_words(filename)
- counter = collections.Counter(data)
- # tl.logging.info('counter %s' % counter) # dictionary for the occurrence number of each word, e.g. 'banknote': 1, 'photography': 1, 'kia': 1
- count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
- # tl.logging.info('count_pairs %s' % count_pairs) # convert dictionary to list of tuple, e.g. ('ssangyong', 1), ('swapo', 1), ('wachter', 1)
- words, _ = list(zip(*count_pairs))
- word_to_id = dict(zip(words, range(len(words))))
- # tl.logging.info(words) # list of words
- # tl.logging.info(word_to_id) # dictionary for word to id, e.g. 'campbell': 2587, 'atlantic': 2247, 'aoun': 6746
- return word_to_id
-
-
- def build_reverse_dictionary(word_to_id):
- """Given a dictionary that maps word to integer id.
- Returns a reverse dictionary that maps a id to word.
-
- Parameters
- ----------
- word_to_id : dictionary
- that maps word to ID.
-
- Returns
- --------
- dictionary
- A dictionary that maps IDs to words.
-
- """
- reverse_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))
- return reverse_dictionary
-
-
- def build_words_dataset(words=None, vocabulary_size=50000, printable=True, unk_key='UNK'):
- """Build the words dictionary and replace rare words with 'UNK' token.
- The most common word has the smallest integer id.
-
- Parameters
- ----------
- words : list of str or byte
- The context in list format. You may need to do preprocessing on the words, such as lower case, remove marks etc.
- vocabulary_size : int
- The maximum vocabulary size, limiting the vocabulary size. Then the script replaces rare words with 'UNK' token.
- printable : boolean
- Whether to print the read vocabulary size of the given words.
- unk_key : str
- Represent the unknown words.
-
- Returns
- --------
- data : list of int
- The context in a list of ID.
- count : list of tuple and list
- Pair words and IDs.
- - count[0] is a list : the number of rare words
- - count[1:] are tuples : the number of occurrence of each word
- - e.g. [['UNK', 418391], (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764)]
- dictionary : dictionary
- It is `word_to_id` that maps word to ID.
- reverse_dictionary : a dictionary
- It is `id_to_word` that maps ID to word.
-
- Examples
- --------
- >>> words = tl.files.load_matt_mahoney_text8_dataset()
- >>> vocabulary_size = 50000
- >>> data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset(words, vocabulary_size)
-
- References
- -----------------
- - `tensorflow/examples/tutorials/word2vec/word2vec_basic.py <https://github.com/tensorflow/tensorflow/blob/r0.7/tensorflow/examples/tutorials/word2vec/word2vec_basic.py>`__
-
- """
- if words is None:
- raise Exception("words : list of str or byte")
-
- count = [[unk_key, -1]]
- count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
- dictionary = dict()
- for word, _ in count:
- dictionary[word] = len(dictionary)
- data = list()
- unk_count = 0
- for word in words:
- if word in dictionary:
- index = dictionary[word]
- else:
- index = 0 # dictionary['UNK']
- unk_count += 1
- data.append(index)
- count[0][1] = unk_count
- reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
- if printable:
- tl.logging.info('Real vocabulary size %d' % len(collections.Counter(words).keys()))
- tl.logging.info('Limited vocabulary size {}'.format(vocabulary_size))
- if len(collections.Counter(words).keys()) < vocabulary_size:
- raise Exception(
- "len(collections.Counter(words).keys()) >= vocabulary_size , the limited vocabulary_size must be less than or equal to the read vocabulary_size"
- )
- return data, count, dictionary, reverse_dictionary
-
-
- def words_to_word_ids(data=None, word_to_id=None, unk_key='UNK'):
- """Convert a list of string (words) to IDs.
-
- Parameters
- ----------
- data : list of string or byte
- The context in list format
- word_to_id : a dictionary
- that maps word to ID.
- unk_key : str
- Represent the unknown words.
-
- Returns
- --------
- list of int
- A list of IDs to represent the context.
-
- Examples
- --------
- >>> words = tl.files.load_matt_mahoney_text8_dataset()
- >>> vocabulary_size = 50000
- >>> data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset(words, vocabulary_size, True)
- >>> context = [b'hello', b'how', b'are', b'you']
- >>> ids = tl.nlp.words_to_word_ids(words, dictionary)
- >>> context = tl.nlp.word_ids_to_words(ids, reverse_dictionary)
- >>> print(ids)
- [6434, 311, 26, 207]
- >>> print(context)
- [b'hello', b'how', b'are', b'you']
-
- References
- ---------------
- - `tensorflow.models.rnn.ptb.reader <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/models/rnn/ptb>`__
-
- """
- if data is None:
- raise Exception("data : list of string or byte")
- if word_to_id is None:
- raise Exception("word_to_id : a dictionary")
- # if isinstance(data[0], six.string_types):
- # tl.logging.info(type(data[0]))
- # # exit()
- # tl.logging.info(data[0])
- # tl.logging.info(word_to_id)
- # return [word_to_id[str(word)] for word in data]
- # else:
-
- word_ids = []
- for word in data:
- if word_to_id.get(word) is not None:
- word_ids.append(word_to_id[word])
- else:
- word_ids.append(word_to_id[unk_key])
- return word_ids
- # return [word_to_id[word] for word in data] # this one
-
- # if isinstance(data[0], str):
- # # tl.logging.info('is a string object')
- # return [word_to_id[word] for word in data]
- # else:#if isinstance(s, bytes):
- # # tl.logging.info('is a unicode object')
- # # tl.logging.info(data[0])
- # return [word_to_id[str(word)] f
-
-
- def word_ids_to_words(data, id_to_word):
- """Convert a list of integer to strings (words).
-
- Parameters
- ----------
- data : list of int
- The context in list format.
- id_to_word : dictionary
- a dictionary that maps ID to word.
-
- Returns
- --------
- list of str
- A list of string or byte to represent the context.
-
- Examples
- ---------
- see ``tl.nlp.words_to_word_ids``
-
- """
- return [id_to_word[i] for i in data]
-
-
- def save_vocab(count=None, name='vocab.txt'):
- """Save the vocabulary to a file so the model can be reloaded.
-
- Parameters
- ----------
- count : a list of tuple and list
- count[0] is a list : the number of rare words,
- count[1:] are tuples : the number of occurrence of each word,
- e.g. [['UNK', 418391], (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764)]
-
- Examples
- ---------
- >>> words = tl.files.load_matt_mahoney_text8_dataset()
- >>> vocabulary_size = 50000
- >>> data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset(words, vocabulary_size, True)
- >>> tl.nlp.save_vocab(count, name='vocab_text8.txt')
- >>> vocab_text8.txt
- UNK 418391
- the 1061396
- of 593677
- and 416629
- one 411764
- in 372201
- a 325873
- to 316376
-
- """
- if count is None:
- count = []
-
- pwd = os.getcwd()
- vocabulary_size = len(count)
- with open(os.path.join(pwd, name), "w") as f:
- for i in xrange(vocabulary_size):
- f.write("%s %d\n" % (as_text(count[i][0]), count[i][1]))
- tl.logging.info("%d vocab saved to %s in %s" % (vocabulary_size, name, pwd))
-
-
- # Functions for translation
-
-
- def basic_tokenizer(sentence, _WORD_SPLIT=re.compile(b"([.,!?\"':;)(])")):
- """Very basic tokenizer: split the sentence into a list of tokens.
-
- Parameters
- -----------
- sentence : tensorflow.python.platform.gfile.GFile Object
- _WORD_SPLIT : regular expression for word spliting.
-
-
- Examples
- --------
- >>> see create_vocabulary
- >>> from tensorflow.python.platform import gfile
- >>> train_path = "wmt/giga-fren.release2"
- >>> with gfile.GFile(train_path + ".en", mode="rb") as f:
- >>> for line in f:
- >>> tokens = tl.nlp.basic_tokenizer(line)
- >>> tl.logging.info(tokens)
- >>> exit()
- [b'Changing', b'Lives', b'|', b'Changing', b'Society', b'|', b'How',
- b'It', b'Works', b'|', b'Technology', b'Drives', b'Change', b'Home',
- b'|', b'Concepts', b'|', b'Teachers', b'|', b'Search', b'|', b'Overview',
- b'|', b'Credits', b'|', b'HHCC', b'Web', b'|', b'Reference', b'|',
- b'Feedback', b'Virtual', b'Museum', b'of', b'Canada', b'Home', b'Page']
-
- References
- ----------
- - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
-
- """
- words = []
- sentence = as_bytes(sentence)
- for space_separated_fragment in sentence.strip().split():
- words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
- return [w for w in words if w]
-
-
- def create_vocabulary(
- vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True,
- _DIGIT_RE=re.compile(br"\d"), _START_VOCAB=None
- ):
- r"""Create vocabulary file (if it does not exist yet) from data file.
-
- Data file is assumed to contain one sentence per line. Each sentence is
- tokenized and digits are normalized (if normalize_digits is set).
- Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
- We write it to vocabulary_path in a one-token-per-line format, so that later
- token in the first line gets id=0, second line gets id=1, and so on.
-
- Parameters
- -----------
- vocabulary_path : str
- Path where the vocabulary will be created.
- data_path : str
- Data file that will be used to create vocabulary.
- max_vocabulary_size : int
- Limit on the size of the created vocabulary.
- tokenizer : function
- A function to use to tokenize each data sentence. If None, basic_tokenizer will be used.
- normalize_digits : boolean
- If true, all digits are replaced by `0`.
- _DIGIT_RE : regular expression function
- Default is ``re.compile(br"\d")``.
- _START_VOCAB : list of str
- The pad, go, eos and unk token, default is ``[b"_PAD", b"_GO", b"_EOS", b"_UNK"]``.
-
- References
- ----------
- - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
-
- """
- if _START_VOCAB is None:
- _START_VOCAB = [b"_PAD", b"_GO", b"_EOS", b"_UNK"]
- if not gfile.Exists(vocabulary_path):
- tl.logging.info("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
- vocab = {}
- with gfile.GFile(data_path, mode="rb") as f:
- counter = 0
- for line in f:
- counter += 1
- if counter % 100000 == 0:
- tl.logging.info(" processing line %d" % counter)
- tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
- for w in tokens:
- word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w
- if word in vocab:
- vocab[word] += 1
- else:
- vocab[word] = 1
- vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
- if len(vocab_list) > max_vocabulary_size:
- vocab_list = vocab_list[:max_vocabulary_size]
- with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
- for w in vocab_list:
- vocab_file.write(w + b"\n")
- else:
- tl.logging.info("Vocabulary %s from data %s exists" % (vocabulary_path, data_path))
-
-
- def initialize_vocabulary(vocabulary_path):
- """Initialize vocabulary from file, return the `word_to_id` (dictionary)
- and `id_to_word` (list).
-
- We assume the vocabulary is stored one-item-per-line, so a file will result in a vocabulary {"dog": 0, "cat": 1}, and this function will also return the reversed-vocabulary ["dog", "cat"].
-
- Parameters
- -----------
- vocabulary_path : str
- Path to the file containing the vocabulary.
-
- Returns
- --------
- vocab : dictionary
- a dictionary that maps word to ID.
- rev_vocab : list of int
- a list that maps ID to word.
-
- Examples
- ---------
- >>> Assume 'test' contains
- dog
- cat
- bird
- >>> vocab, rev_vocab = tl.nlp.initialize_vocabulary("test")
- >>> print(vocab)
- >>> {b'cat': 1, b'dog': 0, b'bird': 2}
- >>> print(rev_vocab)
- >>> [b'dog', b'cat', b'bird']
-
- Raises
- -------
- ValueError : if the provided vocabulary_path does not exist.
-
- """
- if gfile.Exists(vocabulary_path):
- rev_vocab = []
- with gfile.GFile(vocabulary_path, mode="rb") as f:
- rev_vocab.extend(f.readlines())
- rev_vocab = [as_bytes(line.strip()) for line in rev_vocab]
- vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
- return vocab, rev_vocab
- else:
- raise ValueError("Vocabulary file %s not found.", vocabulary_path)
-
-
- def sentence_to_token_ids(
- sentence, vocabulary, tokenizer=None, normalize_digits=True, UNK_ID=3, _DIGIT_RE=re.compile(br"\d")
- ):
- """Convert a string to list of integers representing token-ids.
-
- For example, a sentence "I have a dog" may become tokenized into
- ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
- "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
-
- Parameters
- -----------
- sentence : tensorflow.python.platform.gfile.GFile Object
- The sentence in bytes format to convert to token-ids, see ``basic_tokenizer()`` and ``data_to_token_ids()``.
- vocabulary : dictionary
- Mmapping tokens to integers.
- tokenizer : function
- A function to use to tokenize each sentence. If None, ``basic_tokenizer`` will be used.
- normalize_digits : boolean
- If true, all digits are replaced by 0.
-
- Returns
- --------
- list of int
- The token-ids for the sentence.
-
- """
- if tokenizer:
- words = tokenizer(sentence)
- else:
- words = basic_tokenizer(sentence)
- if not normalize_digits:
- return [vocabulary.get(w, UNK_ID) for w in words]
- # Normalize digits by 0 before looking words up in the vocabulary.
- return [vocabulary.get(re.sub(_DIGIT_RE, b"0", w), UNK_ID) for w in words]
-
-
- def data_to_token_ids(
- data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True, UNK_ID=3,
- _DIGIT_RE=re.compile(br"\d")
- ):
- """Tokenize data file and turn into token-ids using given vocabulary file.
-
- This function loads data line-by-line from data_path, calls the above
- sentence_to_token_ids, and saves the result to target_path. See comment
- for sentence_to_token_ids on the details of token-ids format.
-
- Parameters
- -----------
- data_path : str
- Path to the data file in one-sentence-per-line format.
- target_path : str
- Path where the file with token-ids will be created.
- vocabulary_path : str
- Path to the vocabulary file.
- tokenizer : function
- A function to use to tokenize each sentence. If None, ``basic_tokenizer`` will be used.
- normalize_digits : boolean
- If true, all digits are replaced by 0.
-
- References
- ----------
- - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
-
- """
- if not gfile.Exists(target_path):
- tl.logging.info("Tokenizing data in %s" % data_path)
- vocab, _ = initialize_vocabulary(vocabulary_path)
- with gfile.GFile(data_path, mode="rb") as data_file:
- with gfile.GFile(target_path, mode="w") as tokens_file:
- counter = 0
- for line in data_file:
- counter += 1
- if counter % 100000 == 0:
- tl.logging.info(" tokenizing line %d" % counter)
- token_ids = sentence_to_token_ids(
- line, vocab, tokenizer, normalize_digits, UNK_ID=UNK_ID, _DIGIT_RE=_DIGIT_RE
- )
- tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
- else:
- tl.logging.info("Target path %s exists" % target_path)
-
-
- def moses_multi_bleu(hypotheses, references, lowercase=False):
- """Calculate the bleu score for hypotheses and references
- using the MOSES ulti-bleu.perl script.
-
- Parameters
- ------------
- hypotheses : numpy.array.string
- A numpy array of strings where each string is a single example.
- references : numpy.array.string
- A numpy array of strings where each string is a single example.
- lowercase : boolean
- If True, pass the "-lc" flag to the multi-bleu script
-
- Examples
- ---------
- >>> hypotheses = ["a bird is flying on the sky"]
- >>> references = ["two birds are flying on the sky", "a bird is on the top of the tree", "an airplane is on the sky",]
- >>> score = tl.nlp.moses_multi_bleu(hypotheses, references)
-
- Returns
- --------
- float
- The BLEU score
-
- References
- ----------
- - `Google/seq2seq/metric/bleu <https://github.com/google/seq2seq>`__
-
- """
- if np.size(hypotheses) == 0:
- return np.float32(0.0)
-
- # Get MOSES multi-bleu script
- try:
- multi_bleu_path, _ = urllib.request.urlretrieve(
- "https://raw.githubusercontent.com/moses-smt/mosesdecoder/"
- "master/scripts/generic/multi-bleu.perl"
- )
- os.chmod(multi_bleu_path, 0o755)
- except Exception: # pylint: disable=W0702
- tl.logging.info("Unable to fetch multi-bleu.perl script, using local.")
- metrics_dir = os.path.dirname(os.path.realpath(__file__))
- bin_dir = os.path.abspath(os.path.join(metrics_dir, "..", "..", "bin"))
- multi_bleu_path = os.path.join(bin_dir, "tools/multi-bleu.perl")
-
- # Dump hypotheses and references to tempfiles
- hypothesis_file = tempfile.NamedTemporaryFile()
- hypothesis_file.write("\n".join(hypotheses).encode("utf-8"))
- hypothesis_file.write(b"\n")
- hypothesis_file.flush()
- reference_file = tempfile.NamedTemporaryFile()
- reference_file.write("\n".join(references).encode("utf-8"))
- reference_file.write(b"\n")
- reference_file.flush()
-
- # Calculate BLEU using multi-bleu script
- with open(hypothesis_file.name, "r") as read_pred:
- bleu_cmd = [multi_bleu_path]
- if lowercase:
- bleu_cmd += ["-lc"]
- bleu_cmd += [reference_file.name]
- try:
- bleu_out = subprocess.check_output(bleu_cmd, stdin=read_pred, stderr=subprocess.STDOUT)
- bleu_out = bleu_out.decode("utf-8")
- bleu_score = re.search(r"BLEU = (.+?),", bleu_out).group(1)
- bleu_score = float(bleu_score)
- except subprocess.CalledProcessError as error:
- if error.output is not None:
- tl.logging.warning("multi-bleu.perl script returned non-zero exit code")
- tl.logging.warning(error.output)
- bleu_score = np.float32(0.0)
-
- # Close temp files
- hypothesis_file.close()
- reference_file.close()
-
- return np.float32(bleu_score)
|