# -*- coding: utf-8 -*- """Utilities for text input preprocessing. May benefit from a fast Cython rewrite. """ from __future__ import absolute_import from __future__ import division import string import sys import numpy as np from six.moves import range from six.moves import zip if sys.version_info < (3,): maketrans = string.maketrans else: maketrans = str.maketrans def text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "): """Converts a text to a sequence of word indices. # Arguments text: Input text (string). filters: Sequence of characters to filter out. lower: Whether to convert the input to lowercase. split: Sentence split marker (string). # Returns A list of integer word indices. """ if lower: text = text.lower() text = text.translate(maketrans(filters, split * len(filters))) seq = text.split(split) return [i for i in seq if i] def one_hot(text, n, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' '): seq = text_to_word_sequence(text, filters=filters, lower=lower, split=split) return [(abs(hash(w)) % (n - 1) + 1) for w in seq] class Tokenizer(object): """Text tokenization utility class. This class allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf... # Arguments nb_words: the maximum number of words to keep, based on word frequency. Only the most common `nb_words` words will be kept. filters: a string where each element is a character that will be filtered from the texts. The default is all punctuation, plus tabs and line breaks, minus the `'` character. lower: boolean. Whether to convert the texts to lowercase. split: character or string to use for token splitting. char_level: if True, every character will be treated as a word. By default, all punctuation is removed, turning the texts into space-separated sequences of words (words maybe include the `'` character). These sequences are then split into lists of tokens. They will then be indexed or vectorized. `0` is a reserved index that won't be assigned to any word. """ def __init__(self, nb_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False): self.word_counts = {} self.word_docs = {} self.filters = filters self.split = split self.lower = lower self.nb_words = nb_words self.document_count = 0 self.char_level = char_level def fit_on_texts(self, texts): """Updates internal vocabulary based on a list of texts. Required before using `texts_to_sequences` or `texts_to_matrix`. # Arguments texts: can be a list of strings, or a generator of strings (for memory-efficiency) """ self.document_count = 0 for text in texts: self.document_count += 1 seq = text if self.char_level else text_to_word_sequence(text, self.filters, self.lower, self.split) for w in seq: if w in self.word_counts: self.word_counts[w] += 1 else: self.word_counts[w] = 1 for w in set(seq): if w in self.word_docs: self.word_docs[w] += 1 else: self.word_docs[w] = 1 wcounts = list(self.word_counts.items()) wcounts.sort(key=lambda x: x[1], reverse=True) sorted_voc = [wc[0] for wc in wcounts] # note that index 0 is reserved, never assigned to an existing word self.word_index = dict(list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1))))) self.index_docs = {} for w, c in list(self.word_docs.items()): self.index_docs[self.word_index[w]] = c def fit_on_sequences(self, sequences): """Updates internal vocabulary based on a list of sequences. Required before using `sequences_to_matrix` (if `fit_on_texts` was never called). # Arguments sequences: A list of sequence. A "sequence" is a list of integer word indices. """ self.document_count = len(sequences) self.index_docs = {} for seq in sequences: seq = set(seq) for i in seq: if i not in self.index_docs: self.index_docs[i] = 1 else: self.index_docs[i] += 1 def texts_to_sequences(self, texts): """Transforms each text in texts in a sequence of integers. Only top "nb_words" most frequent words will be taken into account. Only words known by the tokenizer will be taken into account. # Arguments texts: A list of texts (strings). # Returns A list of sequences. """ res = [] for vect in self.texts_to_sequences_generator(texts): res.append(vect) return res def texts_to_sequences_generator(self, texts): """Transforms each text in texts in a sequence of integers. Only top "nb_words" most frequent words will be taken into account. Only words known by the tokenizer will be taken into account. # Arguments texts: A list of texts (strings). # Yields Yields individual sequences. """ nb_words = self.nb_words for text in texts: seq = text if self.char_level else text_to_word_sequence(text, self.filters, self.lower, self.split) vect = [] for w in seq: i = self.word_index.get(w) if i is not None: if nb_words and i >= nb_words: continue else: vect.append(i) yield vect def texts_to_matrix(self, texts, mode='binary'): """Convert a list of texts to a Numpy matrix. # Arguments texts: list of strings. mode: one of "binary", "count", "tfidf", "freq". # Returns A Numpy matrix. """ sequences = self.texts_to_sequences(texts) return self.sequences_to_matrix(sequences, mode=mode) def sequences_to_matrix(self, sequences, mode='binary'): """Converts a list of sequences into a Numpy matrix. # Arguments sequences: list of sequences (a sequence is a list of integer word indices). mode: one of "binary", "count", "tfidf", "freq" # Returns A Numpy matrix. # Raises ValueError: In case of invalid `mode` argument, or if the Tokenizer requires to be fit to sample data. """ if not self.nb_words: if self.word_index: nb_words = len(self.word_index) + 1 else: raise ValueError('Specify a dimension (nb_words argument), ' 'or fit on some text data first.') else: nb_words = self.nb_words if mode == 'tfidf' and not self.document_count: raise ValueError('Fit the Tokenizer on some data ' 'before using tfidf mode.') x = np.zeros((len(sequences), nb_words)) for i, seq in enumerate(sequences): if not seq: continue counts = {} for j in seq: if j >= nb_words: continue if j not in counts: counts[j] = 1. else: counts[j] += 1 for j, c in list(counts.items()): if mode == 'count': x[i][j] = c elif mode == 'freq': x[i][j] = c / len(seq) elif mode == 'binary': x[i][j] = 1 elif mode == 'tfidf': # Use weighting scheme 2 in # https://en.wikipedia.org/wiki/Tf%E2%80%93idf tf = 1 + np.log(c) idf = np.log(1 + self.document_count / (1 + self.index_docs.get(j, 0))) x[i][j] = tf * idf else: raise ValueError('Unknown vectorization mode:', mode) return x