Source code for bacalhau.text

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import re


[docs]class Text(object): """Represents a text unit from a `bacalhau.document.Document`.""" def __init__(self, text_id, content, tokenizer, stopwords): """Creates a new `.Text` object. :param text_id: id of the `.Text`. :type text_id: `str` :param content: content of the `.Text`. :type content: `str` :param tokenizer: tokenizer used to tokenize the files in the corpus. :type tokenizer: `nltk.tokenize.api.TokenizerI` :param stopwords: words to be removed from the texts. :type stopwords: `list` of words. """ self._text_id = text_id self._content = content.lower() self._tokenizer = tokenizer self._stopwords = stopwords self._lemmatizer = WordNetLemmatizer()
[docs] def get_term_data(self): """Returns term data for this text. The term's data are the unnormalised and normalised frequency counts of the term in this text. The former uses the "count" key, the latter "frequency". The data is structured as a nested dictionary (term -> text -> counts) for easy merging of the term data from multiple `.Text`\s. :rtype: `dict` """ term_data = {} tokens = self._tokenizer.tokenize(self._content) max_token_count = 0 # This provides a "term count" that is unnormalised, meaning # that the length of the text is not accounted for. for token in tokens: if self._is_valid_token(token): lemma = self._lemmatizer.lemmatize(token) token_data = term_data.setdefault(lemma, {self._text_id: {'count': 0}}) if (token_data[self._text_id]['count'] + 1) > max_token_count: max_token_count += 1 term_data[lemma][self._text_id]['count'] += 1 # Normalise the term counts to provide a "term frequency" for # each term. for term, text_data in term_data.items(): count = float(text_data[self._text_id]['count']) text_data[self._text_id]['frequency'] = count / max_token_count return term_data
[docs] def _is_valid_token(self, token): """Checks if the `token` is suitable for processing. A token is suitable if: it is not in the list of stopwords; it is composed of alphabetical character; and is a considered a noun by WordNet. :param token: the token to validate. :type token: `str` :returns: True if `token` is valid. :rtype: `bool` """ if token in self._stopwords: return False if re.search(r'[^A-Za-z]', token): return False if not wordnet.synsets(token, pos=wordnet.NOUN): return False return True