Source code for bacalhau.text
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import re
[docs]class Text(object):
"""Represents a text unit from a `bacalhau.document.Document`."""
def __init__(self, text_id, content, tokenizer, stopwords):
"""Creates a new `.Text` object.
:param text_id: id of the `.Text`.
:type text_id: `str`
:param content: content of the `.Text`.
:type content: `str`
:param tokenizer: tokenizer used to tokenize the files in the corpus.
:type tokenizer: `nltk.tokenize.api.TokenizerI`
:param stopwords: words to be removed from the texts.
:type stopwords: `list` of words.
"""
self._text_id = text_id
self._content = content.lower()
self._tokenizer = tokenizer
self._stopwords = stopwords
self._lemmatizer = WordNetLemmatizer()
[docs] def get_term_data(self):
"""Returns term data for this text.
The term's data are the unnormalised and normalised frequency
counts of the term in this text. The former uses the "count"
key, the latter "frequency".
The data is structured as a nested dictionary (term -> text ->
counts) for easy merging of the term data from multiple
`.Text`\s.
:rtype: `dict`
"""
term_data = {}
tokens = self._tokenizer.tokenize(self._content)
max_token_count = 0
# This provides a "term count" that is unnormalised, meaning
# that the length of the text is not accounted for.
for token in tokens:
if self._is_valid_token(token):
lemma = self._lemmatizer.lemmatize(token)
token_data = term_data.setdefault(lemma,
{self._text_id: {'count': 0}})
if (token_data[self._text_id]['count'] + 1) > max_token_count:
max_token_count += 1
term_data[lemma][self._text_id]['count'] += 1
# Normalise the term counts to provide a "term frequency" for
# each term.
for term, text_data in term_data.items():
count = float(text_data[self._text_id]['count'])
text_data[self._text_id]['frequency'] = count / max_token_count
return term_data
[docs] def _is_valid_token(self, token):
"""Checks if the `token` is suitable for processing. A token is
suitable if: it is not in the list of stopwords; it is composed of
alphabetical character; and is a considered a noun by WordNet.
:param token: the token to validate.
:type token: `str`
:returns: True if `token` is valid.
:rtype: `bool`
"""
if token in self._stopwords:
return False
if re.search(r'[^A-Za-z]', token):
return False
if not wordnet.synsets(token, pos=wordnet.NOUN):
return False
return True