Source code for bacalhau.corpus
from collections import defaultdict
from math import log
import os
import nltk
from bacalhau.topic_tree import TopicTree
[docs]class Corpus(object):
"""A manager class to generate topic hierarchies from files."""
def __init__(self, corpus_path, document_class,
tokenizer=nltk.tokenize.regexp.WordPunctTokenizer(),
stopwords=nltk.corpus.stopwords.words('english'),
**document_kwargs):
"""Creates a new `.Corpus` for the given path, using the given
`bacalhau.document.Document` class to process the files.
:param corpus_path: path to the files.
:type corpus_path: `str`
:param document_class: document class used to process the
corpus files.
:type document_class: `bacalhau.document.Document`
:param tokenizer: tokenizer used to tokenize the files in the
corpus, defaults to
`nltk.tokenize.regexp.WordPunctTokenizer`.
:type tokenizer: `nltk.tokenize.api.TokenizerI`
:param stopwords: words to be removed from the texts, defaults
to `nltk.corpus.stopwords.words(\'english\')`.
:type stopwords: `list`
"""
self._corpus_path = os.path.abspath(corpus_path)
self._document_class = document_class
self._tokenizer = tokenizer
self._stopwords = stopwords
self._document_kwargs = document_kwargs
self._documents = self._get_documents()
# Total number of texts (not documents) in the corpus.
self._text_count = self._get_text_count()
self._hypernyms = None
[docs] def _get_documents(self):
"""Creates a `bacalhau.document.Document` object for each
of the files in the corpus, and returns them in a `list`.
:param corpus_path: path to the corpus files.
:type corpus_path: `str`
:returns: documents in this corpus.
:rtype: `list`
"""
documents = []
for (path, dirs, files) in os.walk(self._corpus_path):
for filename in files:
document = self._document_class(
os.path.join(path, filename), self._tokenizer,
self._stopwords, **self._document_kwargs)
documents.append(document)
return documents
[docs] def _get_text_count(self):
"""Returns the number of `bacalhau.text.Text` objects in this
corpus.
:rtype: `float`
"""
count = 0
for document in self._documents:
count += document.get_text_count()
return float(count)
[docs] def generate_topic_tree(self, n_terms):
"""Generates a `bacalhau.topic_tree.TopicTree` for the corpus,
using a maximum of `n_terms` from each
`bacalhau.text.Text`. First extracts top terms; second gets
hypernyms for each of the terms; third creates the
`bacalhau.topic_tree.TopicTree` using the hypernyms.
:param n_terms: maximum number of terms to be used from each
`Text`.
:type n_terms: `int`
:returns: the generated topic tree.
:rtype: `bacalhau.topic_tree.TopicTree`
"""
top_terms = self.get_top_terms(n_terms)
hypernyms = self.get_hypernyms(top_terms)
tree = self.get_topic_tree(hypernyms)
self._hypernyms = hypernyms
return tree
[docs] def get_top_terms(self, n_terms):
"""Returns a dictionary with the highest `n_terms` for each
`bacalhau.text.Text` from the term data dictionary.
:param n_terms: maximum number of terms to be used from each
text.
:type n_terms: `int`
:returns: `dict`
"""
term_data = self._add_tf_idf(self._get_term_data())
top_terms = defaultdict(list)
top_terms_meta = defaultdict(dict)
for term, data in term_data.iteritems():
for text, v in data.iteritems():
count = len(top_terms[text])
tf_idf = v['tf.idf']
if count < n_terms:
top_terms[text].append(term)
top_terms_meta[text][tf_idf] = term
else:
lower_tf_idf = sorted(top_terms_meta[text])[0]
if tf_idf > lower_tf_idf:
lower_term = top_terms_meta[text][lower_tf_idf]
top_terms[text].remove(lower_term)
top_terms[text].append(term)
top_terms_meta[text].pop(lower_tf_idf)
top_terms_meta[text][tf_idf] = term
return top_terms
[docs] def _get_term_data(self):
"""Returns term data for all of the
`bacalhau.document.Document` objects in this corpus.
:rtype: `dict`
"""
term_data = defaultdict(dict)
for document in self._documents:
document_term_data = document.get_term_data()
for term, new_term_data in document_term_data.items():
term_data[term].update(new_term_data)
return term_data
[docs] def _add_tf_idf(self, term_data):
"""Returns `term_data` with a TF.IDF value added to each
term/text combination.
:param term_data: dict with term/text combination.
:type term_data: `dict`
:rtype: `dict`
"""
for term, text_frequencies in term_data.items():
# Number of texts containing the term.
matches = len(text_frequencies)
idf = log(self._text_count / matches)
for text, text_data in text_frequencies.items():
text_data['tf.idf'] = text_data['frequency'] * idf
return term_data
[docs] def get_hypernyms(self, top_terms):
"""Returns a dictionary with the hypernyms for the given terms.
:param top_terms: dict with term/text information.
:type top_terms: `dict`
:returns: {text: {term: hypernym}}.
:rtype: `dict`
"""
hypernyms = defaultdict(dict)
cache = {}
for text, terms in top_terms.iteritems():
for term in terms:
h = cache.get(term)
if h is None:
h = self._get_hypernym(term)
h.reverse()
cache[term] = h
hypernyms[text][term] = h
return hypernyms
[docs] def _get_hypernym(self, word):
"""Returns a list of the hypernyms for the given word.
:param word: the word to get the hypernym for.
:type word: `str`
:rtype: `list`
"""
hypernym = [word]
synsets = nltk.corpus.wordnet.synsets(word)
while len(synsets) > 0:
s = synsets[0]
hypernym.append(s.name)
synsets = s.hypernyms()
return hypernym
[docs] def get_topic_tree(self, hypernyms):
"""Generates and returns a `bacalhau.topic_tree.TopicTree` for
the given hypernyms.
:param hypernyms: dictionary of hypernyms.
:type hypernyms: `dict`
:rtype: `bacalhau.topic_tree.TopicTree`
"""
tree = TopicTree()
for text, data in hypernyms.iteritems():
for term, hypernym in data.iteritems():
tree.add_nodes_from(hypernym)
tree.node[hypernym[len(hypernym) - 1]]['is_leaf'] = True
tree.node[hypernym[0]]['is_root'] = True
tree.add_path(hypernym)
return tree
[docs] def annotate_topic_tree(self, tree):
"""Annotates the nodes in the `bacalhau.topic_tree.TopicTree`
with information about which `bacalhau.text.Text` and counts
the nodes relate to.
:param tree: topic tree of terms
:type tree: `bacalhau.topic_tree.TopicTree`
:rtype: `bacalhau.topic_tree.TopicTree`
"""
hypernyms = self._hypernyms
for text, data in hypernyms.iteritems():
for hypernym in data.values():
for node in tree.nbunch_iter(hypernym):
texts = tree.node[node].setdefault('texts', [])
texts.append(text)
if 'count' not in tree.node[node]:
tree.node[node]['count'] = 0
tree.node[node]['count'] += 1
return tree