Source code for bacalhau.corpus

from collections import defaultdict
from math import log
import os

import nltk

from bacalhau.topic_tree import TopicTree


[docs]class Corpus(object): """A manager class to generate topic hierarchies from files.""" def __init__(self, corpus_path, document_class, tokenizer=nltk.tokenize.regexp.WordPunctTokenizer(), stopwords=nltk.corpus.stopwords.words('english'), **document_kwargs): """Creates a new `.Corpus` for the given path, using the given `bacalhau.document.Document` class to process the files. :param corpus_path: path to the files. :type corpus_path: `str` :param document_class: document class used to process the corpus files. :type document_class: `bacalhau.document.Document` :param tokenizer: tokenizer used to tokenize the files in the corpus, defaults to `nltk.tokenize.regexp.WordPunctTokenizer`. :type tokenizer: `nltk.tokenize.api.TokenizerI` :param stopwords: words to be removed from the texts, defaults to `nltk.corpus.stopwords.words(\'english\')`. :type stopwords: `list` """ self._corpus_path = os.path.abspath(corpus_path) self._document_class = document_class self._tokenizer = tokenizer self._stopwords = stopwords self._document_kwargs = document_kwargs self._documents = self._get_documents() # Total number of texts (not documents) in the corpus. self._text_count = self._get_text_count() self._hypernyms = None
[docs] def _get_documents(self): """Creates a `bacalhau.document.Document` object for each of the files in the corpus, and returns them in a `list`. :param corpus_path: path to the corpus files. :type corpus_path: `str` :returns: documents in this corpus. :rtype: `list` """ documents = [] for (path, dirs, files) in os.walk(self._corpus_path): for filename in files: document = self._document_class( os.path.join(path, filename), self._tokenizer, self._stopwords, **self._document_kwargs) documents.append(document) return documents
[docs] def _get_text_count(self): """Returns the number of `bacalhau.text.Text` objects in this corpus. :rtype: `float` """ count = 0 for document in self._documents: count += document.get_text_count() return float(count)
[docs] def generate_topic_tree(self, n_terms): """Generates a `bacalhau.topic_tree.TopicTree` for the corpus, using a maximum of `n_terms` from each `bacalhau.text.Text`. First extracts top terms; second gets hypernyms for each of the terms; third creates the `bacalhau.topic_tree.TopicTree` using the hypernyms. :param n_terms: maximum number of terms to be used from each `Text`. :type n_terms: `int` :returns: the generated topic tree. :rtype: `bacalhau.topic_tree.TopicTree` """ top_terms = self.get_top_terms(n_terms) hypernyms = self.get_hypernyms(top_terms) tree = self.get_topic_tree(hypernyms) self._hypernyms = hypernyms return tree
[docs] def get_top_terms(self, n_terms): """Returns a dictionary with the highest `n_terms` for each `bacalhau.text.Text` from the term data dictionary. :param n_terms: maximum number of terms to be used from each text. :type n_terms: `int` :returns: `dict` """ term_data = self._add_tf_idf(self._get_term_data()) top_terms = defaultdict(list) top_terms_meta = defaultdict(dict) for term, data in term_data.iteritems(): for text, v in data.iteritems(): count = len(top_terms[text]) tf_idf = v['tf.idf'] if count < n_terms: top_terms[text].append(term) top_terms_meta[text][tf_idf] = term else: lower_tf_idf = sorted(top_terms_meta[text])[0] if tf_idf > lower_tf_idf: lower_term = top_terms_meta[text][lower_tf_idf] top_terms[text].remove(lower_term) top_terms[text].append(term) top_terms_meta[text].pop(lower_tf_idf) top_terms_meta[text][tf_idf] = term return top_terms
[docs] def _get_term_data(self): """Returns term data for all of the `bacalhau.document.Document` objects in this corpus. :rtype: `dict` """ term_data = defaultdict(dict) for document in self._documents: document_term_data = document.get_term_data() for term, new_term_data in document_term_data.items(): term_data[term].update(new_term_data) return term_data
[docs] def _add_tf_idf(self, term_data): """Returns `term_data` with a TF.IDF value added to each term/text combination. :param term_data: dict with term/text combination. :type term_data: `dict` :rtype: `dict` """ for term, text_frequencies in term_data.items(): # Number of texts containing the term. matches = len(text_frequencies) idf = log(self._text_count / matches) for text, text_data in text_frequencies.items(): text_data['tf.idf'] = text_data['frequency'] * idf return term_data
[docs] def get_hypernyms(self, top_terms): """Returns a dictionary with the hypernyms for the given terms. :param top_terms: dict with term/text information. :type top_terms: `dict` :returns: {text: {term: hypernym}}. :rtype: `dict` """ hypernyms = defaultdict(dict) cache = {} for text, terms in top_terms.iteritems(): for term in terms: h = cache.get(term) if h is None: h = self._get_hypernym(term) h.reverse() cache[term] = h hypernyms[text][term] = h return hypernyms
[docs] def _get_hypernym(self, word): """Returns a list of the hypernyms for the given word. :param word: the word to get the hypernym for. :type word: `str` :rtype: `list` """ hypernym = [word] synsets = nltk.corpus.wordnet.synsets(word) while len(synsets) > 0: s = synsets[0] hypernym.append(s.name) synsets = s.hypernyms() return hypernym
[docs] def get_topic_tree(self, hypernyms): """Generates and returns a `bacalhau.topic_tree.TopicTree` for the given hypernyms. :param hypernyms: dictionary of hypernyms. :type hypernyms: `dict` :rtype: `bacalhau.topic_tree.TopicTree` """ tree = TopicTree() for text, data in hypernyms.iteritems(): for term, hypernym in data.iteritems(): tree.add_nodes_from(hypernym) tree.node[hypernym[len(hypernym) - 1]]['is_leaf'] = True tree.node[hypernym[0]]['is_root'] = True tree.add_path(hypernym) return tree
[docs] def annotate_topic_tree(self, tree): """Annotates the nodes in the `bacalhau.topic_tree.TopicTree` with information about which `bacalhau.text.Text` and counts the nodes relate to. :param tree: topic tree of terms :type tree: `bacalhau.topic_tree.TopicTree` :rtype: `bacalhau.topic_tree.TopicTree` """ hypernyms = self._hypernyms for text, data in hypernyms.iteritems(): for hypernym in data.values(): for node in tree.nbunch_iter(hypernym): texts = tree.node[node].setdefault('texts', []) texts.append(text) if 'count' not in tree.node[node]: tree.node[node]['count'] = 0 tree.node[node]['count'] += 1 return tree