Source code for bacalhau.document
import abc
import os
[docs]class Document:
"""Abstract class to read from/write to files. Different implementations
should extend this class and override the abstract methods."""
__metaclass__ = abc.ABCMeta
def __init__(self, filepath, tokenizer, stopwords):
"""Creates a new `Document` for the given file path.
:param filepath: path to the file.
:type filepath: `str`
:param tokenizer: tokenizer used to tokenize the files in the corpus.
:type tokenizer: `nltk.tokenize.api.TokenizerI`
:param stopwords: words to be removed from the texts.
:type stopwords: `list`
"""
self._path = os.path.abspath(filepath)
self._document_id = os.path.splitext(os.path.basename(self._path))[0]
self._base_filepath = os.path.splitext(self._path)[0]
self._tokenizer = tokenizer
self._stopwords = stopwords
self._texts = self.get_texts()
@abc.abstractmethod
[docs] def get_texts(self):
"""Returns a list of `bacalhau.text.Text` objects within this
document.
:returns: list of `bacalhau.text.Text` objects.
:rtype: `list`
"""
return
@abc.abstractmethod
[docs] def get_term_data(self):
"""Returns term data for each `bacalhau.text.Text` within this document.
:returns: `dict`
"""
return
[docs] def get_text_count(self):
"""Returns the number of `bacalhau.text.Text` objects for this
`.Document`.
:returns: number of `bacalhau.text.Text` objects.
:rtype: `int`
"""
return len(self._texts)