Source code for bacalhau.tei_document

from bacalhau.document import Document
from bacalhau.text import Text
from collections import defaultdict
from lxml import etree


[docs]class TEIDocument (Document):
    """Implementation of the abstract
    `bacalhau.document.Document` class to work with TEI files."""

    TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
    TEI = '{%s}' % TEI_NAMESPACE
    XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'
    XML = '{%s}' % (XML_NAMESPACE)
    NS_MAP = {'tei': TEI_NAMESPACE, 'xml': XML_NAMESPACE}

    def __init__(self, filepath, tokenizer, stopwords, xpath,
                 ns_map=NS_MAP):
        """Creates a new `.TEIDocument` for the given file path.

        :param filepath: path to the file.
        :type filepath: `str`
        :param tokenizer: tokenizer used to tokenize the files in the
            corpus.
        :type tokenizer: `nltk.tokenize.api.TokenizerI`
        :param stopwords: words to be removed from the texts.
        :type stopwords: `list`
        :param xpath: XPath where to get the `bacalhau.text.Text` from
            the TEI files.
        :type xpath: `str`
        :param ns_map: namespaces used in the `.TEIDocument`.
        :type ns_map: `dict`
        """
        self._xpath = xpath
        self._ns_map = ns_map
        super(TEIDocument, self).__init__(filepath, tokenizer,
                stopwords)

[docs]    def get_texts(self):
        """Returns a list of `bacalhau.text.Text` objects within this
        document.

        :returns: `bacalhau.text.Text` objects within this document.
        :rtype: `list`
        """
        texts = []
        tree = etree.parse(self._path)
        text_elements = tree.xpath(self._xpath,
                namespaces=self._ns_map)
        for text_element in text_elements:
            xml_id = text_element.get(self.XML + 'id')
            text_id = '%s-%s' % (self._document_id, xml_id)
            content = etree.tostring(text_element, encoding='utf-8',
                                     method='text')
            texts.append(Text(text_id, content, self._tokenizer,
                              self._stopwords))
        return texts

[docs]    def get_term_data(self):
        """Returns term data for each `bacalhau.text.Text` within this
        document.

        :rtype: `dict`
        """
        term_data = defaultdict(dict)
        for text in self._texts:
            text_term_data = text.get_term_data()
            for term, new_term_data in text_term_data.items():
                term_data[term].update(new_term_data)
        return term_data