"""
A reader for corpora whose documents are in MTE format.
"""
import os
from functools import reduce
from nltk import compat
from nltk.corpus.reader import concat, TaggedCorpusReader

lxmlAvailable = False
try:
    from lxml import etree
    lxmlAvailable = True
except ImportError:
    #first try c version of ElementTree
    try:
        import xml.etree.cElementTree as etree
    except ImportError:
        import xml.etree.ElementTree as etree
import re

def xpath(root, path, ns):
    if lxmlAvailable:
        return root.xpath(path, namespaces=ns)
    else:
        return root.findall(path, ns)


class MTEFileReader:
    """
    Class for loading the content of the multext-east corpus. It
    parses the xml files and does some tag-filtering depending on the
    given method parameters.
    """
    ns = {'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}
    tag_ns = '{http://www.tei-c.org/ns/1.0}'
    xml_ns = '{http://www.w3.org/XML/1998/namespace}'

    def __init__(self, file_path):
        tree = etree.parse(file_path)
        self.__root = xpath(tree.getroot(), './tei:text/tei:body', self.ns)[0]

    @classmethod
    def _words(self, text_root):
        return [w.text for w in xpath(text_root, './/*', self.ns) if
                w.tag == self.tag_ns + "w" or w.tag == self.tag_ns + "c"]

    @classmethod
    def _sents(self, text_root):
        return [MTEFileReader._words(s) for s in xpath(text_root, './/tei:s', self.ns)]

    @classmethod
    def _paras(self, text_root):
        return [MTEFileReader._sents(p) for p in xpath(text_root, './/tei:p', self.ns)]

    @classmethod
    def _lemma_words(self, text_root):
        return [(w.text, w.attrib['lemma']) for w in xpath(text_root, './/tei:w', self.ns)]

    @classmethod
    def _tagged_words(self, text_root, tags=""):
        if tags is None or tags == "":
            return [(w.text, w.attrib['ana']) for w in xpath(text_root, './/tei:w', self.ns)]

        else:
            tags = re.compile('^' + re.sub("-",".",tags) + '.*$')
            return [(w.text, w.attrib['ana']) for w in xpath(text_root, './/tei:w', self.ns)
                                              if tags.match(w.attrib['ana'])]

    @classmethod
    def _lemma_sents(self, text_root):
        return [MTEFileReader._lemma_words(s) for s in xpath(text_root, './/tei:s', self.ns)]

    @classmethod
    def _tagged_sents(self, text_root, tags=""):
        # double list comprehension to remove empty sentences in case there is a sentence only containing punctuation marks
        return [t for t in [MTEFileReader._tagged_words(s, tags) for s in xpath(text_root, './/tei:s', self.ns)] if len(t) > 0]

    @classmethod
    def _lemma_paras(self, text_root):
        return [MTEFileReader._lemma_sents(p) for p in xpath(text_root, './/tei:p', self.ns)]

    @classmethod
    def _tagged_paras(self, text_root, tags=""):
        return [t for t in [MTEFileReader._tagged_sents(p, tags) for p in xpath(text_root, './/tei:p', self.ns)] if len(t) > 0]


    def words(self):
        return MTEFileReader._words(self.__root)

    def sents(self):
        return MTEFileReader._sents(self.__root)

    def paras(self):
        return MTEFileReader._paras(self.__root)

    def lemma_words(self):
        return MTEFileReader._lemma_words(self.__root)

    def tagged_words(self, tags=""):
        return MTEFileReader._tagged_words(self.__root, tags)

    def lemma_sents(self):
        return MTEFileReader._lemma_sents(self.__root)

    def tagged_sents(self, tags=""):
        return MTEFileReader._tagged_sents(self.__root)

    def lemma_paras(self):
        return MTEFileReader._lemma_paras(self.__root)

    def tagged_paras(self, tags=""):
        return MTEFileReader._tagged_paras(self.__root)

class MTETagConverter:
    """
    Class for converting msd tags to universal tags, more conversion
    options are currently not implemented.
    """

    mapping_msd_universal = {
        'A': 'ADJ', 'S': 'ADP', 'R': 'ADV', 'C': 'CONJ',
        'D': 'DET', 'N': 'NOUN', 'M': 'NUM', 'Q': 'PRT',
        'P': 'PRON', 'V': 'VERB', '.': '.', '-': 'X'}

    @staticmethod
    def msd_to_universal(tag):
        """
        This function converts the annotation from the Multex-East to the universal tagset
        as described in Chapter 5 of the NLTK-Book

        Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
        """
        indicator = tag[0] if not tag[0] == "#" else tag[1]

        if not indicator in MTETagConverter.mapping_msd_universal:
            indicator = '-'

        return MTETagConverter.mapping_msd_universal[indicator]

class MTECorpusReader(TaggedCorpusReader):
    """
    Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
    MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
    scheme. These tags can be converted to the Universal tagset
    """

    def __init__(self, root=None, fileids=None, encoding='utf8'):
        """
        Construct a new MTECorpusreader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP

        :param root: The root directory for this corpus. (default points to location in multext config file)
        :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
        :param enconding: The encoding of the given files (default is utf8)
        """
        TaggedCorpusReader.__init__(self, root, fileids, encoding)

    def __fileids(self, fileids):
        if fileids is None: fileids = self._fileids
        elif isinstance(fileids, compat.string_types): fileids = [fileids]
        # filter wrong userinput
        fileids = filter(lambda x : x in self._fileids, fileids)
        # filter multext-east sourcefiles that are not compatible to the teip5 specification
        fileids = filter(lambda x : x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
        if not fileids:
            print("No valid multext-east file specified")
        return fileids

    def readme(self):
        """
        Prints some information about this corpus.
        :return: the content of the attached README file
        :rtype: str
        """
        return self.open("00README.txt").read()

    def raw(self, fileids=None):
        """
	    :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a single string.
        :rtype: str
        """
        return concat([self.open(f).read() for f in self.__fileids(fileids)])

    def words(self, fileids=None):
        """
	    :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        """
        return  reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).words() for f in self.__fileids(fileids)], [])

    def sents(self, fileids=None):
        """
	    :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of sentences or utterances,
                 each encoded as a list of word strings
        :rtype: list(list(str))
        """
        return  reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).sents() for f in self.__fileids(fileids)], [])

    def paras(self, fileids=None):
        """
	    :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of paragraphs, each encoded as a list
                 of sentences, which are in turn encoded as lists of word string
        :rtype: list(list(list(str)))
        """
        return  reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).paras() for f in self.__fileids(fileids)], [])

    def lemma_words(self, fileids=None):
        """
	    :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of words, the corresponding lemmas
                 and punctuation symbols, encoded as tuples (word, lemma)
        :rtype: list(tuple(str,str))
        """
        return  reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).lemma_words() for f in self.__fileids(fileids)], [])

    def tagged_words(self, fileids=None, tagset="msd", tags=None):
        """
	    :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of tagged words and punctuation symbols
                 encoded as tuples (word, tag)
        :rtype: list(tuple(str, str))
        """
        words = reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).tagged_words(tags=tags) for f in self.__fileids(fileids)], [])
        if tagset == "universal":
            return map(lambda wt : (wt[0], MTETagConverter.msd_to_universal(wt[1])), words)
        elif tagset == "msd":
            return words
        else:
            print("Unknown tagset specified.")

    def lemma_sents(self, fileids=None):
        """
	    :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of sentences or utterances, each
                 encoded as a list of tuples of the word and the corresponding
                 lemma (word, lemma)
        :rtype: list(list(tuple(str, str)))
        """
        return  reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).lemma_sents() for f in self.__fileids(fileids)], [])


    def tagged_sents(self, fileids=None, tagset="msd", tags=None):
        """
	    :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of sentences or utterances, each
                 each encoded as a list of (word,tag) tuples
        :rtype: list(list(tuple(str, str)))
        """
        sents = reduce(lambda a, b : a + b, [MTEFileReader(os.path.join(self._root, f)).tagged_sents(tags=tags) for f in self.__fileids(fileids)], [])
        if tagset == "universal":
            return map(lambda s : map (lambda wt : (wt[0], MTETagConverter.msd_to_universal(wt[1])), s), sents)
        elif tagset == "msd":
            return sents
        else:
            print("Unknown tagset specified.")

    def lemma_paras(self, fileids=None):
        """
	    :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of paragraphs, each encoded as a
                 list of sentences, which are in turn encoded as a list of
                 tuples of the word and the corresponding lemma (word, lemma)
        :rtype: list(List(List(tuple(str, str))))
        """
        return reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).lemma_paras() for f in self.__fileids(fileids)], [])

    def tagged_paras(self, fileids=None, tagset="msd", tags=None):
        """
	    :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of paragraphs, each encoded as a
                 list of sentences, which are in turn encoded as a list
                 of (word,tag) tuples
        :rtype: list(list(list(tuple(str, str))))
        """
        paras = reduce(lambda a, b : a + b, [MTEFileReader(os.path.join(self._root, f)).tagged_paras(tags=tags) for f in self.__fileids(fileids)], [])
        if tagset == "universal":
            return map(lambda p : map(lambda s : map (lambda wt : (wt[0], MTETagConverter.msd_to_universal(wt[0])), s), p), paras)
        elif tagset == "msd":
            return paras
        else:
            print("Unknown tagset specified.")