diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..8f3fb55 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,9 @@ +[run] +source = dsegmenter +omit = + */python?.?/* + */lib-python/?.?/*.py + */lib_pypy/_*.py + */site-packages/ordereddict.py + */site-packages/nose/* + */unittest2/* diff --git a/.gitignore b/.gitignore index ed68fee..460309c 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,4 @@ docs/_build/ # PyBuilder target/ MANIFEST +venv/ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..22521b0 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,47 @@ +language: python + +python: + - 2.7 + +git: + depth: 3 + +branches: + only: + - master + +notifications: + email: false + +# Setup anaconda +before_install: + - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh + - chmod +x miniconda.sh + - ./miniconda.sh -b + - export PATH=/home/travis/miniconda2/bin:$PATH + - conda update --yes conda + - conda create --yes -n condaenv python=$TRAVIS_PYTHON_VERSION + - conda install --yes -n condaenv pip + - source activate condaenv + # The next couple lines fix a crash with multiprocessing on Travis + # and are not specific to using Miniconda + - sudo rm -rf /dev/shm + - sudo ln -s /run/shm /dev/shm + +# Install packages +install: + - conda install --yes python=$TRAVIS_PYTHON_VERSION anaconda-client atlas numpy scipy + - conda install --yes python=$TRAVIS_PYTHON_VERSION scikit-learn + # - conda install --yes python=$TRAVIS_PYTHON_VERSION --file=requirements.txt + # Coverage packages are on my binstar channel + # - conda install --yes -c dan_blanchard python-coveralls nose-cov + - pip install -r requirements.txt + - pip install -r test-requirements.txt + - ./setup.py build install + +# Run test +script: + - ./setup.py test + +after_success: + - bash <(curl -s https://codecov.io/bash) diff --git a/AUTHORS b/AUTHORS index 40a7be3..05503b8 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,3 +1,3 @@ Andreas Peldszus -Jean VanCoppenolle Wladimir Sidorenko (Uladzimir Sidarenka) +Jean VanCoppenolle diff --git a/MANIFEST.in b/MANIFEST.in index 5f59d76..5b2b510 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,7 +3,7 @@ include README.rst include requirements.txt include dsegmenter/bparseg/data/*.model include dsegmenter/bparseg/data/*.npy -include dsegmenter/bparseg/data/mate.model +include dsegmenter/mateseg/data/mate.model include dsegmenter/edseg/data/*.txt include dsegmenter/evaluation/alpha/* recursive-include examples * diff --git a/README.rst b/README.rst index 72fcd65..0d6c640 100644 --- a/README.rst +++ b/README.rst @@ -2,6 +2,11 @@ Discourse Segmenter =================== +.. image:: https://travis-ci.org/discourse-lab/DiscourseSegmenter.svg?branch=master + :alt: Build Status + :align: right + :target: https://travis-ci.org/discourse-lab/DiscourseSegmenter + .. image:: https://img.shields.io/badge/license-MIT-blue.svg :alt: MIT License :align: right @@ -18,8 +23,8 @@ This python module currently comprises three discourse segmenters: **edseg** is a rule-based system that uses shallow discourse-oriented - parsing to determine boundaries of elementary discourse units in - text. The rules are hard-coded in the `submodule's file`_ and are + parsing to determine the boundaries of elementary discourse units. + The rules are hard-coded in the `submodule's file`_ and are only applicable to German input. **bparseg** @@ -32,14 +37,11 @@ This python module currently comprises three discourse segmenters: --help`` for further instructions on how to do that). **mateseg** - is an ML-based segmentation module that operates on syntactic - dependency trees (output from Mate_) and decides whether a - sub-structure of the dependency graph initiates a discourse segment - or not using a pre-trained linear SVM model. Again, this model was - trained on the German PCC_ corpus. - - -*Since the current model is a serialized file and, therefore, likely to be incompatible with future releases of `numpy`, we will probably remove the model files from future versions of this package, including source data instead and performing training during the installation.* + is another ML-based segmentation module that operates on dependency + trees (output from MateParser_) and decides whether a sub-structure + of the dependency graph initiates a discourse segment or not using + a pre-trained linear SVM model. Again, this model was trained on + the German PCC_ corpus. Installation @@ -79,9 +81,15 @@ or, alternatively, also use the delivered front-end script discourse_segmenter bparseg segment DiscourseSegmenter/examples/bpar/maz-8727.exb.bpar +or + +.. code-block:: shell + + discourse_segmenter mateseg segment DiscourseSegmenter/examples/conll/maz-8727.parsed.conll + Note that this script requires two mandatory arguments: the type of -the segmenter to use (`bparseg` in the above case) and the operation -to perform (which are specific to each segmenter). +the segmenter to use (`bparseg` or `mateseg` in the above cases) and the +operation to perform (which meight be specific to each segmenter). Evaluation @@ -104,7 +112,7 @@ which requires Java 8. .. _`Bitpar`: http://www.cis.uni-muenchen.de/~schmid/tools/BitPar/ -.. _`Mate`: http://code.google.com/p/mate-tools/ +.. _`MateParser`: http://code.google.com/p/mate-tools/ .. _`PCC`: http://www.lrec-conf.org/proceedings/lrec2014/pdf/579_Paper.pdf .. _`here`: https://github.com/discourse-lab/DiscourseSegmenter/blob/master/scripts/discourse_segmenter .. _`submodule's file`: https://github.com/discourse-lab/DiscourseSegmenter/blob/master/dsegmenter/edseg/clause_segmentation.py diff --git a/doc-requirements.txt b/doc-requirements.txt new file mode 100644 index 0000000..fdb6424 --- /dev/null +++ b/doc-requirements.txt @@ -0,0 +1,4 @@ +sphinx>=1.4.1 +sphinxcontrib-napoleon>=0.4.4 +sphinx-pypi-upload>=0.2.1 +sphinx_rtd_theme>=0.1.9 diff --git a/docs/conf.py b/docs/conf.py index 19010df..59f9ef2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -19,7 +19,7 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath(os.path.join('..', 'dsegmenter'))) +sys.path.insert(0, os.path.abspath(os.path.join('dsegmenter'))) # -- General configuration ------------------------------------------------ @@ -77,9 +77,9 @@ # built documents. # # The short X.Y version. -version = u'0.0.1.dev1' +version = u'0.2.1' # The full version, including alpha/beta/rc tags. -release = u'0.0.1.dev1' +release = u'0.2.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/dsegmenter.bparseg.align.rst b/docs/dsegmenter.bparseg.align.rst new file mode 100644 index 0000000..c55379c --- /dev/null +++ b/docs/dsegmenter.bparseg.align.rst @@ -0,0 +1,6 @@ +dsegmenter.bparseg.align +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: dsegmenter.bparseg.align + :members: + :special-members: diff --git a/docs/dsegmenter.bparseg.bparsegmenter.rst b/docs/dsegmenter.bparseg.bparsegmenter.rst new file mode 100644 index 0000000..0c52773 --- /dev/null +++ b/docs/dsegmenter.bparseg.bparsegmenter.rst @@ -0,0 +1,6 @@ +dsegmenter.bparseg.bparsegmenter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: dsegmenter.bparseg.bparsegmenter.BparSegmenter + :members: + :special-members: diff --git a/docs/dsegmenter.bparseg.constants.rst b/docs/dsegmenter.bparseg.constants.rst new file mode 100644 index 0000000..4e0097b --- /dev/null +++ b/docs/dsegmenter.bparseg.constants.rst @@ -0,0 +1,6 @@ +dsegmenter.bparseg.constants +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: dsegmenter.bparseg.constants + :members: + :special-members: diff --git a/docs/dsegmenter.bparseg.constituency_tree.rst b/docs/dsegmenter.bparseg.constituency_tree.rst new file mode 100644 index 0000000..5769797 --- /dev/null +++ b/docs/dsegmenter.bparseg.constituency_tree.rst @@ -0,0 +1,6 @@ +dsegmenter.bparseg.constituency_tree +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: dsegmenter.bparseg.constituency_tree + :members: + :special-members: diff --git a/docs/dsegmenter.bparseg.rst b/docs/dsegmenter.bparseg.rst index 15591c4..43d2531 100644 --- a/docs/dsegmenter.bparseg.rst +++ b/docs/dsegmenter.bparseg.rst @@ -2,5 +2,11 @@ dsegmenter.bparseg ~~~~~~~~~~~~~~~~~~ .. automodule:: dsegmenter.bparseg - :members: - :undoc-members: + +.. toctree:: + :maxdepth: 2 + + dsegmenter.bparseg.align.rst + dsegmenter.bparseg.bparsegmenter.rst + dsegmenter.bparseg.constants.rst + dsegmenter.bparseg.constituency_tree.rst diff --git a/docs/dsegmenter.edseg.rst b/docs/dsegmenter.edseg.rst new file mode 100644 index 0000000..c09d5ab --- /dev/null +++ b/docs/dsegmenter.edseg.rst @@ -0,0 +1,6 @@ +dsegmenter.edseg +~~~~~~~~~~~~~~~~ + +.. automodule:: dsegmenter.edseg + :members: + :undoc-members: diff --git a/docs/dsegmenter.mateseg.rst b/docs/dsegmenter.mateseg.rst new file mode 100644 index 0000000..7ab0e0f --- /dev/null +++ b/docs/dsegmenter.mateseg.rst @@ -0,0 +1,6 @@ +dsegmenter.mateseg +~~~~~~~~~~~~~~~~~~ + +.. automodule:: dsegmenter.mateseg + :members: + :undoc-members: diff --git a/docs/dsegmenter.rst b/docs/dsegmenter.rst index 41d61e8..c472dd4 100644 --- a/docs/dsegmenter.rst +++ b/docs/dsegmenter.rst @@ -5,10 +5,10 @@ dsegmenter ---------- .. automodule:: dsegmenter - :members: - :undoc-members: .. toctree:: - :maxdepth: 2 + :maxdepth: 3 dsegmenter.bparseg.rst + dsegmenter.edseg.rst + dsegmenter.mateseg.rst diff --git a/dsegmenter/__init__.py b/dsegmenter/__init__.py index 8f976e3..3fa4df0 100644 --- a/dsegmenter/__init__.py +++ b/dsegmenter/__init__.py @@ -3,19 +3,20 @@ ################################################################## # Documentation - """Main meta-package containing a collection of discourse segmenters. Attributes: - edseg (module): + common (module): + routines common to multiple subpackages + edseg (subpackage): rule-based discourse segmenter for Mate dependency trees - treeseg (module): + treeseg (subpackage): auxiliary segmenter routines used by syntax-driven segmenters - bparseg (module): + bparseg (subpackage): machine-learning discourse segmenter for BitPar constituency trees - mateseg (module): + mateseg (subpackage): machine-learning discourse segmenter for Mate dependency graphs - evaluation (module): + evaluation (subpackage): metrics for evaluating discourse segmentation __all__ (List[str]): list of sub-modules exported by this package diff --git a/dsegmenter/bparseg/__init__.py b/dsegmenter/bparseg/__init__.py index 69954a7..bd9cff3 100644 --- a/dsegmenter/bparseg/__init__.py +++ b/dsegmenter/bparseg/__init__.py @@ -12,7 +12,7 @@ trees bparsegmenter (module): class for segmenting syntax trees into discourse units - __all__ (List[str]): list of sub-modules exported by this package + __all__ (list[str]): list of sub-modules exported by this package __author__ (str): package's author __email__ (str): email of package's author __name__ (str): package's name @@ -22,15 +22,19 @@ ################################################################## # Imports -from .constants import ENCODING, NO_PARSE_RE, WORD_SEP -from .bparsegmenter import BparSegmenter, read_trees, read_segments, trees2segs -from .constituency_tree import CTree +from __future__ import absolute_import + +from dsegmenter.bparseg.constants import ENCODING, NO_PARSE_RE, WORD_SEP +from dsegmenter.bparseg.bparsegmenter import BparSegmenter, \ + read_trees, read_tok_trees, trees2segs +from dsegmenter.bparseg.constituency_tree import CTree, OP, OP_RE, CP, CP_RE ################################################################## # Intialization __name__ = "bparseg" -__all__ = ["ENCODING", "NO_PARSE_RE", "WORD_SEP", "BparSegmenter", "CTree", \ - "read_trees", "read_segments", "trees2segs"] +__all__ = ["ENCODING", "NO_PARSE_RE", "WORD_SEP", "BparSegmenter", "CTree", + "OP", "OP_RE", "CP", "CP_RE", "read_trees", "read_tok_trees", + "trees2segs"] __author__ = "Uladzimir Sidarenka" __email__ = "sidarenk at uni dash potsdam dot de" __version__ = "0.0.1" diff --git a/dsegmenter/bparseg/align.py b/dsegmenter/bparseg/align.py index fa1d191..961e242 100644 --- a/dsegmenter/bparseg/align.py +++ b/dsegmenter/bparseg/align.py @@ -3,8 +3,7 @@ ################################################################## # Documentation -""" -Module providing methods for string alignment. +"""Module providing methods for string alignment. All of these methods take two iterables (can be either lists or strings) as arguments. The first iterable (L1) represents string or list to which @@ -19,24 +18,19 @@ elements. Example: + nw_align("AGTACGCA", "TCGC") + => [[], [], [0], [], [1], [2], [3], []] + - nw_align("AGTACGCA", "TCGC") - => [[], [], [0], [], [1], [2], [3], []] +this corresponds to the alignment - this corresponds to the alignment AGTACGCA T CGC Please also note that different algorithms may give different alignments for tie cases. -Methods: -hb_align - Hirschberg alignment algorithm: (O(nm) time; O(min{n, m}) space) -nw_align - Needleman-Wunsch alignment algorithm: (O(nm) time; O(nm) space) - -@author = Wladimir Sidorenko (Uladzimir Sidarenka) -@mail = -@version = 0.0.1 +.. moduleauthor: Wladimir Sidorenko (Uladzimir Sidarenka) """ @@ -46,32 +40,38 @@ import sys + ################################################################## # Methods -def hb_align(s1, s2, insert = lambda c: -2, \ - delete = lambda c: -2, \ - substitute = lambda c1, c2: 2 if c1 == c2 else -1, \ - offset = 0, keep_deleted = False): +def hb_align(s1, s2, insert=lambda c: -2, + delete=lambda c: -2, + substitute=lambda c1, c2: 2 if c1 == c2 else -1, + offset=0, keep_deleted=False): """Align two iterables using Hirschberg alignment algorithm. - @param s1 - iterable for alignment - @param s2 - iterable which should be aligned - @param insert - function returning penalty for insertion (default -2) - @param delete - function returning penalty for deletion (default -2) - @param substitute - function returning penalty for substitution (default -1) - @param offset - add `offset` to each aligned index of second iterable + Complexity: :math:`O(nm)` time; :math:`O(min(n, m))` space + + Args: + s1 (iterable): iterable for alignment + s2 (iterable): iterable which should be aligned + insert (lambda): function returning penalty for insertion (default -2) + delete (lambda): function returning penalty for deletion (default -2) + substitute (lambda): function returning penalty for substitution + (default -1) + offset (int): add ``offset`` to each aligned index of second iterable (this option is only needed for recursive alignmnet of substrings or sublists) - @param keep_deleted - return indices of deleted words too + keep_deleted (bool): return indices of deleted words too - @return list whose elements are indices of s2 corresponding to given - positions of s1 + Returns: + list: indices of s2 corresponding to given positions of s1 """ # remember penalty functions penalties = [insert, delete, substitute] # determine length of both lists - L1 = len(s1); L2 = len(s2) + L1 = len(s1) + L2 = len(s2) # return value will always be a list of L1 list elements ret = [] # establish auxiliary variables @@ -84,39 +84,43 @@ def hb_align(s1, s2, insert = lambda c: -2, \ ret = [[i + offset] for i in xrange(L1)] # the trickier case, however, is when actual alignment should be done elif L1 == 1 or L2 == 1: - ret = nw_align(s1, s2, *penalties, offset = offset) + ret = nw_align(s1, s2, *penalties, offset=offset) else: mid1 = L1 / 2 ScoreL = _nw_score_(s1[:mid1], s2) - ScoreR = _nw_score_([c for c in reversed(s1[mid1:])], \ - [c for c in reversed(s2)]) - mid2 = _partition_(ScoreL, [i for i in reversed(ScoreR)]) - - ret += hb_align(s1[:mid1], s2[:mid2], *penalties, \ - offset = offset) - ret += hb_align(s1[mid1:], s2[mid2:], *penalties, \ - offset = offset + mid2) + ScoreR = _nw_score_([c for c in reversed(s1[mid1:])], + [c for c in reversed(s2)]) + mid2 = _partition_(ScoreL, [i for i in reversed(ScoreR)]) + + ret += hb_align(s1[:mid1], s2[:mid2], *penalties, + offset=offset) + ret += hb_align(s1[mid1:], s2[mid2:], *penalties, + offset=offset + mid2) return ret -def nw_align(s1, s2, insert = lambda c: -2, \ - delete = lambda c: -2, \ - substitute = lambda c1, c2: 2 if c1 == c2 else -1, \ - offset = 0, keep_deleted = False): + +def nw_align(s1, s2, insert=lambda c: -2, + delete=lambda c: -2, + substitute=lambda c1, c2: 2 if c1 == c2 else -1, + offset=0, keep_deleted=False): """Align two iterables using Needleman-Wunsch algorithm. - @param s1 - iterable for alignment - @param s2 - iterable which should be aligned - @param insert - penalty for insertion (default -2) - @param delete - penalty for deletion (default -2) - @param substitute - penalty for substitution (default -1) - @param offset - add `offset` to each aligned index of second iterable + Complexity: :math:`O(nm)` time; :math:`O(nm)` space + + Args: + s1 (iterable): iterable for alignment + s2 (iterable): iterable which should be aligned + insert (lambda): penalty for insertion (default -2) + delete (lambda): penalty for deletion (default -2) + substitute (lambda): penalty for substitution (default -1) + offset (int): add ``offset`` to each aligned index of second iterable (this option is only needed for recursive alignmnet of substrings or sublists) - @param keep_deleted - return indices of deleted words too + keep_deleted (bool): return indices of deleted words too - @return list whose elements are indices of s2 corresponding to given - positions in s1 + Returns: + list: indices of s2 corresponding to given positions in s1 """ # create optimal matching matrix @@ -124,6 +128,7 @@ def nw_align(s1, s2, insert = lambda c: -2, \ # decode best alignment using this matrix return _decode_matrix_(mtx, offset, keep_deleted) + ################################################################## # Private Methods def _make_matrix_(s1, s2, insert, delete, substitute): @@ -134,19 +139,21 @@ def _make_matrix_(s1, s2, insert, delete, substitute): where first element is the optimal alignment score and second element is another two-tuple with indices of source element which yielded this score. - @param s1 - first string to align - @param s2 - second string to align - @param insert - function giving penalty for inserting a char - @param delete - function giving penalty for deleting a char - @param substitute - function giving penalty for substituting a char1 with char2 + s1 - first string to align + s2 - second string to align + insert - function giving penalty for inserting a char + delete - function giving penalty for deleting a char + substitute - function giving penalty for substituting a char1 with char2 - @return optimal matching matrix of two strings + Returns: + np.array: optimal matching matrix of two strings """ # get lengths of both lists - L1 = len(s1) + 1; L2 = len(s2) + 1 + L1 = len(s1) + 1 + L2 = len(s2) + 1 # create a matrix for storing scores and backtracking information. - mtx = [[[None, None]] * L2 for c in xrange(L1)] + mtx = [[[None, None]] * L2 for c in xrange(L1)] # auxiliary variables for characters c1 = c2 = '' # auxiliary variables for iterators @@ -182,7 +189,7 @@ def _make_matrix_(s1, s2, insert, delete, substitute): c2 = s2[prev_j] # compute different modification scores delscore = mtx[prev_i][j][0] + delete(c1) - inscore = mtx[i][prev_j][0] + insert(c2) + inscore = mtx[i][prev_j][0] + insert(c2) subscore = mtx[prev_i][prev_j][0] + substitute(c1, c2) # compute the maximum over three scores maxscore = max(delscore, inscore, subscore) @@ -197,20 +204,24 @@ def _make_matrix_(s1, s2, insert, delete, substitute): mtx[i][j] = (maxscore, bck_idx) return mtx -def _decode_matrix_(mtx, a_offset = 0, a_keep = False): + +def _decode_matrix_(mtx, a_offset=0, a_keep=False): """Compute best alignment for s1 and s2 based on error matrix. - @param mtx - optimal matching matrix - @param a_offset - offset for indices - @param a_keep - include indices of words the were deleted during edit + Args: + mtx (np.array): optimal matching matrix + a_offset (int): offset for indices + a_keep (bool): include indices of words the were deleted during edit - @return list of indices of second iterable which provide best alignment to - first iterable + Returns: + list: indices of second iterable which provide best alignment to + the first iterable """ # matrix indices `i` and `j` will differ by one from the actual string # indices for strings `s1` and `s2` - i = len(mtx) - 1; j = len(mtx[0]) - 1 + i = len(mtx) - 1 + j = len(mtx[0]) - 1 # return value will be a list of length `len(s1)` whose elements in turn # will be lists of element indices of `s2` ret = [[] for c in xrange(i)] @@ -225,7 +236,8 @@ def _decode_matrix_(mtx, a_offset = 0, a_keep = False): # list was deleted, then do nothing but simply check our sanity assert(prev_i != i) elif prev_i != i: - # if neither `i` nor `j` are the same, this means full correspondence + # if neither `i` nor `j` are the same, this means full + # correspondence ret[i - 1].insert(0, j - 1 + a_offset) # uncomment this, if you want to get a list of deleted elements for a # single position @@ -241,13 +253,17 @@ def _decode_matrix_(mtx, a_offset = 0, a_keep = False): # return calculated alignment list return ret + def _partition_(seq1, seq2): """Find a pair of elements in iterables seq1 and seq2 with maximum sum. - @param seq1 - iterable with real values - @param seq2 - iterable with real values + Args: + seq1 (iterable): with real values + seq2 (iterable): with real values + + Returns: + int: list index such that seq1[pos] + seq2[pos] is maximum - @return pos - such that seq1[pos] + seq2[pos] is maximum """ _sum_ = _max_ = _pos_ = float("-inf") for pos, ij in enumerate(zip(seq1, seq2)): @@ -257,9 +273,10 @@ def _partition_(seq1, seq2): _pos_ = pos return _pos_ -def _nw_score_(s1, s2, insert = lambda c: -2, \ - delete = lambda c: -2, \ - substitute = lambda c1, c2: 2 if c1 == c2 else -1): + +def _nw_score_(s1, s2, insert=lambda c: -2, + delete=lambda c: -2, + substitute=lambda c1, c2: 2 if c1 == c2 else -1): """Compute Needleman Wunsch score for aligning two strings. This algorithm basically performs the same operations as Needleman Wunsch @@ -267,17 +284,22 @@ def _nw_score_(s1, s2, insert = lambda c: -2, \ the optimal alignment matrix. As a consequence, no reconstruction is possible. - @param s1 - iterable - @param s2 - another iterable to be aligned - @param insert - function returning penalty for insertion (default -2) - @param delete - function returning penalty for deletion (default -2) - @param substitute - function returning penalty for substitution (default -1) + Args: + s1 (iterable): iterable to which we should align + s2 (iterable): iterable to be aligned + insert (lambda): function returning penalty for insertion (default -2) + delete (lambda): function returning penalty for deletion (default -2) + substitute (lambda): function returning penalty for substitution + (default -1) + + Returns: + : last column of optimal matching matrix - @return last column of optimal matching matrix """ # lengths of two strings are further used for ranges, therefore 1 is added # to every length - m = len(s1) + 1; n = len(s2) + 1 + m = len(s1) + 1 + n = len(s2) + 1 # score will be a two dimensional matrix score = [[0 for i in xrange(n)], [0 for i in xrange(n)]] # character of first and second string, respectively @@ -286,7 +308,9 @@ def _nw_score_(s1, s2, insert = lambda c: -2, \ s2_it = xrange(1, n) # indices of current and previous column in the error matrix (will be # swapped along the way) - crnt = 0; prev = 1; prev_j = 0 + crnt = 0 + prev = 1 + prev_j = 0 # base case when the first string is shorter than second for j in s2_it: prev_j = j - 1 @@ -304,7 +328,7 @@ def _nw_score_(s1, s2, insert = lambda c: -2, \ c2 = s2[prev_j] # current cell will be the maximum over insertions, deletions, and # substitutions applied to adjacent cells - # substitution (covers cases when both chars are equal) + # substitution (covers cases when both chars are equal) score[crnt][j] = max(score[prev][prev_j] + substitute(c1, c2), # deletion score[prev][j] + delete(c1), diff --git a/dsegmenter/bparseg/bparsegmenter.py b/dsegmenter/bparseg/bparsegmenter.py index fe98740..b8b9a27 100644 --- a/dsegmenter/bparseg/bparsegmenter.py +++ b/dsegmenter/bparseg/bparsegmenter.py @@ -6,19 +6,13 @@ """Module providing discourse segmenter for constituency trees. Attributes: - SUBSTITUTEF (method): custom weighting function used for token alignment - _ispunct (method): check if word consists only of punctuation characters - _prune_punc (method): remove tokens representing punctuation from set - _translate_toks (method): replace tokens and return updated set - tree2tok (method): create dictionary mapping constituency trees to numbered tokens + tree2tok (method): create dictionary mapping constituency trees to numbered + tokens read_trees (method): read file and return a list of constituent dictionaries - read_segments (method): read file and return a list of segment dictionaries trees2segs (method): align trees with corresponding segments featgen (method): default feature generation function classify (method): default classification method -Classes: - BparSegmenter: discourse segmenter for constituency trees .. moduleauthor:: Wladimir Sidorenko (Uladzimir Sidarenka) @@ -26,77 +20,38 @@ ################################################################## # Libraries -from .align import nw_align -from .constants import ENCODING -from .constituency_tree import Tree, CTree -from ..treeseg import TreeSegmenter, DiscourseSegment, CONSTITUENCY, DEFAULT_SEGMENT +from __future__ import absolute_import, unicode_literals + +from dsegmenter.common import NONE, prune_punc, score_substitute, \ + translate_toks + +from dsegmenter.bparseg.align import nw_align +from dsegmenter.bparseg.constituency_tree import Tree, CTree +from dsegmenter.treeseg import TreeSegmenter, DiscourseSegment, \ + CONSTITUENCY, DEFAULT_SEGMENT -# from sklearn.cross_validation import KFold from sklearn.externals import joblib from sklearn.feature_extraction import DictVectorizer -from sklearn.feature_selection import VarianceThreshold, SelectKBest -from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix +from sklearn.feature_selection import VarianceThreshold +from sklearn.metrics import precision_recall_fscore_support from sklearn.pipeline import Pipeline -from sklearn.svm import SVC, LinearSVC +from sklearn.svm import LinearSVC import locale import os import re -import sys -import string ################################################################## # Constants -NONE = str(None) +locale.setlocale(locale.LC_ALL, "") N_FOLDS = 10 -SUBSTITUTEF = lambda c1, c2: 2 if c1[-1] == c2[-1] else -3 ESCAPE_QUOTE_RE = re.compile(r"\\+([\"'])") ESCAPE_SLASH_RE = re.compile(r"\\/") + ################################################################## # Methods -locale.setlocale(locale.LC_ALL, "") - -def _ispunct(a_word): - """Check if word consists only of punctuation characters. - - Args: - a_word (str): word to check - - Returns: - (bool) True if word consists only of punctuation characters, False otherwise - - """ - return all(c in string.punctuation for c in a_word) - -def _prune_punc(a_toks): - """Remove tokens representing punctuation from set. - - @param a_toks - tokens to prune - - @return token set with punctuation tokens removed - - """ - return frozenset([tok for tok in a_toks if not _ispunct(tok[-1])]) - -def _translate_toks(a_toks, a_translation): - """Translate tokens and return translated set. - - @param a_toks - tokens to be translated - @param a_translation - translation dictionary for tokens - - @return translated set of tokens - - """ - if a_translation is None: - return a_toks - ret = set() - for tok in a_toks: - for t_tok in a_translation[tok]: - ret.add(t_tok) - return frozenset(ret) - -def tree2tok(a_tree, a_start = 0): +def tree2tok(a_tree, a_start=0): """Create dictionary mapping constituency trees to numbered tokens. Args: @@ -111,7 +66,6 @@ def tree2tok(a_tree, a_start = 0): chset = None tr2tk = {(a_start, a_tree.label()): (a_tree, rset)} i = a_start - max_ch_pos = -1 for child in a_tree: if isinstance(child, Tree): tr2tk.update(tree2tok(child, i)) @@ -123,17 +77,20 @@ def tree2tok(a_tree, a_start = 0): i += 1 return tr2tk -def read_trees(a_lines, a_one_per_line = False): + +def read_tok_trees(a_lines, a_one_per_line=False): """Read file and return a list of constituent dictionaries. Args: a_lines (list[str]): decoded lines of the input file + a_one_per_line (bool): boolean flag indicating whether each + tree is stored on a separate line Returns: 2-tuple: list of dictionaries mapping tokens to trees and a list of trees """ - ctrees = CTree.parse_lines(a_lines, a_one_per_line = a_one_per_line) + ctrees = CTree.parse_lines(a_lines, a_one_per_line=a_one_per_line) # generate dictionaries mapping trees' yields to trees t_cnt = 0 t2t = None @@ -152,70 +109,22 @@ def read_trees(a_lines, a_one_per_line = False): toks2trees[toks] = [tree] return toks2trees, ctrees -def read_segments(a_lines): - """Read file and return a list of segment dictionaries. + +def read_trees(a_lines, a_one_per_line=False): + """Read file and return a list of constituent dictionaries. Args: - a_lines (list): decoded lines of the input file + a_lines (list[str]): decoded lines of the input file + a_one_per_line (bool): boolean flag indicating whether each + tree is stored on a separate line - Returns: - dict: mapping from tokens to segments + Yields: + CTree: input tree """ - segs2toks = {} - s_c = t_c = 0 - tokens = [] - atoks = [] - new_seg = None - active_tokens = set() - active_segments = [] - # read segments - for iline in a_lines: - iline = iline.strip() - if not iline: - continue - # do some clean-up - active_tokens.clear() - del atoks[:] - del active_segments[:] - tokens = iline.split() - # establish correspondence between tokens and segments - for tok in tokens: - if tok[0] == '(' and len(tok) > 1: - active_tokens = set(atoks) - del atoks[:] - for a_s in active_segments: - segs2toks[a_s].update(active_tokens) - new_seg = (s_c, tok[1:]) - active_segments.append(new_seg) - segs2toks[new_seg] = set() - s_c += 1 - continue - elif tok == ')': - assert active_segments, "Unbalanced closing parenthesis at line: " + repr(iline) - active_tokens = set(atoks) - del atoks[:] - for a_s in active_segments: - segs2toks[a_s].update(active_tokens) - active_segments.pop() - continue - else: - atoks.append((t_c, tok)) - t_c += 1 - assert not active_segments, "Unbalanced opening parenthesis at line: " + repr(iline) - toks2segs = dict() - segments = segs2toks.keys() - segments.sort(key = lambda el: el[0]) - for seg in segments: - toks = frozenset(segs2toks[seg]) - # it can be same tokenset corresponds to multiple segments, in that - # case we leave the first one that we encounter - if toks in toks2segs: - continue - assert toks not in toks2segs, "Multiple segments correspond to the same tokenset: '" + \ - repr(toks) + "': " + repr(seg) + ", " + repr(toks2segs[toks]) - toks2segs[toks] = seg - return toks2segs + for ctree in CTree.parse_lines(a_lines, a_one_per_line=a_one_per_line): + yield ctree + def trees2segs(a_toks2trees, a_toks2segs): """Align trees with corresponding segments. @@ -229,54 +138,67 @@ def trees2segs(a_toks2trees, a_toks2segs): """ # prune empty trees and their corresponding segments - tree2seg = {t: None for val in a_toks2trees.values() for t in val} + tree2seg = {t: None + for val in a_toks2trees.itervalues() + for t in val} # add additional keys to `a_toks2trees` by pruning punctuation marks from # existing trees pruned_toks = None tree_tok_keys = a_toks2trees.keys() for tree_toks in tree_tok_keys: - pruned_toks = _prune_punc(tree_toks) + pruned_toks = prune_punc(tree_toks) if pruned_toks not in a_toks2trees: a_toks2trees[pruned_toks] = a_toks2trees[tree_toks] # establish a mapping between tree tokens and segment tokens - tree_toks = list(set([t for t_set in a_toks2trees.keys() for t in t_set])) - tree_toks.sort(key = lambda el: el[0]) - seg_toks = list(set([t for t_set in a_toks2segs.keys() for t in t_set])) - seg_toks.sort(key = lambda el: el[0]) + tree_toks = list(set([t + for t_set in a_toks2trees.iterkeys() + for t in t_set])) + tree_toks.sort(key=lambda el: el[0]) + seg_toks = list(set([t + for t_set in a_toks2segs.iterkeys() + for t in t_set])) + seg_toks.sort(key=lambda el: el[0]) # align tokens if necessary seg_t2tree_t = None if tree_toks != seg_toks: seg_t2tree_t = dict() - alignment = nw_align(seg_toks, tree_toks, substitute = SUBSTITUTEF, keep_deleted = True) + alignment = nw_align(seg_toks, tree_toks, + substitute=score_substitute, + keep_deleted=True) for i, tt in enumerate(alignment): seg_t2tree_t[seg_toks[i]] = [tree_toks[j] for j in tt] # for each segment look if its corresponding token set is matched by # any other subtree translated_toks = None for toks, segs in a_toks2segs.iteritems(): - translated_toks = _translate_toks(toks, seg_t2tree_t) + translated_toks = translate_toks(toks, seg_t2tree_t) key = None if translated_toks in a_toks2trees: key = translated_toks else: - translated_toks = _prune_punc(translated_toks) + translated_toks = prune_punc(translated_toks) if translated_toks in a_toks2trees: key = translated_toks if key: for tree in a_toks2trees[key]: # if tree2seg[tree] is not None: # continue - assert tree2seg[tree] is None, "Multiple segments found for tree" + repr(tree) + ": " +\ + assert tree2seg[tree] is None, \ + "Multiple segments found for tree" + repr(tree) + ": " + \ repr(segs[-1]) + "; " + repr(tree2seg[tree]) tree2seg[tree] = segs[-1] return tree2seg + def featgen(a_tree): """Generate features for the given BitPar tree. - @param a_tree - BitPar tree for which we should generate features + Args: + a_tree (dsegmenter.bparseg.constituency_tree.CTree): BitPar tree + for which we should generate features - @return list of string features + Returns: + list: string features """ assert a_tree.leaves(), "Tree does not contain leaves." @@ -313,20 +235,25 @@ def featgen(a_tree): # add label of the parent tree return ret -def classify(a_classifier, a_featgen, a_el, a_default = None): + +def classify(a_classifier, a_featgen, a_el, a_default=None): """Classify given element. - @param a_classifier - model which should make predictions - @param a_featgen - feature generation function - @param a_el - constituency tree to be classified - @param a_default - default element that should be returned if el does + Args: + a_classifier - model which should make predictions + a_featgen - feature generation function + a_el - constituency tree to be classified + a_default - default element that should be returned if el does not yield segment - @return assigned class + Returns: + str: assigned class """ prediction = a_classifier.predict(a_featgen(a_el))[0] - return a_default if prediction is None or prediction == NONE else prediction + return a_default if prediction is None or \ + prediction == NONE else prediction + ################################################################## # Class @@ -335,19 +262,20 @@ class BparSegmenter(object): """ - #: classifier object: default classification method - DEFAULT_CLASSIFIER = LinearSVC(C = 0.3, multi_class = 'crammer_singer') + #:classifier object: default classification method + DEFAULT_CLASSIFIER = LinearSVC(C=0.3, multi_class='crammer_singer') #:str: path to default model to use in classification - DEFAULT_MODEL = os.path.join(os.path.dirname(__file__), "data", "bpar.model") + DEFAULT_MODEL = os.path.join(os.path.dirname(__file__), "data", + "bpar.model") #:pipeline object: default pipeline object used for classification DEFAULT_PIPELINE = Pipeline([('vectorizer', DictVectorizer()), - ('var_filter', VarianceThreshold()), - ('LinearSVC', DEFAULT_CLASSIFIER)]) + ('var_filter', VarianceThreshold()), + ('LinearSVC', DEFAULT_CLASSIFIER)]) - def __init__(self, a_featgen = featgen, a_classify = classify, \ - a_model = DEFAULT_MODEL): + def __init__(self, a_featgen=featgen, a_classify=classify, + a_model=DEFAULT_MODEL): """Class constructor. Args: @@ -370,23 +298,24 @@ def segment(self, a_trees): a_trees (list): list of sentence trees to be parsed Returns: - iterator: constructed segment trees + list: constructed segment trees """ seg_idx = 0 segments = [] isegment = None if self.model is None: - return [DiscourseSegment(a_name = DEFAULT_SEGMENT, a_leaves = t.leaves) \ - for t in a_trees] + return [DiscourseSegment(a_name=DEFAULT_SEGMENT, a_leaves=t.leaves) + for t in a_trees] for t in a_trees: self._segmenter.segment(t, segments) # if classifier failed to create one common segment for # the whole tree, create one for it if (len(segments) - seg_idx) > 1 or \ - (len(segments) and not isinstance(segments[-1][-1], DiscourseSegment)): - isegment = DiscourseSegment(a_name = DEFAULT_SEGMENT, \ - a_leaves = segments[seg_idx:]) + (len(segments) and not isinstance(segments[-1][-1], + DiscourseSegment)): + isegment = DiscourseSegment(a_name=DEFAULT_SEGMENT, + a_leaves=segments[seg_idx:]) segments[seg_idx:] = [(isegment.leaves[0][0], isegment)] seg_idx = len(segments) return segments @@ -427,22 +356,29 @@ def test(self, a_trees, a_segments): """ if self.model is None: return (0, 0) - segments = [self.model.predict(self.featgen(itree))[0] for itree in a_trees] + segments = [self.model.predict(self.featgen(itree))[0] + for itree in a_trees] a_segments = [str(s) for s in a_segments] - _, _, macro_f1, _ = precision_recall_fscore_support(a_segments, segments, average='macro', \ - warn_for = ()) - _, _, micro_f1, _ = precision_recall_fscore_support(a_segments, segments, average='micro', \ - warn_for = ()) + _, _, macro_f1, _ = precision_recall_fscore_support(a_segments, + segments, + average='macro', + warn_for=()) + _, _, micro_f1, _ = precision_recall_fscore_support(a_segments, + segments, + average='micro', + warn_for=()) return (macro_f1, micro_f1) def _train(self, a_feats, a_segs, a_model): """Train segmenter model. - @param a_feats - list of BitPar featuress - @param a_segs - list of discourse segments - @param a_model - model object whose parameters should be fit + Args: + a_feats (list): BitPar featuress + a_segs (list): discourse segments + a_model: model object whose parameters should be fit - @return \c void + Returns: + void: """ # train classifier @@ -452,21 +388,26 @@ def _train(self, a_feats, a_segs, a_model): def _update_segmenter(self, a_model): """Update model, decision function, and internal segmenter. - @param a_model - model used by classifier + Args: + a_model: model used by classifier - @return \c void + Returns: + void: """ if a_model is None: self.model = a_model self.decfunc = lambda el: None - self._segmenter = TreeSegmenter(a_decfunc = self.decfunc, a_type = CONSTITUENCY) + self._segmenter = TreeSegmenter(a_decfunc=self.decfunc, + a_type=CONSTITUENCY) return elif isinstance(a_model, str): if not os.path.isfile(a_model) or not os.access(a_model, os.R_OK): - raise RuntimeError("Can't create model from file {:s}".format(a_model)) + raise RuntimeError("Can't create model from" + " file {:s}".format(a_model)) self.model = joblib.load(a_model) else: self.model = a_model self.decfunc = lambda el: self.classify(self.model, self.featgen, el) - self._segmenter = TreeSegmenter(a_decfunc = self.decfunc, a_type = CONSTITUENCY) + self._segmenter = TreeSegmenter(a_decfunc=self.decfunc, + a_type=CONSTITUENCY) diff --git a/dsegmenter/bparseg/constants.py b/dsegmenter/bparseg/constants.py index 8192905..8b47d83 100644 --- a/dsegmenter/bparseg/constants.py +++ b/dsegmenter/bparseg/constants.py @@ -3,22 +3,24 @@ ################################################################## # Documentation -""" -Module defining necessary constants for that package. +"""Module defining necessary constants for that package. + +Attributes: + ENCODING (str): default encoding used for strings + NO_PARSE_RE (re): regexp that matches sentences for which no BitPar + tree was generated + WORD_SEP (re): regexp matching word delimiters -Constants: -ENCODING - default encoding used fro stringd -NO_PARSE_RE - regexp that matches sentences for which no BitPar tree - was generated -WORD_SEP - """ ################################################################## # Imports import re + ################################################################## # Constants ENCODING = "utf-8" -NO_PARSE_RE = re.compile("""\s*No\s+parse\s+for\s*:\s*"([^\n]+)"$""", re.IGNORECASE) +NO_PARSE_RE = re.compile("""\s*No\s+parse\s+for\s*:\s*"([^\n]+)"$""", + re.IGNORECASE) WORD_SEP = re.compile("\s+") diff --git a/dsegmenter/bparseg/constituency_tree.py b/dsegmenter/bparseg/constituency_tree.py index a8000e0..ed06ec9 100644 --- a/dsegmenter/bparseg/constituency_tree.py +++ b/dsegmenter/bparseg/constituency_tree.py @@ -6,11 +6,14 @@ """ Module providing class for handling constituency syntax trees. -Classes: -Tree - meta-subclass of NLTK tree which allows hashing -CTree - interface for handling constituency trees +Attributes: + OP (str): special token used to substitute opening parentheses + OP_RE (re): regexp for matching opening parentheses + CP (str): special token used to substitute closing parentheses + CP_RE (re): regexp for matching closing parentheses + +.. moduleauthor:: Wladimir Sidorenko -@author: Wladimir Sidorenko """ ################################################################## @@ -30,22 +33,24 @@ CP = "-CP-" CP_RE = re.compile(r"\\\)") + ################################################################## # Classes class Tree(nltk.tree.ParentedTree): - """ - Direct subclass of nltk.tree.ParentedTree providing hashing. + """Direct subclass of nltk.tree.ParentedTree providing hashing. + + This class extends its parent by an additional method :meth:`__hash__`, + which uses the standard :meth:`id` method and allows the objects to be + stored in hashes, and also overwrites the method :meth:`prnt_label`, + returning the label of the parent tree - This class extends its parent by two additional methods: - __hash__() - which uses the standard id() method and makes - NLTK trees prnt_label() - return label of the parent tree """ def __init__(self, *args): - """ - Class constructor (simply delegates to super-class). + """Class constructor (simply delegates to super-class). - @param args - list of arguments which should be passed to the parent + Args: + args (list): arguments which should be passed to the parent """ if len(args) == 0: @@ -54,8 +59,8 @@ def __init__(self, *args): super(Tree, self).__init__(*args) def __hash__(self): - """ - Return id of this object. + """Return id of this object. + """ return id(self) @@ -70,6 +75,7 @@ def prnt_label(self): return self._parent.label() return "" + ################################################################## class CTree(Tree): """Class for reading and modifying constituency trees. @@ -82,7 +88,7 @@ class CTree(Tree): """ @classmethod - def parse_lines(cls, a_lines, a_one_per_line = False): + def parse_lines(cls, a_lines, a_one_per_line=False): """Parse input lines and return list of BitPar trees. Args: @@ -121,12 +127,14 @@ def parse_lines(cls, a_lines, a_one_per_line = False): @classmethod def _get_segments(cls, a_line): - """ - Split line into separate segments. + """Split line into separate segments. - @param a_line - line to be split + Args: + a_line (str): line to be split + + Returns: + list: segments - @return list of segments """ seg = "" ob = 0 @@ -136,7 +144,8 @@ def _get_segments(cls, a_line): if c == "(" and i < max_len and not WORD_SEP.match(a_line[i + 1]): ob += 1 elif c == ")": - assert ob > 0, "Unmatched closing bracket in line" + repr(a_line) + assert ob > 0, \ + "Unmatched closing bracket in line" + repr(a_line) ob -= 1 seg += c if ob == 0 and not WORD_SEP.match(seg): @@ -146,8 +155,7 @@ def _get_segments(cls, a_line): return segments def __init__(self): - """ - Class constructor. + """Class constructor. """ pass diff --git a/dsegmenter/bparseg/data/bpar.model b/dsegmenter/bparseg/data/bpar.model index 5ef2026..35b9285 100644 Binary files a/dsegmenter/bparseg/data/bpar.model and b/dsegmenter/bparseg/data/bpar.model differ diff --git a/dsegmenter/common.py b/dsegmenter/common.py new file mode 100644 index 0000000..ccbfe8f --- /dev/null +++ b/dsegmenter/common.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python2.7 +# -*- mode: python; coding: utf-8; -*- + +################################################################## +# Documentation +"""Module defining methods common to many modules. + +Attributes: + _ispunct (method): check if word consists only of punctuation characters + prune_punc (method): remove tokens representing punctuation from set + read_segments (method): default method for reading segment files + score_substitute (method): custom weighting function used for token alignment + translate_toks (method): replace tokens and return updated set + +""" + +################################################################## +# Imports +import string + + +################################################################## +# Constants +DEPS = "deps" +NONE = str(None) +REL = "rel" +TAG = "tag" +WORD = "word" + + +################################################################## +# Methods +def _ispunct(a_word): + """Check if word consists only of punctuation characters. + + Args: + a_word (str): word to check + + Returns: + bool: True if word consists only of punctuation characters, + False otherwise + + """ + return all(c in string.punctuation for c in a_word) + + +def prune_punc(a_toks): + """Remove tokens representing punctuation from set. + + Args: + a_toks (iterable): original tokens + + Returns: + frozenset: tokens without punctuation marks + + """ + return frozenset([tok for tok in a_toks if not _ispunct(tok[-1])]) + + +def read_segments(a_lines): + """Read file and return a list of segment dictionaries. + + Args: + a_lines (list): decoded lines of the input file + + Returns: + dict: mapping from tokens to segments + + """ + segs2toks = {} + s_c = t_c = 0 + tokens = [] + atoks = [] + new_seg = None + active_tokens = set() + active_segments = [] + # read segments + for iline in a_lines: + iline = iline.strip() + if not iline: + continue + # do some clean-up + active_tokens.clear() + del atoks[:] + del active_segments[:] + tokens = iline.split() + # establish correspondence between tokens and segments + for tok in tokens: + if tok[0] == '(' and len(tok) > 1: + active_tokens = set(atoks) + del atoks[:] + for a_s in active_segments: + segs2toks[a_s].update(active_tokens) + new_seg = (s_c, tok[1:]) + active_segments.append(new_seg) + segs2toks[new_seg] = set() + s_c += 1 + continue + elif tok == ')': + assert active_segments, \ + "Unbalanced closing parenthesis at line: " + repr(iline) + active_tokens = set(atoks) + del atoks[:] + for a_s in active_segments: + segs2toks[a_s].update(active_tokens) + active_segments.pop() + continue + else: + atoks.append((t_c, tok)) + t_c += 1 + assert not active_segments, \ + "Unbalanced opening parenthesis at line: " + repr(iline) + toks2segs = dict() + segments = segs2toks.keys() + segments.sort(key=lambda el: el[0]) + for seg in segments: + toks = frozenset(segs2toks[seg]) + # it can be same tokenset corresponds to multiple segments, in that + # case we leave the first one that we encounter + if toks in toks2segs: + continue + assert toks not in toks2segs, \ + "Multiple segments correspond to the same tokenset: '" + \ + repr(toks) + "': " + repr(seg) + ", " + repr(toks2segs[toks]) + toks2segs[toks] = seg + return toks2segs + + +def score_substitute(a_c1, a_c2): + """Score substitution of two characters. + + Args: + a_c1 (str): first word to compare + a_c2 (str): second word to compare + + Returns: + int: 2 if the last characters of both words are equal, -3 otherwise + + """ + return 2 if a_c1[-1] == a_c2[-1] else -3 + + +def translate_toks(a_toks, a_translation): + """Translate tokens and return translated set. + + Args: + a_toks (iterable): tokens to be translated + a_translation (dict): - translation dictionary for tokens + + Returns: + frozenset: translated tokens + + """ + if a_translation is None: + return a_toks + ret = set() + for tok in a_toks: + for t_tok in a_translation[tok]: + ret.add(t_tok) + return frozenset(ret) diff --git a/dsegmenter/edseg/__init__.py b/dsegmenter/edseg/__init__.py index 015917f..a6cc262 100644 --- a/dsegmenter/edseg/__init__.py +++ b/dsegmenter/edseg/__init__.py @@ -4,28 +4,27 @@ ################################################################## # Documentation -""" -Package providing rule-based discourse segmenter for CONLL dependency trees. - -Modules: -chunking - routines for internal clause segmentation -clause_segmentation - rule-based clause segmenter -conll - interface for dealing with CONLL data -data - data definitions and data reading routines -edssegmenter - definition of rule-based discourse segmenter -finitestateparsing - parsing routines based on finite-state mechanisms -util - auxiliary match routines needed for rule matching +"""Package providing rule-based discourse segmenter for CONLL trees. -@author = Wladimir Sidorenko (Uladzimir Sidarenka) -@mail = -@version = 0.0.1 +Attributes: + chunking (module): routines for internal clause segmentation + clause_segmentation (module): utilities and classes for rule-based clause + segmenter + conll (module): interface for dealing with CONLL data + data (module): data definitions and data reading routines + edssegmenter (module): definition of rule-based discourse segmenter + finitestateparsing (module): parsing routines based on finite-state + mechanisms + util (module): auxiliary match routines needed for rule matching """ ################################################################## # Imports -from .conll import CONLL -from .edssegmenter import EDSSegmenter +from __future__ import absolute_import + +from dsegmenter.edseg.conll import CONLL +from dsegmenter.edseg.edssegmenter import EDSSegmenter ################################################################## # Variables and Constants diff --git a/dsegmenter/edseg/chunking.py b/dsegmenter/edseg/chunking.py index 2dccc4f..f49db9d 100644 --- a/dsegmenter/edseg/chunking.py +++ b/dsegmenter/edseg/chunking.py @@ -19,31 +19,36 @@ UnificationFailure - exception raise on non-merged feature bits @author = Jean VanCoppenolle, Wladimir Sidorenko (Uladzimir Sidarenka) -@mail = , +@mail = , + """ ################################################################## # Libraries +from __future__ import unicode_literals + from .finitestateparsing import constraint, FiniteStateParser from copy import deepcopy import sys + ################################################################## # Methods def catgetter(token): - return token['pos'] + return token["pos"] + ################################################################## # Exceptions class UnificationFailure(Exception): pass + ################################################################## class FeatureMatrix(object): - """ - Class for converting CONLL features to bit matrices. + """Class for converting CONLL features to bit matrices. Class constants: FEATS - nominal names of the features @@ -54,23 +59,25 @@ class FeatureMatrix(object): Public methods: from_string - initiate feature matrix from string representation - from_dict - initiate feature matrix from dictionary of feature names and values + from_dict - initiate feature matrix from dictionary of feature names and + values unify - make an intersection of features in the current matrix with the features from another instance unifies - check if the intersection of the current feature matrix with the matrix from another instance is not empty + """ FEATS = [ - 'nom', - 'acc', - 'dat', - 'gen', - 'sg', - 'pl', - 'masc', - 'fem', - 'neut', + "nom", + "acc", + "dat", + "gen", + "sg", + "pl", + "masc", + "fem", + "neut", ] _FEAT_INDICES = dict((feat, idx) for (idx, feat) in enumerate(FEATS)) @@ -117,14 +124,14 @@ def from_dict(cls, feat_dict): return cls([v for v in feat_dict.itervalues()]) def unify(self, other): - """ - Intersect features in the current matrix with the features from another instance + """Intersect current features with the features from another instance @param other - another FeatureMatrix instance @return this FeatureMatrix instance + """ - if not hasattr(other, '_bits'): + if not hasattr(other, "_bits"): return False bits = self._bits & other._bits if not self._unified(bits): @@ -133,14 +140,14 @@ def unify(self, other): return self def unifies(self, other): - """ - Check if intersection of the current instance with another instance is not empty + """Check if intersection with another instance is not empty @param other - another FeatureMatrix instance @return this FeatureMatrix instance + """ - if not hasattr(other, '_bits'): + if not hasattr(other, "_bits"): return False return self._unified(self._bits & other._bits) @@ -150,6 +157,7 @@ def _unified(self, bits): def __str__(self): return bin(self._bits)[2:] + ################################################################## class Chunker(object): """ @@ -181,54 +189,54 @@ def chunk(self, sent): # make a deep copy of sentence, in order not to use it destructively isent = deepcopy(sent) for token in isent: - if token['pos'] in ('ART', 'NE', 'NN'): - if isinstance(token['feat'], basestring): - token['feat'] = FeatureMatrix.from_string(token['feat']) - elif isinstance(token['feat'], dict): - token['feat'] = FeatureMatrix.from_dict(token['feat']) + if token["pos"] in ("ART", "NE", "NN"): + if isinstance(token["feat"], basestring): + token["feat"] = FeatureMatrix.from_string(token["feat"]) + elif isinstance(token["feat"], dict): + token["feat"] = FeatureMatrix.from_dict(token["feat"]) return self._parser.parse(isent, catgetter=catgetter) def _setup_parser(self): add_rule = self._parser.add_rule - add_rule('NC', - ''' - - ''', - level=1) + add_rule("NC", + """ + + """, + level=1) @constraint def nc_month_spec_constraint(match): - if match[2][0]['lemma'] not in ('Anfang', 'Mitte', 'Ende'): + if match[2][0]["lemma"] not in ("Anfang", "Mitte", "Ende"): return False - return match[3][0]['lemma'] in ('Januar', - 'Februar', - u'März', - 'Maerz', - 'April', - 'Mai', - 'Juni', - 'Juli', - 'August', - 'September', - 'Oktober', - 'November', - 'Dezember') - - add_rule('NC', - ''' - (?: - ^ - | - [^] - ) - ( - () - () - ) - ''', - constraint=nc_month_spec_constraint, - group=1, level=1) + return match[3][0]["lemma"] in ("Januar", + "Februar", + "März", + "Maerz", + "April", + "Mai", + "Juni", + "Juli", + "August", + "September", + "Oktober", + "November", + "Dezember") + + add_rule("NC", + """ + (?: + ^ + | + [^] + ) + ( + () + () + ) + """, + constraint=nc_month_spec_constraint, + group=1, level=1) @constraint def nc_det_noun_agreement(match): @@ -237,131 +245,131 @@ def nc_det_noun_agreement(match): return True noun = match[2][0] try: - if hasattr(noun['feat'], 'unify'): - noun['feat'].unify(det[0]) + if hasattr(noun["feat"], "unify"): + noun["feat"].unify(det[0]) else: return False except UnificationFailure: return False return True - add_rule('NC', - ''' - (?: - ()? - | - [] - )? - (?: - (?: - ? - ) - (?: - <$,> - (?: - ? - ) + add_rule("NC", + """ + (?: + ()? + | + [] + )? + (?: + (?: + ? + ) + (?: + <$,> + (?: + ? + ) )* - )? - ([]) - ''', - constraint=nc_det_noun_agreement, - level=1) - - add_rule('NC', - ''' - (?: - ? - | - [] - ) - - - ''', - level=3) - - add_rule('PC', - ''' - # preposition - ? - - ''', - level=2) + )? + ([]) + """, + constraint=nc_det_noun_agreement, + level=1) + + add_rule("NC", + """ + (?: + ? + | + [] + ) + + + """, + level=3) + + add_rule("PC", + """ + # preposition + ? + + """, + level=2) @constraint def pc_genitive_adjunct_constraint(match): node = match[1][0] - if node.last_child['pos'] != 'NN': + if node.last_child["pos"] != "NN": return False art = node.first_child - if art is None or art['pos'] != 'ART': + if art is None or art["pos"] != "ART": return False - if (not 'feats' in art) or (not hasattr(art['feats'], 'unifies')): + if "feat" not in art or not hasattr(art["feat"], "unifies"): return False - return art['feats'].unifies(FeatureMatrix('gen')) - - add_rule('PC', - ''' - [] - - (?: - - - )* - () - (?: - - | - [] # optional circumposition or pronominal adverb - )? - ''', - constraint=pc_genitive_adjunct_constraint, - level=2) - - add_rule('PC', - ''' - [] # preposition - ? # optional embedded preposition ("bis an das Ende") - (?: - # adverbial chunk ("von damals") - (?: # optional conjunction - - - )? - | - # cardinal ("bis 1986") - (?: # optional conjunction - - - )? - | - # noun chunk - (?: # optional conjunction - - - )? - ) - []? # optional circumposition or pronominal adverb - ''', - level=2) - - add_rule('AC', - ''' - * - ? - + - ''', - level=3) - - add_rule('AC', - ''' - ? - + - ''', - level=3) - - add_rule('AC', - ''' - - ''', - level=3) + return art["feat"].unifies(FeatureMatrix("gen")) + + add_rule("PC", + """ + [] + + (?: + + + )* + () + (?: + + | + [] + )? + """, + constraint=pc_genitive_adjunct_constraint, + level=2) + + add_rule("PC", + """ + [] # preposition + ? # ("bis an das Ende") + (?: + # adverbial chunk ("von damals") + (?: # optional conjunction + + + )? + | + # cardinal ("bis 1986") + (?: # optional conjunction + + + )? + | + # noun chunk + (?: # optional conjunction + + + )? + ) + []? # optional pronominal adverb + """, + level=2) + + add_rule("AC", + """ + * + ? + + + """, + level=3) + + add_rule("AC", + """ + ? + + + """, + level=3) + + add_rule("AC", + """ + + """, + level=3) diff --git a/dsegmenter/edseg/clause_segmentation.py b/dsegmenter/edseg/clause_segmentation.py index 62f4898..36c53e8 100644 --- a/dsegmenter/edseg/clause_segmentation.py +++ b/dsegmenter/edseg/clause_segmentation.py @@ -1,31 +1,24 @@ #!/usr/bin/env python # -*- mode: python; coding: utf-8; -*- -""" -Module providing rule-based clause segmenter - -Constants: - -Methods: -catgetter - method for obtaining category of CONLL node +"""Module providing rule-based clause segmenter -Classes: -ClauseSegmenter - class for doing clause segmentation - -@author = Jean VanCoppenolle, Wladimir Sidorenko (Uladzimir Sidarenka) -@mail = , +.. moduelauthor:: Jean VanCoppenolle """ ################################################################## # Libraries -from .chunking import Chunker -from .finitestateparsing import FiniteStateParser, Tree -from .util import match as match_ -from .data import DELIMS, DELIM_NAMES, finite_verbs +from __future__ import absolute_import + +from dsegmenter.edseg.chunking import Chunker +from dsegmenter.edseg.finitestateparsing import FiniteStateParser, Tree +from dsegmenter.edseg.util import match as match_ +from dsegmenter.edseg.data import DELIMS, DELIM_NAMES, finite_verbs import sys + ################################################################## # Methods def catgetter(node): @@ -43,25 +36,23 @@ def catgetter(node): return DELIM_NAMES[form] return node['pos'] + ################################################################## # Class class ClauseSegmenter(object): - """ - Class for perfoming discourse segmentation on CONLL dependency trees. + """Class for perfoming discourse segmentation on CONLL dependency trees. - Instance variables: - _chunker - internal rule-based clause chunker - _parser - internal finite-state parser + Attributes: + _chunker: internal rule-based clause chunker + _parser: internal finite-state parser - Public methods: - segment - perform discourse segmentation of the CONLL sentence """ def __init__(self, **kwargs): - """ - Class constructor. + """Class constructor. @param a_chunker - clause chunker to use + """ chunker = kwargs.get('chunker') if chunker is None: @@ -72,12 +63,12 @@ def __init__(self, **kwargs): self._setup_parser() def segment(self, sent): - """ - Method for segmenting CONLL trees. + """Method for segmenting CONLL trees. @param sent - CONLL tree to process @return sentence-level discourse segment + """ self._prepare_tokens(sent) chunk_tree = self._chunker.chunk(sent) @@ -108,108 +99,108 @@ def _setup_parser(self): ########## define('VFIN', - ''' - - - - ''') + ''' + + + + ''') define('VINF', - ''' - - - - - - - - ''') + ''' + + + + + + + + ''') define('V', - ''' - - - - - - - - - - - ''') + ''' + + + + + + + + + + + ''') define('PUNCT', - ''' - <$,> - <$(> - <$.> - ''') + ''' + <$,> + <$(> + <$.> + ''') define('EOS', - ''' - <$.> - <$,> - ''') + ''' + <$.> + <$,> + ''') define('DASH', - ''' - - - - - ''') + ''' + + + + + ''') define('VG', - ''' - - - ''') + ''' + + + ''') define('CLAUSE', - ''' - - - - - - - ''') + ''' + + + + + + + ''') define('BASIC_CONTENT', - ''' - (?: - [^%PUNCT%%VG%] - (?: - ? - [^%PUNCT%%VG%] - )? - )* - ''') + ''' + (?: + [^%PUNCT%%VG%] + (?: + ? + [^%PUNCT%%VG%] + )? + )* + ''') define('CONTENT', - ''' - (?: - [^%PUNCT%%VG%] - (?: - [<$,>] - [^%PUNCT%%VG%] - | - [%CLAUSE%] - <$,> - )? - )* - ''') + ''' + (?: + [^%PUNCT%%VG%] + (?: + [<$,>] + [^%PUNCT%%VG%] + | + [%CLAUSE%] + <$,> + )? + )* + ''') define('BASIC_TRAILER', - ''' - (?: - [] - [^%PUNCT%%VG%]+ - | - - ) - ''') + ''' + (?: + [] + [^%PUNCT%%VG%]+ + | + + ) + ''') ########################## # Parenthesized segments # @@ -242,104 +233,110 @@ def get_verb(match, group=0): return aux add_rule('FVG', - ''' - ( - ? - [%VINF%]+ - [%VFIN%] - | - # gehen !!! added by W. Sidorenko (remove if it causes errors) - # lassen or simply `gehen' in case of tagging mistakes - ) - (?: - [^] - | - $ - ) - ''', - group=1, - feats=lambda match: {'verb': get_verb(match, group=1)}, - level=5) + ''' + ( + ? + [%VINF%]+ + [%VFIN%] + | + # gehen !!! added by W. Sidorenko (remove if it causes errors) + # lassen or simply `gehen' in case of tagging mistakes + + + ) + (?: + [^] + | + $ + ) + ''', + group=1, + feats=lambda match: {'verb': get_verb(match, group=1)}, + level=5) add_rule('FVG', - ''' - (?: - # ausgenommen - # werden - [%VFIN%] # soll - | - [%VFIN%] # soll - # ausgenommen - # werden - | - [%VFIN%] - [%VINF%]* - ) - ''', - feats=lambda match: {'verb': get_verb(match)}, - level=5) + ''' + (?: + # ausgenommen + # werden + [%VFIN%] # soll + | + [%VFIN%] # soll + # ausgenommen + # werden + | + [%VFIN%] + [%VINF%]* + ) + ''', + feats=lambda match: {'verb': get_verb(match)}, + level=5) add_rule('IVG', - ''' - [%VINF%]* - ? - [%VINF%]+ - ''', - feats=lambda match: {'verb': get_verb(match)}, - level=5) + ''' + [%VINF%]* + ? + [%VINF%]+ + ''', + feats=lambda match: {'verb': get_verb(match)}, + level=5) ################################ # Basic clauses (no embedding) # ################################ add_rule('RelCl', - ''' - ? # optional preposition - [] # relative pronoun - %BASIC_CONTENT% # clause content - ( - [%VG%] # verb group (error tolerance: should actually be finite) - ) - %BASIC_TRAILER%? # optional trailer - [%EOS%]? # optional end of sentence punctuation - ''', - feats=lambda match: {'verb': match[1][0].get('verb')}, - level=6) + ''' + ? # optional preposition + [] # relative pronoun + %BASIC_CONTENT% # clause content + ( + # verb group (error tolerance: should actually be finite) + [%VG%] + ) + %BASIC_TRAILER%? # optional trailer + # optional end of sentence punctuation + [%EOS%]? + ''', + feats=lambda match: {'verb': match[1][0].get('verb')}, + level=6) add_rule('RelCl', - ''' - # conjunction - ( - ? # optional preposition - [] # relative pronoun - %BASIC_CONTENT% # clause content - ( - [%VG%] # verb group (error tolerance: should actually be finite) - ) - %BASIC_TRAILER%? # optional trailer - [%EOS%]? # optional end of sentence punctuation - ) - ''', - group=1, - feats=lambda match: {'verb': match[2][0].get('verb')}, - level=7) + ''' + # conjunction + ( + ? # optional preposition + [] # relative pronoun + %BASIC_CONTENT% # clause content + ( + # verb group (error tolerance: should actually be finite) + [%VG%] + ) + %BASIC_TRAILER%? # optional trailer + [%EOS%]? # optional end of sentence punctuation + ) + ''', + group=1, + feats=lambda match: {'verb': match[2][0].get('verb')}, + level=7) add_rule('RelCl', - ''' - # relative clause - # conjunction - ( - %BASIC_CONTENT% # clause content - ( - [%VG%] # verb group (error tolerance: should actually be finite) - ) - %BASIC_TRAILER%? # optional trailer - [%EOS%]? # optional end of sentence punctuation - ) - ''', - group=1, - feats=lambda match: {'verb': match[2][0].get('verb')}, - level=7) + ''' + # relative clause + # conjunction + ( + %BASIC_CONTENT% # clause content + ( + # verb group (error tolerance: should actually be finite) + [%VG%] + ) + %BASIC_TRAILER%? # optional trailer + [%EOS%]? # optional end of sentence punctuation + ) + ''', + group=1, + feats=lambda match: {'verb': match[2][0].get('verb')}, + level=7) def complex_that(match): tokens = list(match[1][0].iter_terminals()) @@ -1160,139 +1157,144 @@ def get_verb_feats(match): level=17) add_rule('MainCl', - ''' - ( - (?: - ^ - - )? - (?: - [^] - (?: - - [^] - )? - )* - ( - - ) - [^%VG%%CLAUSE%%DASH%%PUNCT%]* - (?: - [%CLAUSE%] - <$,> - [^%VG%%CLAUSE%%DASH%%PUNCT%]* - )* - ( - - )? - [%PUNCT%] - ) - - ''', group=1, level=17) + ''' + ( + (?: + ^ + + )? + (?: + [^] + (?: + + [^] + )? + )* + ( + + ) + [^%VG%%CLAUSE%%DASH%%PUNCT%]* + (?: + [%CLAUSE%] + <$,> + [^%VG%%CLAUSE%%DASH%%PUNCT%]* + )* + ( + + )? + [%PUNCT%] + ) + + ''', group=1, level=17) add_rule('MainCl', - ''' - ^ - (?: - + - (?: <$,> | <$(> ) - )? - ( - - ) - [%CLAUSE%]+ - <$.>? - ''', - feats=lambda match: match[1][0].feats, - level=17) + ''' + ^ + (?: + + + (?: <$,> | <$(> ) + )? + ( + + ) + [%CLAUSE%]+ + <$.>? + ''', + feats=lambda match: match[1][0].feats, + level=17) add_rule('MainCl', - ''' - (?: - ^ - - )? - (?: - [^%DASH%%PUNCT%] - (?: - - [^] - )? - | - [^%VG%%CLAUSE%%DASH%%PUNCT%] - )* - ( - - ) - [^%VG%%CLAUSE%%DASH%%PUNCT%]* - (?: - [%CLAUSE%] - [%PUNCT%]? - )* - [^%VG%%CLAUSE%%DASH%%PUNCT%]* - ( - - )? - ( - # either a non-finite verb group - | # or (error tolerance) - # a finite verb group if - [%PUNCT%] # immediately followed by punctuation - )? - (?: - [%DASH%%PUNCT%] - (?: - [^%VG%%DASH%%PUNCT%]+ - (?: - $ - | - [%DASH%%PUNCT%] - ) - )? - | - # [^%VG%%CLAUSE%%DASH%%PUNCT%]* # commented because of errors - (?: - [%CLAUSE%] - <$,>? - )* - )? - (?: <$.> | <$,>)? - ''', + ''' + (?: + ^ + + )? + (?: + [^%DASH%%PUNCT%] + (?: + + [^] + )? + | + [^%VG%%CLAUSE%%DASH%%PUNCT%] + )* + ( + + ) + [^%VG%%CLAUSE%%DASH%%PUNCT%]* + (?: + [%CLAUSE%] + [%PUNCT%]? + )* + [^%VG%%CLAUSE%%DASH%%PUNCT%]* + ( + + )? + ( + # either a non-finite verb group + # or (error tolerance) + + | + # a finite verb group if + + # immediately followed by punctuation + [%PUNCT%] + )? + (?: + [%DASH%%PUNCT%] + (?: + [^%VG%%DASH%%PUNCT%]+ + (?: + $ + | + [%DASH%%PUNCT%] + ) + )? + | + # commented because of errors + # [^%VG%%CLAUSE%%DASH%%PUNCT%]* + (?: + [%CLAUSE%] + <$,>? + )* + )? + (?: <$.> | <$,>)? + ''', feats=get_verb_feats, level=18) add_rule('MainCl', - ''' - ( - - ) - (?: - [<$,>] - - )+ - ''', - feats=lambda match: match[1][0].feats, - level=19) + ''' + ( + + ) + (?: + [<$,>] + + )+ + ''', + feats=lambda match: match[1][0].feats, + level=19) add_rule('MainCl', - ''' - - ( - ( - [^%VG%%CLAUSE%%DASH%%PUNCT%] - | - [^%VG%%CLAUSE%%DASH%%PUNCT%] - <$,> - )+ - [%EOS%] - ) - ''', group=1, + ''' + + ( + ( + [^%VG%%CLAUSE%%DASH%%PUNCT%] + | + [^%VG%%CLAUSE%%DASH%%PUNCT%] + <$,> + )+ + [%EOS%] + ) + ''', group=1, feats=lambda match: {'makeVerbLess': True}, level=20) # Catch-all rule (fallback). add_rule('ANY', - ''' - [^]+ - ''', - level=21) + ''' + [^]+ + ''', + level=21) diff --git a/dsegmenter/edseg/conll.py b/dsegmenter/edseg/conll.py index e9d5562..eeee20f 100644 --- a/dsegmenter/edseg/conll.py +++ b/dsegmenter/edseg/conll.py @@ -3,28 +3,30 @@ ################################################################## # Documentation -""" -This module provides a convenient interface for handling CONLL data. +"""This module provides a convenient interface for handling CONLL data. CONLL data are represented in the form of individual lines with tab-separated fields. This module provides several classes which parse such lines either incrementally, one by one, or all at once, and store their information in their internal data structure. -Constants: -EOS - end of sentence marker -EOL - end of line marker -EOS_TAG - end tag for sentences -FIELDSEP - separator of fields for description of a single word -EMPTY_FIELD - word denoting an empty fields in the word -FEAT_SEP - separator of individual features -FEAT_VALUE_SEP - separator of feature name and its value -FEAT_VALUE_SEP_RE - regular expression corresponding to FEAT_VALUE_SEP +Attributes: + EOS (str): end of sentence marker + EOL (str): end of line marker + EOS_TAG (str): end tag for sentences + FIELDSEP (str): separator of fields for description of a single word + EMPTY_FIELD (str): word denoting an empty fields in the word + FEAT_SEP (str): separator of individual features + FEAT_VALUE_SEP (str): separator of feature name and its value + FEAT_VALUE_SEP_RE (re): regular expression corresponding to FEAT_VALUE_SEP Classes: -CONLL() - class for handling CONLL forrests -CONLLSentence() - class storing information pertaining to a single CONLL sentence -CONLLWord() - class storing information about a single CONLL word + CONLL: class for handling CONLL forrests + CONLLSentence: class storing information pertaining to a single CONLL + sentence + CONLLWord: class storing information about a single CONLL word + +.. moduleauthor:: Wladimir Sidorenko (Uladzimir Sidarenka) """ @@ -36,15 +38,15 @@ ################################################################## # Interface __name__ = "conll" -__all__ = ["EOS", "EOL", "EOS_TAG", "FIELDSEP", "EMPTY_FIELD", \ - "FEAT_SEP", "FEAT_VALUE_SEP", "FEAT_VALUE_SEP_RE", \ - "CONLL", "CONLLSentence", "CONLLWord"] +__all__ = ["EOS", "EOL", "EOS_TAG", "FIELDSEP", "EMPTY_FIELD", + "FEAT_SEP", "FEAT_VALUE_SEP", "FEAT_VALUE_SEP_RE", + "CONLL", "CONLLSentence", "CONLLWord"] ################################################################## # Constants -EOS = '\n' -EOL = '\n' -EOS_TAG = "" +EOS = '\n' +EOL = '\n' +EOS_TAG = "" FIELDSEP = '\t' EMPTY_FIELD = '_' @@ -52,6 +54,7 @@ FEAT_VALUE_SEP = '=' FEAT_VALUE_SEP_RE = re.compile(FEAT_VALUE_SEP) + ################################################################## # Classes class CONLL(object): @@ -61,40 +64,30 @@ class CONLL(object): An instance of this class comprises information about one or multiple parsed sentences in CONLL format. - This class provides following instance variables: - self.sentences - list of all sentences gathered in tree forest - self.s_id - list index of last parsed sentence - - This class provides following public methods: - __init__() - class constructor (can accept) - self.add_line() - parse specified single line and incrementally add - it to the data of current tree or append a new tree to the - forrest - self.is_empty() - return true if no sentences are stored - self.clear() - drop all stored information - self.get_words() - return list of words with their sentence and word indices - __str__() - return string representation of current forrest - __getitem__() - return sentence from forrest - __setitem__() - set sentence in forrest """ - def __init__(self, istring = ''): + def __init__(self, istring=''): """Initialize instance variables and parse input string if specified. - @param istring - input string(s) with CONLL data (optional) + Args: + istring (basestring): input string(s) with CONLL data (optional) """ + self.s_id = -1 self.sentences = [] - self.s_id = -1 self.__eos_seen__ = True for iline in istring.splitlines(): self.add_line(iline) - def add_line(self, iline = u''): - """Parse line and add it as CONLL word to either current or new - sentence. + def add_line(self, iline=u''): + """Parse line and add it as CONLL word. + + Args: + iline (basestring): input line(s) to parse + + Returns: + void: - @param iline - input line(s) to parse """ iline = iline.strip() if not iline or iline == EOS or iline == EOS_TAG: @@ -113,16 +106,18 @@ def add_line(self, iline = u''): # word is less than the index of the last word, that means that a # new sentence has started. w = CONLLWord(iline) - if self.s_id == -1 or int(w.idx) < int(self.sentences[self.s_id].words[-1].idx): + if self.s_id == -1 or \ + int(w.idx) < int(self.sentences[self.s_id].words[-1].idx): self._add_sentence(w) else: self.sentences[self.s_id].push_word(w) def is_empty(self): - """ - Check whether any sentences are stored. + """Check whether any sentences are stored. + + Returns: + bool: True if there is at least one sentence. - @return True if there is at least one sentence. """ return self.s_id == -1 @@ -131,7 +126,7 @@ def clear(self): Remove all stored information. """ del self.sentences[:] - self.s_id = -1 + self.s_id = -1 self.__eos_seen__ = False def get_words(self): @@ -146,8 +141,8 @@ def get_words(self): """ retlist = [] for s_id in xrange(self.s_id + 1): - retlist += [(w.form, s_id, w_id) for w, w_id in \ - self.sentences[s_id].get_words()] + retlist += [(w.form, s_id, w_id) for w, w_id in + self.sentences[s_id].get_words()] return retlist def __unicode__(self): @@ -160,26 +155,32 @@ def __str__(self): return self.__unicode__().encode("utf-8") def __getitem__(self, i): - """ - Return reference to `i`-th sentence in forrest. + """Return reference to `i`-th sentence in forrest. - @param i - integer index of sentence in forrest + Args: + i (int): integer index of sentence in forrest - @return `i`-th CONLL sentence in forrest. IndexError is raised if `i` - is outside of forrest boundaries. + Returns: + CONLLSentence: `i`-th CONLL sentence in forrest. + + Raises: + IndexError: is raised if `i` is outside of forrest boundaries. """ return self.sentences[i] def __setitem__(self, i, value): - """ - Set `i`-th sentence in forrest to specified value. + """Set `i`-th sentence in forrest to specified value. - @param i - integer index of sentence in forrest - @param value - CONLL sentence to which i-th sentence should be set + Args: + i (int): integer index of sentence in forrest + value (CONLLSentence): to which i-th sentence should be set - @return new value of `i`-th sentence. IndexError is raised if `i` - is outside of forrest boundaries. + Returns: + CONLLSentence:new value of `i`-th sentence + + Raises: + IndexError: raised if `i` is outside of forrest boundaries. """ self.sentences[i] = value @@ -224,9 +225,9 @@ class CONLLSentence(object): """ - def __init__(self, iword = ""): + def __init__(self, iword=""): """Initialize instance variables and parse iline if specified.""" - self.w_id = -1 + self.w_id = -1 self.words = [] self.children = defaultdict(list) if iword: @@ -234,13 +235,13 @@ def __init__(self, iword = ""): def clear(self): """Remove all words and reset counters.""" - self.w_id = -1 + self.w_id = -1 self.children.clear() del self.words[:] def is_empty(self): """Check if any words are present in sentence.""" - return self.w_id == -1 + return self.w_id == -1 def push_word(self, iword): """Parse iline storing its information in instance variables.""" @@ -306,6 +307,7 @@ def __len__(self): """Return the number of words in sentence.""" return len(self.words) + class CONLLWord(object): """Class for storing and manipulating information about a single word. @@ -332,17 +334,18 @@ class CONLLWord(object): __getitem__() - this method allows access to CONLLWord field using the standard dictionary like syntax, e.g. iword["token] __setitem__() - this method allows to set values of CONLLWord fields by - using the dictionary like syntax, e.g. iword["token] = "sky" + using the dictionary like syntax, + e.g., iword["token] = "sky" __str__() - return string representation of current forrest """ - key2field = {'idx': 0, 'form': 1, 'pform': 2, 'lemma': 3, 'plemma': 4, 'pos': 5, \ - 'ppos': 6, 'feat': 7, 'head': 8, 'phead': 9, 'deprel': 10, \ - 'pdeprel': 11, 'fillpred': 12, 'pred': 13} + key2field = {'idx': 0, 'form': 1, 'pform': 2, 'lemma': 3, 'plemma': 4, + 'pos': 5, 'ppos': 6, 'feat': 7, 'head': 8, 'phead': 9, + 'deprel': 10, 'pdeprel': 11, 'fillpred': 12, 'pred': 13} REQFIELDS = len(key2field) - def __init__(self, iline = None): + def __init__(self, iline=None): """Initialize instance variables and parse iline if specified.""" self.fields = [] self.features = {} @@ -355,26 +358,44 @@ def parse_line(self, iline): nfields = len(self.fields) # check that proper number of fields is provided if nfields != self.REQFIELDS: - raise Exception( \ - "Incorrect line format ({:d} fields expected instead of {:d}):\n'{:s}'".format( \ + raise Exception( + "Incorrect line format ({:d} fields" + " expected instead of {:d}):\n'{:s}'".format( self.REQFIELDS, nfields, iline)) # convert features and pfeatures to dicts feat_i = CONLLWord.key2field["feat"] - self.features = self.fields[feat_i] = self._str2dict(self.fields[feat_i]) + self.features = self.fields[feat_i] = \ + self._str2dict(self.fields[feat_i]) - def add_features(self, newfeatures = {}): - """Update dictionary of features with new features from `newfeatures'.""" + def add_features(self, newfeatures={}): + """Update dictionary of features with new features.""" self.features.update(newfeatures) - def get(self, ikey, idefault = None): - """Return value of ikey field or idefault if the field is not present.""" + def get(self, ikey, idefault=None): + """Return value of ikey field or idefault if the field is missing.""" try: return self.__getattr__(ikey) except AttributeError: return idefault + def __contains__(self, name): + """Check if field is present in item. + + This method looks for the passed field name in `key2field` dict and + returns true if the name is found and false otherwise. + + Args: + name (str): name of the field to be retrieved + + Returns: + (bool): + true if the given field name is found in item + + """ + return name in self.key2field + def __getattr__(self, name): - """Return self.field's item if this item's name is present in key2field. + """Return field's item if this item's name is present in key2field. This method looks for passed name in `key2field` dict and returns corresponding item of `self.fields` or raises an AttributeException @@ -386,10 +407,10 @@ def __getattr__(self, name): if name in self.key2field: return self.fields[self.key2field[name]] else: - raise AttributeError("cannot find symbol {:s}".format(name)) + raise AttributeError("Cannot find symbol {:s}".format(name)) def __getitem__(self, name): - """Return self.field's item if this item's name is present in key2field. + """Return field's item if this item's name is present in key2field. This method uses the self.__getattr__() method but converts the AttributeException to IndexError in case when lookup was not @@ -430,7 +451,7 @@ def __unicode__(self): retStr += FIELDSEP retStr += feat_str if feat_i < self.REQFIELDS: - retStr += FIELDSEP + retStr += FIELDSEP # add the rest of the fields retStr += FIELDSEP.join(self.fields[feat_i + 1:]) return retStr @@ -439,7 +460,7 @@ def __str__(self): """Return string representation of this object encoded in UTF-8.""" return self.__unicode__().encode("utf-8") - def _dict2str(self, idict, new_format = True): + def _dict2str(self, idict, new_format=True): """Convert dictionary of features to a string.""" fList = [] if not idict: diff --git a/dsegmenter/edseg/edssegmenter.py b/dsegmenter/edseg/edssegmenter.py index 404732c..c342fee 100644 --- a/dsegmenter/edseg/edssegmenter.py +++ b/dsegmenter/edseg/edssegmenter.py @@ -3,27 +3,23 @@ ################################################################## # Documentation -""" -Module providing rule-based discourse segmenter `EDSSegmenter`. - -Constants: -WESWEGEN_SET - set of strings representing causal connectives -SDS_LABEL - label for sentence discourse segments -EDS_LABEL - label for elementary discourse segments -MAIN_CLAUSE - label for discourse segments that encompass main clauses -SUB_CLAUSE - label for discourse segments that encompass subordinate clauses -REL_CLAUSE - label for discourse segments that encompass restrictive relative clauses -PAREN - label for parenthetical discourse segments -DISCOURSE_PP - label for discourse segments formed by prepositional phrases - -Classes: -EDSSegmenter - rule-based discourse segmenter - -Exceptions: - -@author = Jean VanCoppenolle, Wladimir Sidorenko (Uladzimir Sidarenka) -@mail = , -@version = 0.0.1 +"""Module providing rule-based discourse segmenter `EDSSegmenter`. + +Attributes: + WESWEGEN_SET (set): set of strings representing causal connectives + SDS_LABEL (str): label for sentence discourse segments + EDS_LABEL (str): label for elementary discourse segments + MAIN_CLAUSE (str): label for discourse segments that encompass main clauses + SUB_CLAUSE (str): label for discourse segments that encompass subordinate + clauses + REL_CLAUSE (str): label for discourse segments that encompass restrictive + relative clauses + PAREN (str): label for parenthetical discourse segments + DISCOURSE_PP (str): label for discourse segments formed by prepositional + phrases + EDSSegmenter (class): rule-based discourse segmenter + +.. moduleauthor:: Jean VanCoppenolle, Wladimir Sidorenko (Uladzimir Sidarenka) """ @@ -47,24 +43,23 @@ PAREN = 'Paren' DISCOURSE_PP = 'DiPP' + ################################################################## # Classes class EDSSegmenter(object): - """ - Class for perfoming discourse segmentation on CONLL dependency trees. + """Class for perfoming discourse segmentation on CONLL dependency trees. - Instance variables: - _clause_segmenter - internal worker for doing discourse segmentation - _clause_discarder - internal automaton which decides if sentence shouldn't - be processed - _sent - internal reference to the sentence being processed - _tokens - internal reference to the list of processed tokens + Attributes: + _clause_segmenter: internal worker for doing discourse segmentation + _clause_discarder: internal automaton which decides if sentence + shouldn't be processed + _sent: internal reference to the sentence being processed + _tokens: internal reference to the list of processed tokens + segment: perform discourse segmentation of the CONLL sentence - Public methods: - segment - perform discourse segmentation of the CONLL sentence """ - def __init__(self, a_clause_segmenter = None): + def __init__(self, a_clause_segmenter=None): """ Class constructor. @@ -74,17 +69,20 @@ def __init__(self, a_clause_segmenter = None): self._clause_segmenter = ClauseSegmenter() else: self._clause_segmenter = a_clause_segmenter - self._clause_discarder = StartOfClauseMatcher.from_file(data.data_dir('skip_rules.txt')) + self._clause_discarder = StartOfClauseMatcher.from_file( + data.data_dir('skip_rules.txt')) self._sent = None self._tokens = [] def segment(self, sent): - """ - Method for segmenting CONLL trees. + """Segment CONLL trees. - @param sent - CONLL tree to process + Args: + sent (CONLLSentence)L CONLL tree to process + + Returns: + Segment: sentence-level discourse segment - @return sentence-level discourse segment """ self._sent = sent clauses = self._clause_segmenter.segment(sent) @@ -104,7 +102,7 @@ def _process_clause(self, clause, idx, clauses, sds, eds, depth=0): if not self._is_token(child1) and child1.label == clause.label: for idx, child in enumerate(clause): eds = self._process_clause(child, idx, clause, sds, eds, - depth = depth) + depth=depth) return eds self._tokens, prev_toks = list(clause.iter_terminals()), self._tokens if self._clause_discarder.match(self._tokens, prev_toks): diff --git a/dsegmenter/edseg/finitestateparsing.py b/dsegmenter/edseg/finitestateparsing.py index 1a04db3..732d5e9 100644 --- a/dsegmenter/edseg/finitestateparsing.py +++ b/dsegmenter/edseg/finitestateparsing.py @@ -1,15 +1,14 @@ #!/usr/bin/env python # -*- mode: python; coding: utf-8 -*- -################################################################## -# Documentation -""" -Module providing parsing routines based on finite-state mechanisms. +"""Module providing parsing routines based on finite-state mechanisms. Constants: Methods: -constraint - function decorator making function execution safe against internal failures + +constraint - function decorator making function execution safe against internal +failures Classes: Tree - auxiliary tree class used for constructing segment nodes @@ -21,7 +20,8 @@ Overflow - exception raised by SymTab class when buffer overflows @author = Jean VanCoppenolle, Wladimir Sidorenko (Uladzimir Sidarenka) -@mail = , +@mail = , + """ @@ -36,15 +36,19 @@ import sys import warnings + ################################################################## # Methods def constraint(func): - """ - Function decorator making function execution safe against internal failures. + """Decorator making function execution safe against internal failures. + + Args: + func (method): + reference to function which should be decorated - @param func - reference to function which should be decorated + Returns: + self - @return self """ @wraps(func) def decorate(match): @@ -56,6 +60,7 @@ def decorate(match): return False return decorate + ################################################################## # Classes class Tree(object): @@ -305,10 +310,12 @@ def terminals(self, n): @property def is_preterminal(self): - """ - Check if all of node's children are terminals + """Check if all of node's children are terminals + + Returns: + (bool): + true if all of node's children are terminals, false otherwise - @return \c true if all of node's children are terminals, \c false otherwise """ return not any(isinstance(child, Tree) for child in self) @@ -340,9 +347,10 @@ def iter_terminals(self): yield terminal def pretty_print(self, a_stream=sys.stdout, a_depth=0, a_indent=' ', - a_term_print=lambda term: u'{form}/{pos}'.format(form=term['form'], \ - pos=term['pos']), - a_feat_print=lambda feat: u'{0}={1}'.format(feat[0], feat[1]), + a_term_print=lambda term: u'{form}/{pos}'.format( + form=term['form'], pos=term['pos']), + a_feat_print=lambda feat: u'{0}={1}'.format( + feat[0], feat[1]), a_encoding='utf-8'): """ Output nice string representation of the current tree @@ -356,18 +364,22 @@ def pretty_print(self, a_stream=sys.stdout, a_depth=0, a_indent=' ', @return \c void """ - emit = lambda out: a_stream.write('{0}{1}'.format(a_indent * a_depth, out)) + emit = lambda out: a_stream.write('{0}{1}'.format( + a_indent * a_depth, out)) if self.feats: feat_str = ','.join(a_feat_print(item) for item in self.feats.iteritems()) - emit('({0} [{1}]\n'.format(self.label, feat_str.encode(a_encoding))) + emit('({0} [{1}]\n'.format(self.label, + feat_str.encode(a_encoding))) else: emit('({0}\n'.format(self.label)) for child in self: if hasattr(child, 'pretty_print'): - child.pretty_print(a_stream = a_stream, a_depth = a_depth + 1, - a_indent = a_indent, a_term_print = a_term_print, - a_feat_print = a_feat_print, a_encoding = a_encoding) + child.pretty_print(a_stream=a_stream, a_depth=a_depth + 1, + a_indent=a_indent, + a_term_print=a_term_print, + a_feat_print=a_feat_print, + a_encoding=a_encoding) else: emit('{0}{1}\n'.format(a_indent, a_term_print(child).encode(a_encoding))) @@ -387,7 +399,8 @@ def __str__(self): if isinstance(child, Tree): ostring += str(child) + '\n' else: - ostring += u'{form}/{pos}'.format(form=child['form'], pos=child['pos']) + ostring += u'{form}/{pos}'.format( + form=child['form'], pos=child['pos']) ostring += ")\n" return ostring @@ -412,10 +425,12 @@ class SymTab(object): """ class Overflow(Exception): - """ - Custom exception thrown when the number of mapped strings exceeds capacity + """Custom exception. + + Thrown when the number of mapped strings exceeds capacity This class subclasses `Exception` + """ pass @@ -469,11 +484,12 @@ class MatchProxy(object): """ def __init__(self, match, nodes): - """ - Class constructor + """Class constructor @param match - reference to rule match - @param nodes - reference to the list of nodes to which match was applied + @param nodes - reference to the list of nodes to which match was + applied + """ self._match = match self._nodes = nodes @@ -489,9 +505,9 @@ def __getitem__(self, group): start, end = self._match.start(group), self._match.end(group) return self._nodes[start:end] + class FiniteStateParser(object): - """ - Match engine used for matching rules + """Match engine used for matching rules Class methods: from_file - create an automaton instance from file with rules @@ -508,19 +524,22 @@ class FiniteStateParser(object): _rules - internal mapping from rule levels to compiled rules Public methods: - add_rule - convert symbolic rule to a regular expression and add it to common automaton + add_rule - convert symbolic rule to a regular expression and add it to + common automaton define - add macro definition to the current set of rules parse - parse given rules instance and add it to the rule cascade + """ _RE_VAR = re.compile('(%[^%]+%)', re.I) _RE_CAT = re.compile('(?]+)>', re.I) def __init__(self, root_cat='ROOT'): - """ - Class constructor + """Class constructor + + @param root_cat - label to be used for the root node of the constructed + tree - @param root_cat - label to be used for the root node of the constructed tree """ self.root_cat = root_cat self.cats = set() @@ -578,8 +597,7 @@ def define(self, name, pattern): def add_rule(self, lhs, rhs, level=0, constraint=None, group=0, feats=None): - """ - Convert symbolic rule to a regular expression and add it to common automaton + """Convert rule to regexp and add it to common automaton @param lhs - left hand side of the rule (is used for matching) @param rhs - left hand side of the rule (is used as replacement) @@ -589,6 +607,7 @@ def add_rule(self, lhs, rhs, level=0, constraint=None, group=0, @param feats - custom function for extracting features from nodes @return self + """ if level < 0: raise ValueError('level must be a positive number') @@ -600,7 +619,7 @@ def add_rule(self, lhs, rhs, level=0, constraint=None, group=0, 'feats': feats}) return self - def parse(self, tokens, catgetter = lambda tok: tok): + def parse(self, tokens, catgetter=lambda tok: tok): """ Parse given rules instance and add it to the rule cascade @@ -610,7 +629,7 @@ def parse(self, tokens, catgetter = lambda tok: tok): @return newly constructed segment tree """ nodes = tokens - for lvl, rules in sorted(self._rules.iteritems(), key = itemgetter(0)): + for lvl, rules in sorted(self._rules.iteritems(), key=itemgetter(0)): nodes = self._parse_level(rules, nodes, catgetter) return Tree(self.root_cat, nodes) @@ -634,7 +653,8 @@ def _parse_level(self, rules, nodes, catgetter): flag = constraint(proxy) except Exception as exc: flag = False - warnings.warn('Exception in constraint: {0}'.format(lhs, exc)) + warnings.warn('Exception in constraint:' + ' {0}'.format(lhs, exc)) raise if not flag: continue @@ -664,7 +684,7 @@ def _make_tag_string(self, nodes, catgetter): def _replace_vars(self, pattern): return self._RE_VAR.sub(lambda match: self._vars[match.group(1)[1:-1]], - pattern) + pattern) def _compile_rhs(self, rhs): rhs = self._replace_vars(rhs) diff --git a/dsegmenter/edseg/util.py b/dsegmenter/edseg/util.py index 2794c43..3ace304 100644 --- a/dsegmenter/edseg/util.py +++ b/dsegmenter/edseg/util.py @@ -21,8 +21,8 @@ AlreadyFinalized - exception raised by StartOfClauseMatcher on an attempt to add a rule after matcher has already been finalized -@author = Jean VanCoppenolle, Wladimir Sidorenko (Uladzimir Sidarenka) -@mail = , +.. moduelauthor:: Jean VanCoppenolle, Wladimir Sidorenko (Uladzimir Sidarenka) + """ ################################################################## @@ -32,17 +32,20 @@ import codecs + ################################################################## # Methods def match(tokens, *search, **options): - """ - Custom search on tokens for specified patterns + """Custom search on tokens for specified patterns + + Args: + tokens - input tokens in which pattern should be searched + search - searched pattern + options - search options - @param tokens - input tokens in which pattern should be searched - @param search - searched pattern - @param options - search options + Returns: + bool: True if pattern was found, False otherwise - @return \c True if pattern was found, \c False otherwise """ if options.get('reverse'): tokens = reversed(tokens) @@ -56,44 +59,45 @@ def match(tokens, *search, **options): return False return True + ################################################################## # Classes class Trie(object): - """ - Implementation of the trie data structure + """Implementation of the trie data structure Constants: - _SENTINEL - default object to comapre with to ensure that the matched object is valid + _SENTINEL - default object to comapre with to ensure that the matched + object is valid Instance variables: - start - index of the start state of the trie from which to begin matching - _trans - transition table for the states - _final - dictionary mapping final states to corresponding tree labels - _last_state - last active state used for matching + start - index of the start state of the trie from which to begin matching + _trans - transition table for the states + _final - dictionary mapping final states to corresponding tree labels + _last_state - last active state used for matching Public methods: - add_word - add new word to the total trie - get - perform match operation on the given string - get_state - generate new state for the trie - set_final - remember given state as final and associate a lebel with it - is_final - check if given state is final - get_olabel - return label associated with given final state - set_olabel - set new label for the given final state - add_trans - add new transitions to the given state - get_trans - obtain transitions emitted by the given state - iter_trans - iterate over transitions of the given state - as_dot - output trie in dotty format + add_word - add new word to the total trie + get - perform match operation on the given string + get_state - generate new state for the trie + set_final - remember given state as final and associate a lebel with it + is_final - check if given state is final + get_olabel - return label associated with given final state + set_olabel - set new label for the given final state + add_trans - add new transitions to the given state + get_trans - obtain transitions emitted by the given state + iter_trans - iterate over transitions of the given state + as_dot - output trie in dotty format Exceptions: - NotFinal - exception raised when match does not reach final state + NotFinal - exception raised when match does not reach final state """ class NotFinal(Exception): - """ - Exception thrown when the trie has no more transitions and there is no default + """Exception thrown when the trie has no more transitions. This class subclasses `Exception` + """ pass diff --git a/dsegmenter/mateseg/__init__.py b/dsegmenter/mateseg/__init__.py index 447ebe1..4b82b9e 100644 --- a/dsegmenter/mateseg/__init__.py +++ b/dsegmenter/mateseg/__init__.py @@ -6,7 +6,6 @@ """Package providing discourse segmenter for Mate dependency graphs. Attributes: - __all__ (List[str]): list of sub-modules exported by this package __author__ (str): package's author __email__ (str): email of package's author @@ -17,17 +16,20 @@ ################################################################## # Imports -from .dependency_graph import (DependencyGraph, read_deptree_file, - HEAD, DEPS, REL, TAG, ADDRESS, - TOP_TAG_LABEL, TOP_RELATION_LABEL) -from .segmentation_tree import read_segtree_file, generate_subtrees_from_forest +from __future__ import absolute_import, unicode_literals + +from dsegmenter.mateseg.dependency_graph import DependencyGraph, \ + read_trees, read_tok_trees, HEAD, ADDRESS, TOP_TAG_LABEL, \ + TOP_RELATION_LABEL +from dsegmenter.mateseg.matesegmenter import MateSegmenter, \ + trees2segs ################################################################## # Intialization __name__ = "mateseg" -__all__ = ["DependencyGraph", "HEAD", "DEPS", "REL", "TAG", "ADDRESS", - "TOP_TAG_LABEL", "TOP_RELATION_LABEL", "read_deptree_file", - "read_segtree_file", "generate_subtrees_from_forest"] +__all__ = ["DependencyGraph", "HEAD", "ADDRESS", + "TOP_TAG_LABEL", "TOP_RELATION_LABEL", "MateSegmenter", + "read_trees", "read_tok_trees", "trees2segs"] __author__ = "Andreas Peldszus" __email__ = "peldszus at uni dash potsdam dot de" -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/dsegmenter/mateseg/data/mate.model b/dsegmenter/mateseg/data/mate.model index 840f63f..5ce24ea 100644 Binary files a/dsegmenter/mateseg/data/mate.model and b/dsegmenter/mateseg/data/mate.model differ diff --git a/dsegmenter/mateseg/dependency_graph.py b/dsegmenter/mateseg/dependency_graph.py index 95c40a6..10ef400 100644 --- a/dsegmenter/mateseg/dependency_graph.py +++ b/dsegmenter/mateseg/dependency_graph.py @@ -9,17 +9,16 @@ ################################################################## # Imports -import codecs -from nltk.parse.dependencygraph import DependencyGraph as NLTKDependencyGraph +from __future__ import absolute_import, print_function, unicode_literals + +from dsegmenter.common import DEPS, REL, TAG, WORD +from nltk.parse.dependencygraph import DependencyGraph as NLTKDependencyGraph +import sys ################################################################## # Constants HEAD = "head" -DEPS = "deps" -WORD = "word" -REL = "rel" -TAG = "tag" ADDRESS = "address" LEMMA = "lemma" CTAG = "ctag" @@ -81,10 +80,10 @@ def is_valid_parse_tree(self): for the moment just check for a unique root''' root = self.get_dependencies_simple(0) if len(root) < 1: - print "Warning: No root address" + print("Warning: No root address", file=sys.stderr) return False if len(root) > 1: - print "Warning: More than one root address" + print("Warning: More than one root address", file=sys.stderr) return False return True @@ -112,54 +111,136 @@ def deannotate(self, field_name): ################################################################## # Methods -def transform_conll_data(data): - '''transforms conll data outputted by the mate parser to valid - conll 2007 format''' - out = [] - for line in data.splitlines(): - out.append(transform_line(line)) - return '\n'.join(out) - - def transform_line(line): - '''transforms a conll line outputted by the mate parser to a valid - conll 2007 format line''' - if line.strip() == '': - return '' - else: - f = line.split('\t') - # escape parenthesis - token = f[1] - if token == '(': - token = '-OP-' - elif token == ')': - token = '-CP-' - # The nltk v3 implementation of dependency graphs needs an explicit - # root relation label. Mate's output uses '--' as a label for relations - # to the root, but also for punctuations. We thus translate the - # relation label to 'ROOT'. - if f[9] == '0': - f[11] = TOP_RELATION_LABEL - return '\t'.join([f[0], token, f[3], f[5], f[5], f[7], f[9], f[11], - '_', '_']) - - -def number_tokens_of_dependency_graphs(list_of_dependency_graphs): - '''prefixes all tokens in a list of dependency graphs with a running number - starting from 0''' - deptree_leaf_counter = 0 - for depgraph in list_of_dependency_graphs: - for node in depgraph.subgraphs(exclude_root=True): - node[WORD] = (deptree_leaf_counter, node[WORD]) - deptree_leaf_counter += 1 - return list_of_dependency_graphs - - -def read_deptree_file(fn): - '''reads mate parser output and returns a list of dependency graphs of the - parsed sentences''' - with codecs.open(fn, 'r', 'utf-8') as f: - s = transform_conll_data(f.read()) - l = [DependencyGraph(sentence, top_relation_label=TOP_RELATION_LABEL) - for sentence in s.split('\n\n') if sentence] - return number_tokens_of_dependency_graphs(l) + """Transform a mate line to a valid conll 2007 format. + + Args: + a_line (str): input line to transform + + Returns: + str: transformed line + + """ + f = line.split('\t') + # escape parenthesis + token = f[1] + if token == '(': + token = '-OP-' + elif token == ')': + token = '-CP-' + # The nltk v3 implementation of dependency graphs needs an explicit + # root relation label. Mate's output uses '--' as a label for relations + # to the root, but also for punctuations. We thus translate the + # relation label to 'ROOT'. + if f[9] == '0': + f[11] = TOP_RELATION_LABEL + return '\t'.join([f[0], token, f[3], f[5], f[5], f[7], f[9], f[11], + '_', '_']) + + +def number_tokens(dgraph): + """Prefix all tokens in dependency graphs with their running number. + + Args: + dgraph (nltk.parse.dependencygraph.DependencyGraph): + list of dependency trees + + Returns: + nltk.parse.dependencygraph.DependencyGraph: + dependency trees with numbered tokens + + """ + cnt = 0 + for node in dgraph.subgraphs(exclude_root=True): + node[WORD] = (cnt, node[WORD]) + cnt += 1 + return dgraph + + +def tree2tok(a_tree, a_tree_idx, a_root_idx, a_tk_start=0): + """Create dictionary mapping dependency trees to numbered tokens. + + Args: + a_tree (DependencyGraph): tree to analyze + a_tree_idx (int): tree index in the document + a_root_idx (int): index of the root node + a_tk_start (int): starting position of the first token + + Returns: + (dict) mapping from subtrees to their yields + + """ + # set of terminals corresponding to the given node + iroot = a_tree.nodes[a_root_idx] + tkset = set() + if iroot[WORD] is not None: + tkset.add((a_tk_start + iroot[WORD][0], iroot[WORD][1])) + tr2tk = {(a_tree_idx, a_root_idx): (a_tree, tkset)} + for ch_idcs in iroot[DEPS].itervalues(): + for ch_idx in ch_idcs: + t2t = tree2tok(a_tree, a_tree_idx, ch_idx, a_tk_start) + tr2tk.update(t2t) + tkset.update(t2t[(a_tree_idx, ch_idx)][-1]) + return tr2tk + + +def read_tok_trees(a_lines): + """Read file and return a mapping from tokens to trees and a list of trees. + + Args: + a_lines (list[str]): decoded lines of the input file + + Returns: + 2-tuple: list of dictionaries mapping tokens to trees and a list of trees + + """ + toks = [] + tok_c = 0 + t2t = None + trees = [t for t in read_trees(a_lines)] + trees2toks = dict() + + for i, itree in enumerate(trees): + t2t = tree2tok(itree, i, 0, tok_c) + trees2toks.update(t2t) + # increment token counter by the number of tokens in the sentence + tok_c += len(t2t[(i, 0)][-1]) + + toks2trees = dict() + for ((tree_c, tree_pos), (tree, toks)) in trees2toks.iteritems(): + # skip the abstract root node + if tree_pos == 0: + continue + toks = frozenset(toks) + if toks in toks2trees: + toks2trees[toks].append((tree, tree_pos)) + else: + toks2trees[toks] = [(tree, tree_pos)] + return (toks2trees, trees) + + +def read_trees(a_lines): + """Read file and yield DependencyGraphs. + + Args: + a_lines (list[str]): iterable over decoded lines of the input file + + Yields: + nltk.parse.dependencygraph.DependencyGraph: + + """ + toks = [] + for iline in a_lines: + iline = iline.strip() + if not iline: + if toks: + yield number_tokens( + DependencyGraph('\n'.join(toks), + top_relation_label=TOP_RELATION_LABEL)) + del toks[:] + else: + toks.append(transform_line(iline)) + if toks: + yield number_tokens( + DependencyGraph('\n'.join(toks), + top_relation_label=TOP_RELATION_LABEL)) diff --git a/dsegmenter/mateseg/matesegmenter.py b/dsegmenter/mateseg/matesegmenter.py index e14b245..8b2aaac 100644 --- a/dsegmenter/mateseg/matesegmenter.py +++ b/dsegmenter/mateseg/matesegmenter.py @@ -1,32 +1,36 @@ #!/usr/bin/env python2.7 # -*- mode: python; coding: utf-8; -*- -''' -Created on 03.01.2015 +"""Created on 03.01.2015 @author: Andreas Peldszus -''' + +""" ################################################################## # Imports -from .dependency_graph import HEAD, WORD, REL, TAG, ADDRESS -from .segmentation_tree import generate_subtrees_from_forest -from ..treeseg import (TreeSegmenter, DiscourseSegment, DEPENDENCY, - DEFAULT_SEGMENT) -from ..treeseg.treesegmenter import NO_MATCH_STRING -from ..treeseg.constants import GREEDY -from ..bparseg.align import nw_align +from __future__ import absolute_import, print_function, unicode_literals + +from dsegmenter.common import NONE, prune_punc, score_substitute, \ + translate_toks + +from dsegmenter.mateseg.dependency_graph import HEAD, WORD, REL, TAG, ADDRESS +from dsegmenter.mateseg.segmentation_tree import generate_subtrees_from_forest +from dsegmenter.treeseg import (TreeSegmenter, DiscourseSegment, DEPENDENCY, + DEFAULT_SEGMENT) +from dsegmenter.treeseg.treesegmenter import NO_MATCH_STRING +from dsegmenter.treeseg.constants import GREEDY +from dsegmenter.bparseg.align import nw_align from sklearn.pipeline import Pipeline from sklearn.feature_extraction import DictVectorizer from sklearn.feature_selection import VarianceThreshold -from sklearn.cross_validation import KFold from sklearn.svm import LinearSVC from sklearn.metrics import precision_recall_fscore_support from sklearn.externals import joblib -import numpy as np import os +import sys ################################################################## # Variables and Constants @@ -39,8 +43,72 @@ ################################################################## # Methods +def trees2segs(a_toks2trees, a_toks2segs): + """Align trees with corresponding segments. + + Args: + a_toks2trees (dict): mapping from tokens to trees + a_toks2segs (dict): mapping from tokens to segments + + Returns: + dict: mapping from trees to segments + + """ + # prune empty trees and their corresponding segments + tree2seg = {t: None + for val in a_toks2trees.itervalues() + for t in val} + # add additional keys to `a_toks2trees` by pruning punctuation marks from + # existing trees + pruned_toks = None + tree_tok_keys = a_toks2trees.keys() + for tree_toks in tree_tok_keys: + pruned_toks = prune_punc(tree_toks) + if pruned_toks not in a_toks2trees: + a_toks2trees[pruned_toks] = a_toks2trees[tree_toks] + # establish a mapping between tree tokens and segment tokens + tree_toks = list(set([t + for t_set in a_toks2trees.iterkeys() + for t in t_set])) + tree_toks.sort(key=lambda el: el[0]) + seg_toks = list(set([t + for t_set in a_toks2segs.iterkeys() + for t in t_set])) + seg_toks.sort(key=lambda el: el[0]) + # align tokens if necessary + seg_t2tree_t = None + if tree_toks != seg_toks: + seg_t2tree_t = dict() + alignment = nw_align(seg_toks, tree_toks, + substitute=score_substitute, + keep_deleted=True) + for i, tt in enumerate(alignment): + seg_t2tree_t[seg_toks[i]] = [tree_toks[j] for j in tt] + # for each segment look if its corresponding token set is matched by + # any other subtree + translated_toks = None + for toks, segs in a_toks2segs.iteritems(): + translated_toks = translate_toks(toks, seg_t2tree_t) + key = None + if translated_toks in a_toks2trees: + key = translated_toks + else: + translated_toks = prune_punc(translated_toks) + if translated_toks in a_toks2trees: + key = translated_toks + if key: + for tree in a_toks2trees[key]: + # if tree2seg[tree] is not None: + # continue + assert tree2seg[tree] is None, \ + "Multiple segments found for tree" + repr(tree) + ": " + \ + repr(segs[-1]) + "; " + repr(tree2seg[tree]) + tree2seg[tree] = segs[-1] + return tree2seg + + def gen_features_for_segment(dep_graph, trg_adr): - ''' ugly feature extraction code ;) ''' + """ ugly feature extraction code ;) """ nodes = list(dep_graph.subgraphs(exclude_root=False)) nl = {node[ADDRESS]: node for node in nodes} @@ -127,7 +195,7 @@ def word_access(x): def substitution_costs(c1, c2): - '''defines the costs of substitutions for the alignment''' + """defines the costs of substitutions for the alignment""" if c1[-1] == c2[-1]: return 2 else: @@ -135,7 +203,7 @@ def substitution_costs(c1, c2): def chained(iterable): - '''flattens a single embed iterable''' + """flattens a single embed iterable""" return list(elm for sublist in iterable for elm in sublist) @@ -180,7 +248,8 @@ def get_training_observations(seg_trees, dep_trees): for seg_sub_tree in generate_subtrees_from_forest(seg_trees): node = seg_sub_tree.label() if node is None or node == "": - print "Warning: Empty node.", sentence_index + print("Warning: Empty node.", sentence_index, + file=sys.stderr) if unequal_tokenizations: seg_leaves = set([seg_to_dep_tok[leaf] for leaf in seg_sub_tree.leaves()]) @@ -197,10 +266,12 @@ def get_training_observations(seg_trees, dep_trees): def _cnt_stat(a_gold_segs, a_pred_segs): """Estimate the number of true pos, false pos, and false neg. - @param a_gold_segs - gold segments - @param a_pred_segs - predicted segments + Args: + a_gold_segs (iterable): gold segments + a_pred_segs (iterable): predicted segments - @return 3-tuple with true positives, false positives, and false negatives + Returns: + tuple: true positives, false positives, and false negatives """ tp = fp = fn = 0 @@ -218,11 +289,12 @@ def _cnt_stat(a_gold_segs, a_pred_segs): def decision_function(node, tree): - '''decision function for the tree segmenter''' + """decision function for the tree segmenter""" assert PREDICTION in node, "No prediction for node {}".format(node) pred = node[PREDICTION] # pred = NO_MATCH_STRING - if pred == NO_MATCH_STRING and HEAD in node and node[HEAD] == 0: + if (pred == NO_MATCH_STRING or pred == NONE) and HEAD in node \ + and node[HEAD] == 0: # The classifier did not recognize sentence top as a segment, so we # enforce a labelling with the default segment type. return DEFAULT_SEGMENT @@ -230,18 +302,22 @@ def decision_function(node, tree): return pred +################################################################## +# Class class MateSegmenter(object): """Class for perfoming discourse segmentation on constituency trees. """ #: classifier object: default classification method - DEFAULT_CLASSIFIER = LinearSVC(multi_class='ovr', class_weight='auto') + DEFAULT_CLASSIFIER = LinearSVC(multi_class="ovr", + class_weight="balanced") - #:str: path to default model to use in classification - DEFAULT_MODEL = os.path.join(os.path.dirname(__file__), "data", "mate.model") + #: path to default model to use in classification + DEFAULT_MODEL = os.path.join(os.path.dirname(__file__), + "data", "mate.model") - #:pipeline object: default pipeline object used for classification + #: default pipeline object used for classification DEFAULT_PIPELINE = Pipeline([ ('vectorizer', DictVectorizer()), ('var_filter', VarianceThreshold()), @@ -249,23 +325,24 @@ class MateSegmenter(object): def __init__(self, featgen=gen_features_for_segment, model=DEFAULT_MODEL): """Class constructor. + """ self.featgen = featgen - self.pipeline = None + self.model = None self._update_model(model) - - def extract_features_from_corpus(self, dep_corpus, seg_corpus=None): - all_features = [] - all_labels = [] - for text in sorted(dep_corpus.keys()): - seg_forest = seg_corpus.get(text, None) - features, labels = self.extract_features_from_text( - dep_corpus[text], seg_forest=seg_forest) - all_features.extend(features) - all_labels.extend(labels) - return all_features, all_labels + self._segmenter = TreeSegmenter(a_type=DEPENDENCY) def extract_features_from_text(self, dep_forest, seg_forest=None): + """Extract features from dependency trees. + + Args: + dep_forrest (list): list of sentence trees to be parsed + dep_forrest (list or None): list of discourse segments + + Returns: + 2-tuple[list, list]: list of features and list of labels + + """ features = [] labels = [] observations = get_observations(seg_forest, dep_forest) @@ -274,15 +351,37 @@ def extract_features_from_text(self, dep_forest, seg_forest=None): labels.append(class_) return features, labels - def segment(self, dep_corpus, out_folder): - for text, trees in dep_corpus.iteritems(): - print text - discourse_tree = self.segment_text(trees) - with open(out_folder + '/' + text + '.tree', 'w') as fout: - fout.write(str(discourse_tree)) + def segment(self, a_trees): + """Create discourse segments based on the Mate trees. + + Args: + a_trees (list): list of sentence trees to be parsed + + Returns: + list: constructed segment trees + + """ + segments = [] + features = predictions = None + for itree in a_trees: + features, _ = self.extract_features_from_text([itree]) + predictions = self._predict(features) + segments.append(self._segment_sentence( + predictions, itree)[0][1]) + return (segments,) def segment_text(self, dep_forest): - features, _ = self.extract_features_from_text(dep_forest) + """Segment all sentences of a text. + + Args: + dep_forrest (list[dsegmenter.mateseg.dependency_graph]): list + of sentence trees to be parsed + + Returns: + list: constructed segment trees + + """ + features = self.extract_features_from_text(dep_forest) predictions = self._predict(features) return self._segment_text(predictions, dep_forest) @@ -297,7 +396,7 @@ def _segment_text(self, predictions, parses): segments = self._segment_sentence(sentence_predictions, dep_graph) segment = segments[0][1] all_segments.append((sentence, segment)) - return DiscourseSegment(a_name='TEXT', a_leaves=all_segments) + return DiscourseSegment(a_name=DEFAULT_SEGMENT, a_leaves=all_segments) def _segment_sentence(self, sentence_predictions, dep_graph): if dep_graph.is_valid_parse_tree(): @@ -306,11 +405,13 @@ def _segment_sentence(self, sentence_predictions, dep_graph): # annotate dep_graph with sentence predictions dep_graph.annotate(sentence_predictions, PREDICTION) # call tree_segmenter - segmenter = TreeSegmenter(a_type=DEPENDENCY) - segments = segmenter.segment( + segments = self._segmenter.segment( dep_graph, a_predict=decision_function, a_word_access=word_access, a_strategy=GREEDY, a_root_idx=dep_graph.root[ADDRESS]) + if len(segments) > 1: + segments = [(0, DiscourseSegment(a_name=DEFAULT_SEGMENT, + a_leaves=segments))] else: # make a simple sentence segment for invalid parse trees leaves = [(i, word) for i, (_, word) in @@ -319,27 +420,48 @@ def _segment_sentence(self, sentence_predictions, dep_graph): segments = [(0, dseg)] return segments - def train(self, seg_corpus, dep_corpus, path=None): - assert seg_corpus.keys() == dep_corpus.keys() - features, labels = self.extract_features_from_corpus( - dep_corpus, seg_corpus=seg_corpus) - self._train(features, labels) + def train(self, trees, segments, path=None): + """Train segmenter model. + + Args: + a_trees (list): BitPar trees + a_segs (list): discourse segments + a_path (str): path to file in which the trained model should be + stored + + Returns: + void: + + """ + features = [self.featgen(t, n) for t, n in trees] + segments = [str(s) for s in segments] + self._train(features, segments) if path is not None: - joblib.dump(self.pipeline, path, compress=1, cache_size=1e9) + joblib.dump(self.model, path, compress=1, cache_size=1e9) def _train(self, features, labels): - self.pipeline = MateSegmenter.DEFAULT_PIPELINE - self.pipeline.fit(features, labels) + self.model = MateSegmenter.DEFAULT_PIPELINE + self.model.fit(features, labels) + + def test(self, trees, segments): + """Estimate performance of segmenter model. + + Args: + a_trees (list): BitPar trees + a_segments (list): corresponding gold segments for trees - def test(self, seg_corpus, dep_corpus): - assert seg_corpus.keys() == dep_corpus.keys() - features, labels = self.extract_features_from_corpus( - dep_corpus, seg_corpus=seg_corpus) - predicted_labels = self._predict(features) - return self._score(labels, predicted_labels) + Returns: + 2-tuple: macro and micro-averaged F-scores + + """ + predictions = [self.model.predict(self.featgen(t, n)) + for t, n in trees] + segments = [str(s) for s in segments] + return self._score(segments, predictions) def _predict(self, features): - return self.pipeline.predict(features) + return [None if p == NONE else p + for p in self.model.predict(features)] def _score(self, labels, predicted_labels): _, _, macro_f1, _ = precision_recall_fscore_support( @@ -348,74 +470,13 @@ def _score(self, labels, predicted_labels): labels, predicted_labels, average='micro', warn_for=()) return macro_f1, micro_f1 - def cross_validate(self, seg_corpus, dep_corpus, out_folder=None): - assert seg_corpus.keys() == dep_corpus.keys() - texts = np.array(sorted(seg_corpus.keys())) - folds = KFold(len(texts), number_of_folds) - - # extract features for all texts - all_features = {} - all_labels = {} - for text in texts: - features, labels = self.extract_features_from_text( - dep_corpus[text], seg_forest=seg_corpus[text]) - all_features[text] = features - all_labels[text] = labels - - # do the cross-validation - macro_F1s = [] - micro_F1s = [] - tp = fp = fn = tp_i = fp_i = fn_i = 0 - for i, (train, test) in enumerate(folds): - print "# FOLD", i - # train - train_texts = texts[train] - train_features = chained([all_features[text] for text in - train_texts]) - train_labels = chained([all_labels[text] for text in train_texts]) - print " training on %d items..." % len(train_labels) - self._train(train_features, train_labels) - print " extracted %d features using the dict vectorizer." % \ - len(self.pipeline.named_steps[ - 'vectorizer'].get_feature_names()) - # test (predicting textwise) - test_labels = [] - pred_labels = [] - for text in texts[test]: - features = all_features[text] - labels = all_labels[text] - predictions = self._predict(features) - test_labels.extend(labels) - pred_labels.extend(predictions) - if out_folder is not None: - discourse_tree = self._segment_text(predictions, - dep_corpus[text]) - with open(out_folder + '/' + text + '.tree', 'w') as fout: - fout.write(str(discourse_tree)) - macro_f1, micro_f1 = self._score(test_labels, pred_labels) - macro_F1s.append(macro_f1) - micro_F1s.append(micro_f1) - tp_i, fp_i, fn_i = _cnt_stat(test_labels, pred_labels) - tp += tp_i - fp += fp_i - fn += fn_i - - print "# Average Macro F1 = %3.1f +- %3.2f" % \ - (100 * np.mean(macro_F1s), 100 * np.std(macro_F1s)) - print "# Average Micro F1 = %3.1f +- %3.2f" % \ - (100 * np.mean(micro_F1s), 100 * np.std(micro_F1s)) - if tp or fp or fn: - print "# F1_{tp,fp} %.2f" % (2. * tp / (2. * tp + fp + fn) * 100) - else: - print "# F1_{tp,fp} 0. %" - def _update_model(self, model): if model is None: - self.pipeline = MateSegmenter.DEFAULT_PIPELINE - elif isinstance(model, str): + self.model = MateSegmenter.DEFAULT_PIPELINE + elif isinstance(model, basestring): if not os.path.isfile(model) or not os.access(model, os.R_OK): raise RuntimeError("Can't load model from file {:s}".format( model)) - self.pipeline = joblib.load(model) + self.model = joblib.load(model) else: - self.pipeline = model + self.model = model diff --git a/dsegmenter/mateseg/segmentation_tree.py b/dsegmenter/mateseg/segmentation_tree.py index efdcc52..88e946f 100644 --- a/dsegmenter/mateseg/segmentation_tree.py +++ b/dsegmenter/mateseg/segmentation_tree.py @@ -1,11 +1,11 @@ #!/usr/bin/env python2.7 # -*- mode: python; coding: utf-8; -*- -''' -Created on 03.01.2015 +"""Created on 03.01.2015 @author: Andreas Peldszus -''' + +""" ################################################################## # Imports @@ -22,7 +22,7 @@ ################################################################## # Methods def prefix_number_seg_token(s): - '''adds an number prefix to a token string''' + """adds an number prefix to a token string""" global segtree_leaf_counter r = (segtree_leaf_counter, s) segtree_leaf_counter += 1 @@ -30,17 +30,15 @@ def prefix_number_seg_token(s): def generate_subtrees_from_forest(forest): - '''yields all subtress of a forest of trees''' + """yields all subtress of a forest of trees""" for tree in forest: for subtree in tree.subtrees(): yield subtree def read_segtree_file(fn): - '''reads a string representing a discourse tree (from the seg. - annotation) and returns a list of its child tree objects''' - global segtree_leaf_counter - segtree_leaf_counter = 0 + """reads a string representing a discourse tree (from the seg. + annotation) and returns a list of its child tree objects""" with codecs.open(fn, 'r', 'utf-8') as f: s = f.read() text_tree = Tree.fromstring(s, read_leaf=prefix_number_seg_token) diff --git a/dsegmenter/treeseg/__init__.py b/dsegmenter/treeseg/__init__.py index 5cc3613..7a9c761 100644 --- a/dsegmenter/treeseg/__init__.py +++ b/dsegmenter/treeseg/__init__.py @@ -16,15 +16,17 @@ ################################################################## # Imports -from .constants import DEFAULT_SEGMENT, ENCODING, GREEDY, GENEROUS, \ - DEPENDENCY, CONSTITUENCY -from .discourse_segment import DiscourseSegment -from .treesegmenter import TreeSegmenter +from __future__ import absolute_import, unicode_literals + +from dsegmenter.treeseg.constants import DEFAULT_SEGMENT, ENCODING, \ + GREEDY, GENEROUS, DEPENDENCY, CONSTITUENCY +from dsegmenter.treeseg.discourse_segment import DiscourseSegment +from dsegmenter.treeseg.treesegmenter import TreeSegmenter ################################################################## # Intialization __name__ = "treeseg" -__all__ = ["DEFAULT_SEGMENT", "ENCODING", "GREEDY", "GENEROUS", "DEPENDENCY", \ +__all__ = ["DEFAULT_SEGMENT", "ENCODING", "GREEDY", "GENEROUS", "DEPENDENCY", "CONSTITUENCY", "DiscourseSegment", "TreeSegmenter"] __author__ = "Uladzimir Sidarenka" __email__ = "sidarenk at uni dash potsdam dot de" diff --git a/dsegmenter/treeseg/discourse_segment.py b/dsegmenter/treeseg/discourse_segment.py index e44926e..03d4d10 100644 --- a/dsegmenter/treeseg/discourse_segment.py +++ b/dsegmenter/treeseg/discourse_segment.py @@ -3,52 +3,53 @@ ################################################################## # Documentation -""" -Module providing Discourse Segment class. +"""Module providing Discourse Segment class. Class: DiscourseSegment - class representing discourse segment -@author = Wladimir Sdorenko (Uladzimir Sidarenka) -@mail = -@version = 0.0.1 +.. moduleauthor:: Wladimir Sdorenko (Uladzimir Sidarenka) """ ################################################################## # Imports -from .constants import ENCODING +from __future__ import absolute_import +from dsegmenter.treeseg.constants import ENCODING from bisect import bisect_right + ################################################################## # Class class DiscourseSegment(object): - """ - Class representing discourse segment. + """Class representing discourse segment. - Instance Variables: - name - name of this segment - leaves - list of words or other segments inside of this unit + Attributes: + name (str): name of this segment + leaves (list): words or other segments inside of this unit - Methods: - get_end - obtain position of last token in the list of leaves """ - def __init__(self, a_name = "", a_leaves = []): - """ - Class constructor. + def __init__(self, a_name="", a_leaves=[]): + """Class constructor. + + Args: + a_name (str): name of discourse segment + a_leaves (list): segment's child nodes (either words or other + segments) - @param a_name - name of discourse segment - @param a_leaves - list of segment's child nodes (either words or other segments) """ self.name = a_name self.leaves = a_leaves - self.leaves.sort(key = lambda el: el[0] if el else -1) + self.leaves.sort(key=lambda el: el[0] if el else -1) def get_end(self): - """ - Obtain position of last token in the list of leaves. + """Obtain position of the last token in the list of leaves. + + Returns: + int: position of the last token in the list of leaves + """ if not self.leaves: return -1 @@ -60,12 +61,14 @@ def get_end(self): return last_leaf[0] def insort(self, a_leaf): - """ - Insert leaf in the list of leaves according to its position. + """Insert leaf in the list of leaves according to its position. + + Args: + a_leaf (dict): leaf to be inserted - @param a_leaf - leaf to be inserted + Returns: + void: - @return void """ ipos = bisect_right(self.leaves, a_leaf) inserted = False @@ -80,26 +83,29 @@ def insort(self, a_leaf): self.leaves.insert(ipos, a_leaf) def __len__(self): - """ - Return number of elements in given segment. + """Return number of elements in given segment. + + Returns: + int:number of elements in segment - @return number of elements in segment """ return len(self.leaves) def __nonzero__(self): - """ - Return True if the given segment is not empty. + """Return True if the given segment is not empty. + + Returns: + bool: True if segment is not empty - @return True if segment is not empty """ return bool(self.leaves) def __unicode__(self): - """ - Return unicode representation of given segment. + """Return unicode representation of given segment. + + Returns: + unicode: unicode string representing this object - @return unicode string representing this object """ ret = u"(" + unicode(self.name) for t in self.leaves: @@ -108,18 +114,20 @@ def __unicode__(self): return ret def __str__(self): - """ - Return utf-8 string representation of given segment. + """Return utf-8 string representation of given segment. + + Returns: + str: utf-8 string representing this object - @return utf-8 string representing this object """ return self.__unicode__().encode(ENCODING) def __repr__(self): - """ - Return internal representation of given segment. + """Return internal representation of given segment. + + Returns: + str: internal representation of this segment - @return internal representation of this segment """ ret = '<' + self.__class__.__name__ + " at " + str(hex(id(self))) ret += u" name=" + repr(self.name) diff --git a/dsegmenter/treeseg/treesegmenter.py b/dsegmenter/treeseg/treesegmenter.py index 2f59f8e..2048ed6 100644 --- a/dsegmenter/treeseg/treesegmenter.py +++ b/dsegmenter/treeseg/treesegmenter.py @@ -17,9 +17,12 @@ ################################################################## # Imports -from .constants import GREEDY, GENEROUS, DEPENDENCY, CONSTITUENCY -from .discourse_segment import DiscourseSegment -from ..mateseg.dependency_graph import WORD, REL, DEPS, TAG +from __future__ import absolute_import, print_function, unicode_literals + +from dsegmenter.common import WORD, REL, DEPS, TAG +from dsegmenter.treeseg.constants import GREEDY, GENEROUS, DEPENDENCY, \ + CONSTITUENCY +from dsegmenter.treeseg.discourse_segment import DiscourseSegment ################################################################## # Constants @@ -31,13 +34,10 @@ class TreeSegmenter(object): """Class for converting parse trees to discourse segments. - Instance Variables: - decfunc - decision function - segment - public function for doing segmentation - type - type of trees to be processed - - Methods: - segment - extract discourse segments from parse trees + Attributes: + decfunc: decision function + segment: public function for doing segmentation + type: type of trees to be processed """ @@ -66,21 +66,23 @@ def _dg_segment(self, a_tree, a_predict=None, a_word_access=lambda x: x, a_strategy=GREEDY): """Extract discourse segments from dependency parse trees. - @param a_tree - parse tree which should be processed (with interface - compatible with nltk.parse.dependencygraph) - @param a_predict - prediction function - @param a_root_idx - index of the root node in the list of tree nodes - @param a_children - index of the child nodes - @param a_word_access - a function for accessing the token string for + Args: + a_tree (nltk.parse.dependencygraph): parse tree which should be + processed + a_predict (lambda): prediction function + a_root_idx (int): index of the root node in the list of tree nodes + a_children (list[int]): index of the child nodes + a_word_access (lambda): a function for accessing the token string for more complex, structured tokens - @param a_strategy - flag for handling missing and non-projective edges - (GREEDY means that only adjacent descendants of the root node will - be put into a segment, if the root initiates one; GENEROUS means that - all words between the root and its right- and left-most dependencies - will be put into one segment disregarding the actual structure of the - dependency tree) + a_strategy (int): flag for handling missing and non-projective edges + (GREEDY means that only adjacent descendants of the root node will + be put into a segment, if the root initiates one; GENEROUS means + that all words between the root and its right- and left-most + dependencies will be put into one segment disregarding the actual + structure of the dependency tree) - @return list of discourse segments + Returns: + list: discourse segments """ a_ret = [] @@ -129,13 +131,15 @@ def _dg_segment(self, a_tree, a_predict=None, return a_ret def _dg_decfunc(self, a_node, a_tree): - """ - Make a prediction whether given node initiates a segment. + """Make a prediction whether given node initiates a segment. + + Args: + a_node (dict): parse node to be analyzed + a_tree (nltk.parse.dependencygraph): tree of analyzed node - @param a_node - parse node to be analyzed - @param a_tree - tree of analyzed node + Returns: + str or None: name of discourse segment or None - @return name of discourse segment or None """ chnode = None chtag = "" @@ -156,12 +160,15 @@ def _dg_decfunc(self, a_node, a_tree): def _cnst_segment(self, a_tree, a_ret, a_predict=None, a_start=0): """Extract discourse segments from constitutency parse trees. - @param a_tree - parse tree which should be processed - @param a_ret - target list which should be populated with segments - @param a_predict - prediction function - @param a_start - starting index of tokens + Args: + a_tree (nltk.parse.dependencygraph): parse tree which should + be processed + a_ret (list): target list which should be populated with segments + a_predict (lambda): prediction function + a_start (int): starting index of tokens - @return list of discourse segments + Returns: + list: discourse segments """ if a_predict is None: # find appropriate decision function @@ -187,12 +194,14 @@ def _cnst_segment(self, a_tree, a_ret, a_predict=None, a_start=0): return a_start def _cnst_decfunc(self, a_tree): - """ - Make a prediction whether given parse tree initiates a segment. + """Make a prediction whether given parse tree initiates a segment. + + Args: + a_tree (nltk.parse.dependencygraph):- tree of analyzed node - @param a_tree - tree of analyzed node + Returns: + str or None: name of discourse segment or None - @return name of discourse segment or None """ if a_tree.label() == "TOP": return "HS" @@ -202,11 +211,14 @@ def _cnst_decfunc(self, a_tree): def _extract_nonadjacent(self, a_seg, a_root_pos): """Remove from segment nodes which are not adjacent to root. - @param a_seg - list of terminals to be modified - @param a_root_idx - index of the root node + Args: + a_seg (list): terminals to be modified + a_root_idx (int): index of the root node + + Return: + list: non-adjacent words (discourse segment will also be + modified) - @return list of non-adjacent words (discourse segment will also be - modified) """ temp = [] adjacent = [] @@ -237,11 +249,13 @@ def _extract_nonadjacent(self, a_seg, a_root_pos): def _unite_nonadjacent(self, a_word_seg): """Add nodes between the discourse segment and its non-projective edges - to the segment. - @param a_word_seg - list of terminals and segments to be modified + Args: + a_word_seg (list): terminals and segments to be modified + + Returns: + list: modified word/segment list - @return modified word/segment list """ word_seg = [] right_leaves = [] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..0842b4f --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths=tests +addopts = --cov=dsegmenter --cov-report=term-missing -v diff --git a/requirements.txt b/requirements.txt index 3dd9d19..e6a098a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ -numpy >=1.9.2 -nltk >=3.0.2 -scipy >=0.16.0 -scikit-learn >=0.15.2 -segeval >=2.0.11 +pip>=8.1.0 +segeval>=2.0 +nltk>=3.0.2 +scikit-learn>=0.18.1 +numpy>=1.9.2 +scipy>=0.9 diff --git a/scripts/discourse_segmenter b/scripts/discourse_segmenter index 56e9782..38fbbe7 100755 --- a/scripts/discourse_segmenter +++ b/scripts/discourse_segmenter @@ -1,25 +1,30 @@ #!/usr/bin/env python # -*- mode: python; coding: utf-8; -*- -""" -Parse input text into elementary discourse segments and output them +"""Parse input text into elementary discourse segments. USAGE: discourse_segmenter [GLOBAL_OPTIONS] type [TYPE_SPECIFIC_OPTIONS] [FILEs] -@author: Wladimir Sidorenko - """ ################################################################## # Imports -from __future__ import print_function, unicode_literals -from dsegmenter.bparseg import BparSegmenter, CTree, read_trees, read_segments, \ - trees2segs +from __future__ import absolute_import, print_function, unicode_literals + +from dsegmenter.bparseg import BparSegmenter, CTree +from dsegmenter.bparseg import read_tok_trees as bpar_read_tok_trees +from dsegmenter.bparseg import read_trees as bpar_read_trees +from dsegmenter.bparseg import trees2segs as bpar_trees2segs +from dsegmenter.common import read_segments from dsegmenter.edseg import EDSSegmenter, CONLL +from dsegmenter.mateseg import MateSegmenter +from dsegmenter.mateseg import read_tok_trees as mate_read_tok_trees +from dsegmenter.mateseg import read_trees as mate_read_trees +from dsegmenter.mateseg import trees2segs as mate_trees2segs from collections import defaultdict -from sklearn.cross_validation import KFold +from sklearn.model_selection import KFold import argparse import codecs import glob @@ -33,6 +38,7 @@ DEFAULT_ENCODING = "utf-8" ENCODING = DEFAULT_ENCODING EDSEG = "edseg" BPARSEG = "bparseg" +MATESEG = "mateseg" CV = "cv" TEST = "test" TRAIN = "train" @@ -40,34 +46,108 @@ SEGMENT = "segment" Segmenter = None N_FOLDS = 10 + ################################################################## # Methods def _set_train_test_args(a_parser): - """Add CLI options common to train and test mode to ArgumentParser instance. + """Add CLI options to ArgumentParser instance. - @param a_parser - ArgumentParser instance to which new arguments should - be added + Args: + a_parser (argparse.ArgumentParser): ArgumentParser instance to which new + arguments should be added - @return \c void + Returns: + void: """ - a_parser.add_argument("--bpar-sfx", help = """suffix of the names of BitPar files""", \ - type = str, default = "") - a_parser.add_argument("--seg-sfx", help = """suffix of the names of segmentation files""", \ - type = str, default = "") - a_parser.add_argument("bpar_dir", help="directory containing BitPar files", \ - type = str) - a_parser.add_argument("seg_dir", help="directory containing segmentation files", \ - type = str) - -def _read_files(a_files, a_encoding = DEFAULT_ENCODING, a_skip_line = ""): + a_parser.add_argument("--tree-sfx", + help="suffix of the file names with input trees", + type=str, default="") + a_parser.add_argument("--seg-sfx", + help="suffix of the names of segmentation files", + type=str, default="") + a_parser.add_argument("tree_dir", + help="directory containing files with input trees", + type=str) + a_parser.add_argument("seg_dir", + help="directory containing segmentation files", + type=str) + + +def _set_bpar_mate_args(a_parser, a_name, a_in_name, a_dflt_path): + """Add CLI options common to BitPar and Mate segmenter. + + Args: + a_parser (argparse.ArgumentParser): ArgumentParser instance to which new + arguments should be added + a_name (str): name of the segmenter for which new options should be added + a_in_name (str): name of the input data structure + a_dflt_path (str): path to default model + + Returns: + void: + + Note: + modifies ``a_parser`` in place + + """ + parser = a_parser.add_parser(a_name, + help="machine-learning discourse" + "segmenter for {:s}".format(a_in_name)) + subparsers = parser.add_subparsers(help="action to perform", + dest="mode") + parser_train = subparsers.add_parser(TRAIN, + help="train new model" + " on {:s} and segment" + " files".format(a_in_name)) + parser_train.add_argument("model", + help="path to file in which to store the" + " trained model", type=str) + _set_train_test_args(parser_train) + + parser_cv = subparsers.add_parser(CV, + help="train and evaluate model" + " using cross-validation") + parser_cv.add_argument("-o", "--output-dir", + help="output directory (leave empty for" + " no output)", type=str, default="") + parser_cv.add_argument("model", + help="path to the file in which the best" + " trained model should be stored", + type=str) + _set_train_test_args(parser_cv) + + parser_test = subparsers.add_parser(TEST, + help="test model on {:s}" + " and segment files") + parser_test.add_argument("-m", "--model", + help="path to file containing model", + type=str, + default=a_dflt_path) + _set_train_test_args(parser_test) + + parser_segment = subparsers.add_parser(SEGMENT, + help="split {:s} into" + " discourse units".format( + a_in_name)) + parser_segment.add_argument("-m", "--model", + help="path to file containing model", + type=str, + default=a_dflt_path) + parser_segment.add_argument("files", help="input files", + nargs='*', metavar="file") + + +def _read_files(a_files, a_encoding=DEFAULT_ENCODING, a_skip_line=""): """Return iterator over lines of the input file. - @param a_files - files to read from - @param a_encoding - text encoding used for input/output - @param a_skip_line - line which should be skipped during iteration + Args: + a_files (list): files to read from + a_encoding (str): text encoding used for input/output + a_skip_line (str): line which should be skipped during iteration - @return iterator over input lines + Yields: + input lines """ if not a_files: @@ -79,82 +159,106 @@ def _read_files(a_files, a_encoding = DEFAULT_ENCODING, a_skip_line = ""): yield line.rstrip() else: for fname in a_files: - with codecs.open(fname, encoding = a_encoding, errors = "replace") as ifile: + with codecs.open(fname, + encoding=a_encoding, errors="replace") as ifile: for line in ifile: if line == a_skip_line: print(line.encode(a_encoding)) else: yield line.rstrip() -def _align_files(a_bpar_dir, a_seg_dir, a_bpar_sfx, a_seg_sfx): + +def _align_files(a_tree_dir, a_seg_dir, a_tree_sfx, a_seg_sfx): """Align BitPar and segment files in two directories. - @param a_bpar_dir - directory containing files with BitPar trees - @param a_seg_dir - directory containing files with discourse segments - @param a_bpar_sfx - suffix of the names of BitPar files - @param a_seg_sfx - suffix of the names of segmentation files + Args: + a_tree_dir (str): directory containing files with BitPar trees + a_seg_dir (str): directory containing files with discourse segments + a_tree_sfx (str): suffix of the names of BitPar files + a_seg_sfx (str): suffix of the names of segmentation files - @return iterator over list of 2-tuples with BitPar and segment file + Yields: + 2-tuples with BitPar and segment file """ - segf = ""; basefname = ""; - BP_SFX_RE = re.compile(re.escape(a_bpar_sfx)) - bpar_files = glob.iglob(os.path.join(a_bpar_dir, '*' + a_bpar_sfx)) + segf = "" + basefname = "" + BP_SFX_RE = re.compile(re.escape(a_tree_sfx) + '$') + bpar_files = glob.iglob(os.path.join(a_tree_dir, '*' + a_tree_sfx)) for bpf in bpar_files: basefname = BP_SFX_RE.sub("", os.path.basename(bpf)) segf = os.path.join(a_seg_dir, basefname + a_seg_sfx) if os.path.isfile(segf) and os.access(segf, os.R_OK): yield (bpf, segf) else: - print(\ - "WARNING: No counterpart file found for BitPar file '{:s}'.".format(bpf), \ - file = sys.stderr) + print( + "WARNING: No counterpart file found for BitPar" + " file '{:s}'.".format(bpf), file=sys.stderr) -def _read_trees_segments(a_bpar_dir, a_seg_dir, a_bpar_sfx, a_seg_sfx, \ - a_fname2item = False, a_encoding = DEFAULT_ENCODING): + +def _read_trees_segments(a_tree_dir, a_seg_dir, a_tree_sfx, a_seg_sfx, + a_read_tok_trees=bpar_read_tok_trees, + a_trees2segs=bpar_trees2segs, + a_fname2item=False, + a_encoding=DEFAULT_ENCODING): """Read input files containing discourse segments and BitPar trees. - @param a_bpar_dir - directory containing files with BitPar trees - @param a_seg_dir - directory containing files with discourse segments - @param a_bpar_sfx - suffix of the names of BitPar files - @param a_seg_sfx - suffix of the names of segmentation files - @param a_fname2item - generate mappings from filenames to trees - @param a_encoding - text encoding used for input/output + Args: + a_tree_dir (str): directory containing files with BitPar trees + a_seg_dir (str): directory containing files with discourse segments + a_tree_sfx (str): suffix of the names of BitPar files + a_seg_sfx (str): suffix of the names of segmentation files + a_read_tok_trees (lambda): custom function for reading syntax trees + a_trees2segs (lambda): custom function for aligning syntax trees with + segments + a_fname2item (bool): generate mappings from filenames to trees + a_encoding (str): text encoding used for input/output - @return 2-tuple with a list of segments and a list of trees + Returns: + tuple: list of segments and a list of trees """ if a_fname2item: - trees = defaultdict(list); segments = defaultdict(list) + trees = defaultdict(list) + segments = defaultdict(list) else: - trees = []; segments = [] + trees = [] + segments = [] ts, segs = trees, segments - tree2seg = {}; toks2trees = {}; toks2segs = {}; - bpar_seg_files = _align_files(a_bpar_dir, a_seg_dir, a_bpar_sfx, a_seg_sfx) + tree2seg = {} + toks2trees = {} + toks2segs = {} + tree_seg_files = _align_files(a_tree_dir, a_seg_dir, a_tree_sfx, a_seg_sfx) # do tree/segment alignment - for bpf, segf in bpar_seg_files: + for tf, segf in tree_seg_files: if a_fname2item: - ts, segs = trees[bpf], segments[segf] - with codecs.open(bpf, 'r', encoding = a_encoding) as ibpf: - toks2trees, _ = read_trees(ibpf) - with codecs.open(segf, 'r', encoding = a_encoding) as isegf: + ts, segs = trees[tf], segments[segf] + with codecs.open(tf, 'r', encoding=a_encoding) as itf: + toks2trees, _ = a_read_tok_trees(itf) + with codecs.open(segf, 'r', encoding=a_encoding) as isegf: toks2segs = read_segments(isegf) - tree2seg = trees2segs(toks2trees, toks2segs) - for t, s in tree2seg.iteritems(): - ts.append(t) - segs.append(s) + if a_trees2segs is None: + pass + else: + tree2seg = a_trees2segs(toks2trees, toks2segs) + for t, s in tree2seg.iteritems(): + ts.append(t) + segs.append(s) return (trees, segments) + def _output_segment_forrest(a_forrest, a_segmenter, a_output, a_encoding): """Split CONLL sentences in elementary discourse units and output them. - @param a_forrest - pointer to CONLL forrest - @param a_segmnter - pointer to discourse segmenter - @param a_output - boolean flag indicating whether dependency trees - should be printed - @param a_encoding - text encoding used for output + Args: + a_forrest (dsegmenter.edseg.CONLL): forrest of CoNLL trees + a_segmenter (Segmenter): pointer to discourse segmenter + a_output (bool): flag indicating whether dependency trees + should be printed + a_encoding (str): text encoding used for output - @return \c void + Returns: + void: """ if a_forrest.is_empty(): @@ -164,18 +268,21 @@ def _output_segment_forrest(a_forrest, a_segmenter, a_output, a_encoding): print(unicode(a_forrest).encode(a_encoding)) sds_list = [a_segmenter.segment(sent) for sent in a_forrest] for sds in sds_list: - sds.pretty_print(a_encoding = a_encoding) + sds.pretty_print(a_encoding=a_encoding) a_forrest.clear() -def edseg_segment(a_ilines, a_output_trees, a_encoding = DEFAULT_ENCODING): + +def edseg_segment(a_ilines, a_output_trees, a_encoding=DEFAULT_ENCODING): """Perform rule-based segmentation of CONLL dependency trees. - @param a_ilines - iterator over input lines - @param a_output_trees - boolean flag indicating whether dependency trees - should be printed - @param a_encoding - text encoding used for input/output + Args: + a_ilines (iterable): iterator over input lines + a_output_trees (bool): flag indicating whether dependency trees + should be printed + a_encoding (str): text encoding used for input/output - @return \c void + Returns: + void: """ forrest = CONLL() @@ -183,62 +290,75 @@ def edseg_segment(a_ilines, a_output_trees, a_encoding = DEFAULT_ENCODING): for line in a_ilines: if not line: # print collected sentences - _output_segment_forrest(forrest, segmenter, a_output_trees, a_encoding) + _output_segment_forrest(forrest, segmenter, a_output_trees, + a_encoding) # output line print(line.encode(a_encoding)) # otherwise, append the line to the CONLL forrest else: forrest.add_line(line) - istart = True # output remained EDUs _output_segment_forrest(forrest, segmenter, a_output_trees, a_encoding) -def bparseg_segment(a_segmenter, a_ilines, a_encoding = DEFAULT_ENCODING, \ - a_ostream = sys.stdout): - """Perform machine-learning driven segmentation of BitPar constituency trees. - @param a_segmenter - pointer to BitPar segmenter - @param a_ilines - iterator over input lines - @param a_encoding - text encoding used for input/output - @param a_ostream - output stream +def bpar_mate_segment(a_segmenter, a_lines, a_encoding=DEFAULT_ENCODING, + a_ostream=sys.stdout, a_read_trees=CTree.parse_lines): + """Perform segmentation based on Mate dependency trees. + + Args: + a_segmenter (MateSegmenter): pointer to Mate segmenter + a_fnames (iterable): iterator over input lines + a_encoding (str): text encoding used for input/output + a_ostream (IOstream): output stream + a_read_trees (func): custom function for reading trees - @return \c void + Returns: + void: """ - segments = [] - for ctree in CTree.parse_lines(a_ilines): - segments = a_segmenter.segment([ctree]) - print(u'\n'.join([unicode(s[-1]) for s in segments]).encode(a_encoding), \ - file = a_ostream) + segments = None + for itree in a_read_trees(a_lines): + segments = a_segmenter.segment([itree]) + print('\n'.join([unicode(s[-1]) + for s in segments]).encode(a_encoding), + file=a_ostream) + -def bparseg_test(a_segmenter, a_trees, a_segments): +def bpar_mate_test(a_segmenter, a_trees, a_segments): """Evaluate performance of segment classification. - @param a_segmenter - pointer to BitPar segmenter - @param a_trees - list of BitPar trees - @param a_segments - list of discourse segments corresponding - to BitPar trees + Args: + a_segmenter (Segmenter): pointer to BitPar segmenter + a_trees (list[dsegmenter.bparseg.CTree]): list of BitPar trees + a_segments (list[dsegmenter.treeseg.DiscourseSegment]): + list of discourse segments corresponding to BitPar trees - @return \c void + Returns: + void: """ macro_f1, micro_f1 = a_segmenter.test(a_trees, a_segments) - print("Macro F1-score: {:.2%}".format(macro_f1), file = sys.stderr) - print("Micro F1-score: {:.2%}".format(micro_f1), file = sys.stderr) + print("Macro F1-score: {:.2%}".format(macro_f1), file=sys.stderr) + print("Micro F1-score: {:.2%}".format(micro_f1), file=sys.stderr) def _cnt_stat(a_gold_segs, a_pred_segs): - """Estimate the number of true positives, false positives, and false negatives + """Estimate true positives, false positives, and false negatives - @param a_gold_segs - gold segments - @param a_pred_segs - predicted segments + Args: + a_gold_segs (list[dsegmenter.treeseg.DiscourseSegment]): + gold segments + a_pred_segs (list[dsegmenter.treeseg.DiscourseSegment]): + predicted segments - @return 3-tuple with true positives, false positives, and false negatives + Returns: + tuple: true positives, false positives, and false negatives """ tp = fp = fn = 0 for gs, ps in zip(a_gold_segs, a_pred_segs): - gs = gs.lower(); ps = ps.lower() + gs = gs.lower() + ps = ps.lower() if gs == "none": if ps != "none": fp += 1 @@ -249,22 +369,29 @@ def _cnt_stat(a_gold_segs, a_pred_segs): return tp, fp, fn -def crossval(a_segmenter, a_path, a_fname2trees, a_fname2segs, \ - a_output = False, a_out_dir = ".", a_out_sfx = ".tree", \ - a_folds = N_FOLDS, a_encoding = ENCODING): +def crossval(a_segmenter, a_path, a_fname2trees, a_fname2segs, + a_tree_sfx, a_seg_sfx, + a_output=False, a_out_dir=".", a_out_sfx=".tree", + a_folds=N_FOLDS, a_encoding=ENCODING, a_read_trees=None): """Train and evaluate model using n-fold cross-validation. - @param a_segmenter - pointer to untrained segmenter instance - @param a_path - path in which to store the model - @param a_fname2trees - mapping from file names to trees - @param a_fname2segs - mapping from file names to segments - @param a_output - boolean flag indicating whether output files should be produced - @param a_out_dir - directory for writing output files - @param a_out_sfx - suffix which should be appended to output files - @param a_folds - number of folds - @param a_encoding - default output encoding - - @return 3-tuple containing list of macro F-scores, micro F-scores, and F1_{tp,fp} + Args: + a_segmenter (Segmenter): pointer to untrained segmenter instance + a_path (str): path in which to store the model + a_fname2trees (dict): mapping from file names to trees + a_fname2segs (dict): mapping from file names to segments + a_tree_sfx (str): suffix of the names of parse files + a_seg_sfx (str): suffix of the names of segmentation files + a_output (bool): boolean flag indicating whether output files should + be produced + a_out_dir (str): directory for writing output files + a_out_sfx (str): suffix which should be appended to output files + a_folds (int): number of folds + a_encoding (str): default output encoding + a_read_trees (func): method for reading input trees + + Returns: + tuple: lists of macro F-scores, micro F-scores, and F1_{tp,fp} """ # do necessary imports @@ -277,27 +404,36 @@ def crossval(a_segmenter, a_path, a_fname2trees, a_fname2segs, \ "Unmatching number of files with trees and segments." # make file names in `a_fname2trees` and `a_fname2segs` uniform and convert # segment classes to strings - a_fname2segs = {os.path.splitext(os.path.basename(k))[0] + a_out_sfx: \ - [str(iseg) for iseg in v] for k, v in a_fname2segs.iteritems()} - ofname2ifname = {os.path.splitext(os.path.basename(k))[0] + a_out_sfx: k \ + seg_sfx_re = re.compile(re.escape(a_seg_sfx) + '$') + a_fname2segs = {seg_sfx_re.sub("", os.path.basename(k)) + a_out_sfx: + [str(iseg) for iseg in v] + for k, v in a_fname2segs.iteritems()} + tree_sfx_re = re.compile(re.escape(a_tree_sfx) + '$') + ofname2ifname = {tree_sfx_re.sub("", os.path.basename(k)) + a_out_sfx: k for k in a_fname2trees} - a_fname2trees = {os.path.splitext(os.path.basename(k))[0] + a_out_sfx: v \ + a_fname2trees = {tree_sfx_re.sub("", os.path.basename(k)) + a_out_sfx: v for k, v in a_fname2trees.iteritems()} # estimate the number of and generate folds fnames = a_fname2trees.keys() n_fnames = len(fnames) if n_fnames < 2: - print("Insufficient number of samples for cross-validation: {:d}.".format(\ - n_fnames), file = sys.stderr) + print("Insufficient number of samples for" + " cross-validation: {:d}.".format(n_fnames), + file=sys.stderr) return -1 - folds = KFold(n_fnames, min(len(fnames), a_folds)) + kf = KFold(n_splits=min(n_fnames, a_folds)) # generate features for trees - fname2feats = {fname: [a_segmenter.featgen(t) for t in trees] \ + fname2feats = {fname: [a_segmenter.featgen(*t) + if isinstance(t, tuple) + else a_segmenter.featgen(t) + for t in trees] for fname, trees in a_fname2trees.iteritems()} # initialize auxiliary variables - F1_macro = F1_micro = F1_tpfp = 0. - macro_f1 = 0.; macro_F1s = [] - micro_f1 = 0.; micro_F1s = [] + F1_tpfp = 0. + macro_f1 = 0. + macro_F1s = [] + micro_f1 = 0. + micro_F1s = [] best_macro_f1 = float("-inf") best_i = -1 # index of the best run istart = ilen = 0 @@ -309,11 +445,12 @@ def crossval(a_segmenter, a_path, a_fname2trees, a_fname2segs, \ processed_fnames = {} in_fname = test_fname = out_fname = "" train_feats = train_segs = None - test_feats = []; test_segs = [] + test_feats = [] + test_segs = [] tp = fp = fn = tp_i = fp_i = fn_i = 0 # iterate over folds - for i, (train, test) in enumerate(folds): - print("Fold: {:d}".format(i), file = sys.stderr) + for i, (train, test) in enumerate(kf.split(fnames)): + print("Fold: {:d}".format(i), file=sys.stderr) train_feats = [feat for k in train for feat in fname2feats[fnames[k]]] train_segs = [seg for k in train for seg in a_fname2segs[fnames[k]]] istart = 0 @@ -324,20 +461,26 @@ def crossval(a_segmenter, a_path, a_fname2trees, a_fname2segs, \ test_feats += fname2feats[fnames[k]] test_segs += a_fname2segs[fnames[k]] # train classifier model - a_segmenter.model = BparSegmenter.DEFAULT_PIPELINE a_segmenter.model.fit(train_feats, train_segs) # obtain new predictions pred_segs = a_segmenter.model.predict(test_feats) # update statistics and F1 scores tp_i, fp_i, fn_i = _cnt_stat(test_segs, pred_segs) - tp += tp_i; fp += fp_i; fn += fn_i - _, _, macro_f1, _ = precision_recall_fscore_support(test_segs, pred_segs, average = 'macro', \ - pos_label = None) - _, _, micro_f1, _ = precision_recall_fscore_support(test_segs, pred_segs, average = 'micro', \ - pos_label = None) - macro_F1s.append(macro_f1); micro_F1s.append(micro_f1) - print("Macro F1: {:.2%}".format(macro_f1), file = sys.stderr) - print("Micro F1: {:.2%}".format(micro_f1), file = sys.stderr) + tp += tp_i + fp += fp_i + fn += fn_i + _, _, macro_f1, _ = precision_recall_fscore_support(test_segs, + pred_segs, + average='macro', + pos_label=None) + _, _, micro_f1, _ = precision_recall_fscore_support(test_segs, + pred_segs, + average='micro', + pos_label=None) + macro_F1s.append(macro_f1) + micro_F1s.append(micro_f1) + print("Macro F1: {:.2%}".format(macro_f1), file=sys.stderr) + print("Micro F1: {:.2%}".format(micro_f1), file=sys.stderr) # update maximum macro F-score and store the most successful model if macro_f1 > best_macro_f1: best_i = i @@ -347,88 +490,84 @@ def crossval(a_segmenter, a_path, a_fname2trees, a_fname2segs, \ if a_output: for k in test: test_fname = fnames[k] - if test_fname in processed_fnames and processed_fnames[test_fname] > macro_f1: + if test_fname in processed_fnames and \ + processed_fnames[test_fname] > macro_f1: continue processed_fnames[test_fname] = macro_f1 # fname2gld_pred[test_fname] = [(test_segs[i], pred_segs[i]) \ - # for i in xrange(*fname2range[test_fname])] + # for i in + # xrange( + # *fname2range[test_fname])] in_fname = ofname2ifname[test_fname] out_fname = os.path.join(a_out_dir, test_fname) with open(out_fname, "w") as ofile: - print("(TEXT", file = ofile) - bparseg_segment(a_segmenter, _read_files([in_fname], a_encoding), \ - a_encoding, ofile) - print(")", file = ofile) - test_feats = []; test_segs = [] - del trees[:]; del out_fnames[:]; fname2range.clear() - print("Average macro F1: {:.2%} +/- {:.2%}".format(np.mean(macro_F1s), np.std(macro_F1s)), \ - file = sys.stderr) - print("Average micro F1: {:.2%} +/- {:.2%}".format(np.mean(micro_F1s), np.std(micro_F1s)), \ - file = sys.stderr) + print("(TEXT", file=ofile) + bpar_mate_segment(a_segmenter, + _read_files([in_fname], a_encoding), + a_encoding=a_encoding, + a_ostream=ofile, + a_read_trees=a_read_trees) + print(")", file=ofile) + test_feats = [] + test_segs = [] + del trees[:] + del out_fnames[:] + fname2range.clear() + print("Average macro F1: {:.2%} +/- {:.2%}".format( + np.mean(macro_F1s), np.std(macro_F1s)), file=sys.stderr) + print("Average micro F1: {:.2%} +/- {:.2%}".format( + np.mean(micro_F1s), np.std(micro_F1s)), file=sys.stderr) if tp or fp or fn: F1_tpfp = (2. * tp / (2. * tp + fp + fn)) - print("F1_{{tp,fp}} {:.2%}".format(F1_tpfp), file = sys.stderr) + print("F1_{{tp,fp}} {:.2%}".format(F1_tpfp), + file=sys.stderr) return (macro_F1s, micro_F1s, F1_tpfp, best_i) + def main(argv): """Read input text and segment it into elementary discourse units. - @param argv - command line arguments + Args: + argv (list[str]): command line arguments - @return \c 0 on success, non-\c 0 otherwise + Returns: + int: 0 on success, non-0 otherwise """ # process arguments - parser = argparse.ArgumentParser(description = """Script for segmenting text -into elementary discourse units.""") + parser = argparse.ArgumentParser( + description="Script for segmenting text into elementary" + " discourse units.") # define global options - parser.add_argument("-e", "--encoding", help = "input encoding of text", nargs = 1, \ - type = str, default = DEFAULT_ENCODING) - parser.add_argument("-s", "--skip-line", help = """lines which should be ignored during the -processing and output without changes (defaults to empty lines)""", type = str, default = "") + parser.add_argument("-e", "--encoding", help="input encoding of text", + nargs=1, type=str, default=DEFAULT_ENCODING) + parser.add_argument("-s", "--skip-line", + help="lines which should be ignored during the " + "processing and output without changes" + " (defaults to empty lines)", + type=str, default="") # add type-specific subparsers - subparsers = parser.add_subparsers(help="type of discourse segmenter to use", dest = "dtype") + subparsers = parser.add_subparsers(help="type of discourse segmenter" + " to use", dest="dtype") # edgseg argument parser - parser_edseg = subparsers.add_parser(EDSEG, help = "rule-based discourse segmenter for CONLL\ - dependency trees") - parser_edseg.add_argument("-o", "--output-trees", help="output dependency trees along with\ - segments", action = "store_true") - parser_edseg.add_argument("files", help="input files", nargs = '*', metavar="file") - - # bpar argument parser - parser_bpar = subparsers.add_parser(BPARSEG, help = """machine-learning driven discourse\ - segmenter for BitPar constituency trees""") - bpar_subparsers = parser_bpar.add_subparsers(help = "action to perform", dest = "mode") - parser_bpar_train = bpar_subparsers.add_parser(TRAIN, help = """train new model on BitPar - and segment files""") - parser_bpar_train.add_argument("--cross-validate", help = "train model in cross-validation mode") - parser_bpar_train.add_argument("model", help = "path to file in which to store the trained model", \ - type = str) - _set_train_test_args(parser_bpar_train) - - parser_bpar_cv = bpar_subparsers.add_parser(CV, help = """train and evaluate model - using cross-validation""") - parser_bpar_cv.add_argument("-o", "--output-dir", help = """output directory (leave empty for - no output)""", type = str, default = "") - parser_bpar_cv.add_argument("model", help = """path to the file in which the best trained model - should be stored""", type = str) - _set_train_test_args(parser_bpar_cv) - - parser_bpar_test = bpar_subparsers.add_parser(TEST, help = """test model on BitPar - and segment files""") - parser_bpar_test.add_argument("-m", "--model", help = "path to file containing model", \ - type = str, default = BparSegmenter.DEFAULT_MODEL) - _set_train_test_args(parser_bpar_test) - - parser_bpar_segment = bpar_subparsers.add_parser(SEGMENT, help = """split BitPar trees\ - into discourse units""") - parser_bpar_segment.add_argument("-m", "--model", help = "path to file containing model", \ - type = str, default = BparSegmenter.DEFAULT_MODEL) - parser_bpar_segment.add_argument("files", help="input files", nargs = '*', metavar="file") - + parser_edseg = subparsers.add_parser(EDSEG, + help="rule-based discourse segmenter" + " for CONLL dependency trees") + parser_edseg.add_argument("-o", "--output-trees", + help="output dependency trees along with" + " segments", action="store_true") + parser_edseg.add_argument("files", help="input files", + nargs='*', metavar="file") + + # add bpar arguments + _set_bpar_mate_args(subparsers, BPARSEG, "BitPar constituency trees", + BparSegmenter.DEFAULT_MODEL) + # add mate arguments + _set_bpar_mate_args(subparsers, MATESEG, "Mate dependency trees", + MateSegmenter.DEFAULT_MODEL) args = parser.parse_args() # process input files @@ -441,6 +580,15 @@ processing and output without changes (defaults to empty lines)""", type = str, edseg_segment(ifiles, args.output_trees) # process input with bparseg else: + if args.dtype == BPARSEG: + read_trees = bpar_read_trees + read_tok_trees = bpar_read_tok_trees + trees2segs = bpar_trees2segs + else: + read_trees = mate_read_trees + read_tok_trees = mate_read_tok_trees + trees2segs = mate_trees2segs + if args.mode == TRAIN or args.mode == CV: # make sure there is a directory for storing the model mdir = os.path.dirname(args.model) @@ -448,26 +596,53 @@ processing and output without changes (defaults to empty lines)""", type = str, pass elif os.path.exists(mdir): if not os.path.isdir(mdir) or not os.access(mdir, os.R_OK): - print("Can't write to directory '{:s}'.".format(mdir), file = sys.stderr) + print("Can't write to directory '{:s}'.".format(mdir), + file=sys.stderr) else: os.makedirs(mdir) - segmenter = BparSegmenter() - trees, segments = _read_trees_segments(args.bpar_dir, args.seg_dir, args.bpar_sfx, \ - args.seg_sfx, args.mode == CV, args.encoding) + + if args.dtype == BPARSEG: + segmenter = BparSegmenter( + a_model=BparSegmenter.DEFAULT_PIPELINE) + else: + segmenter = MateSegmenter(model=MateSegmenter.DEFAULT_PIPELINE) + + trees, segments = _read_trees_segments( + args.tree_dir, args.seg_dir, + args.tree_sfx, args.seg_sfx, + read_tok_trees, trees2segs, + args.mode == CV, args.encoding) + if args.mode == TRAIN: segmenter.train(trees, segments, args.model) else: - crossval(segmenter, args.model, trees, segments, bool(args.output_dir), args.output_dir) + crossval(segmenter, args.model, trees, segments, + args.tree_sfx, args.seg_sfx, + bool(args.output_dir), args.output_dir, + a_read_trees=read_trees) else: - assert os.path.exists(args.model) and os.path.isfile(args.model) and \ - os.access(args.model, os.R_OK), "Can't read model file '{:s}'.".format(args.model) - segmenter = BparSegmenter(a_model = args.model) + assert os.path.exists(args.model) and \ + os.path.isfile(args.model) and \ + os.access(args.model, os.R_OK), \ + "Can't read model file '{:s}'.".format(args.model) + + if args.dtype == BPARSEG: + segmenter = BparSegmenter(a_model=args.model) + else: + segmenter = MateSegmenter(model=args.model) + if args.mode == TEST: - trees, segments = _read_trees_segments(args.bpar_dir, args.seg_dir, args.bpar_sfx, \ - args.seg_sfx, args.encoding) - bparseg_test(segmenter, trees, segments) + trees, segments = _read_trees_segments( + args.tree_dir, args.seg_dir, + args.tree_sfx, args.seg_sfx, + a_read_tok_trees=read_tok_trees, + a_trees2segs=trees2segs, + a_encoding=args.encoding) + bpar_mate_test(segmenter, trees, segments) else: - bparseg_segment(segmenter, ifiles, args.encoding) + bpar_mate_segment(segmenter, ifiles, args.encoding, + a_read_trees=read_trees) + ################################################################## # Main diff --git a/scripts/mate_segmenter b/scripts/mate_segmenter deleted file mode 100755 index 37890be..0000000 --- a/scripts/mate_segmenter +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8; mode: python; -*- - -################################################################## -# Imports -from dsegmenter.mateseg.dependency_graph import read_deptree_file -from dsegmenter.mateseg.segmentation_tree import read_segtree_file -from dsegmenter.mateseg.matesegmenter import MateSegmenter - -import argparse -import os - - -################################################################## -# Methods -def load_corpus_folder(folder_path, file_suffix='.suffix', - file_type_desc='corpus', reader_func=None): - print 'Finding {} files...'.format(file_type_desc), - files = sorted([f for f in os.listdir(folder_path) - if f.endswith(file_suffix)]) - print 'found %d.' % len(files) - texts = set([fn[:-len(file_suffix)] for fn in files]) - corpus = { - text: reader_func(os.path.join(folder_path, text + file_suffix)) - for text in texts - } - return corpus - - -def load_dep_folder(folder_path, file_suffix_dep='.parsed.conll'): - return load_corpus_folder( - folder_path, file_suffix=file_suffix_dep, - file_type_desc='dependency parse', reader_func=read_deptree_file) - - -def load_seg_folder(folder_path, file_suffix_seg='.tree'): - return load_corpus_folder( - folder_path, file_suffix=file_suffix_seg, - file_type_desc='discourse segmentation', reader_func=read_segtree_file) - - -def main(): - # initialize argument parser - aparser = argparse.ArgumentParser(description="Discourse segmentation " - "model to be trained and tested on dependency parses.") - aparser.add_argument( - "mode", help="mode", choices=['eval', 'train', 'segment']) - aparser.add_argument( - "in_seg", help="input folder for segmentation files " - "(will be ignored in test mode)") - aparser.add_argument( - "in_dep", help="input folder for mate dependencies") - aparser.add_argument( - "out_folder", help="output folder for predictions or models") - aparser.add_argument( - "--model", help="model to use for prediction", nargs=1) - args = aparser.parse_args() - - if args.mode in ['eval', 'train']: - seg_corpus = load_seg_folder(args.in_seg) - dep_corpus = load_dep_folder(args.in_dep) - ms = MateSegmenter(model=None) - if args.mode == 'eval': - ms.cross_validate(seg_corpus, dep_corpus, args.out_folder) - elif args.mode == 'train': - ms.train(seg_corpus, dep_corpus, args.out_folder) - elif args.mode == 'segment': - dep_corpus = load_dep_folder(args.in_dep) - if (args.model is None or len(args.model) != 1 or args.model[0] is None - or args.model[0] == ''): - print "No model specified, using pretrained model." - ms = MateSegmenter() - else: - ms = MateSegmenter(model=args.model[0]) - ms.segment(dep_corpus, args.out_folder) - - -################################################################## -# Main -if __name__ == "__main__": - main() diff --git a/setup.cfg b/setup.cfg index 2a9acf1..8314f6f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,13 @@ +[aliases] +test=pytest + [bdist_wheel] -universal = 1 +universal=1 + +[build_sphinx] +all_files=1 +build-dir=docs/_build +source-dir=docs + +[upload_sphinx] +upload-dir=docs/_build/html diff --git a/setup.py b/setup.py index cb532df..d6ee5f9 100755 --- a/setup.py +++ b/setup.py @@ -1,23 +1,42 @@ #!/usr/bin/env python +# -*- mode: python; coding: utf-8; -*- ################################################################## # Libraries -from distutils.core import setup +from setuptools import setup from os import path import codecs -import glob + ################################################################## # Variables and Constants -pwd = path.abspath(path.dirname(__file__)) -with codecs.open(path.join(pwd, "README.rst"), encoding="utf-8") as ifile: +PWD = path.abspath(path.dirname(__file__)) +ENCODING = "utf-8" + +with codecs.open(path.join(PWD, "README.rst"), encoding="utf-8") as ifile: long_description = ifile.read() +INSTALL_REQUIRES = [] +with codecs.open(path.join(PWD, "requirements.txt"), + encoding=ENCODING) as ifile: + for iline in ifile: + iline = iline.strip() + if iline: + INSTALL_REQUIRES.append(iline) + +TEST_REQUIRES = [] +with codecs.open(path.join(PWD, "test-requirements.txt"), + encoding=ENCODING) as ifile: + for iline in ifile: + iline = iline.strip() + if iline: + TEST_REQUIRES.append(iline) + ################################################################## # setup() setup( name="dsegmenter", - version="0.0.1.dev1", + version="0.2.1", description=("Collection of discourse segmenters " "(with pre-trained models for German)"), long_description=long_description, @@ -29,9 +48,6 @@ packages=["dsegmenter", "dsegmenter.bparseg", "dsegmenter.edseg", "dsegmenter.treeseg", "dsegmenter.mateseg", "dsegmenter.evaluation"], - # package_dir = {"dsegmenter.bparseg": "dsegmenter", - # "dsegmenter.edseg": "dsegmenter", - # "dsegmenter.treeseg": "dsegmenter"}, package_data={ "dsegmenter.edseg": [path.join("data", fname) for fname in ( "dass_verbs.txt", "discourse_preps.txt", "finite_verbs.txt", @@ -39,16 +55,13 @@ "dsegmenter.bparseg": [path.join("data", "*.npy"), path.join("data", "*.model")], "dsegmenter.mateseg": [path.join("data", "mate.model")]}, - requires=["numpy (>=1.9.2)", - "scipy (>=0.16.0)", - "nltk (>=3.0.2)", - "scikit.learn (>=0.15.2)", - "segeval (>=2.0.11)"], - provides=["dsegmenter (0.0.1)"], + install_requires=INSTALL_REQUIRES, + setup_requires=["pytest-runner"], + tests_require=TEST_REQUIRES, + provides=["dsegmenter (0.2.12)"], scripts=[path.join("scripts", "discourse_segmenter"), - path.join("scripts", "evaluation"), - path.join("scripts", "mate_segmenter")], - classifiers=["Development Status :: 2 - Pre-Alpha", + path.join("scripts", "evaluation")], + classifiers=["Development Status :: 4 - Beta", "Environment :: Console", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 0000000..b483069 --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1,2 @@ +pytest>=3.0.3 +pytest-cov>=2.3.1 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..f9d58f8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8; mode: python; -*- + +################################################################## +# Imports +from pytest import fixture + +import codecs +import os + + +################################################################## +# Constants +ENCODING = "utf-8" +DIRNAME = os.path.dirname(__file__) +DATA_DIR = os.path.join(DIRNAME, "data") +MATESEG_INPUT1_FNAME = os.path.join(DATA_DIR, "mateseg.input.test1") +with codecs.open(MATESEG_INPUT1_FNAME, 'r', ENCODING) as ifile: + MATESEG_INPUT1 = [iline for iline in ifile] + + +################################################################## +# Fixtures +@fixture(scope="module") +def mateseg_input1(): + return MATESEG_INPUT1 diff --git a/tests/data/mateseg.input.test1 b/tests/data/mateseg.input.test1 new file mode 100644 index 0000000..7d8c85b --- /dev/null +++ b/tests/data/mateseg.input.test1 @@ -0,0 +1,27 @@ +1 Im _ in _ APPRART _ dat|sg|neut -1 15 _ MO _ _ +2 Gegenteil _ Gegenteil _ NN _ dat|sg|neut -1 1 _ NK _ _ +3 : _ -- _ $. _ _ -1 2 _ -- _ _ +4 Ein _ ein _ ART _ nom|sg|masc -1 5 _ NK _ _ +5 Abbau _ Abbau _ NN _ nom|sg|masc -1 10 _ SB _ _ +6 weniger _ weniger _ PIAT _ gen|pl|masc -1 8 _ NK _ _ +7 produktiver _ produktiv _ ADJA _ gen|pl|masc|pos -1 8 _ NK _ _ +8 Arbeitsplätze _ Arbeitsplatz _ NN _ gen|pl|masc -1 5 _ AG _ _ +9 etwa _ etwa _ ADV _ _ -1 8 _ MO _ _ +10 würde _ werden _ VAFIN _ sg|3|past|subj -1 0 _ -- _ _ +11 zunächst _ zunächst _ ADV _ _ -1 15 _ MO _ _ +12 einmal _ einmal _ ADV _ _ -1 15 _ MO _ _ +13 die _ der _ ART _ acc|sg|fem -1 14 _ NK _ _ +14 Arbeitslosigkeit _ Arbeitslosigkeit _ NN _ acc|sg|fem -1 15 _ OA _ _ +15 erhöhen _ erhöhen _ VVINF _ _ -1 10 _ OC _ _ +16 , _ -- _ $, _ _ -1 15 _ -- _ _ +17 die _ der _ ART _ acc|pl|fem -1 19 _ NK _ _ +18 staatlichen _ staatlich _ ADJA _ acc|pl|fem|pos -1 19 _ NK _ _ +19 Aufwendungen _ Aufwendung _ NN _ acc|pl|fem -1 20 _ OA _ _ +20 steigern _ steigern _ VVINF _ _ -1 15 _ CJ _ _ +21 und _ und _ KON _ _ -1 20 _ CD _ _ +22 die _ der _ ART _ acc|pl|fem -1 24 _ NK _ _ +23 privaten _ privat _ ADJA _ acc|pl|fem|pos -1 24 _ NK _ _ +24 Ausgaben _ Ausgabe _ NN _ acc|pl|fem -1 25 _ OA _ _ +25 verringern _ verringern _ VVINF _ _ -1 21 _ CJ _ _ +26 . _ -- _ $. _ _ -1 25 _ -- _ _ + diff --git a/tests/edseg/test_conll.py b/tests/edseg/test_conll.py new file mode 100644 index 0000000..d8ab2e6 --- /dev/null +++ b/tests/edseg/test_conll.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# -*- coding: utf-8; mode: python; -*- + +################################################################## +# Imports +from __future__ import absolute_import + +from dsegmenter.edseg.conll import CONLLWord + +from pytest import fixture +from unittest import TestCase + + +################################################################## +# Variables and Constants +CW_STR = "1 Das _ der _ ART _ " \ + "nom|sg|neut -1 3 _ NK _ _" +CW1 = CONLLWord() +CW2 = CONLLWord(CW_STR) + + +################################################################## +# Test Classes +class TestCONLLWord(TestCase): + + @fixture(autouse=True) + def set_feature(self): + self.cw1 = CW1 + self.cw2 = CW2 + + def test_init(self): + assert self.cw1 + assert self.cw2 + + def test_parse_line(self): + self.cw1.parse_line(CW_STR) + + def test_contains(self): + # issue #3 + assert 'feats' not in self.cw2 diff --git a/tests/mateseg/test_mateseg.py b/tests/mateseg/test_mateseg.py new file mode 100644 index 0000000..892f408 --- /dev/null +++ b/tests/mateseg/test_mateseg.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8; mode: python; -*- + +################################################################## +# Imports +from __future__ import absolute_import, unicode_literals, print_function + +from dsegmenter.mateseg import read_trees +from dsegmenter.mateseg.matesegmenter import MateSegmenter + +from pytest import fixture +from unittest import TestCase + + +################################################################## +# Variables and Constants +TEST1_OUT = "(HS (FRE Im Gegenteil : ) Ein Abbau weniger produktiver" \ + " Arbeitsplätze etwa würde zunächst einmal die" \ + " Arbeitslosigkeit erhöhen , die staatlichen Aufwendungen" \ + " steigern (HSF und die privaten Ausgaben verringern . ) )" + + +################################################################## +# Test Classes +class TestMateSegmenter(TestCase): + + @fixture(autouse=True) + def set_vars(self, mateseg_input1): + self.segmenter = MateSegmenter() + self.input1 = [t for t in read_trees(mateseg_input1)] + + def test_segment_sentence_0(self): + segments = self.segmenter.segment(self.input1) + assert len(segments) == 1 + assert unicode(segments[0][-1]).strip() == TEST1_OUT