removed clldutils as dependency; corrected docs.

xrotwang · xrotwang · commit 82bc9bde99fd · 2025-02-20T08:28:27.000+01:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -6,6 +6,7 @@ This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html
 ## [Unreleased]
 
 - Updated project scaffolding.
+- Removed dependency on clldutils.
 
 
 ## [2.2.1] - 2022-07-08
diff --git a/faq.md b/faq.md
@@ -11,11 +11,12 @@ use the common regular expression markers for start `^` and end `$`.
 ```python
 >>> from segments.tokenizer import Profile
 >>> prf = Profile(
- {'Grapheme': 'c', 'IPA': 'c'},
- {'Grapheme': '^', 'IPA': 'NULL'},
- {'Grapheme': '$', 'IPA': 'NULL'},
- {'Grapheme': 'a', 'IPA': 'b'},
- {'Grapheme': '^a', 'IPA': 'A'})
+    {'Grapheme': 'th', 'IPA': 'tH'},
+    {'Grapheme': 'c', 'IPA': 'c'},
+    {'Grapheme': '^', 'IPA': None},
+    {'Grapheme': '$', 'IPA': None},
+    {'Grapheme': 'a', 'IPA': 'b'},
+    {'Grapheme': '^a', 'IPA': 'A'})
 ```
 
 Note: We treat word-initial `a` differently!
diff --git a/setup.cfg b/setup.cfg
@@ -40,7 +40,6 @@ python_requires = >=3.8
 install_requires =
     regex
     csvw>=1.5.6
-    clldutils>=1.7.3
 
 [options.packages.find]
 where = src
@@ -83,6 +82,7 @@ source =
 
 [coverage:report]
 show_missing = true
+skip_covered = true
 
 [tox:tox]
 envlist = py38, py39, py310, py311, py312, py313
diff --git a/src/segments/__main__.py b/src/segments/__main__.py
@@ -1,55 +1,79 @@
-import logging
 import sys
-from pathlib import Path
-
-from clldutils.clilib import ArgumentParser, command, ParserError
+import logging
+import pathlib
+import argparse
 
 from segments import Tokenizer, Profile
 
 
-def _write(args, line):
-    print('%s' % line)
-
-
-def _read(args):
-    string = args.args[0] if args.args else sys.stdin.read()
-    if not isinstance(string, str):
-        string = string.decode(args.encoding)
-    return string.strip()
+class ParserError(Exception):
+    pass
 
 
-@command()
 def tokenize(args):
     """
     Tokenize a string (passed as argument or read from stdin)
 
     segments [--profile=PATH/TO/PROFILE] tokenize [STRING]
     """
-    if args.profile and not Path(args.profile).exists():  # pragma: no cover
+    if args.profile and not pathlib.Path(args.profile).exists():  # pragma: no cover
         raise ParserError('--profile must be a path for an existing file')
-    _write(args, Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
+    print(Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
 
 
-@command()
 def profile(args):
     """
     Create an orthography profile for a string (passed as argument or read from stdin)
 
     segments profile [STRING]
     """
-    _write(args, Profile.from_text(_read(args)))
+    print(Profile.from_text(_read(args)))
 
 
-def main():  # pragma: no cover
+def _read(args):
+    string = args.args[0] if args.args else sys.stdin.read()
+    if not isinstance(string, str):
+        string = string.decode(args.encoding)
+    return string.strip()
+
+
+def main(parsed_args=None):
+    commands = {'tokenize': tokenize, 'profile': profile}
     logging.basicConfig()
-    parser = ArgumentParser('segments')
+    parser = argparse.ArgumentParser(
+        description="Main command line interface of the segments package.",
+        epilog="Use '%(prog)s help <cmd>' to get help about individual commands.")
+    parser.add_argument("--verbosity", help="increase output verbosity")
+    parser.add_argument('command', help=' | '.join(commands))
+    parser.add_argument('args', nargs=argparse.REMAINDER)
     parser.add_argument("--encoding", help='input encoding', default="utf8")
     parser.add_argument("--profile", help='path to an orthography profile', default=None)
     parser.add_argument(
         "--mapping",
         help='column name in ortho profile to map graphemes',
         default=Profile.GRAPHEME_COL)
-    sys.exit(parser.main())
+
+    args = parsed_args or parser.parse_args()
+    if args.command == 'help' and len(args.args):
+        # As help text for individual commands we simply re-use the docstrings of the
+        # callables registered for the command:
+        print(commands[args.args[0]].__doc__.strip()
+              if args.args[0] in commands else "Invalid command: '{}'".format(args.args[0]))
+    else:
+        if args.command not in commands:
+            print('invalid command')
+            parser.print_help()
+            sys.exit(64)
+        try:
+            commands[args.command](args)
+        except ParserError as e:
+            print(e)
+            print(commands[args.command].__doc__.strip())
+            sys.exit(64)
+        except Exception as e:  # pragma: no cover
+            print(e)
+            sys.exit(1)
+    sys.exit(0)
 
 
 if __name__ == '__main__':  # pragma: no cover
diff --git a/src/segments/profile.py b/src/segments/profile.py
@@ -8,7 +8,6 @@
 import json.decoder
 
 from csvw import TableGroup, Column
-from clldutils.path import readlines
 
 from segments.tree import Tree
 from segments.util import grapheme_pattern
@@ -149,7 +148,9 @@ def from_text(cls, text: str, mapping='mapping') -> 'Profile':
 
     @classmethod
     def from_textfile(cls, fname, mapping='mapping') -> 'Profile':
-        return cls.from_text(' '.join(readlines(fname)), mapping=mapping)
+        with pathlib.Path(fname).open(encoding='utf-8') as fp:
+            lines = fp.readlines()
+            return cls.from_text(' '.join(lines), mapping=mapping)
 
     def __str__(self):
         """
diff --git a/src/segments/tokenizer.py b/src/segments/tokenizer.py
@@ -3,17 +3,25 @@
 (of orthographies) given an orthography profile.
 """
 import typing
+import pathlib
 import unicodedata
 
 import regex
 from csvw.dsv import reader
-from clldutils.path import readlines
 
 from segments.util import nfd, grapheme_pattern
 from segments import errors
 from segments.profile import Profile
 
 
+def iterlines(p: typing.Union[pathlib.Path, str]) -> typing.Generator[str, None, None]:
+    with pathlib.Path(p).open(encoding='utf-8') as fp:
+        for line in fp.readlines():
+            line = line.strip()
+            if line and not line.startswith('#'):
+                yield unicodedata.normalize('NFD', line)
+
+
 class Rules:
     """
     Rules are given in tuple format, comma delimited.
@@ -24,7 +32,7 @@ def __init__(self, *rules: typing.Tuple[str, str]):
 
     @classmethod
     def from_file(cls, fname) -> 'Rules':
-        return cls(*list(reader(readlines(fname, comment='#', normalize='NFD'))))
+        return cls(*list(reader(list(iterlines(fname)))))
 
     def apply(self, s):
         for rule, replacement in self._rules:
diff --git a/src/segments/util.py b/src/segments/util.py
@@ -1,8 +1,8 @@
-from functools import partial
+import functools
 import unicodedata
 
 import regex
 
 REPLACEMENT_MARKER = '�'
-nfd = partial(unicodedata.normalize, 'NFD')
+nfd = functools.partial(unicodedata.normalize, 'NFD')
 grapheme_pattern = regex.compile(r"\X", regex.UNICODE)
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,6 +1,31 @@
 import io
+import argparse
 
-from segments.__main__ import tokenize, profile
+import pytest
+
+from segments.__main__ import tokenize, profile, main
+
+
+def test_main(capsys):
+    with pytest.raises(SystemExit):
+        main(argparse.Namespace(command=None))
+    out, err = capsys.readouterr()
+    assert 'segments' in out
+
+    with pytest.raises(SystemExit):
+        main(argparse.Namespace(command='help', args=['tokenize']))
+    out, err = capsys.readouterr()
+    assert 'Tokenize' in out
+
+    with pytest.raises(SystemExit):
+        main(argparse.Namespace(command='tokenize', args=['abc'], profile=None, mapping=None))
+    out, err = capsys.readouterr()
+    assert 'a b c' in out
+
+    with pytest.raises(SystemExit):
+        main(argparse.Namespace(command='tokenize', args=['abc'], profile='xyz', mapping=None))
+    out, err = capsys.readouterr()
+    assert 'existing' in out
 
 
 def test_tokenize(capsys, mocker):
diff --git a/tests/test_profile.py b/tests/test_profile.py
@@ -1,7 +1,7 @@
+import json
 from copy import deepcopy
 
 import pytest
-from clldutils import jsonlib
 
 from segments import Profile
 
@@ -20,11 +20,11 @@ def test_missing_grapheme():
         Profile({'Grapheme': ''})
 
 
-def test_profile_with_bad_metadata(tmpdir):
-    mdpath = tmpdir / 'md.json'
+def test_profile_with_bad_metadata(tmp_path):
+    mdpath = tmp_path / 'md.json'
     md = deepcopy(Profile.MD)
     md['tables'].append({'tableSchema': {'columns': []}})
-    jsonlib.dump(md, str(mdpath))
+    mdpath.write_text(json.dumps(md), encoding='utf-8')
 
     with pytest.raises(ValueError):
         Profile.from_file(str(mdpath))
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -6,8 +6,7 @@
 
 
 def _read_data(fname):
-    with fname.open(encoding="utf-8") as fp:
-        return fp.read()
+    return fname.read_text(encoding="utf-8")
 
 
 @pytest.fixture