Skip to content

Commit 82bc9bd

Browse files
committed
removed clldutils as dependency; corrected docs.
1 parent 3620a14 commit 82bc9bd

File tree

10 files changed

+99
-40
lines changed

10 files changed

+99
-40
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html
66
## [Unreleased]
77

88
- Updated project scaffolding.
9+
- Removed dependency on clldutils.
910

1011

1112
## [2.2.1] - 2022-07-08

faq.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@ use the common regular expression markers for start `^` and end `$`.
1111
```python
1212
>>> from segments.tokenizer import Profile
1313
>>> prf = Profile(
14-
{'Grapheme': 'c', 'IPA': 'c'},
15-
{'Grapheme': '^', 'IPA': 'NULL'},
16-
{'Grapheme': '$', 'IPA': 'NULL'},
17-
{'Grapheme': 'a', 'IPA': 'b'},
18-
{'Grapheme': '^a', 'IPA': 'A'})
14+
{'Grapheme': 'th', 'IPA': 'tH'},
15+
{'Grapheme': 'c', 'IPA': 'c'},
16+
{'Grapheme': '^', 'IPA': None},
17+
{'Grapheme': '$', 'IPA': None},
18+
{'Grapheme': 'a', 'IPA': 'b'},
19+
{'Grapheme': '^a', 'IPA': 'A'})
1920
```
2021

2122
Note: We treat word-initial `a` differently!

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ python_requires = >=3.8
4040
install_requires =
4141
regex
4242
csvw>=1.5.6
43-
clldutils>=1.7.3
4443

4544
[options.packages.find]
4645
where = src
@@ -83,6 +82,7 @@ source =
8382

8483
[coverage:report]
8584
show_missing = true
85+
skip_covered = true
8686

8787
[tox:tox]
8888
envlist = py38, py39, py310, py311, py312, py313

src/segments/__main__.py

Lines changed: 45 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,79 @@
1-
import logging
21
import sys
3-
from pathlib import Path
4-
5-
from clldutils.clilib import ArgumentParser, command, ParserError
2+
import logging
3+
import pathlib
4+
import argparse
65

76
from segments import Tokenizer, Profile
87

98

10-
def _write(args, line):
11-
print('%s' % line)
12-
13-
14-
def _read(args):
15-
string = args.args[0] if args.args else sys.stdin.read()
16-
if not isinstance(string, str):
17-
string = string.decode(args.encoding)
18-
return string.strip()
9+
class ParserError(Exception):
10+
pass
1911

2012

21-
@command()
2213
def tokenize(args):
2314
"""
2415
Tokenize a string (passed as argument or read from stdin)
2516
2617
segments [--profile=PATH/TO/PROFILE] tokenize [STRING]
2718
"""
28-
if args.profile and not Path(args.profile).exists(): # pragma: no cover
19+
if args.profile and not pathlib.Path(args.profile).exists(): # pragma: no cover
2920
raise ParserError('--profile must be a path for an existing file')
30-
_write(args, Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
21+
print(Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
3122

3223

33-
@command()
3424
def profile(args):
3525
"""
3626
Create an orthography profile for a string (passed as argument or read from stdin)
3727
3828
segments profile [STRING]
3929
"""
40-
_write(args, Profile.from_text(_read(args)))
30+
print(Profile.from_text(_read(args)))
4131

4232

43-
def main(): # pragma: no cover
33+
def _read(args):
34+
string = args.args[0] if args.args else sys.stdin.read()
35+
if not isinstance(string, str):
36+
string = string.decode(args.encoding)
37+
return string.strip()
38+
39+
40+
def main(parsed_args=None):
41+
commands = {'tokenize': tokenize, 'profile': profile}
4442
logging.basicConfig()
45-
parser = ArgumentParser('segments')
43+
parser = argparse.ArgumentParser(
44+
description="Main command line interface of the segments package.",
45+
epilog="Use '%(prog)s help <cmd>' to get help about individual commands.")
46+
parser.add_argument("--verbosity", help="increase output verbosity")
47+
parser.add_argument('command', help=' | '.join(commands))
48+
parser.add_argument('args', nargs=argparse.REMAINDER)
4649
parser.add_argument("--encoding", help='input encoding', default="utf8")
4750
parser.add_argument("--profile", help='path to an orthography profile', default=None)
4851
parser.add_argument(
4952
"--mapping",
5053
help='column name in ortho profile to map graphemes',
5154
default=Profile.GRAPHEME_COL)
52-
sys.exit(parser.main())
55+
56+
args = parsed_args or parser.parse_args()
57+
if args.command == 'help' and len(args.args):
58+
# As help text for individual commands we simply re-use the docstrings of the
59+
# callables registered for the command:
60+
print(commands[args.args[0]].__doc__.strip()
61+
if args.args[0] in commands else "Invalid command: '{}'".format(args.args[0]))
62+
else:
63+
if args.command not in commands:
64+
print('invalid command')
65+
parser.print_help()
66+
sys.exit(64)
67+
try:
68+
commands[args.command](args)
69+
except ParserError as e:
70+
print(e)
71+
print(commands[args.command].__doc__.strip())
72+
sys.exit(64)
73+
except Exception as e: # pragma: no cover
74+
print(e)
75+
sys.exit(1)
76+
sys.exit(0)
5377

5478

5579
if __name__ == '__main__': # pragma: no cover

src/segments/profile.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import json.decoder
99

1010
from csvw import TableGroup, Column
11-
from clldutils.path import readlines
1211

1312
from segments.tree import Tree
1413
from segments.util import grapheme_pattern
@@ -149,7 +148,9 @@ def from_text(cls, text: str, mapping='mapping') -> 'Profile':
149148

150149
@classmethod
151150
def from_textfile(cls, fname, mapping='mapping') -> 'Profile':
152-
return cls.from_text(' '.join(readlines(fname)), mapping=mapping)
151+
with pathlib.Path(fname).open(encoding='utf-8') as fp:
152+
lines = fp.readlines()
153+
return cls.from_text(' '.join(lines), mapping=mapping)
153154

154155
def __str__(self):
155156
"""

src/segments/tokenizer.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,25 @@
33
(of orthographies) given an orthography profile.
44
"""
55
import typing
6+
import pathlib
67
import unicodedata
78

89
import regex
910
from csvw.dsv import reader
10-
from clldutils.path import readlines
1111

1212
from segments.util import nfd, grapheme_pattern
1313
from segments import errors
1414
from segments.profile import Profile
1515

1616

17+
def iterlines(p: typing.Union[pathlib.Path, str]) -> typing.Generator[str, None, None]:
18+
with pathlib.Path(p).open(encoding='utf-8') as fp:
19+
for line in fp.readlines():
20+
line = line.strip()
21+
if line and not line.startswith('#'):
22+
yield unicodedata.normalize('NFD', line)
23+
24+
1725
class Rules:
1826
"""
1927
Rules are given in tuple format, comma delimited.
@@ -24,7 +32,7 @@ def __init__(self, *rules: typing.Tuple[str, str]):
2432

2533
@classmethod
2634
def from_file(cls, fname) -> 'Rules':
27-
return cls(*list(reader(readlines(fname, comment='#', normalize='NFD'))))
35+
return cls(*list(reader(list(iterlines(fname)))))
2836

2937
def apply(self, s):
3038
for rule, replacement in self._rules:

src/segments/util.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
from functools import partial
1+
import functools
22
import unicodedata
33

44
import regex
55

66
REPLACEMENT_MARKER = '�'
7-
nfd = partial(unicodedata.normalize, 'NFD')
7+
nfd = functools.partial(unicodedata.normalize, 'NFD')
88
grapheme_pattern = regex.compile(r"\X", regex.UNICODE)

tests/test_cli.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,31 @@
11
import io
2+
import argparse
23

3-
from segments.__main__ import tokenize, profile
4+
import pytest
5+
6+
from segments.__main__ import tokenize, profile, main
7+
8+
9+
def test_main(capsys):
10+
with pytest.raises(SystemExit):
11+
main(argparse.Namespace(command=None))
12+
out, err = capsys.readouterr()
13+
assert 'segments' in out
14+
15+
with pytest.raises(SystemExit):
16+
main(argparse.Namespace(command='help', args=['tokenize']))
17+
out, err = capsys.readouterr()
18+
assert 'Tokenize' in out
19+
20+
with pytest.raises(SystemExit):
21+
main(argparse.Namespace(command='tokenize', args=['abc'], profile=None, mapping=None))
22+
out, err = capsys.readouterr()
23+
assert 'a b c' in out
24+
25+
with pytest.raises(SystemExit):
26+
main(argparse.Namespace(command='tokenize', args=['abc'], profile='xyz', mapping=None))
27+
out, err = capsys.readouterr()
28+
assert 'existing' in out
429

530

631
def test_tokenize(capsys, mocker):

tests/test_profile.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1+
import json
12
from copy import deepcopy
23

34
import pytest
4-
from clldutils import jsonlib
55

66
from segments import Profile
77

@@ -20,11 +20,11 @@ def test_missing_grapheme():
2020
Profile({'Grapheme': ''})
2121

2222

23-
def test_profile_with_bad_metadata(tmpdir):
24-
mdpath = tmpdir / 'md.json'
23+
def test_profile_with_bad_metadata(tmp_path):
24+
mdpath = tmp_path / 'md.json'
2525
md = deepcopy(Profile.MD)
2626
md['tables'].append({'tableSchema': {'columns': []}})
27-
jsonlib.dump(md, str(mdpath))
27+
mdpath.write_text(json.dumps(md), encoding='utf-8')
2828

2929
with pytest.raises(ValueError):
3030
Profile.from_file(str(mdpath))

tests/test_tokenizer.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66

77

88
def _read_data(fname):
9-
with fname.open(encoding="utf-8") as fp:
10-
return fp.read()
9+
return fname.read_text(encoding="utf-8")
1110

1211

1312
@pytest.fixture

0 commit comments

Comments
 (0)