Skip to content

Commit

Permalink
fix : adding new separator in eds.table and new input check
Browse files Browse the repository at this point in the history
  • Loading branch information
svittoz committed Apr 24, 2024
1 parent 54a3ff8 commit 3451608
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 6 deletions.
2 changes: 1 addition & 1 deletion edsnlp/pipes/misc/tables/patterns.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
sep = r"¦"
sep = r"'¦|\|'"
regex = dict(
tables=rf"(\b.*{sep}.*\n)+",
)
15 changes: 10 additions & 5 deletions edsnlp/pipes/misc/tables/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ class TablesMatcher(GenericMatcher):
tables_pattern : Optional[Dict[str, str]]
The regex pattern to identify tables.
The key of dictionary should be `tables`
sep_pattern : Optional[str]
The regex pattern to identify the separator pattern.
Used when calling `to_pd_table`.
attr : str
spaCy's attribute to use:
a string with the value "TEXT" or "NORM", or a dict with
Expand All @@ -124,14 +127,16 @@ def __init__(
attr: Union[Dict[str, str], str] = "TEXT",
ignore_excluded: bool = True,
):
if tables_pattern is None:
if tables_pattern is None and sep_pattern is None:
self.tables_pattern = patterns.regex
else:
self.tables_pattern = tables_pattern

if sep_pattern is None:
self.sep = patterns.sep
elif tables_pattern is None or sep_pattern is None:
raise ValueError(
"Both tables_pattern and sep_pattern must be provided "
"for custom eds.table pipeline."
)
else:
self.tables_pattern = tables_pattern
self.sep = sep_pattern

super().__init__(
Expand Down

0 comments on commit 3451608

Please sign in to comment.