Skip to content

Commit d8d2c40

Browse files
committed
update pattern
1 parent 6122772 commit d8d2c40

File tree

1 file changed

+21
-5
lines changed

1 file changed

+21
-5
lines changed

edsnlp/pipes/ner/tnm/patterns_new.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
tumour_pattern = (
22
r"(?P<tumour_prefix>[cpyramP]{1,2}\s?)?" # Optional tumour prefix
33
r"T\s?" # 'T' followed by optional space
4-
r"(?P<tumour>([0-4]|is|[Xx]))" # Tumour size (required if 'T' is present)
4+
r"(?P<tumour>([0-4]|is|[Xx]|[Oo]))" # Tumour size (required if 'T' is present)
55
r"(?:\s?(?P<tumour_specification>[abcdx]|mi))?" # Optional tumour specification
66
r"(?:\s?\((?P<tumour_suffix>[^()]{1,10})\))?" # Optional tumour suffix
77
)
88

99
node_pattern = (
1010
r"(?P<node_prefix>[cpyraP]{1,2}\s?)?" # Optional node prefix
1111
r"N\s?" # 'N' followed by optional space
12-
r"(?P<node>[Xx01234\+])" # Node size/status (required if 'N' is present)
12+
r"(?P<node>[Xx01234\+]|[Oo])" # Node size/status (required if 'N' is present)
1313
r"(?:\s?(?P<node_specification>"
1414
r"[abcdx]|mi|sn|i[-,+]|mol[-,+]|\(mi\)|\(sn\)|"
1515
r"\(i[-,+]\)|\(mol[-,+]\)|\(\d+\s*/\s*\d+\)))?" # Optional specification
@@ -19,7 +19,7 @@
1919
metastasis_pattern = (
2020
r"(?P<metastasis_prefix>[cpyraP]{1,2}\s?)?" # Optional metastasis prefix
2121
r"M\s?" # 'M' followed by optional space
22-
r"(?P<metastasis>[Xx0123\+])" # Metastasis status (required if 'M' is present)
22+
r"(?P<metastasis>[Xx0123\+]|[Oo])" # Metastasis status (required if 'M' is present)
2323
r"(?:\s?(?P<metastasis_specification>"
2424
r"[abcd]|i\+|mol\+|cy\+|\(i\+\)|\(mol\+\)|"
2525
r"\(cy\+\)|PUL|OSS|HEP|BRA|LYM|OTH|MAR|PLE|PER|ADR|SKI))?" # Optional specification
@@ -31,7 +31,7 @@
3131

3232
resection_pattern = (
3333
r"R\s?"
34-
r"(?P<resection>[Xx012])?" # Optional resection completeness
34+
r"(?P<resection>[Xx012]|[Oo])?" # Optional resection completeness
3535
r"(?:\s?(?P<resection_specification>is|cy\+|\(is\)|\(cy\+\)))?" # Optional specification
3636
r"(?:\s?(?P<resection_loc>(\((?P<r_loc>[a-z]+)\)[,;\s]*)*))?" # Optional localization with space
3737
)
@@ -46,6 +46,7 @@
4646

4747
# We need te exclude pattern like 'T1', 'T2' if they are not followed by node or
4848
# metastasis sections.
49+
4950
exclude_pattern = (
5051
r"(?!T\s*[0-4]\s*[.,\/](?!\s*"
5152
+ node_pattern
@@ -57,6 +58,20 @@
5758
+ "))"
5859
)
5960

61+
exclude_pattern = (
62+
r"(?!"
63+
r"(?:[cpyramP]{0,2}\s*)?" # Optional prefix like p, yp, PT
64+
r"T\s*"
65+
r"(?:[0-4]|is|[xXoO])" # T stage (includes is, x, o)
66+
r"(?:[abcdx]|mi)?" # Optional specification
67+
r"(?:\s*\([^()]{1,10}\))?" # Optional suffix
68+
r"(?:\s*[\s,\/\.\(\)]|$)" # <-- KEY ADDITION: allow end-of-string ($)
69+
r"(?!\s*"
70+
+ node_pattern + "?" + TNM_space + "?" + metastasis_pattern + "?"
71+
+ ")"
72+
+ ")"
73+
)
74+
6075
tnm_pattern_new = (
6176
r"(?:\b|^)"
6277
+ exclude_pattern
@@ -90,5 +105,6 @@
90105
+ version_pattern
91106
+ ")?"
92107
+ r")"
93-
+ r"(?:\b|$|\n)"
108+
+ r"(?=[\s\(\)\.,;:/]|$)"
109+
#+ r"(?:\b|$|\n)"
94110
)

0 commit comments

Comments
 (0)