aphp · percevalw · Mar 7, 2023 · Mar 1, 2023 · Mar 1, 2023
diff --git a/changelog.md b/changelog.md
@@ -2,10 +2,14 @@
 
 ## Unreleased
 
+### Added
+- Tokenization exceptions (`Mr.`, `Dr.`, `Mrs.`) and non end-of-sentence periods are now tokenized with the next letter in the `eds` tokenizer
+
 ### Changed
 
 - Disable `EDSMatcher` preprocessing auto progress tracking by default
 - Moved dependencies to a single pyproject.toml: support for `pip install -e '.[dev,docs,setup]'`
+- ADICAP matcher now allow dot separators (e.g. `B.H.HP.A7A0`)
 
 ### Fixed
 

diff --git a/edsnlp/language.py b/edsnlp/language.py
@@ -42,6 +42,9 @@ class EDSLanguage(French):
     default_config = Defaults
 
 
+TOKENIZER_EXCEPTIONS = [r"Dr\.", r"Pr\.", r"M\.", r"Mme\.", r"Mlle\.", r"(?i)(?:ep\.)"]
+
+
 class EDSTokenizer(DummyTokenizer):
     def __init__(self, vocab: Vocab) -> None:
         """
@@ -50,7 +53,6 @@ def __init__(self, vocab: Vocab) -> None:
         - numbers: "ACR5" -> ["ACR", "5"] instead of ["ACR5"]
         - newlines: "\n \n \n" -> ["\n", "\n", "\n"] instead of ["\n \n \n"]
         and should be around 5-6 times faster than its standard French counterpart.
-
         Parameters
         ----------
         vocab: Vocab
@@ -59,9 +61,23 @@ def __init__(self, vocab: Vocab) -> None:
         self.vocab = vocab
         punct = "[:punct:]" + "\"'ˊ＂〃ײ᳓″״‶˶ʺ“”˝"
         num_like = r"\d+(?:[.,]\d+)?"
-        default = rf"[^\d{punct}'\n[[:space:]]+(?:['ˊ](?=[[:alpha:]]|$))?"
+        sep = rf"\d{punct}'\n[:space:]"
+        default = rf"[^{sep}]+(?:['ˊ](?=[[:alpha:]]|$))?"
+        exceptions = "|".join(TOKENIZER_EXCEPTIONS)
+        acronym = r"[A-Z][A-Z0-9]*[.](?=[A-Z0-9])"
         self.word_regex = regex.compile(
-            rf"({num_like}|[{punct}]|[\n\r\t]|[^\S\r\n\t]+|{default})([^\S\r\n\t])?"
+            rf"""(?x)
+        (
+            {exceptions}    # tokenizer exceptions like M., Dr., etc
+            |{acronym}      # acronyms
+            |{num_like}     # numbers
+            |[{punct}]        # punctuations
+            |[\n\r\t]       # new lines or tabs
+            |[^\S\r\n\t]+   # multi-spaces
+            |{default}      # anything else: most often alpha-numerical words
+        )                   # followed by
+        ([^\S\r\n\t])?      # an optional space
+        """
         )
 
     def __call__(self, text: str) -> Doc:

diff --git a/edsnlp/pipelines/ner/adicap/adicap.py b/edsnlp/pipelines/ner/adicap/adicap.py
@@ -1,5 +1,5 @@
 """`eds.adicap` pipeline"""
-
+import re
 
 from spacy.tokens import Doc, Span
 
@@ -61,6 +61,7 @@ def set_extensions(cls) -> None:
             Span.set_extension("value", default=None)
 
     def decode(self, code):
+        code = re.sub("[^A-Za-z0-9 ]+", "", code)
         exploded = list(code)
         adicap = AdicapCode(
             code=code,

diff --git a/edsnlp/pipelines/ner/adicap/patterns.py b/edsnlp/pipelines/ner/adicap/patterns.py
@@ -4,7 +4,8 @@
 """
 
 
-d1_4 = r"[A-Z]{4}"
+# d1_4 = r"[A-Z]{4}"
+d1_4 = r"[A-Z]\.?[A-Z]\.?[A-Z]{2}\.?"
 d5_8_v1 = r"\d{4}"
 d5_8_v2 = r"\d{4}|[A-Z][0-9A-Z][A-Z][0-9]"
 d5_8_v3 = r"[0-9A-Z][0-9][09A-Z][0-9]"

diff --git a/tests/pipelines/ner/test_adicap.py b/tests/pipelines/ner/test_adicap.py
@@ -9,17 +9,16 @@
 d2v5 = "A9AZ"
 d2v6 = "0A12"
 
-
 examples = [
-    f"""1. Codification ADICAP : <ent text={d1v1+d2v1}>{d1v1+d2v1}</ent>.
+    f"""1. Codification ADICAP : <ent text={d1v1 + d2v1}>{d1v1 + d2v1}</ent>.
     Une autre chose""",
-    rf"""2. Codification ADICAP : <ent text={d1v1+d2v2}>{d1v1+d2v2}</ent>,\s
-    <ent text={d1v1+d2v3}>{d1v1+d2v3}</ent>. Une autre chose""",
-    f"""3. adicap : <ent text={d1v2+d2v3}>{d1v2+d2v3}</ent>,
-    <ent text={d1v1+d2v4}>{d1v1+d2v4}</ent>. Une autre chose""",
-    f"""4. Codification  : <ent text={d1v1+d2v6}>{d1v1+d2v6}</ent>.
-    J'aime edsnlp. : {d1v2+d2v3}.  Une autre chose""",
-    f"""5. J'aime edsnlp. : {d1v2+d2v5}.  Une autre chose""",
+    rf"""2. Codification ADICAP : <ent text={d1v1 + d2v2}>{d1v1 + d2v2}</ent>,\s
+    <ent text={d1v1 + d2v3}>{d1v1 + d2v3}</ent>. Une autre chose""",
+    f"""3. adicap : <ent text={d1v2 + d2v3}>{d1v2 + d2v3}</ent>,
+    <ent text={d1v1 + d2v4}>{d1v1 + d2v4}</ent>. Une autre chose""",
+    f"""4. Codification  : <ent text={d1v1 + d2v6}>{d1v1 + d2v6}</ent>.
+    J'aime edsnlp. : {d1v2 + d2v3}.  Une autre chose""",
+    f"""5. J'aime edsnlp. : {d1v2 + d2v5}.  Une autre chose""",
 ]
 
 
@@ -37,3 +36,17 @@ def test_scores(blank_nlp):
                 assert text[expected.start_char : expected.end_char] == ent.text
                 assert expected.modifiers[0].value == ent._.adicap.code
                 assert len(ent._.adicap.dict()) > 0
+
+
+def test_local_spelling(blank_nlp):
+    blank_nlp.add_pipe("eds.adicap")
+    txt = "Codification ADICAP : B.H.HP.A7A0 . Autre chose"
+    assert blank_nlp(txt).ents[0]._.adicap.dict() == {
+        "behaviour_type": "CANCER INVASIF",
+        "code": "BHHPA7A0",
+        "organ": "PROSTATE",
+        "pathology": "PATHOLOGIE TUMORALE",
+        "pathology_type": "ADENOCARCINOME INVASIF (SAI)",
+        "sampling_mode": "BIOPSIE CHIRURGICALE",
+        "technic": "HISTOLOGIE ET CYTOLOGIE PAR INCLUSION",
+    }
diff --git a/tests/test_language.py b/tests/test_language.py
@@ -67,3 +67,27 @@ def test_eds_tokenizer_numbers():
         ("5.4", ""),
         ("mm", ""),
     ]
+
+
+def test_eds_tokenizer_exceptions():
+    nlp = spacy.blank("eds")
+    txt = "M. Gentil a un rhume, code ADICAP: B.H.HP.A7A0"
+    tokenized = [(w.text, w.whitespace_) for w in nlp(txt)]
+    assert tokenized == [
+        ("M.", " "),
+        ("Gentil", " "),
+        ("a", " "),
+        ("un", " "),
+        ("rhume", ""),
+        (",", " "),
+        ("code", " "),
+        ("ADICAP", ""),
+        (":", " "),
+        ("B.", ""),
+        ("H.", ""),
+        ("HP.", ""),
+        ("A", ""),
+        ("7", ""),
+        ("A", ""),
+        ("0", ""),
+    ]