adding possibility to select cim10 and atc in eds.cim10 and eds.drugs

svittoz · svittoz · commit c8049213b95b · 2024-08-28T15:16:38.000Z
diff --git a/edsnlp/pipes/ner/cim10/factory.py b/edsnlp/pipes/ner/cim10/factory.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 from typing_extensions import Literal
 
@@ -28,6 +28,7 @@ def create_component(
     name: str = "cim10",
     *,
     attr: str = "NORM",
+    cim10: List[str] = None,
     ignore_excluded: bool = False,
     ignore_space_tokens: bool = False,
     term_matcher: Literal["exact", "simstring"] = "exact",
@@ -75,6 +76,9 @@ def create_component(
         The pipeline object
     name : str
         The name of the component
+    cim10 : str
+        List of cim10 to retrieve. If None, all cim10 will be searched,
+        resulting in higher computation time.
     attr : str
         The default attribute to use for matching.
     ignore_excluded : bool
@@ -104,7 +108,7 @@ def create_component(
         nlp=nlp,
         name=name,
         regex=dict(),
-        terms=get_patterns(),
+        terms=get_patterns(cim10),
         attr=attr,
         ignore_excluded=ignore_excluded,
         ignore_space_tokens=ignore_space_tokens,
diff --git a/edsnlp/pipes/ner/cim10/patterns.py b/edsnlp/pipes/ner/cim10/patterns.py
@@ -5,7 +5,7 @@
 from edsnlp import BASE_DIR
 
 
-def get_patterns() -> Dict[str, List[str]]:
+def get_patterns(cim10: List[str] = None) -> Dict[str, List[str]]:
     df = pd.read_csv(BASE_DIR / "resources" / "cim10.csv.gz")
 
     df["code_pattern"] = df["code"]
@@ -30,4 +30,6 @@ def get_patterns() -> Dict[str, List[str]]:
 
     patterns = df.groupby("code")["patterns"].agg(list).to_dict()
 
+    patterns = {k: v for k, v in patterns.items() if k in cim10} if cim10 else patterns
+
     return patterns
diff --git a/edsnlp/pipes/ner/drugs/factory.py b/edsnlp/pipes/ner/drugs/factory.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 from typing_extensions import Literal
 
@@ -28,6 +28,7 @@ def create_component(
     name: str = "drugs",
     *,
     attr: str = "NORM",
+    atc: List[str] = None,
     ignore_excluded: bool = False,
     ignore_space_tokens: bool = False,
     term_matcher: Literal["exact", "simstring"] = "exact",
@@ -83,6 +84,9 @@ def create_component(
         The name of the component
     attr : str
         The default attribute to use for matching.
+    atc : str
+        List of atc to retrieve. If None, all atc will be searched,
+        resulting in higher computation time.
     ignore_excluded : bool
         Whether to skip excluded tokens (requires an upstream
         pipeline to mark excluded tokens).
@@ -111,7 +115,7 @@ def create_component(
         nlp=nlp,
         name=name,
         regex=dict(),
-        terms=get_patterns(),
+        terms=get_patterns(atc),
         attr=attr,
         ignore_excluded=ignore_excluded,
         ignore_space_tokens=ignore_space_tokens,
diff --git a/edsnlp/pipes/ner/drugs/patterns.py b/edsnlp/pipes/ner/drugs/patterns.py
@@ -6,6 +6,15 @@
 drugs_file = BASE_DIR / "resources" / "drugs.json"
 
 
-def get_patterns() -> Dict[str, List[str]]:
+def filter_dict_by_keys(D: Dict[str, List[str]], L: List[str]):
+    filtered_dict = {
+        k: v for k, v in D.items() if any(k.startswith(prefix) for prefix in L)
+    }
+    return filtered_dict
+
+
+def get_patterns(atc: List[str] = None) -> Dict[str, List[str]]:
     with open(drugs_file, "r") as f:
-        return json.load(f)
+        patterns = json.load(f)
+        patterns = {k: v for k, v in patterns.items() if k in atc} if atc else patterns
+        return patterns