From fff384a7328cb13a6c384edf0bd924a33d64bf0c Mon Sep 17 00:00:00 2001 From: VITTOZ Simon Date: Tue, 20 Aug 2024 16:46:41 +0200 Subject: [PATCH 1/4] rename eds.measurements pipe to eds.quantities --- changelog.md | 4 + demo/app.py | 4 +- docs/pipes/misc/index.md | 2 +- docs/pipes/misc/measurements.md | 8 - docs/pipes/misc/quantities.md | 8 + docs/pipes/ner/index.md | 2 +- edsnlp/pipes/__init__.py | 2 +- edsnlp/pipes/misc/measurements/__init__.py | 4 - edsnlp/pipes/misc/quantities/__init__.py | 4 + .../{measurements => quantities}/factory.py | 10 +- .../{measurements => quantities}/patterns.py | 2 +- .../quantities.py} | 298 +++++++++--------- mkdocs.yml | 2 +- pyproject.toml | 5 +- tests/helpers.py | 2 +- ...est_measurements.py => test_quantities.py} | 98 +++--- 16 files changed, 228 insertions(+), 227 deletions(-) delete mode 100644 docs/pipes/misc/measurements.md create mode 100644 docs/pipes/misc/quantities.md delete mode 100644 edsnlp/pipes/misc/measurements/__init__.py create mode 100644 edsnlp/pipes/misc/quantities/__init__.py rename edsnlp/pipes/misc/{measurements => quantities}/factory.py (75%) rename edsnlp/pipes/misc/{measurements => quantities}/patterns.py (99%) rename edsnlp/pipes/misc/{measurements/measurements.py => quantities/quantities.py} (82%) rename tests/pipelines/misc/{test_measurements.py => test_quantities.py} (70%) diff --git a/changelog.md b/changelog.md index e993ed824..72a9f72c8 100644 --- a/changelog.md +++ b/changelog.md @@ -6,6 +6,10 @@ - Numbers are now only detected without trying to remove the pollution in between digits, ie `55 @ 77777` could be detected as a full number before, but not anymore. +### Changed + +- Rename `eds.measurements` to `eds.quantities` + ## v0.13.0 ### Added diff --git a/demo/app.py b/demo/app.py index 9462ae314..eb2db4edb 100644 --- a/demo/app.py +++ b/demo/app.py @@ -68,7 +68,7 @@ "Drugs": "drugs", "CIM10": "cim10", "Dates": "dates", - "Measurements": "measurements", + "Quantities": "quantities", "Charlson": "charlson", "SOFA": "sofa", "Elston & Ellis": "elston_ellis", @@ -220,7 +220,7 @@ def load_model(custom_regex: str, **enabled): doc = nlp(text) doc.ents = filter_spans( - (*doc.ents, *doc.spans.get("dates", []), *doc.spans.get("measurements", [])) + (*doc.ents, *doc.spans.get("dates", []), *doc.spans.get("quantities", [])) ) st.header("Visualisation") diff --git a/docs/pipes/misc/index.md b/docs/pipes/misc/index.md index d9df52c6e..5a38acd99 100644 --- a/docs/pipes/misc/index.md +++ b/docs/pipes/misc/index.md @@ -12,7 +12,7 @@ For instance, the date detection and normalisation pipeline falls in this catego |--------------------------|---------------------------------------------| | `eds.dates` | Date extraction and normalisation | | `eds.consultation_dates` | Identify consultation dates | -| `eds.measurements` | Measure extraction and normalisation | +| `eds.quantities` | Quantity extraction and normalisation | | `eds.sections` | Section detection | | `eds.reason` | Rule-based hospitalisation reason detection | | `eds.tables` | Tables detection | diff --git a/docs/pipes/misc/measurements.md b/docs/pipes/misc/measurements.md deleted file mode 100644 index 135cc7e25..000000000 --- a/docs/pipes/misc/measurements.md +++ /dev/null @@ -1,8 +0,0 @@ -# Measurements {: #edsnlp.pipes.misc.measurements.factory.create_component } - -::: edsnlp.pipes.misc.measurements.factory.create_component - options: - heading_level: 2 - show_bases: false - show_source: false - only_class_level: true diff --git a/docs/pipes/misc/quantities.md b/docs/pipes/misc/quantities.md new file mode 100644 index 000000000..50dbe50ad --- /dev/null +++ b/docs/pipes/misc/quantities.md @@ -0,0 +1,8 @@ +# Quantities {: #edsnlp.pipes.misc.quantities.factory.create_component } + +::: edsnlp.pipes.misc.quantities.factory.create_component + options: + heading_level: 2 + show_bases: false + show_source: false + only_class_level: true diff --git a/docs/pipes/ner/index.md b/docs/pipes/ner/index.md index b242840d0..839200982 100644 --- a/docs/pipes/ner/index.md +++ b/docs/pipes/ner/index.md @@ -2,7 +2,7 @@ We provide several Named Entity Recognition (NER) components. Named Entity Recognition is the task of identifying short relevant spans of text, named entities, and classifying them into pre-defined categories. -In the case of clinical documents, these entities can be scores, disorders, behaviors, codes, dates, measurements, etc. +In the case of clinical documents, these entities can be scores, disorders, behaviors, codes, dates, quantities, etc. ## Span setters: where are stored extracted entities ? {: #edsnlp.pipes.base.SpanSetterArg } diff --git a/edsnlp/pipes/__init__.py b/edsnlp/pipes/__init__.py index b3c7f9648..c197e400c 100644 --- a/edsnlp/pipes/__init__.py +++ b/edsnlp/pipes/__init__.py @@ -20,7 +20,7 @@ from .core.terminology.factory import create_component as terminology from .misc.consultation_dates.factory import create_component as consultation_dates from .misc.dates.factory import create_component as dates - from .misc.measurements.factory import create_component as measurements + from .misc.quantities.factory import create_component as quantities from .misc.reason.factory import create_component as reason from .misc.sections.factory import create_component as sections from .misc.tables.factory import create_component as tables diff --git a/edsnlp/pipes/misc/measurements/__init__.py b/edsnlp/pipes/misc/measurements/__init__.py deleted file mode 100644 index 4a01c38ad..000000000 --- a/edsnlp/pipes/misc/measurements/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from edsnlp.pipes.misc.measurements.measurements import MeasurementsMatcher -from edsnlp.pipes.misc.measurements.patterns import * - -from . import factory diff --git a/edsnlp/pipes/misc/quantities/__init__.py b/edsnlp/pipes/misc/quantities/__init__.py new file mode 100644 index 000000000..f5a99a6dc --- /dev/null +++ b/edsnlp/pipes/misc/quantities/__init__.py @@ -0,0 +1,4 @@ +from edsnlp.pipes.misc.quantities.quantities import QuantitiesMatcher +from edsnlp.pipes.misc.quantities.patterns import * + +from . import factory diff --git a/edsnlp/pipes/misc/measurements/factory.py b/edsnlp/pipes/misc/quantities/factory.py similarity index 75% rename from edsnlp/pipes/misc/measurements/factory.py rename to edsnlp/pipes/misc/quantities/factory.py index 9b9bc01ea..7b1f453c9 100644 --- a/edsnlp/pipes/misc/measurements/factory.py +++ b/edsnlp/pipes/misc/quantities/factory.py @@ -1,10 +1,10 @@ from edsnlp.core import registry from . import patterns -from .measurements import MeasurementsMatcher +from .quantities import QuantitiesMatcher DEFAULT_CONFIG = dict( - measurements=list(patterns.common_measurements.keys()), # noqa: E501 + quantities=list(patterns.common_quantities.keys()), # noqa: E501 units_config=patterns.units_config, number_terms=patterns.number_terms, number_regex=patterns.number_regex, @@ -24,7 +24,7 @@ ) create_component = registry.factory.register( - "eds.measurements", + "eds.quantities", assigns=["doc.spans", "doc.ents"], - deprecated=["eds.measures"], -)(MeasurementsMatcher) + deprecated=["eds.measures", "eds.measurements"], +)(QuantitiesMatcher) diff --git a/edsnlp/pipes/misc/measurements/patterns.py b/edsnlp/pipes/misc/quantities/patterns.py similarity index 99% rename from edsnlp/pipes/misc/measurements/patterns.py rename to edsnlp/pipes/misc/quantities/patterns.py index 3ec25c90c..36f38100b 100644 --- a/edsnlp/pipes/misc/measurements/patterns.py +++ b/edsnlp/pipes/misc/quantities/patterns.py @@ -57,7 +57,7 @@ )?""" -common_measurements = { +common_quantities = { "weight": { "unit": "kg", "unitless_patterns": [ diff --git a/edsnlp/pipes/misc/measurements/measurements.py b/edsnlp/pipes/misc/quantities/quantities.py similarity index 82% rename from edsnlp/pipes/misc/measurements/measurements.py rename to edsnlp/pipes/misc/quantities/quantities.py index b7a42c0c4..c7d71c493 100644 --- a/edsnlp/pipes/misc/measurements/measurements.py +++ b/edsnlp/pipes/misc/quantities/quantities.py @@ -22,10 +22,10 @@ get_spans, validate_span_getter, ) -from edsnlp.pipes.misc.measurements import patterns +from edsnlp.pipes.misc.quantities import patterns from edsnlp.utils.filter import align_spans, filter_spans, get_span_group -__all__ = ["MeasurementsMatcher"] +__all__ = ["QuantitiesMatcher"] AFTER_SNIPPET_LIMIT = 6 BEFORE_SNIPPET_LIMIT = 10 @@ -62,31 +62,31 @@ class MsrConfig(TypedDict): name: NotRequired[str] -class Measurement(abc.ABC): +class Quantity(abc.ABC): @abc.abstractmethod - def __len__(self) -> Iterable["SimpleMeasurement"]: + def __len__(self) -> Iterable["SimpleQuantity"]: """ - Number of items in the measure (only one for SimpleMeasurement) + Number of items in the measure (only one for SimpleQuantity) Returns ------- - Iterable["SimpleMeasurement"] + Iterable["SimpleQuantity"] """ @abc.abstractmethod - def __iter__(self) -> Iterable["SimpleMeasurement"]: + def __iter__(self) -> Iterable["SimpleQuantity"]: """ - Iter over items of the measure (only one for SimpleMeasurement) + Iter over items of the measure (only one for SimpleQuantity) Returns ------- - Iterable["SimpleMeasurement"] + Iterable["SimpleQuantity"] """ @abc.abstractmethod - def __getitem__(self, item) -> "SimpleMeasurement": + def __getitem__(self, item) -> "SimpleQuantity": """ - Access items of the measure (only one for SimpleMeasurement) + Access items of the measure (only one for SimpleQuantity) Parameters ---------- @@ -94,7 +94,7 @@ def __getitem__(self, item) -> "SimpleMeasurement": Returns ------- - SimpleMeasurement + SimpleQuantity """ @@ -125,10 +125,10 @@ def parse_unit(self, unit: str) -> Tuple[str, float]: return str(dict(sorted(degrees.items()))), scale -class SimpleMeasurement(Measurement): +class SimpleQuantity(Quantity): def __init__(self, value, unit, registry): """ - The SimpleMeasurement class contains the value and unit + The SimpleQuantity class contains the value and unit for a single non-composite measure Parameters @@ -155,24 +155,24 @@ def __len__(self): return 1 def __repr__(self): - return f"Measurement({self.value}, {repr(self.unit)})" + return f"Quantity({self.value}, {repr(self.unit)})" def __eq__(self, other: Any): - if isinstance(other, SimpleMeasurement): + if isinstance(other, SimpleQuantity): return self.convert_to(other.unit) == other.value return False - def __add__(self, other: "SimpleMeasurement"): + def __add__(self, other: "SimpleQuantity"): if other.unit == self.unit: return self.__class__(self.value + other.value, self.unit, self.registry) return self.__class__( self.value + other.convert_to(self.unit), self.unit, self.registry ) - def __lt__(self, other: Union["SimpleMeasurement", "RangeMeasurement"]): + def __lt__(self, other: Union["SimpleQuantity", "RangeQuantity"]): return self.convert_to(other.unit) < min((part.value for part in other)) - def __le__(self, other: Union["SimpleMeasurement", "RangeMeasurement"]): + def __le__(self, other: Union["SimpleQuantity", "RangeQuantity"]): return self.convert_to(other.unit) <= min((part.value for part in other)) def convert_to(self, other_unit): @@ -199,7 +199,7 @@ def verify(cls, ent): return True -class RangeMeasurement(Measurement): +class RangeQuantity(Quantity): def __init__(self, from_value, to_value, unit, registry): super().__init__() self.value = (from_value, to_value) @@ -207,10 +207,10 @@ def __init__(self, from_value, to_value, unit, registry): self.registry = registry @classmethod - def from_measurements(cls, a, b): + def from_quantities(cls, a, b): a_value = a.value b_value = b.convert_to(a.unit) - return RangeMeasurement(a_value, b_value, a.unit, a.registry) + return RangeQuantity(a_value, b_value, a.unit, a.registry) def convert_to(self, other_unit): self_degrees, self_scale = self.registry.parse_unit(self.unit) @@ -234,17 +234,17 @@ def __iter__(self): def __len__(self): return 2 - def __lt__(self, other: Union[SimpleMeasurement, "RangeMeasurement"]): + def __lt__(self, other: Union[SimpleQuantity, "RangeQuantity"]): return max(self.convert_to(other.unit)) < min((part.value for part in other)) - def __le__(self, other: Union[SimpleMeasurement, "RangeMeasurement"]): + def __le__(self, other: Union[SimpleQuantity, "RangeQuantity"]): return max(self.convert_to(other.unit)) <= max((part.value for part in other)) def __getattr__(self, other_unit): return self.convert_to(other_unit) def __eq__(self, other: Any): - if isinstance(other, RangeMeasurement): + if isinstance(other, RangeQuantity): return self.convert_to(other.unit) == other.value return False @@ -252,26 +252,26 @@ def __str__(self): return f"{self.value[0]}-{self.value[1]} {self.unit}" def __repr__(self): - return f"RangeMeasurement({self.value}, {repr(self.unit)})" + return f"RangeQuantity({self.value}, {repr(self.unit)})" def __getitem__(self, item: int): assert isinstance(item, int) - return SimpleMeasurement(self.value[item], self.unit, self.registry) + return SimpleQuantity(self.value[item], self.unit, self.registry) @classmethod def verify(cls, ent): return True -class MeasurementsMatcher(BaseNERComponent): +class QuantitiesMatcher(BaseNERComponent): r''' - The `eds.measurements` matcher detects and normalizes numerical measurements + The `eds.quantities` matcher detects and normalizes numerical quantities within a medical document. !!! warning - The ``measurements`` pipeline is still in active development and has not - been rigorously validated. If you come across a measurement expression that + The ``quantities`` pipeline is still in active development and has not + been rigorously validated. If you come across a quantity expression that goes undetected, please file an issue ! Pipe definition @@ -281,7 +281,7 @@ class MeasurementsMatcher(BaseNERComponent): On mesure ... à 3mmol/l ; pression : 100mPa-110mPa. Acte réalisé par ... à 12h13""" ``` - === "All measurements" + === "All quantities" ```python import edsnlp @@ -289,19 +289,19 @@ class MeasurementsMatcher(BaseNERComponent): nlp.add_pipe("eds.sentences") nlp.add_pipe("eds.tables") nlp.add_pipe( - "eds.measurements", + "eds.quantities", config=dict( - measurements="all", extract_ranges=True, use_tables=True # (3) # (1) + quantities="all", extract_ranges=True, use_tables=True # (3) # (1) ), # (2) ) - nlp(text).spans["measurements"] + nlp(text).spans["quantities"] # Out: [65, 1.75, 3mmol/l, 100mPa-110mPa, 12h13] ``` 1. 100-110mg, 2 à 4 jours ... 2. If True `eds.tables` must be called 3. All units from [Availability](#availability) will be detected - === "Custom measurements" + === "Custom quantities" ```python import edsnlp @@ -309,9 +309,9 @@ class MeasurementsMatcher(BaseNERComponent): nlp.add_pipe("eds.sentences") nlp.add_pipe("eds.tables") nlp.add_pipe( - "eds.measurements", + "eds.quantities", config=dict( - measurements={ + quantities={ "concentration": {"unit": "mol_per_l"}, "pressure": {"unit": "Pa"}, }, # (3) @@ -319,7 +319,7 @@ class MeasurementsMatcher(BaseNERComponent): use_tables=True, ), # (2) ) - nlp(text).spans["measurements"] + nlp(text).spans["quantities"] # Out: [3mmol/l, 100mPa-110mPa] ``` @@ -327,7 +327,7 @@ class MeasurementsMatcher(BaseNERComponent): 2. If True `eds.tables` must be called 3. Which units are available ? See [Availability](#availability). More on customization ? See [Customization](#customization) - === "Predefined measurements" + === "Predefined quantities" ```python import edsnlp @@ -335,31 +335,31 @@ class MeasurementsMatcher(BaseNERComponent): nlp.add_pipe("eds.sentences") nlp.add_pipe("eds.tables") nlp.add_pipe( - "eds.measurements", + "eds.quantities", config=dict( - measurements=["weight", "size"], # (3) + quantities=["weight", "size"], # (3) extract_ranges=True, # (1) use_tables=True, ), # (2) ) - nlp(text).spans["measurements"] + nlp(text).spans["quantities"] # Out: [65, 1.75] ``` 1. 100-110mg, 2 à 4 jours ... 2. If True `eds.tables` must be called - 3. Which measurements are available ? See [Availability](#availability) + 3. Which quantities are available ? See [Availability](#availability) Scope ----- - The `eds.measurements` matcher can extract simple (e.g. `3cm`) measurements. - It can also detect elliptic enumerations (eg `32, 33 et 34kg`) of measurements - of the same type and split the measurements accordingly. + The `eds.quantities` matcher can extract simple (e.g. `3cm`) quantities. + It can also detect elliptic enumerations (eg `32, 33 et 34kg`) of quantities + of the same type and split the quantities accordingly. The normalized value can then be accessed via the `span._.{measure_name}` attribute, for instance `span._.size` or `span._.weight` and be converted on the fly to a desired unit. Like for other components, the `span._.value` extension can also be - used to access the normalized value for any measurement span. + used to access the normalized value for any quantity span. See Availability section for details on which units are handled @@ -370,8 +370,8 @@ class MeasurementsMatcher(BaseNERComponent): nlp = edsnlp.blank("eds") nlp.add_pipe( - eds.measurements( - measurements=["size", "weight", "bmi"], + eds.quantities( + quantities=["size", "weight", "bmi"], extract_ranges=True, ), ) @@ -386,63 +386,63 @@ class MeasurementsMatcher(BaseNERComponent): doc = nlp(text) - measurements = doc.spans["measurements"] + quantities = doc.spans["quantities"] - measurements + quantities # Out: [1m78, 76kg, 1,2, 2.4mm, 24, entre 1 et 1.5 cm] - measurements[0] + quantities[0] # Out: 1m78 - str(measurements[0]._.size), str(measurements[0]._.value) + str(quantities[0]._.size), str(quantities[0]._.value) # Out: ('1.78 m', '1.78 m') - measurements[0]._.value.cm + quantities[0]._.value.cm # Out: 178.0 - measurements[2] + quantities[2] # Out: 1,2 - str(measurements[2]._.value) + str(quantities[2]._.value) # Out: '1.2 mm' - str(measurements[2]._.value.mm) + str(quantities[2]._.value.mm) # Out: 1.2 - measurements[4] + quantities[4] # Out: 24 - str(measurements[4]._.value) + str(quantities[4]._.value) # Out: '24 kg_per_m2' - str(measurements[4]._.value.kg_per_m2) + str(quantities[4]._.value.kg_per_m2) # Out: 24 - str(measurements[5]._.value) + str(quantities[5]._.value) # Out: 1-1.5 cm ``` - To extract all sizes in centimeters, and average range measurements, you can + To extract all sizes in centimeters, and average range quantities, you can use the following snippet: ```python sizes = [ sum(item.cm for item in m._.value) / len(m._.value) - for m in doc.spans["measurements"] + for m in doc.spans["quantities"] if m.label_ == "size" ] sizes # Out: [178.0, 0.12, 0.24, 1.25] ``` - To extract the measurements from many texts, you can use the following snippet: + To extract the quantities from many texts, you can use the following snippet: ```python import edsnlp, edsnlp.pipes as eds nlp = edsnlp.blank("eds") nlp.add_pipe( - eds.measurements(measurements="weight", extract_ranges=True, as_ents=True), + eds.quantities(quantities="weight", extract_ranges=True, as_ents=True), ) texts = ["Le patient mesure 40000,0 g (aussi noté 40 kg)"] docs = edsnlp.data.from_iterable(texts) @@ -456,10 +456,10 @@ class MeasurementsMatcher(BaseNERComponent): # 1 None 40 45 weight 40 kg ents kg 40.0 ``` - Available units and measurements + Available units and quantities -------------------------------- - Feel free to propose any missing raw unit or predefined measurement. + Feel free to propose any missing raw unit or predefined quantity. Raw units and their derivations (g, mg, mgr ...) and their compositions (g/ml, cac/j ...) can be detected. @@ -469,31 +469,31 @@ class MeasurementsMatcher(BaseNERComponent): `g, m, m2, m3, mol, ui, Pa, %, log, mmHg, s/min/h/d/w/m/y, arc-second, °, °C, cac, goutte, l, x10*4, x10*5` - __Available predefined measurements :__ + __Available predefined quantities :__ - | measurement_name | Example | + | quantity_name | Example | |------------------|------------------------| | `size` | `1m50`, `1.50m`... | | `weight` | `1kg`, `Poids : 65`... | | `bmi` | `BMI: 24`, `24 kg.m-2` | | `volume` | `2 cac`, `8ml`... | - See the [patterns](https://github.com/aphp/edsnlp/blob/master/edsnlp/pipes/misc/measurements/patterns.py) + See the [patterns](https://github.com/aphp/edsnlp/blob/master/edsnlp/pipes/misc/quantities/patterns.py) for exhaustive definition. Customization ------------- - You can declare custom measurements by altering the patterns: + You can declare custom quantities by altering the patterns: ```python import edsnlp, edsnlp.pipes as eds nlp = edsnlp.blank("eds") nlp.add_pipe( - eds.measurements( - measurements={ - "my_custom_surface_measurement": { - # This measurement unit is homogenous to square meters + eds.quantities( + quantities={ + "my_custom_surface_quantity": { + # This quantity unit is homogenous to square meters "unit": "m2", # Handle cases like "surface: 1.8" (implied m2), # vs "surface: 50" (implied cm2) @@ -514,8 +514,8 @@ class MeasurementsMatcher(BaseNERComponent): Extensions ---------- - The `eds.measurements` pipeline declares its extensions dynamically, depending - on the `measurements` parameter: each measurement gets its own extension, and + The `eds.quantities` pipeline declares its extensions dynamically, depending + on the `quantities` parameter: each quantity gets its own extension, and is assigned to a different span group. Parameters @@ -524,7 +524,7 @@ class MeasurementsMatcher(BaseNERComponent): The pipeline object name : str The name of the component. - measurements : Union[str, List[Union[str, MsrConfig]], Dict[str, MsrConfig]] + quantities : Union[str, List[Union[str, MsrConfig]], Dict[str, MsrConfig]] A mapping from measure names to MsrConfig Each measure's configuration has the following shape: ```{ .python .no-check } @@ -544,7 +544,7 @@ class MeasurementsMatcher(BaseNERComponent): } } ``` - Set `measurements="all"` to extract all raw measurements from units_config file. + Set `quantities="all"` to extract all raw quantities from units_config file. number_terms: Dict[str, List[str] A mapping of numbers to their lexical variants stopwords: List[str] @@ -562,17 +562,17 @@ class MeasurementsMatcher(BaseNERComponent): extract_ranges: bool Whether to extract ranges (like "entre 1 et 2 cm") range_patterns: List[Tuple[str, str]] - A list of "{FROM} xx {TO} yy" patterns to match range measurements + A list of "{FROM} xx {TO} yy" patterns to match range quantities after_snippet_limit: int - Maximum word distance after to link a part of a measurement after its number + Maximum word distance after to link a part of a quantity after its number before_snippet_limit: int - Maximum word distance after to link a part of a measurement before its number + Maximum word distance after to link a part of a quantity before its number span_setter: Optional[SpanSetterArg] - How to set the spans in the document. By default, each measurement will + How to set the spans in the document. By default, each quantity will be assigned to its own span group (using either the "name" field of the - config, or the key if you passed a dict), and to the "measurements" group. + config, or the key if you passed a dict), and to the "quantities" group. span_getter : SpanGetterArg - Where to look for measurements in the doc. By default, look in the whole doc. + Where to look for quantities in the doc. By default, look in the whole doc. You can combine this with the `merge_mode` argument for interesting results. merge_mode : Literal["intersect", "align"] How to merge matches with the spans from `span_getter`, if given: @@ -585,16 +585,16 @@ class MeasurementsMatcher(BaseNERComponent): Authors and citation -------------------- - The `eds.measurements` pipeline was developed by AP-HP's Data Science team. + The `eds.quantities` pipeline was developed by AP-HP's Data Science team. ''' # noqa: E501 # fmt: off def __init__( self, nlp: PipelineProtocol, - name: str = "measurements", + name: str = "quantities", *, - measurements: Union[str, List[Union[str, MsrConfig]], Dict[str, MsrConfig]] = list(patterns.common_measurements.keys()), # noqa: E501 + quantities: Union[str, List[Union[str, MsrConfig]], Dict[str, MsrConfig]] = list(patterns.common_quantities.keys()), # noqa: E501 units_config: Dict[str, UnitConfig] = patterns.units_config, number_terms: Dict[str, List[str]] = patterns.number_terms, number_regex: str = patterns.number_regex, @@ -624,22 +624,22 @@ def __init__( "Skipping that step." ) - self.all_measurements = (measurements == "all") - if self.all_measurements: - measurements = [] + self.all_quantities = (quantities == "all") + if self.all_quantities: + quantities = [] # fmt: on - if isinstance(measurements, str): - measurements = [measurements] - if isinstance(measurements, (list, tuple)): - measurements = [ + if isinstance(quantities, str): + quantities = [quantities] + if isinstance(quantities, (list, tuple)): + quantities = [ m if isinstance(m, dict) - else {**patterns.common_measurements[m], "name": m} - for m in measurements + else {**patterns.common_quantities[m], "name": m} + for m in quantities ] - elif isinstance(measurements, dict): - measurements = [{"name": k, **m} for k, m in measurements.items()] + elif isinstance(quantities, dict): + quantities = [{"name": k, **m} for k, m in quantities.items()] self.unit_registry = UnitRegistry(units_config) self.unitless_patterns: Dict[str, UnitlessPatternConfigWithName] = {} @@ -660,13 +660,13 @@ def __init__( self.after_snippet_limit = after_snippet_limit # MEASURES - for m in measurements: + for m in quantities: self.measure_names[self.unit_registry.parse_unit(m["unit"])[0]] = m["name"] if span_setter is None: span_setter = { "ents": as_ents, - "measurements": True, + "quantities": True, **{ name: [name] for name in self.measure_names.values() @@ -691,13 +691,13 @@ def __init__( ignore_space_tokens=True, ) - if self.all_measurements: - measurements = [ - {"name": name, **common_measurement} - for name, common_measurement in patterns.common_measurements.items() + if self.all_quantities: + quantities = [ + {"name": name, **common_quantity} + for name, common_quantity in patterns.common_quantities.items() ] - for measure_config in measurements: + for measure_config in quantities: name = measure_config["name"] unit = measure_config["unit"] self.measure_names[self.unit_registry.parse_unit(unit)[0]] = name @@ -744,7 +744,7 @@ def __init__( def set_extensions(self) -> None: """ - Set extensions for the measurements pipeline. + Set extensions for the quantities pipeline. """ super().set_extensions() @@ -863,7 +863,7 @@ def make_pseudo_sentence( def get_matches(self, doc): """ Extract and filter regex and phrase matches in the document - to prepare the measurement extraction. + to prepare the quantity extraction. Returns the matches and a list of hashes to quickly find unit matches Parameters @@ -892,14 +892,14 @@ def get_matches(self, doc): if term.label not in self.unit_part_label_hashes ] - # Filter out measurement-related spans that overlap already matched + # Filter out quantity-related spans that overlap already matched # entities (in doc.ents or doc.spans["dates"]) # Note: we also include sentence ends tokens as 1-token spans in those matches - # Prevent from matching over ents that are not measurement related + # Prevent from matching over ents that are not quantity related ents = (e for e in doc.ents if e.label_ not in self.measure_names.values()) spans__keep__is_sent_end = filter_spans( [ - # Tuples (span, keep = is measurement related, is sentence end) + # Tuples (span, keep = is quantity related, is sentence end) *zip(get_span_group(doc, "dates"), repeat(False), repeat(False)), *zip(regex_matches, repeat(True), repeat(False)), *zip(non_unit_terms, repeat(True), repeat(False)), @@ -909,7 +909,7 @@ def get_matches(self, doc): ] ) - # Remove non-measurement related spans (keep = False) and sort the matches + # Remove non-quantity related spans (keep = False) and sort the matches matches_and_is_sentence_end: List[(Span, bool)] = sorted( [ (span, is_sent_end) @@ -921,7 +921,7 @@ def get_matches(self, doc): return matches_and_is_sentence_end, unit_label_hashes - def extract_measurements(self, doclike: Doc): + def extract_quantities(self, doclike: Doc): """ Extracts measure entities from the document @@ -971,7 +971,7 @@ def get_matches_before(i): }, ) - measurements = [] + quantities = [] matched_unit_indices = set() matched_number_indices = set() @@ -1117,10 +1117,10 @@ def is_within_row(x): # If the measure was not requested, dismiss it # Otherwise, relabel the entity and create the value attribute - if (dims not in self.measure_names) and not self.all_measurements: + if (dims not in self.measure_names) and not self.all_quantities: continue - if self.all_measurements: + if self.all_quantities: if not Span.has_extension(unit_norm): Span.set_extension(unit_norm, default=None) ent.label_ = unit_norm @@ -1129,10 +1129,10 @@ def is_within_row(x): ent.label_ = self.measure_names[dims] ent._.set( ent.label_, - SimpleMeasurement(value, unit_norm, self.unit_registry) + SimpleQuantity(value, unit_norm, self.unit_registry) ) - measurements.append(ent) + quantities.append(ent) if unit_idx is not None: matched_unit_indices.add(unit_idx) @@ -1149,26 +1149,26 @@ def is_within_row(x): ): unmatched.append(match) - return measurements, unmatched + return quantities, unmatched @classmethod - def merge_adjacent_measurements(cls, measurements: List[Span]) -> List[Span]: + def merge_adjacent_quantities(cls, quantities: List[Span]) -> List[Span]: """ - Aggregates extracted measurements together when they are adjacent to handle + Aggregates extracted quantities together when they are adjacent to handle cases like - 1 meter 50 cm - 30° 4' 54" Parameters ---------- - measurements: List[Span] + quantities: List[Span] Returns ------- List[Span] """ - merged = measurements[:1] - for ent in measurements[1:]: + merged = quantities[:1] + for ent in quantities[1:]: last = merged[-1] if last.end == ent.start and last._.value.unit != ent._.value.unit: @@ -1184,26 +1184,26 @@ def merge_adjacent_measurements(cls, measurements: List[Span]) -> List[Span]: return merged - def merge_measurements_in_ranges(self, measurements: List[Span]) -> List[Span]: + def merge_quantities_in_ranges(self, quantities: List[Span]) -> List[Span]: """ - Aggregates extracted measurements together when they are adjacent to handle + Aggregates extracted quantities together when they are adjacent to handle cases like - 1 meter 50 cm - 30° 4' 54" Parameters ---------- - measurements: List[Span] + quantities: List[Span] Returns ------- List[Span] """ if not self.extract_ranges or not self.range_patterns: - return measurements + return quantities - merged = measurements[:1] - for ent in measurements[1:]: + merged = quantities[:1] + for ent in quantities[1:]: last = merged[-1] from_text = last.doc[last.start - 1].norm_ if last.start > 0 else None @@ -1215,7 +1215,7 @@ def merge_measurements_in_ranges(self, measurements: List[Span]) -> List[Span]: ] if len(matching_patterns): try: - new_value = RangeMeasurement.from_measurements( + new_value = RangeQuantity.from_quantities( last._.value, ent._.value ) merged[-1] = last = last.doc[ @@ -1238,41 +1238,41 @@ def merge_with_existing( existing: List[Span], ) -> List[Span]: """ - Merges the extracted measurements with the existing measurements in the + Merges the extracted quantities with the existing quantities in the document. Parameters ---------- extracted: List[Span] - The extracted measurements + The extracted quantities existing: List[Span] - The existing measurements in the document + The existing quantities in the document Returns ------- List[Span] """ if self.merge_mode == "align": - spans_measurements = align_spans(extracted, existing, sort_by_overlap=True) + spans_quantities = align_spans(extracted, existing, sort_by_overlap=True) extracted = [] - for span, span_measurements in zip(existing, spans_measurements): - if len(span_measurements): - span._.set(span.label_, span_measurements[0]._.get(span.label_)) + for span, span_quantities in zip(existing, spans_quantities): + if len(span_quantities): + span._.set(span.label_, span_quantities[0]._.get(span.label_)) extracted.append(span) elif self.merge_mode == "intersect": - spans_measurements = align_spans(extracted, existing) + spans_quantities = align_spans(extracted, existing) extracted = [] - for span, span_measurements in zip(existing, spans_measurements): - extracted.extend(span_measurements) + for span, span_quantities in zip(existing, spans_quantities): + extracted.extend(span_quantities) extracted = list(dict.fromkeys(extracted)) return extracted def __call__(self, doc): """ - Adds measurements to document's "measurements" SpanGroup. + Adds quantities to document's "quantities" SpanGroup. Parameters ---------- @@ -1282,7 +1282,7 @@ def __call__(self, doc): Returns ------- doc: - spaCy Doc object, annotated for extracted measurements. + spaCy Doc object, annotated for extracted quantities. """ existing = ( list(get_spans(doc, self.span_getter)) @@ -1293,12 +1293,12 @@ def __call__(self, doc): if self.span_getter is not None else [doc] ) - measurements = [m for s in snippets for m in self.extract_measurements(s)[0]] - measurements = self.merge_adjacent_measurements(measurements) - measurements = self.merge_measurements_in_ranges(measurements) + quantities = [m for s in snippets for m in self.extract_quantities(s)[0]] + quantities = self.merge_adjacent_quantities(quantities) + quantities = self.merge_quantities_in_ranges(quantities) if self.span_getter is not None: - measurements = self.merge_with_existing(measurements, existing) + quantities = self.merge_with_existing(quantities, existing) - self.set_spans(doc, measurements) + self.set_spans(doc, quantities) return doc diff --git a/mkdocs.yml b/mkdocs.yml index 9edd09038..1b736c324 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -73,7 +73,7 @@ nav: - Miscellaneous: - pipes/misc/index.md - pipes/misc/dates.md - - pipes/misc/measurements.md + - pipes/misc/quantities.md - pipes/misc/consultation-dates.md - pipes/misc/sections.md - pipes/misc/reason.md diff --git a/pyproject.toml b/pyproject.toml index c6db7d300..950cce87e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -169,7 +169,7 @@ where = ["."] # Misc "eds.consultation_dates" = "edsnlp.pipes.misc.consultation_dates.factory:create_component" "eds.dates" = "edsnlp.pipes.misc.dates.factory:create_component" -"eds.measurements" = "edsnlp.pipes.misc.measurements.factory:create_component" +"eds.quantities" = "edsnlp.pipes.misc.quantities.factory:create_component" "eds.reason" = "edsnlp.pipes.misc.reason.factory:create_component" "eds.sections" = "edsnlp.pipes.misc.sections.factory:create_component" "eds.tables" = "edsnlp.pipes.misc.tables.factory:create_component" @@ -192,7 +192,8 @@ where = ["."] "eds.emergency.ccmu" = "edsnlp.pipes.ner.scores.emergency.ccmu.factory:create_component" "eds.emergency.gemsa" = "edsnlp.pipes.ner.scores.emergency.gemsa.factory:create_component" "eds.emergency.priority" = "edsnlp.pipes.ner.scores.emergency.priority.factory:create_component" -"eds.measures" = "edsnlp.pipes.misc.measurements.factory:create_component" +"eds.measures" = "edsnlp.pipes.misc.quantities.factory:create_component" +"eds.measurements" = "edsnlp.pipes.misc.quantities.factory:create_component" "eds.remove-lowercase" = "edsnlp.pipes.core.normalizer.remove_lowercase.factory:create_component" "emergency.ccmu" = "edsnlp.pipes.ner.scores.emergency.ccmu.factory:create_component" "emergency.gemsa" = "edsnlp.pipes.ner.scores.emergency.gemsa.factory:create_component" diff --git a/tests/helpers.py b/tests/helpers.py index 826091b5b..378faa7aa 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -37,6 +37,6 @@ def make_nlp(lang): model.add_pipe("eds.reported_speech") model.add_pipe("eds.dates") - model.add_pipe("eds.measurements") + model.add_pipe("eds.quantities") return model diff --git a/tests/pipelines/misc/test_measurements.py b/tests/pipelines/misc/test_quantities.py similarity index 70% rename from tests/pipelines/misc/test_measurements.py rename to tests/pipelines/misc/test_quantities.py index b7461be3e..98353c286 100644 --- a/tests/pipelines/misc/test_measurements.py +++ b/tests/pipelines/misc/test_quantities.py @@ -5,7 +5,7 @@ from spacy.tokens.span import Span from edsnlp.core import PipelineProtocol -from edsnlp.pipelines.misc.measurements import MeasurementsMatcher +from edsnlp.pipelines.misc.quantities import QuantitiesMatcher text = ( "Le patient fait 1 m 50 kg. La tumeur fait 2.0cm x 3cm. \n" @@ -30,34 +30,32 @@ def blank_nlp(): @fixture def matcher(blank_nlp: PipelineProtocol): - return MeasurementsMatcher(blank_nlp, extract_ranges=True, use_tables=True) + return QuantitiesMatcher(blank_nlp, extract_ranges=True, use_tables=True) def test_default_factory(blank_nlp: PipelineProtocol): blank_nlp.add_pipe("matcher", config=dict(terms={"patient": "patient"})) blank_nlp.add_pipe( - "eds.measurements", - config=dict(measurements=["size", "weight", "bmi"], use_tables=True), + "eds.quantities", + config=dict(quantities=["size", "weight", "bmi"], use_tables=True), ) doc = blank_nlp(text) assert len(doc.ents) == 1 - assert len(doc.spans["measurements"]) == 15 + assert len(doc.spans["quantities"]) == 15 -def test_measurements_component( - blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher -): +def test_quantities_component(blank_nlp: PipelineProtocol, matcher: QuantitiesMatcher): doc = blank_nlp(text) with raises(KeyError): - doc.spans["measurements"] + doc.spans["quantities"] doc = matcher(doc) - m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13 = doc.spans["measurements"] + m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13 = doc.spans["quantities"] assert str(m1._.value) == "1 m" assert str(m2._.value) == "50 kg" @@ -74,17 +72,17 @@ def test_measurements_component( assert str(m13._.value) == "13-14 g" -def test_measurements_component_scaling( - blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher +def test_quantities_component_scaling( + blank_nlp: PipelineProtocol, matcher: QuantitiesMatcher ): doc = blank_nlp(text) with raises(KeyError): - doc.spans["measurements"] + doc.spans["quantities"] doc = matcher(doc) - m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13 = doc.spans["measurements"] + m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13 = doc.spans["quantities"] assert abs(m1._.value.cm - 100) < 1e-6 assert abs(m2._.value.mg - 50000000.0) < 1e-6 @@ -103,11 +101,11 @@ def test_measurements_component_scaling( assert abs(m13._.value.g[1] - 14.0) < 1e-6 -def test_measure_label(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): +def test_measure_label(blank_nlp: PipelineProtocol, matcher: QuantitiesMatcher): doc = blank_nlp(text) doc = matcher(doc) - m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13 = doc.spans["measurements"] + m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13 = doc.spans["quantities"] assert m1.label_ == "size" assert m2.label_ == "weight" @@ -124,23 +122,21 @@ def test_measure_label(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher assert m13.label_ == "weight" -def test_measurements_all_input( - blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher -): +def test_quantities_all_input(blank_nlp: PipelineProtocol, matcher: QuantitiesMatcher): all_text = "On mesure 13 mol/ml de ..." "On compte 16x10*9 ..." blank_nlp.add_pipe( - "eds.measurements", - config=dict(measurements="all", extract_ranges=True), + "eds.quantities", + config=dict(quantities="all", extract_ranges=True), ) doc = blank_nlp(all_text) - m1, m2 = doc.spans["measurements"] + m1, m2 = doc.spans["quantities"] assert str(m1._.value) == "13 mol_per_ml" assert str(m2._.value) == "16 x10*9" -def test_measure_str(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): +def test_measure_str(blank_nlp: PipelineProtocol, matcher: QuantitiesMatcher): for text, res in [ ("1m50", "1.5 m"), ("1,50cm", "1.5 cm"), @@ -148,39 +144,39 @@ def test_measure_str(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): doc = blank_nlp(text) doc = matcher(doc) - assert str(doc.spans["measurements"][0]._.value) == res + assert str(doc.spans["quantities"][0]._.value) == res -def test_measure_repr(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): +def test_measure_repr(blank_nlp: PipelineProtocol, matcher: QuantitiesMatcher): for text, res in [ ( "1m50", - "Measurement(1.5, 'm')", + "Quantity(1.5, 'm')", ), ( "1,50cm", - "Measurement(1.5, 'cm')", + "Quantity(1.5, 'cm')", ), ]: doc = blank_nlp(text) doc = matcher(doc) - print(doc.spans["measurements"]) + print(doc.spans["quantities"]) - assert repr(doc.spans["measurements"][0]._.value) == res + assert repr(doc.spans["quantities"][0]._.value) == res -def test_compare(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): +def test_compare(blank_nlp: PipelineProtocol, matcher: QuantitiesMatcher): m1, m2 = "1m0", "120cm" - m1 = matcher(blank_nlp(m1)).spans["measurements"][0] - m2 = matcher(blank_nlp(m2)).spans["measurements"][0] + m1 = matcher(blank_nlp(m1)).spans["quantities"][0] + m2 = matcher(blank_nlp(m2)).spans["quantities"][0] assert m1._.value <= m2._.value assert m2._.value > m1._.value m3 = "Entre deux et trois metres" m4 = "De 2 à 3 metres" - m3 = matcher(blank_nlp(m3)).spans["measurements"][0] - m4 = matcher(blank_nlp(m4)).spans["measurements"][0] + m3 = matcher(blank_nlp(m3)).spans["quantities"][0] + m4 = matcher(blank_nlp(m4)).spans["quantities"][0] print(blank_nlp("Entre deux et trois metres")) assert str(m3._.value) == "2-3 m" assert str(m4._.value) == "2-3 m" @@ -193,7 +189,7 @@ def test_compare(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): assert max(list(chain(m1._.value, m2._.value, m3._.value, m4._.value))).cm == 300 -def test_unitless(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): +def test_unitless(blank_nlp: PipelineProtocol, matcher: QuantitiesMatcher): for text, res in [ ("BMI: 24 .", "24 kg_per_m2"), ("Le patient mesure 1.5 ", "1.5 m"), @@ -203,10 +199,10 @@ def test_unitless(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): doc = blank_nlp(text) doc = matcher(doc) - assert str(doc.spans["measurements"][0]._.value) == res + assert str(doc.spans["quantities"][0]._.value) == res -def test_non_matches(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): +def test_non_matches(blank_nlp: PipelineProtocol, matcher: QuantitiesMatcher): for text in [ "On délivre à 10 g / h.", "Le patient grandit de 10 cm par jour ", @@ -217,10 +213,10 @@ def test_non_matches(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): print(list(doc)) doc = matcher(doc) - assert len(doc.spans["measurements"]) == 0 + assert len(doc.spans["quantities"]) == 0 -def test_numbers(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): +def test_numbers(blank_nlp: PipelineProtocol, matcher: QuantitiesMatcher): for text, res in [ ("deux m", "2 m"), ("2 m", "2 m"), @@ -231,10 +227,10 @@ def test_numbers(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): doc = blank_nlp(text) doc = matcher(doc) - assert str(doc.spans["measurements"][0]._.value) == res + assert str(doc.spans["quantities"][0]._.value) == res -def test_ranges(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): +def test_ranges(blank_nlp: PipelineProtocol, matcher: QuantitiesMatcher): for text, res, snippet in [ ("Le patient fait entre 1 et 2m", "1-2 m", "entre 1 et 2m"), ("On mesure de 2 à 2.5 dl d'eau", "2-2.5 dl", "de 2 à 2.5 dl"), @@ -242,10 +238,10 @@ def test_ranges(blank_nlp: PipelineProtocol, matcher: MeasurementsMatcher): doc = blank_nlp(text) doc = matcher(doc) - measurement = doc.spans["measurements"][0] - print(doc.spans["measurements"]) - assert str(measurement._.value) == res - assert measurement.text == snippet + quantity = doc.spans["quantities"][0] + print(doc.spans["quantities"]) + assert str(quantity._.value) == res + assert quantity.text == snippet def test_merge_align(blank_nlp, matcher): @@ -261,7 +257,7 @@ def test_merge_align(blank_nlp, matcher): assert str(ent._.value) == "2.0 cm" -def test_merge_intersect(blank_nlp, matcher: MeasurementsMatcher): +def test_merge_intersect(blank_nlp, matcher: QuantitiesMatcher): matcher.merge_mode = "intersect" matcher.span_setter = {**matcher.span_setter, "ents": True} matcher.span_getter = {"lookup_zones": True} @@ -271,12 +267,12 @@ def test_merge_intersect(blank_nlp, matcher: MeasurementsMatcher): doc = matcher(doc) assert len(doc.ents) == 2 - assert len(doc.spans["measurements"]) == 2 + assert len(doc.spans["quantities"]) == 2 assert [doc.ents[0].text, doc.ents[1].text] == ["2.0cm", "3cm"] assert [doc.ents[0]._.value.cm, doc.ents[1]._.value.cm] == [2.0, 3] -def test_measurement_snippets(blank_nlp, matcher: MeasurementsMatcher): +def test_quantity_snippets(blank_nlp, matcher: QuantitiesMatcher): for text, result in [ ("0.50g", ["0.5 g"]), ("0.050g", ["0.05 g"]), @@ -291,10 +287,10 @@ def test_measurement_snippets(blank_nlp, matcher: MeasurementsMatcher): doc = blank_nlp(text) doc = matcher(doc) - assert [str(span._.value) for span in doc.spans["measurements"]] == result + assert [str(span._.value) for span in doc.spans["quantities"]] == result -def test_error_management(blank_nlp, matcher: MeasurementsMatcher): +def test_error_management(blank_nlp, matcher: QuantitiesMatcher): text = """ Leucocytes ¦ ¦ ¦4.2 ¦ ¦4.0-10.0 Hémoglobine ¦ ¦9.0 - ¦ ¦13-14 @@ -302,4 +298,4 @@ def test_error_management(blank_nlp, matcher: MeasurementsMatcher): doc = blank_nlp(text) doc = matcher(doc) - assert len(doc.spans["measurements"]) == 0 + assert len(doc.spans["quantities"]) == 0 From c8049213b95b6750bf389f1661fe7b651a1f3a07 Mon Sep 17 00:00:00 2001 From: svittoz Date: Wed, 21 Aug 2024 15:53:37 +0000 Subject: [PATCH 2/4] adding possibility to select cim10 and atc in eds.cim10 and eds.drugs --- edsnlp/pipes/ner/cim10/factory.py | 8 ++++++-- edsnlp/pipes/ner/cim10/patterns.py | 4 +++- edsnlp/pipes/ner/drugs/factory.py | 8 ++++++-- edsnlp/pipes/ner/drugs/patterns.py | 13 +++++++++++-- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/edsnlp/pipes/ner/cim10/factory.py b/edsnlp/pipes/ner/cim10/factory.py index b61967306..a128fca19 100644 --- a/edsnlp/pipes/ner/cim10/factory.py +++ b/edsnlp/pipes/ner/cim10/factory.py @@ -1,4 +1,4 @@ -from typing import Any, Dict +from typing import Any, Dict, List from typing_extensions import Literal @@ -28,6 +28,7 @@ def create_component( name: str = "cim10", *, attr: str = "NORM", + cim10: List[str] = None, ignore_excluded: bool = False, ignore_space_tokens: bool = False, term_matcher: Literal["exact", "simstring"] = "exact", @@ -75,6 +76,9 @@ def create_component( The pipeline object name : str The name of the component + cim10 : str + List of cim10 to retrieve. If None, all cim10 will be searched, + resulting in higher computation time. attr : str The default attribute to use for matching. ignore_excluded : bool @@ -104,7 +108,7 @@ def create_component( nlp=nlp, name=name, regex=dict(), - terms=get_patterns(), + terms=get_patterns(cim10), attr=attr, ignore_excluded=ignore_excluded, ignore_space_tokens=ignore_space_tokens, diff --git a/edsnlp/pipes/ner/cim10/patterns.py b/edsnlp/pipes/ner/cim10/patterns.py index 43e19064d..b6ac84d0a 100644 --- a/edsnlp/pipes/ner/cim10/patterns.py +++ b/edsnlp/pipes/ner/cim10/patterns.py @@ -5,7 +5,7 @@ from edsnlp import BASE_DIR -def get_patterns() -> Dict[str, List[str]]: +def get_patterns(cim10: List[str] = None) -> Dict[str, List[str]]: df = pd.read_csv(BASE_DIR / "resources" / "cim10.csv.gz") df["code_pattern"] = df["code"] @@ -30,4 +30,6 @@ def get_patterns() -> Dict[str, List[str]]: patterns = df.groupby("code")["patterns"].agg(list).to_dict() + patterns = {k: v for k, v in patterns.items() if k in cim10} if cim10 else patterns + return patterns diff --git a/edsnlp/pipes/ner/drugs/factory.py b/edsnlp/pipes/ner/drugs/factory.py index f346a16b1..bf840bfd4 100644 --- a/edsnlp/pipes/ner/drugs/factory.py +++ b/edsnlp/pipes/ner/drugs/factory.py @@ -1,4 +1,4 @@ -from typing import Any, Dict +from typing import Any, Dict, List from typing_extensions import Literal @@ -28,6 +28,7 @@ def create_component( name: str = "drugs", *, attr: str = "NORM", + atc: List[str] = None, ignore_excluded: bool = False, ignore_space_tokens: bool = False, term_matcher: Literal["exact", "simstring"] = "exact", @@ -83,6 +84,9 @@ def create_component( The name of the component attr : str The default attribute to use for matching. + atc : str + List of atc to retrieve. If None, all atc will be searched, + resulting in higher computation time. ignore_excluded : bool Whether to skip excluded tokens (requires an upstream pipeline to mark excluded tokens). @@ -111,7 +115,7 @@ def create_component( nlp=nlp, name=name, regex=dict(), - terms=get_patterns(), + terms=get_patterns(atc), attr=attr, ignore_excluded=ignore_excluded, ignore_space_tokens=ignore_space_tokens, diff --git a/edsnlp/pipes/ner/drugs/patterns.py b/edsnlp/pipes/ner/drugs/patterns.py index 1ea480f81..3ecdde4fe 100644 --- a/edsnlp/pipes/ner/drugs/patterns.py +++ b/edsnlp/pipes/ner/drugs/patterns.py @@ -6,6 +6,15 @@ drugs_file = BASE_DIR / "resources" / "drugs.json" -def get_patterns() -> Dict[str, List[str]]: +def filter_dict_by_keys(D: Dict[str, List[str]], L: List[str]): + filtered_dict = { + k: v for k, v in D.items() if any(k.startswith(prefix) for prefix in L) + } + return filtered_dict + + +def get_patterns(atc: List[str] = None) -> Dict[str, List[str]]: with open(drugs_file, "r") as f: - return json.load(f) + patterns = json.load(f) + patterns = {k: v for k, v in patterns.items() if k in atc} if atc else patterns + return patterns From e2fc8051c1a8f2c8adce9a291827615830de21ff Mon Sep 17 00:00:00 2001 From: svittoz <137794505+svittoz@users.noreply.github.com> Date: Wed, 28 Aug 2024 18:24:56 +0200 Subject: [PATCH 3/4] Update patterns.py --- edsnlp/pipes/ner/drugs/patterns.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/edsnlp/pipes/ner/drugs/patterns.py b/edsnlp/pipes/ner/drugs/patterns.py index 3ecdde4fe..7a42f0d7d 100644 --- a/edsnlp/pipes/ner/drugs/patterns.py +++ b/edsnlp/pipes/ner/drugs/patterns.py @@ -5,14 +5,6 @@ drugs_file = BASE_DIR / "resources" / "drugs.json" - -def filter_dict_by_keys(D: Dict[str, List[str]], L: List[str]): - filtered_dict = { - k: v for k, v in D.items() if any(k.startswith(prefix) for prefix in L) - } - return filtered_dict - - def get_patterns(atc: List[str] = None) -> Dict[str, List[str]]: with open(drugs_file, "r") as f: patterns = json.load(f) From b4d8dfb8874a9d628cc67e199ca084d947415221 Mon Sep 17 00:00:00 2001 From: svittoz <137794505+svittoz@users.noreply.github.com> Date: Wed, 28 Aug 2024 18:27:04 +0200 Subject: [PATCH 4/4] Update patterns.py --- edsnlp/pipes/ner/drugs/patterns.py | 1 + 1 file changed, 1 insertion(+) diff --git a/edsnlp/pipes/ner/drugs/patterns.py b/edsnlp/pipes/ner/drugs/patterns.py index 7a42f0d7d..fe762998c 100644 --- a/edsnlp/pipes/ner/drugs/patterns.py +++ b/edsnlp/pipes/ner/drugs/patterns.py @@ -5,6 +5,7 @@ drugs_file = BASE_DIR / "resources" / "drugs.json" + def get_patterns(atc: List[str] = None) -> Dict[str, List[str]]: with open(drugs_file, "r") as f: patterns = json.load(f)