From 07871ed4feca7dbbbd6f4dd578bdb460d5aa7d3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christel=20G=C3=A9rardin?= Date: Wed, 24 May 2023 14:50:40 +0000 Subject: [PATCH] Bug fixes, table pipe is more robust, Updated doc --- docs/pipelines/misc/measurements.md | 9 +- docs/pipelines/misc/tables.md | 105 +++++++++++++++++++++++ edsnlp/pipelines/misc/tables/factory.py | 6 +- edsnlp/pipelines/misc/tables/patterns.py | 4 +- edsnlp/pipelines/misc/tables/tables.py | 66 +++++++++++--- 5 files changed, 172 insertions(+), 18 deletions(-) create mode 100644 docs/pipelines/misc/tables.md diff --git a/docs/pipelines/misc/measurements.md b/docs/pipelines/misc/measurements.md index a30041025..5aeb5a850 100644 --- a/docs/pipelines/misc/measurements.md +++ b/docs/pipelines/misc/measurements.md @@ -8,7 +8,7 @@ We use simple regular expressions to extract and normalize measurements, and use By default, the `eds.measurements` pipeline lets you match all measurements, i.e measurements in most units as well as unitless measurements. If a unit is not in our register, then you can add It manually. If not, the measurement will be matched without Its unit. -If you prefer to match specific measurements only, you can create your own measurement config. Nevertheless, some default measurements configs are already provided out of the box: +If you prefer matching specific measurements only, you can create your own measurement config anda set `all_measurements` parameter to `False`. Nevertheless, some default measurements configs are already provided out of the box: | Measurement name | Example | | ---------------- | ---------------------- | @@ -16,6 +16,7 @@ If you prefer to match specific measurements only, you can create your own measu | `eds.weight` | `12kg`, `1kg300` | | `eds.bmi` | `BMI: 24`, `24 kg.m-2` | | `eds.volume` | `2 cac`, `8ml` | +| `eds.bool` | `positive`, `negatif` | The normalized value can then be accessed via the `span._.value` attribute and converted on the fly to a desired unit (eg `span._.value.g_per_cl` or `span._.value.kg_per_m3` for a density). @@ -25,7 +26,7 @@ The measurements that can be extracted can have one or many of the following cha - Measurements with range indication (escpecially < or >) - Measurements with power -The measurement can be written in many coplex forms. Among them, this pipe can detect: +The measurement can be written in many complex forms. Among them, this pipe can detect: - Measurements with range indication, numerical value, power and units in many different orders and separated by customizable stop words - Composed units (eg `1m50`) - Measurement with "unitless patterns", i.e some textual information next to a numerical value which allows us to retrieve a unit even if It is not written (eg in the text `Height: 80`, this pipe will a detect the numlerical value `80`and match It to the unit `kg`) @@ -33,7 +34,9 @@ The measurement can be written in many coplex forms. Among them, this pipe can d ## Usage -The matched measurements are labelised with `eds.measurement` by default. However, if you are only creating your own measurement or using a predefined one, your measurements will be labeled with the name of this measurement (eg `eds.weight`). +This pipe works better with `eds.dates` and `eds.tables` pipe at the same time. These pipes let `eds.measurements` skip dates as measurements and make a specific matching for each table, benefitting of the structured data. + +The matched measurements are labeled with a default measurement name if available (eg `eds.size`), else `eds.measurement` if any measure is linked to the dimension of the measure's unit and if `all_measurements` is set to `True`. As said before, each matched measurement can be accessed via the `span._.value`. This gives you a `SimpleMeasurement` object with the following attributes : - `value_range` ("<", "=" or ">") diff --git a/docs/pipelines/misc/tables.md b/docs/pipelines/misc/tables.md new file mode 100644 index 000000000..0acc54f9d --- /dev/null +++ b/docs/pipelines/misc/tables.md @@ -0,0 +1,105 @@ +# Tables + +The `eds.tables` pipeline's role is to detect tables present in a medical document. +We use simple regular expressions to extract tables like text. + +## Usage + +This pipe lets you match different forms of tables. They can have a frame or not, rows can be spread on multiple consecutive lines (in case of a bad parsing for example)... You can also indicate the presence of headers with the `col_names` and `row_names` boolean parameters. + +Each matched table is returned as a `Span` object. You can then access to an equivalent dictionnary formatted table with `table` extension or use `to_pandas_table()` to get the equivalent pandas DataFrame. The key of the dictionnary is determined as folowed: +- If `col_names` is True, then, the dictionnary keys are the names of the columns (str). +- Elif `row_names` is True, then, the dictionnary keys are the names (str). +- Else the dictionnary keys are indexes of the columns (int). + +`to_pandas_table()` can be customised with `as_spans` parameter. If set to `True`, then the pandas dataframe will contain the cells as spans, else the pandas dataframe will contain the cells as raw strings. + +```python +import spacy + +nlp = spacy.blank("fr") +nlp.add_pipe("eds.normalizer") +nlp.add_pipe("eds.tables") + +text = """ +SERVICE +MEDECINE INTENSIVE – +REANIMATION +Réanimation / Surveillance Continue +Médicale + +COMPTE RENDU D'HOSPITALISATION du 05/06/2020 au 10/06/2020 +Madame DUPONT Marie, née le 16/05/1900, âgée de 20 ans, a été hospitalisée en réanimation du +05/06/1920 au 10/06/1920 pour intoxication médicamenteuse volontaire. + + +Examens complémentaires +Hématologie +Numération +Leucocytes ¦x10*9/L ¦4.97 ¦4.09-11 +Hématies ¦x10*12/L¦4.68 ¦4.53-5.79 +Hémoglobine ¦g/dL ¦14.8 ¦13.4-16.7 +Hématocrite ¦% ¦44.2 ¦39.2-48.6 +VGM ¦fL ¦94.4 + ¦79.6-94 +TCMH ¦pg ¦31.6 ¦27.3-32.8 +CCMH ¦g/dL ¦33.5 ¦32.4-36.3 +Plaquettes ¦x10*9/L ¦191 ¦172-398 +VMP ¦fL ¦11.5 + ¦7.4-10.8 + +Sur le plan neurologique : Devant la persistance d'une confusion à distance de l'intoxication au +... + +2/2Pat : |F | | |Intitulé RCP + +""" + +doc = nlp(text) + +# A table span +table = doc.spans["tables"][0] +# Leucocytes ¦x10*9/L ¦4.97 ¦4.09-11 +# Hématies ¦x10*12/L¦4.68 ¦4.53-5.79 +# Hémoglobine ¦g/dL ¦14.8 ¦13.4-16.7 +# Hématocrite ¦% ¦44.2 ¦39.2-48.6 +# VGM ¦fL ¦94.4 + ¦79.6-94 +# TCMH ¦pg ¦31.6 ¦27.3-32.8 +# CCMH ¦g/dL ¦33.5 ¦32.4-36.3 +# Plaquettes ¦x10*9/L ¦191 ¦172-398 +# VMP ¦fL ¦11.5 + ¦7.4-10.8 + +# Convert span to Pandas table +df = table._.to_pd_table(as_spans=False) +type(df) +# >> pandas.core.frame.DataFrame +``` +The pd DataFrame: +| | 0 | 1 | 2 | 3 | +| ---: | :---------- | :------- | :----- | :-------- | +| 0 | Leucocytes | x10*9/L | 4.97 | 4.09-11 | +| 1 | Hématies | x10*12/L | 4.68 | 4.53-5.79 | +| 2 | Hémoglobine | g/dL | 14.8 | 13.4-16.7 | +| 3 | Hématocrite | % | 44.2 | 39.2-48.6 | +| 4 | VGM | fL | 94.4 + | 79.6-94 | +| 5 | TCMH | pg | 31.6 | 27.3-32.8 | +| 6 | CCMH | g/dL | 33.5 | 32.4-36.3 | +| 7 | Plaquettes | x10*9/L | 191 | 172-398 | +| 8 | VMP | fL | 11.5 + | 7.4-10.8 | + +## Declared extensions + +The `eds.tables` pipeline declares two [spaCy extension](https://spacy.io/usage/processing-pipelines#custom-components-attributes) on the `Span` object. The first one is `to_pd_table()` method which returns a parsed pandas version of the table. The second one is `table` which contains the table stored as a dictionnary containing cells as `Span` objects. + +## Configuration + +The pipeline can be configured using the following parameters : + +| Parameter | Explanation | Default | +| ----------------- | ------------------------------------------------ | ---------------------- | +| `tables_pattern` | Pattern to identify table spans | `rf"(\b.*{sep}.*\n)+"` | +| `sep_pattern` | Pattern to identify column separation | `r"¦"` | +| `ignore_excluded` | Ignore excluded tokens | `True` | +| `attr` | spaCy attribute to match on, eg `NORM` or `TEXT` | `"TEXT"` | + +## Authors and citation + +The `eds.tables` pipeline was developed by AP-HP's Data Science team. diff --git a/edsnlp/pipelines/misc/tables/factory.py b/edsnlp/pipelines/misc/tables/factory.py index 801dea164..d796a56c5 100644 --- a/edsnlp/pipelines/misc/tables/factory.py +++ b/edsnlp/pipelines/misc/tables/factory.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Union +from typing import List, Optional from spacy.language import Language @@ -20,8 +20,8 @@ def create_component( nlp: Language, name: str, - tables_pattern: Optional[Dict[str, Union[List[str], str]]], - sep_pattern: Optional[str], + tables_pattern: Optional[List[str]], + sep_pattern: Optional[List[str]], attr: str, ignore_excluded: bool, col_names: Optional[bool] = False, diff --git a/edsnlp/pipelines/misc/tables/patterns.py b/edsnlp/pipelines/misc/tables/patterns.py index 7f29c7b4c..233bc47b6 100644 --- a/edsnlp/pipelines/misc/tables/patterns.py +++ b/edsnlp/pipelines/misc/tables/patterns.py @@ -1,2 +1,2 @@ -sep = r"¦" -regex = rf"(?:{sep}?(?:[^{sep}\n]*{sep})+[^{sep}\n]*{sep}?\n)+" +sep = [r"¦", r"|"] +regex = [r"(?:¦?(?:[^¦\n]*¦)+[^¦\n]*¦?\n)+", r"(?:\|?(?:[^\|\n]*\|)+[^\|\n]*\|?\n)+"] diff --git a/edsnlp/pipelines/misc/tables/tables.py b/edsnlp/pipelines/misc/tables/tables.py index d6e2ee1f7..8262b0bc0 100644 --- a/edsnlp/pipelines/misc/tables/tables.py +++ b/edsnlp/pipelines/misc/tables/tables.py @@ -1,4 +1,4 @@ -from typing import Dict, Optional, Union +from typing import List, Optional import pandas as pd from spacy.language import Language @@ -19,10 +19,10 @@ class TablesMatcher: ---------- nlp : Language spaCy nlp pipeline to use for matching. - tables_pattern : Optional[str] - The regex pattern to identify tables. - sep_pattern : Optional[str] - The regex pattern to identify separators + tables_pattern : Optional[List[str]] + The regex patterns to identify tables. + sep_pattern : Optional[List[str]] + The regex patterns to identify separators in the detected tables col_names : Optional[bool] Whether the tables_pattern matches column names @@ -39,9 +39,9 @@ class TablesMatcher: def __init__( self, nlp: Language, - tables_pattern: Optional[str], - sep_pattern: Optional[str], - attr: Union[Dict[str, str], str], + tables_pattern: Optional[List[str]], + sep_pattern: Optional[List[str]], + attr: str, ignore_excluded: bool, col_names: Optional[bool] = False, row_names: Optional[bool] = False, @@ -54,7 +54,7 @@ def __init__( sep_pattern = patterns.sep self.regex_matcher = RegexMatcher(attr=attr, ignore_excluded=True) - self.regex_matcher.add("table", [tables_pattern]) + self.regex_matcher.add("table", tables_pattern) self.term_matcher = EDSPhraseMatcher(nlp.vocab, attr=attr, ignore_excluded=True) self.term_matcher.build_patterns( @@ -138,7 +138,53 @@ def get_tables(self, matches): if all(row[-1].start == row[-1].end for row in processed_table): processed_table = [row[:-1] for row in processed_table] - tables_list.append(processed_table) + # Check if all rows have the same dimension. + # If not, try to merge neighbour rows + # to find a new table + row_len = len(processed_table[0]) + if not all(len(row) == row_len for row in processed_table): + + # Method to find all possible lengths of the rows + def divisors(n): + result = set() + for i in range(1, int(n**0.5) + 1): + if n % i == 0: + result.add(i) + result.add(n // i) + return sorted(list(result)) + + if self.col_names: + n_rows = len(processed_table) - 1 + else: + n_rows = len(processed_table) + + for n_rows_to_merge in divisors(n_rows): + row_len = sum(len(row) for row in processed_table[:n_rows_to_merge]) + if all( + sum( + len(row) + for row in processed_table[ + i * n_rows_to_merge : (i + 1) * n_rows_to_merge + ] + ) + == row_len + for i in range(n_rows // n_rows_to_merge) + ): + processed_table = [ + [ + cell + for subrow in processed_table[ + i * n_rows_to_merge : (i + 1) * n_rows_to_merge + ] + for cell in subrow + ] + for i in range(n_rows // n_rows_to_merge) + ] + tables_list.append(processed_table) + break + continue + else: + tables_list.append(processed_table) # Convert to dictionnaries according to self.col_names # and self.row_names