From 1072b1ebe35b85aa65d439db7de88613861415f7 Mon Sep 17 00:00:00 2001 From: Thomas PETIT-JEAN Date: Mon, 19 Dec 2022 15:59:10 +0100 Subject: [PATCH 1/5] feat: add option to sentenciser pipeline regarding newlines --- docs/pipelines/core/sentences.md | 30 ++++++++++--- edsnlp/pipelines/core/sentences/factory.py | 3 ++ edsnlp/pipelines/core/sentences/sentences.pxd | 5 ++- edsnlp/pipelines/core/sentences/sentences.pyx | 33 +++++++++++--- edsnlp/pipelines/core/sentences/terms.py | 43 +++++++++++++++++++ tests/pipelines/core/test_sentences.py | 35 +++++++++++++++ 6 files changed, 138 insertions(+), 11 deletions(-) diff --git a/docs/pipelines/core/sentences.md b/docs/pipelines/core/sentences.md index 9bd34a166..b56d9b3a5 100644 --- a/docs/pipelines/core/sentences.md +++ b/docs/pipelines/core/sentences.md @@ -2,7 +2,7 @@ The `eds.sentences` pipeline provides an alternative to spaCy's default `sentencizer`, aiming to overcome some of its limitations. -Indeed, the `sentencizer` merely looks at period characters to detect the end of a sentence, a strategy that often fails in a clinical note settings. Our `sentences` component also classifies end-of-lines as sentence boundaries if the subsequent token begins with an uppercase character, leading to slightly better performances. +Indeed, the `sentencizer` merely looks at period characters to detect the end of a sentence, a strategy that often fails in a clinical note settings. Our `sentences` component also classifies end-of-lines as sentence boundaries if the subsequent token begins with an uppercase character, leading to slightly better performances. This exact behaviour can be adjusted using the `split_on_newlines` parameter (see below) Moreover, the `eds.sentences` pipeline can use the output of the `eds.normalizer` pipeline, and more specifically the end-of-line classification. This is activated by default. @@ -61,10 +61,30 @@ Notice how EDS-NLP's implementation is more robust to ill-defined sentence endin The pipeline can be configured using the following parameters : -| Parameter | Explanation | Default | -| -------------- | ----------------------------------------------------------------------- | --------------------------------- | -| `punct_chars` | Punctuation patterns | `None` (use pre-defined patterns) | -| `use_endlines` | Whether to use endlines prediction (see [documentation](./endlines.md)) | `True` | +| Parameter | Explanation | Default | +| ------------------- | ----------------------------------------------------------------------- | --------------------------------- | +| `punct_chars` | Punctuation patterns | `None` (use pre-defined patterns) | +| `use_endlines` | Whether to use endlines prediction (see [documentation](./endlines.md)) | `True` | +| `split_on_newlines` | Rule to use to consider a newline (`\n`) as a sentence split | `with_capitalized` | + +### The `split_on_newlines` parameter. + +=== "`with_capitalized` (Default)" + + The rule applied here is to consider a newline as a sentence split if the following token is capitalized, + i.e. it's first letter is in uppercase and it's other letters are lowercase. + This rule should cover most cases, but might be problematic with long lists of fully uppercased strings (e.g. lsits of drugs commercial names) + +=== "`with_uppercase`" + + The rule applied here is to consider a newline as a sentence split if the following token starts with an uppercase letter, + with no regards on the other letters. + This rule will correct the problem of long fully-uppercased texts, but might wrongly split sentences e.g. around acronyms. + + +=== "`False`" + + No sentence split is set using newlines alone. ## Authors and citation diff --git a/edsnlp/pipelines/core/sentences/factory.py b/edsnlp/pipelines/core/sentences/factory.py index 5f7731a5f..f2c3f402a 100644 --- a/edsnlp/pipelines/core/sentences/factory.py +++ b/edsnlp/pipelines/core/sentences/factory.py @@ -10,6 +10,7 @@ punct_chars=None, ignore_excluded=True, use_endlines=None, + split_on_newlines="with_capitalized", ) @@ -30,10 +31,12 @@ def create_component( punct_chars: Optional[List[str]], use_endlines: Optional[bool], ignore_excluded: bool, + split_on_newlines: Optional[str], ): return SentenceSegmenter( nlp.vocab, punct_chars=punct_chars, use_endlines=use_endlines, ignore_excluded=ignore_excluded, + split_on_newlines=split_on_newlines, ) diff --git a/edsnlp/pipelines/core/sentences/sentences.pxd b/edsnlp/pipelines/core/sentences/sentences.pxd index 531c55830..6630fa19b 100644 --- a/edsnlp/pipelines/core/sentences/sentences.pxd +++ b/edsnlp/pipelines/core/sentences/sentences.pxd @@ -4,6 +4,7 @@ from libcpp.vector cimport vector from spacy.tokens.doc cimport Doc from spacy.typedefs cimport attr_t +cdef enum split_options: WITH_CAPITALIZED, WITH_UPPERCASE, NONE cdef class SentenceSegmenter(object): cdef bool ignore_excluded @@ -12,5 +13,7 @@ cdef class SentenceSegmenter(object): cdef attr_t endline_hash cdef set[attr_t] punct_chars_hash cdef set[attr_t] capitalized_shapes_hash + cdef set[attr_t] capitalized_chars_hash + cdef split_options split_on_newlines - cdef void process(self, Doc doc) nogil + cdef void process(self, Doc doc) nogil \ No newline at end of file diff --git a/edsnlp/pipelines/core/sentences/sentences.pyx b/edsnlp/pipelines/core/sentences/sentences.pyx index 7123e72eb..2c66e9451 100644 --- a/edsnlp/pipelines/core/sentences/sentences.pyx +++ b/edsnlp/pipelines/core/sentences/sentences.pyx @@ -3,13 +3,13 @@ from typing import Iterable, List, Optional from libcpp cimport bool # from spacy.typedefs cimport attr_t -from spacy.attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE +from spacy.attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE, IS_UPPER from spacy.lexeme cimport Lexeme from spacy.tokens.doc cimport Doc from spacy.tokens.token cimport TokenC from spacy.vocab cimport Vocab -from .terms import punctuation +from .terms import punctuation, uppercase cdef class SentenceSegmenter(object): @@ -37,6 +37,7 @@ cdef class SentenceSegmenter(object): punct_chars: Optional[List[str]], use_endlines: bool, ignore_excluded: bool = True, + split_on_newlines: Optional[str] = "with_capitalized", ): if punct_chars is None: @@ -48,6 +49,23 @@ cdef class SentenceSegmenter(object): self.endline_hash = vocab.strings["ENDLINE"] self.punct_chars_hash = {vocab.strings[c] for c in punct_chars} self.capitalized_shapes_hash = {vocab.strings[shape] for shape in ("Xx", "Xxx", "Xxxx", "Xxxxx")} + self.capitalized_chars_hash = {vocab.strings[letter] for letter in uppercase} + + options = { + "with_capitalized": 0, + "with_uppercase": 1, + False: 2 + } + chosen = options.get(split_on_newlines, None) + if chosen is None: + raise ValueError( + ( + "Incorrect value for 'split_on_newlines'. " + f"Provided: {split_on_newlines}\n" + f"Available: {options}." + ) + ) + self.split_on_newlines = chosen if use_endlines: print("The use_endlines is deprecated and has been replaced by the ignore_excluded parameter") @@ -90,16 +108,21 @@ cdef class SentenceSegmenter(object): is_newline = Lexeme.c_check_flag(token.lex, IS_SPACE) and token.lex.orth == self.newline_hash if seen_period or seen_newline: - if seen_period and Lexeme.c_check_flag(token.lex, IS_DIGIT): - continue if is_in_punct_chars or is_newline or Lexeme.c_check_flag(token.lex, IS_PUNCT): continue + if seen_period and Lexeme.c_check_flag(token.lex, IS_DIGIT): + continue + seen_newline = False + seen_period = False if seen_period: doc.c[i].sent_start = 1 seen_newline = False seen_period = False else: - doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1 + if self.split_on_newlines == WITH_UPPERCASE: + doc.c[i].sent_start = 1 if self.capitalized_chars_hash.const_find(token.lex.prefix) != self.capitalized_chars_hash.const_end() else -1 + elif self.split_on_newlines == WITH_CAPITALIZED: + doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1 seen_newline = False seen_period = False elif is_in_punct_chars: diff --git a/edsnlp/pipelines/core/sentences/terms.py b/edsnlp/pipelines/core/sentences/terms.py index 2e0a422c3..537daeb92 100644 --- a/edsnlp/pipelines/core/sentences/terms.py +++ b/edsnlp/pipelines/core/sentences/terms.py @@ -129,3 +129,46 @@ "。", "。", ] + +uppercase = [ + "A", + "À", + "Â", + "B", + "C", + "D", + "E", + "É", + "È", + "Ê", + "Ë", + "F", + "G", + "H", + "I", + "Î", + "Ï", + "J", + "K", + "L", + "M", + "N", + "O", + "Ô", + "Œ", + "P", + "Q", + "R", + "S", + "T", + "U", + "Ù", + "Û", + "Ü", + "V", + "W", + "X", + "Y", + "Ÿ", + "Z", +] \ No newline at end of file diff --git a/tests/pipelines/core/test_sentences.py b/tests/pipelines/core/test_sentences.py index b4e078393..10411c0d1 100644 --- a/tests/pipelines/core/test_sentences.py +++ b/tests/pipelines/core/test_sentences.py @@ -50,3 +50,38 @@ def test_false_positives(blank_nlp): for fp in false_positives: doc = blank_nlp(fp) assert len(list(doc.sents)) == 1 + + +@mark.parametrize( + "split_options", + [ + dict( + split_on_newlines=False, + n_sents=2, + ), + dict( + split_on_newlines="with_capitalized", + n_sents=3, + ), + dict( + split_on_newlines="with_uppercase", + n_sents=4, + ), + ], +) +def test_newline_split_options(blank_nlp, split_options): + + text = "Une première phrase. " + text += "Une deuxième\n" + text += "Peut-être un autre\n" + text += "ET encore une." + + segmenter = SentenceSegmenter( + blank_nlp.vocab, + punct_chars=terms.punctuation, + use_endlines=False, + split_on_newlines=split_options["split_on_newlines"], + ) + + doc = segmenter(blank_nlp(text)) + assert len(list(doc.sents)) == split_options["n_sents"] From 4b1454d1e015ccc15bb9630bf6d73e5fc107d97a Mon Sep 17 00:00:00 2001 From: Thomas PETIT-JEAN Date: Mon, 19 Dec 2022 16:03:06 +0100 Subject: [PATCH 2/5] chore: changelog --- changelog.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/changelog.md b/changelog.md index 59c5a3405..1914f0e42 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,14 @@ # Changelog +## Pending + +### Added +- `split_on_newlines` parameter to the `sentences` pipeline. + +### Fixed +- `sentences` issue with punctiation followed by a digit. + + ## v0.7.4 (2022-12-12) ### Added From dcaed0f71a392b074869c139c863c517b00460f4 Mon Sep 17 00:00:00 2001 From: Thomas Petit-Jean <30775613+Thomzoy@users.noreply.github.com> Date: Mon, 19 Dec 2022 16:13:45 +0100 Subject: [PATCH 3/5] Update changelog.md --- changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog.md b/changelog.md index 1914f0e42..ec217840d 100644 --- a/changelog.md +++ b/changelog.md @@ -6,7 +6,7 @@ - `split_on_newlines` parameter to the `sentences` pipeline. ### Fixed -- `sentences` issue with punctiation followed by a digit. +- `eds.sentences` issue with punctuation followed by a digit. ## v0.7.4 (2022-12-12) From de773934e3fcb318e104c93e4060a4e82f7f6a57 Mon Sep 17 00:00:00 2001 From: Thomas PETIT-JEAN Date: Mon, 19 Dec 2022 16:23:06 +0100 Subject: [PATCH 4/5] Linting --- docs/pipelines/core/sentences.md | 8 ++++---- edsnlp/pipelines/core/sentences/sentences.pxd | 3 ++- edsnlp/pipelines/core/sentences/sentences.pyx | 16 ++++++++++++---- edsnlp/pipelines/core/sentences/terms.py | 2 +- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/docs/pipelines/core/sentences.md b/docs/pipelines/core/sentences.md index b56d9b3a5..b78b0f4df 100644 --- a/docs/pipelines/core/sentences.md +++ b/docs/pipelines/core/sentences.md @@ -71,15 +71,15 @@ The pipeline can be configured using the following parameters : === "`with_capitalized` (Default)" - The rule applied here is to consider a newline as a sentence split if the following token is capitalized, - i.e. it's first letter is in uppercase and it's other letters are lowercase. + The rule applied here is to consider a newline as a sentence split if the following token is capitalized, + i.e. it's first letter is in uppercase and it's other letters are lowercase. This rule should cover most cases, but might be problematic with long lists of fully uppercased strings (e.g. lsits of drugs commercial names) === "`with_uppercase`" - The rule applied here is to consider a newline as a sentence split if the following token starts with an uppercase letter, + The rule applied here is to consider a newline as a sentence split if the following token starts with an uppercase letter, with no regards on the other letters. - This rule will correct the problem of long fully-uppercased texts, but might wrongly split sentences e.g. around acronyms. + This rule will correct the problem of long fully-uppercased texts, but might wrongly split sentences e.g. around acronyms. === "`False`" diff --git a/edsnlp/pipelines/core/sentences/sentences.pxd b/edsnlp/pipelines/core/sentences/sentences.pxd index 6630fa19b..3d1341886 100644 --- a/edsnlp/pipelines/core/sentences/sentences.pxd +++ b/edsnlp/pipelines/core/sentences/sentences.pxd @@ -4,6 +4,7 @@ from libcpp.vector cimport vector from spacy.tokens.doc cimport Doc from spacy.typedefs cimport attr_t + cdef enum split_options: WITH_CAPITALIZED, WITH_UPPERCASE, NONE cdef class SentenceSegmenter(object): @@ -16,4 +17,4 @@ cdef class SentenceSegmenter(object): cdef set[attr_t] capitalized_chars_hash cdef split_options split_on_newlines - cdef void process(self, Doc doc) nogil \ No newline at end of file + cdef void process(self, Doc doc) nogil diff --git a/edsnlp/pipelines/core/sentences/sentences.pyx b/edsnlp/pipelines/core/sentences/sentences.pyx index 2c66e9451..1b01a8271 100644 --- a/edsnlp/pipelines/core/sentences/sentences.pyx +++ b/edsnlp/pipelines/core/sentences/sentences.pyx @@ -3,7 +3,15 @@ from typing import Iterable, List, Optional from libcpp cimport bool # from spacy.typedefs cimport attr_t -from spacy.attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE, IS_UPPER +from spacy.attrs cimport ( + IS_ALPHA, + IS_ASCII, + IS_DIGIT, + IS_LOWER, + IS_PUNCT, + IS_SPACE, + IS_UPPER, +) from spacy.lexeme cimport Lexeme from spacy.tokens.doc cimport Doc from spacy.tokens.token cimport TokenC @@ -52,8 +60,8 @@ cdef class SentenceSegmenter(object): self.capitalized_chars_hash = {vocab.strings[letter] for letter in uppercase} options = { - "with_capitalized": 0, - "with_uppercase": 1, + "with_capitalized": 0, + "with_uppercase": 1, False: 2 } chosen = options.get(split_on_newlines, None) @@ -119,7 +127,7 @@ cdef class SentenceSegmenter(object): seen_newline = False seen_period = False else: - if self.split_on_newlines == WITH_UPPERCASE: + if self.split_on_newlines == WITH_UPPERCASE: doc.c[i].sent_start = 1 if self.capitalized_chars_hash.const_find(token.lex.prefix) != self.capitalized_chars_hash.const_end() else -1 elif self.split_on_newlines == WITH_CAPITALIZED: doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1 diff --git a/edsnlp/pipelines/core/sentences/terms.py b/edsnlp/pipelines/core/sentences/terms.py index 537daeb92..d5d89e700 100644 --- a/edsnlp/pipelines/core/sentences/terms.py +++ b/edsnlp/pipelines/core/sentences/terms.py @@ -171,4 +171,4 @@ "Y", "Ÿ", "Z", -] \ No newline at end of file +] From 5bfff514d4a2a10c14a7dbdee5e24e9e23b76c4f Mon Sep 17 00:00:00 2001 From: Thomas PETIT-JEAN Date: Fri, 13 Jan 2023 17:18:48 +0100 Subject: [PATCH 5/5] updates --- edsnlp/pipelines/core/sentences/factory.py | 2 ++ edsnlp/pipelines/core/sentences/sentences.pxd | 2 ++ edsnlp/pipelines/core/sentences/sentences.pyx | 9 +++++++-- edsnlp/pipelines/core/sentences/terms.py | 20 +++++++++++++++++++ tests/pipelines/core/test_sentences.py | 9 ++++++++- 5 files changed, 39 insertions(+), 3 deletions(-) diff --git a/edsnlp/pipelines/core/sentences/factory.py b/edsnlp/pipelines/core/sentences/factory.py index f2c3f402a..355887cab 100644 --- a/edsnlp/pipelines/core/sentences/factory.py +++ b/edsnlp/pipelines/core/sentences/factory.py @@ -11,6 +11,7 @@ ignore_excluded=True, use_endlines=None, split_on_newlines="with_capitalized", + split_on_bullets=False, ) @@ -32,6 +33,7 @@ def create_component( use_endlines: Optional[bool], ignore_excluded: bool, split_on_newlines: Optional[str], + split_on_bullets: Optional[bool], ): return SentenceSegmenter( nlp.vocab, diff --git a/edsnlp/pipelines/core/sentences/sentences.pxd b/edsnlp/pipelines/core/sentences/sentences.pxd index 3d1341886..f215e9614 100644 --- a/edsnlp/pipelines/core/sentences/sentences.pxd +++ b/edsnlp/pipelines/core/sentences/sentences.pxd @@ -9,12 +9,14 @@ cdef enum split_options: WITH_CAPITALIZED, WITH_UPPERCASE, NONE cdef class SentenceSegmenter(object): cdef bool ignore_excluded + cdef bool split_on_bullets cdef attr_t newline_hash cdef attr_t excluded_hash cdef attr_t endline_hash cdef set[attr_t] punct_chars_hash cdef set[attr_t] capitalized_shapes_hash cdef set[attr_t] capitalized_chars_hash + cdef set[attr_t] bullets_chars_hash cdef split_options split_on_newlines cdef void process(self, Doc doc) nogil diff --git a/edsnlp/pipelines/core/sentences/sentences.pyx b/edsnlp/pipelines/core/sentences/sentences.pyx index 1b01a8271..fc4f63fbe 100644 --- a/edsnlp/pipelines/core/sentences/sentences.pyx +++ b/edsnlp/pipelines/core/sentences/sentences.pyx @@ -17,7 +17,7 @@ from spacy.tokens.doc cimport Doc from spacy.tokens.token cimport TokenC from spacy.vocab cimport Vocab -from .terms import punctuation, uppercase +from .terms import punctuation, uppercase, bullets cdef class SentenceSegmenter(object): @@ -46,18 +46,21 @@ cdef class SentenceSegmenter(object): use_endlines: bool, ignore_excluded: bool = True, split_on_newlines: Optional[str] = "with_capitalized", + split_on_bullets: bool = False, ): if punct_chars is None: punct_chars = punctuation self.ignore_excluded = ignore_excluded or use_endlines + self.split_on_bullets = split_on_bullets self.newline_hash = vocab.strings["\n"] self.excluded_hash = vocab.strings["EXCLUDED"] self.endline_hash = vocab.strings["ENDLINE"] self.punct_chars_hash = {vocab.strings[c] for c in punct_chars} self.capitalized_shapes_hash = {vocab.strings[shape] for shape in ("Xx", "Xxx", "Xxxx", "Xxxxx")} self.capitalized_chars_hash = {vocab.strings[letter] for letter in uppercase} + self.bullets_chars_hash = {vocab.strings[bullet] for bullet in bullets} options = { "with_capitalized": 0, @@ -129,8 +132,10 @@ cdef class SentenceSegmenter(object): else: if self.split_on_newlines == WITH_UPPERCASE: doc.c[i].sent_start = 1 if self.capitalized_chars_hash.const_find(token.lex.prefix) != self.capitalized_chars_hash.const_end() else -1 - elif self.split_on_newlines == WITH_CAPITALIZED: + if self.split_on_newlines == WITH_CAPITALIZED: doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1 + if self.split_on_bullets: + doc.c[i].sent_start = 1 if self.bullets_chars_hash.const_find(token.lex.prefix) != self.bullets_chars_hash.const_end() else -1 seen_newline = False seen_period = False elif is_in_punct_chars: diff --git a/edsnlp/pipelines/core/sentences/terms.py b/edsnlp/pipelines/core/sentences/terms.py index d5d89e700..8350a7e32 100644 --- a/edsnlp/pipelines/core/sentences/terms.py +++ b/edsnlp/pipelines/core/sentences/terms.py @@ -172,3 +172,23 @@ "Ÿ", "Z", ] + +bullets = [ + "-", + "*", + "•", + "‣", + "⁃", + "⁌", + "⁍", + "∙", + "○", + "●", + "◘", + "◦", + "☙", + "❥", + "❧", + "⦾", + "⦿", +] diff --git a/tests/pipelines/core/test_sentences.py b/tests/pipelines/core/test_sentences.py index 10411c0d1..a401114c9 100644 --- a/tests/pipelines/core/test_sentences.py +++ b/tests/pipelines/core/test_sentences.py @@ -67,6 +67,11 @@ def test_false_positives(blank_nlp): split_on_newlines="with_uppercase", n_sents=4, ), + dict( + split_on_newlines="with_uppercase", + split_on_bullets=True, + n_sents=5, + ), ], ) def test_newline_split_options(blank_nlp, split_options): @@ -74,13 +79,15 @@ def test_newline_split_options(blank_nlp, split_options): text = "Une première phrase. " text += "Une deuxième\n" text += "Peut-être un autre\n" - text += "ET encore une." + text += "ET encore une\n" + text += "- Enfin une dernière avec une liste." segmenter = SentenceSegmenter( blank_nlp.vocab, punct_chars=terms.punctuation, use_endlines=False, split_on_newlines=split_options["split_on_newlines"], + split_on_bullets=split_options.get("split_on_bullets", False), ) doc = segmenter(blank_nlp(text))