From 1072b1ebe35b85aa65d439db7de88613861415f7 Mon Sep 17 00:00:00 2001
From: Thomas PETIT-JEAN <thomas.petitjean@aphp.fr>
Date: Mon, 19 Dec 2022 15:59:10 +0100
Subject: [PATCH 1/5] feat: add option to sentenciser pipeline regarding
 newlines

---
 docs/pipelines/core/sentences.md              | 30 ++++++++++---
 edsnlp/pipelines/core/sentences/factory.py    |  3 ++
 edsnlp/pipelines/core/sentences/sentences.pxd |  5 ++-
 edsnlp/pipelines/core/sentences/sentences.pyx | 33 +++++++++++---
 edsnlp/pipelines/core/sentences/terms.py      | 43 +++++++++++++++++++
 tests/pipelines/core/test_sentences.py        | 35 +++++++++++++++
 6 files changed, 138 insertions(+), 11 deletions(-)

diff --git a/docs/pipelines/core/sentences.md b/docs/pipelines/core/sentences.md
index 9bd34a166..b56d9b3a5 100644
--- a/docs/pipelines/core/sentences.md
+++ b/docs/pipelines/core/sentences.md
@@ -2,7 +2,7 @@
 
 The `eds.sentences` pipeline provides an alternative to spaCy's default `sentencizer`, aiming to overcome some of its limitations.
 
-Indeed, the `sentencizer` merely looks at period characters to detect the end of a sentence, a strategy that often fails in a clinical note settings. Our `sentences` component also classifies end-of-lines as sentence boundaries if the subsequent token begins with an uppercase character, leading to slightly better performances.
+Indeed, the `sentencizer` merely looks at period characters to detect the end of a sentence, a strategy that often fails in a clinical note settings. Our `sentences` component also classifies end-of-lines as sentence boundaries if the subsequent token begins with an uppercase character, leading to slightly better performances. This exact behaviour can be adjusted using the `split_on_newlines` parameter (see below)
 
 Moreover, the `eds.sentences` pipeline can use the output of the `eds.normalizer` pipeline, and more specifically the end-of-line classification. This is activated by default.
 
@@ -61,10 +61,30 @@ Notice how EDS-NLP's implementation is more robust to ill-defined sentence endin
 
 The pipeline can be configured using the following parameters :
 
-| Parameter      | Explanation                                                             | Default                           |
-| -------------- | ----------------------------------------------------------------------- | --------------------------------- |
-| `punct_chars`  | Punctuation patterns                                                    | `None` (use pre-defined patterns) |
-| `use_endlines` | Whether to use endlines prediction (see [documentation](./endlines.md)) | `True`                            |
+| Parameter           | Explanation                                                             | Default                           |
+| ------------------- | ----------------------------------------------------------------------- | --------------------------------- |
+| `punct_chars`       | Punctuation patterns                                                    | `None` (use pre-defined patterns) |
+| `use_endlines`      | Whether to use endlines prediction (see [documentation](./endlines.md)) | `True`                            |
+| `split_on_newlines` | Rule to use to consider a newline (`\n`) as a sentence split            | `with_capitalized`                |
+
+### The `split_on_newlines` parameter.
+
+=== "`with_capitalized` (Default)"
+
+    The rule applied here is to consider a newline as a sentence split if the following token is capitalized, 
+    i.e. it's first letter is in uppercase and it's other letters are lowercase.  
+    This rule should cover most cases, but might be problematic with long lists of fully uppercased strings (e.g. lsits of drugs commercial names)
+
+=== "`with_uppercase`"
+
+    The rule applied here is to consider a newline as a sentence split if the following token starts with an uppercase letter, 
+    with no regards on the other letters.
+    This rule will correct the problem of long fully-uppercased texts, but might wrongly split sentences e.g. around acronyms. 
+
+
+=== "`False`"
+
+    No sentence split is set using newlines alone.
 
 ## Authors and citation
 
diff --git a/edsnlp/pipelines/core/sentences/factory.py b/edsnlp/pipelines/core/sentences/factory.py
index 5f7731a5f..f2c3f402a 100644
--- a/edsnlp/pipelines/core/sentences/factory.py
+++ b/edsnlp/pipelines/core/sentences/factory.py
@@ -10,6 +10,7 @@
     punct_chars=None,
     ignore_excluded=True,
     use_endlines=None,
+    split_on_newlines="with_capitalized",
 )
 
 
@@ -30,10 +31,12 @@ def create_component(
     punct_chars: Optional[List[str]],
     use_endlines: Optional[bool],
     ignore_excluded: bool,
+    split_on_newlines: Optional[str],
 ):
     return SentenceSegmenter(
         nlp.vocab,
         punct_chars=punct_chars,
         use_endlines=use_endlines,
         ignore_excluded=ignore_excluded,
+        split_on_newlines=split_on_newlines,
     )
diff --git a/edsnlp/pipelines/core/sentences/sentences.pxd b/edsnlp/pipelines/core/sentences/sentences.pxd
index 531c55830..6630fa19b 100644
--- a/edsnlp/pipelines/core/sentences/sentences.pxd
+++ b/edsnlp/pipelines/core/sentences/sentences.pxd
@@ -4,6 +4,7 @@ from libcpp.vector cimport vector
 from spacy.tokens.doc cimport Doc
 from spacy.typedefs cimport attr_t
 
+cdef enum split_options: WITH_CAPITALIZED, WITH_UPPERCASE, NONE
 
 cdef class SentenceSegmenter(object):
     cdef bool ignore_excluded
@@ -12,5 +13,7 @@ cdef class SentenceSegmenter(object):
     cdef attr_t endline_hash
     cdef set[attr_t] punct_chars_hash
     cdef set[attr_t] capitalized_shapes_hash
+    cdef set[attr_t] capitalized_chars_hash
+    cdef split_options split_on_newlines
 
-    cdef void process(self, Doc doc) nogil
+    cdef void process(self, Doc doc) nogil
\ No newline at end of file
diff --git a/edsnlp/pipelines/core/sentences/sentences.pyx b/edsnlp/pipelines/core/sentences/sentences.pyx
index 7123e72eb..2c66e9451 100644
--- a/edsnlp/pipelines/core/sentences/sentences.pyx
+++ b/edsnlp/pipelines/core/sentences/sentences.pyx
@@ -3,13 +3,13 @@ from typing import Iterable, List, Optional
 from libcpp cimport bool
 
 # from spacy.typedefs cimport attr_t
-from spacy.attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
+from spacy.attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE, IS_UPPER
 from spacy.lexeme cimport Lexeme
 from spacy.tokens.doc cimport Doc
 from spacy.tokens.token cimport TokenC
 from spacy.vocab cimport Vocab
 
-from .terms import punctuation
+from .terms import punctuation, uppercase
 
 
 cdef class SentenceSegmenter(object):
@@ -37,6 +37,7 @@ cdef class SentenceSegmenter(object):
         punct_chars: Optional[List[str]],
         use_endlines: bool,
         ignore_excluded: bool = True,
+        split_on_newlines: Optional[str] = "with_capitalized",
     ):
 
         if punct_chars is None:
@@ -48,6 +49,23 @@ cdef class SentenceSegmenter(object):
         self.endline_hash = vocab.strings["ENDLINE"]
         self.punct_chars_hash = {vocab.strings[c] for c in punct_chars}
         self.capitalized_shapes_hash = {vocab.strings[shape] for shape in ("Xx", "Xxx", "Xxxx", "Xxxxx")}
+        self.capitalized_chars_hash = {vocab.strings[letter] for letter in uppercase}
+
+        options = {
+            "with_capitalized": 0, 
+            "with_uppercase": 1, 
+            False: 2
+        }
+        chosen = options.get(split_on_newlines, None)
+        if chosen is None:
+            raise ValueError(
+                (
+                    "Incorrect value for 'split_on_newlines'. "
+                    f"Provided: {split_on_newlines}\n"
+                    f"Available: {options}."
+                )
+            )
+        self.split_on_newlines = chosen
 
         if use_endlines:
             print("The use_endlines is deprecated and has been replaced by the ignore_excluded parameter")
@@ -90,16 +108,21 @@ cdef class SentenceSegmenter(object):
             is_newline = Lexeme.c_check_flag(token.lex, IS_SPACE) and token.lex.orth == self.newline_hash
 
             if seen_period or seen_newline:
-                if seen_period and Lexeme.c_check_flag(token.lex, IS_DIGIT):
-                    continue
                 if is_in_punct_chars or is_newline or Lexeme.c_check_flag(token.lex, IS_PUNCT):
                     continue
+                if seen_period and Lexeme.c_check_flag(token.lex, IS_DIGIT):
+                    continue
+                    seen_newline = False
+                    seen_period = False
                 if seen_period:
                     doc.c[i].sent_start = 1
                     seen_newline = False
                     seen_period = False
                 else:
-                    doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1
+                    if self.split_on_newlines == WITH_UPPERCASE: 
+                        doc.c[i].sent_start = 1 if self.capitalized_chars_hash.const_find(token.lex.prefix) != self.capitalized_chars_hash.const_end() else -1
+                    elif self.split_on_newlines == WITH_CAPITALIZED:
+                        doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1
                     seen_newline = False
                     seen_period = False
             elif is_in_punct_chars:
diff --git a/edsnlp/pipelines/core/sentences/terms.py b/edsnlp/pipelines/core/sentences/terms.py
index 2e0a422c3..537daeb92 100644
--- a/edsnlp/pipelines/core/sentences/terms.py
+++ b/edsnlp/pipelines/core/sentences/terms.py
@@ -129,3 +129,46 @@
     "｡",
     "。",
 ]
+
+uppercase = [
+    "A",
+    "À",
+    "Â",
+    "B",
+    "C",
+    "D",
+    "E",
+    "É",
+    "È",
+    "Ê",
+    "Ë",
+    "F",
+    "G",
+    "H",
+    "I",
+    "Î",
+    "Ï",
+    "J",
+    "K",
+    "L",
+    "M",
+    "N",
+    "O",
+    "Ô",
+    "Œ",
+    "P",
+    "Q",
+    "R",
+    "S",
+    "T",
+    "U",
+    "Ù",
+    "Û",
+    "Ü",
+    "V",
+    "W",
+    "X",
+    "Y",
+    "Ÿ",
+    "Z",
+]
\ No newline at end of file
diff --git a/tests/pipelines/core/test_sentences.py b/tests/pipelines/core/test_sentences.py
index b4e078393..10411c0d1 100644
--- a/tests/pipelines/core/test_sentences.py
+++ b/tests/pipelines/core/test_sentences.py
@@ -50,3 +50,38 @@ def test_false_positives(blank_nlp):
     for fp in false_positives:
         doc = blank_nlp(fp)
         assert len(list(doc.sents)) == 1
+
+
+@mark.parametrize(
+    "split_options",
+    [
+        dict(
+            split_on_newlines=False,
+            n_sents=2,
+        ),
+        dict(
+            split_on_newlines="with_capitalized",
+            n_sents=3,
+        ),
+        dict(
+            split_on_newlines="with_uppercase",
+            n_sents=4,
+        ),
+    ],
+)
+def test_newline_split_options(blank_nlp, split_options):
+
+    text = "Une première phrase. "
+    text += "Une deuxième\n"
+    text += "Peut-être un autre\n"
+    text += "ET encore une."
+
+    segmenter = SentenceSegmenter(
+        blank_nlp.vocab,
+        punct_chars=terms.punctuation,
+        use_endlines=False,
+        split_on_newlines=split_options["split_on_newlines"],
+    )
+
+    doc = segmenter(blank_nlp(text))
+    assert len(list(doc.sents)) == split_options["n_sents"]

From 4b1454d1e015ccc15bb9630bf6d73e5fc107d97a Mon Sep 17 00:00:00 2001
From: Thomas PETIT-JEAN <thomas.petitjean@aphp.fr>
Date: Mon, 19 Dec 2022 16:03:06 +0100
Subject: [PATCH 2/5] chore: changelog

---
 changelog.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/changelog.md b/changelog.md
index 59c5a3405..1914f0e42 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,14 @@
 # Changelog
 
+## Pending
+
+### Added
+- `split_on_newlines` parameter to the `sentences` pipeline.
+
+### Fixed
+- `sentences` issue with punctiation followed by a digit.
+
+
 ## v0.7.4 (2022-12-12)
 
 ### Added

From dcaed0f71a392b074869c139c863c517b00460f4 Mon Sep 17 00:00:00 2001
From: Thomas Petit-Jean <30775613+Thomzoy@users.noreply.github.com>
Date: Mon, 19 Dec 2022 16:13:45 +0100
Subject: [PATCH 3/5] Update changelog.md

---
 changelog.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/changelog.md b/changelog.md
index 1914f0e42..ec217840d 100644
--- a/changelog.md
+++ b/changelog.md
@@ -6,7 +6,7 @@
 - `split_on_newlines` parameter to the `sentences` pipeline.
 
 ### Fixed
-- `sentences` issue with punctiation followed by a digit.
+- `eds.sentences` issue with punctuation followed by a digit.
 
 
 ## v0.7.4 (2022-12-12)

From de773934e3fcb318e104c93e4060a4e82f7f6a57 Mon Sep 17 00:00:00 2001
From: Thomas PETIT-JEAN <thomas.petitjean@aphp.fr>
Date: Mon, 19 Dec 2022 16:23:06 +0100
Subject: [PATCH 4/5] Linting

---
 docs/pipelines/core/sentences.md              |  8 ++++----
 edsnlp/pipelines/core/sentences/sentences.pxd |  3 ++-
 edsnlp/pipelines/core/sentences/sentences.pyx | 16 ++++++++++++----
 edsnlp/pipelines/core/sentences/terms.py      |  2 +-
 4 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/docs/pipelines/core/sentences.md b/docs/pipelines/core/sentences.md
index b56d9b3a5..b78b0f4df 100644
--- a/docs/pipelines/core/sentences.md
+++ b/docs/pipelines/core/sentences.md
@@ -71,15 +71,15 @@ The pipeline can be configured using the following parameters :
 
 === "`with_capitalized` (Default)"
 
-    The rule applied here is to consider a newline as a sentence split if the following token is capitalized, 
-    i.e. it's first letter is in uppercase and it's other letters are lowercase.  
+    The rule applied here is to consider a newline as a sentence split if the following token is capitalized,
+    i.e. it's first letter is in uppercase and it's other letters are lowercase.
     This rule should cover most cases, but might be problematic with long lists of fully uppercased strings (e.g. lsits of drugs commercial names)
 
 === "`with_uppercase`"
 
-    The rule applied here is to consider a newline as a sentence split if the following token starts with an uppercase letter, 
+    The rule applied here is to consider a newline as a sentence split if the following token starts with an uppercase letter,
     with no regards on the other letters.
-    This rule will correct the problem of long fully-uppercased texts, but might wrongly split sentences e.g. around acronyms. 
+    This rule will correct the problem of long fully-uppercased texts, but might wrongly split sentences e.g. around acronyms.
 
 
 === "`False`"
diff --git a/edsnlp/pipelines/core/sentences/sentences.pxd b/edsnlp/pipelines/core/sentences/sentences.pxd
index 6630fa19b..3d1341886 100644
--- a/edsnlp/pipelines/core/sentences/sentences.pxd
+++ b/edsnlp/pipelines/core/sentences/sentences.pxd
@@ -4,6 +4,7 @@ from libcpp.vector cimport vector
 from spacy.tokens.doc cimport Doc
 from spacy.typedefs cimport attr_t
 
+
 cdef enum split_options: WITH_CAPITALIZED, WITH_UPPERCASE, NONE
 
 cdef class SentenceSegmenter(object):
@@ -16,4 +17,4 @@ cdef class SentenceSegmenter(object):
     cdef set[attr_t] capitalized_chars_hash
     cdef split_options split_on_newlines
 
-    cdef void process(self, Doc doc) nogil
\ No newline at end of file
+    cdef void process(self, Doc doc) nogil
diff --git a/edsnlp/pipelines/core/sentences/sentences.pyx b/edsnlp/pipelines/core/sentences/sentences.pyx
index 2c66e9451..1b01a8271 100644
--- a/edsnlp/pipelines/core/sentences/sentences.pyx
+++ b/edsnlp/pipelines/core/sentences/sentences.pyx
@@ -3,7 +3,15 @@ from typing import Iterable, List, Optional
 from libcpp cimport bool
 
 # from spacy.typedefs cimport attr_t
-from spacy.attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE, IS_UPPER
+from spacy.attrs cimport (
+    IS_ALPHA,
+    IS_ASCII,
+    IS_DIGIT,
+    IS_LOWER,
+    IS_PUNCT,
+    IS_SPACE,
+    IS_UPPER,
+)
 from spacy.lexeme cimport Lexeme
 from spacy.tokens.doc cimport Doc
 from spacy.tokens.token cimport TokenC
@@ -52,8 +60,8 @@ cdef class SentenceSegmenter(object):
         self.capitalized_chars_hash = {vocab.strings[letter] for letter in uppercase}
 
         options = {
-            "with_capitalized": 0, 
-            "with_uppercase": 1, 
+            "with_capitalized": 0,
+            "with_uppercase": 1,
             False: 2
         }
         chosen = options.get(split_on_newlines, None)
@@ -119,7 +127,7 @@ cdef class SentenceSegmenter(object):
                     seen_newline = False
                     seen_period = False
                 else:
-                    if self.split_on_newlines == WITH_UPPERCASE: 
+                    if self.split_on_newlines == WITH_UPPERCASE:
                         doc.c[i].sent_start = 1 if self.capitalized_chars_hash.const_find(token.lex.prefix) != self.capitalized_chars_hash.const_end() else -1
                     elif self.split_on_newlines == WITH_CAPITALIZED:
                         doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1
diff --git a/edsnlp/pipelines/core/sentences/terms.py b/edsnlp/pipelines/core/sentences/terms.py
index 537daeb92..d5d89e700 100644
--- a/edsnlp/pipelines/core/sentences/terms.py
+++ b/edsnlp/pipelines/core/sentences/terms.py
@@ -171,4 +171,4 @@
     "Y",
     "Ÿ",
     "Z",
-]
\ No newline at end of file
+]

From 5bfff514d4a2a10c14a7dbdee5e24e9e23b76c4f Mon Sep 17 00:00:00 2001
From: Thomas PETIT-JEAN <thomas.petitjean@aphp.fr>
Date: Fri, 13 Jan 2023 17:18:48 +0100
Subject: [PATCH 5/5] updates

---
 edsnlp/pipelines/core/sentences/factory.py    |  2 ++
 edsnlp/pipelines/core/sentences/sentences.pxd |  2 ++
 edsnlp/pipelines/core/sentences/sentences.pyx |  9 +++++++--
 edsnlp/pipelines/core/sentences/terms.py      | 20 +++++++++++++++++++
 tests/pipelines/core/test_sentences.py        |  9 ++++++++-
 5 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/edsnlp/pipelines/core/sentences/factory.py b/edsnlp/pipelines/core/sentences/factory.py
index f2c3f402a..355887cab 100644
--- a/edsnlp/pipelines/core/sentences/factory.py
+++ b/edsnlp/pipelines/core/sentences/factory.py
@@ -11,6 +11,7 @@
     ignore_excluded=True,
     use_endlines=None,
     split_on_newlines="with_capitalized",
+    split_on_bullets=False,
 )
 
 
@@ -32,6 +33,7 @@ def create_component(
     use_endlines: Optional[bool],
     ignore_excluded: bool,
     split_on_newlines: Optional[str],
+    split_on_bullets: Optional[bool],
 ):
     return SentenceSegmenter(
         nlp.vocab,
diff --git a/edsnlp/pipelines/core/sentences/sentences.pxd b/edsnlp/pipelines/core/sentences/sentences.pxd
index 3d1341886..f215e9614 100644
--- a/edsnlp/pipelines/core/sentences/sentences.pxd
+++ b/edsnlp/pipelines/core/sentences/sentences.pxd
@@ -9,12 +9,14 @@ cdef enum split_options: WITH_CAPITALIZED, WITH_UPPERCASE, NONE
 
 cdef class SentenceSegmenter(object):
     cdef bool ignore_excluded
+    cdef bool split_on_bullets
     cdef attr_t newline_hash
     cdef attr_t excluded_hash
     cdef attr_t endline_hash
     cdef set[attr_t] punct_chars_hash
     cdef set[attr_t] capitalized_shapes_hash
     cdef set[attr_t] capitalized_chars_hash
+    cdef set[attr_t] bullets_chars_hash
     cdef split_options split_on_newlines
 
     cdef void process(self, Doc doc) nogil
diff --git a/edsnlp/pipelines/core/sentences/sentences.pyx b/edsnlp/pipelines/core/sentences/sentences.pyx
index 1b01a8271..fc4f63fbe 100644
--- a/edsnlp/pipelines/core/sentences/sentences.pyx
+++ b/edsnlp/pipelines/core/sentences/sentences.pyx
@@ -17,7 +17,7 @@ from spacy.tokens.doc cimport Doc
 from spacy.tokens.token cimport TokenC
 from spacy.vocab cimport Vocab
 
-from .terms import punctuation, uppercase
+from .terms import punctuation, uppercase, bullets
 
 
 cdef class SentenceSegmenter(object):
@@ -46,18 +46,21 @@ cdef class SentenceSegmenter(object):
         use_endlines: bool,
         ignore_excluded: bool = True,
         split_on_newlines: Optional[str] = "with_capitalized",
+        split_on_bullets: bool = False,
     ):
 
         if punct_chars is None:
             punct_chars = punctuation
 
         self.ignore_excluded = ignore_excluded or use_endlines
+        self.split_on_bullets = split_on_bullets
         self.newline_hash = vocab.strings["\n"]
         self.excluded_hash = vocab.strings["EXCLUDED"]
         self.endline_hash = vocab.strings["ENDLINE"]
         self.punct_chars_hash = {vocab.strings[c] for c in punct_chars}
         self.capitalized_shapes_hash = {vocab.strings[shape] for shape in ("Xx", "Xxx", "Xxxx", "Xxxxx")}
         self.capitalized_chars_hash = {vocab.strings[letter] for letter in uppercase}
+        self.bullets_chars_hash = {vocab.strings[bullet] for bullet in bullets}
 
         options = {
             "with_capitalized": 0,
@@ -129,8 +132,10 @@ cdef class SentenceSegmenter(object):
                 else:
                     if self.split_on_newlines == WITH_UPPERCASE:
                         doc.c[i].sent_start = 1 if self.capitalized_chars_hash.const_find(token.lex.prefix) != self.capitalized_chars_hash.const_end() else -1
-                    elif self.split_on_newlines == WITH_CAPITALIZED:
+                    if self.split_on_newlines == WITH_CAPITALIZED:
                         doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1
+                    if self.split_on_bullets:
+                        doc.c[i].sent_start = 1 if self.bullets_chars_hash.const_find(token.lex.prefix) != self.bullets_chars_hash.const_end() else -1
                     seen_newline = False
                     seen_period = False
             elif is_in_punct_chars:
diff --git a/edsnlp/pipelines/core/sentences/terms.py b/edsnlp/pipelines/core/sentences/terms.py
index d5d89e700..8350a7e32 100644
--- a/edsnlp/pipelines/core/sentences/terms.py
+++ b/edsnlp/pipelines/core/sentences/terms.py
@@ -172,3 +172,23 @@
     "Ÿ",
     "Z",
 ]
+
+bullets = [
+    "-",
+    "*",
+    "•",
+    "‣",
+    "⁃",
+    "⁌",
+    "⁍",
+    "∙",
+    "○",
+    "●",
+    "◘",
+    "◦",
+    "☙",
+    "❥",
+    "❧",
+    "⦾",
+    "⦿",
+]
diff --git a/tests/pipelines/core/test_sentences.py b/tests/pipelines/core/test_sentences.py
index 10411c0d1..a401114c9 100644
--- a/tests/pipelines/core/test_sentences.py
+++ b/tests/pipelines/core/test_sentences.py
@@ -67,6 +67,11 @@ def test_false_positives(blank_nlp):
             split_on_newlines="with_uppercase",
             n_sents=4,
         ),
+        dict(
+            split_on_newlines="with_uppercase",
+            split_on_bullets=True,
+            n_sents=5,
+        ),
     ],
 )
 def test_newline_split_options(blank_nlp, split_options):
@@ -74,13 +79,15 @@ def test_newline_split_options(blank_nlp, split_options):
     text = "Une première phrase. "
     text += "Une deuxième\n"
     text += "Peut-être un autre\n"
-    text += "ET encore une."
+    text += "ET encore une\n"
+    text += "- Enfin une dernière avec une liste."
 
     segmenter = SentenceSegmenter(
         blank_nlp.vocab,
         punct_chars=terms.punctuation,
         use_endlines=False,
         split_on_newlines=split_options["split_on_newlines"],
+        split_on_bullets=split_options.get("split_on_bullets", False),
     )
 
     doc = segmenter(blank_nlp(text))