josk0 · honcharov-danylo · May 26, 2025 · Jun 9, 2025 · Jun 9, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ version = "0.2.0"
 description = "A tool for cleaning and formatting markdown documents"
 readme = "README.md"
 authors = [
-    {name = "Johannes Himmelreich", email = "[email protected]"}
+    {name = "Johannes Himmelreich", email = "[email protected]"},
 ]
 license = {text = "MIT"}
 classifiers = [
@@ -28,6 +28,13 @@ keywords = ["markdown", "cleaning", "formatting", "text processing"]
 dependencies = [
     "pyyaml>=6.0",
     "ftfy>=6.0.3",
+    "markdown-it-py>=2.2.0",
+    "symspellpy>=6.9.0",
+    "mdformat>=0.7.22",
+    "deepmultilingualpunctuation>=1.0.1",
+    "spacy>=3.8.2",
+    "en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl",
+
 ]
 requires-python = ">=3.10"
 
@@ -43,7 +50,7 @@ include-package-data = true
 where = ["src"]
 
 [tool.setuptools.package-data]
-markdowncleaner = ["config/*.yaml"]
+markdowncleaner = ["config/*.yaml", "data/*"]
 
 [project.scripts]
 markdowncleaner = "markdowncleaner.cli:main"
diff --git a/src/markdowncleaner/constants.py b/src/markdowncleaner/constants.py
@@ -0,0 +1,35 @@
+import re
+
+SKIP_TYPES = {"code_inline", "code_block", "fence"}  # never touch code
+SENT_DELIM = "\uFFF9"
+
+# compile once
+SHORT_DUP_RE = re.compile(
+            rf"\b([A-Za-z]{{1,3}})(?:\s*{SENT_DELIM}\s*|\s+)+\1\b",  # ← sentinel counts as space
+            flags=re.I)
+
+DUP_RE = re.compile(
+    rf"""
+    \b([A-Za-z]{{1,3}})            # short word (capture = \1)
+    (?:\s*|{SENT_DELIM})+          # any mix of spaces or sentinel(s)
+    \1                             # the SAME short word again …
+    (?=\W|{SENT_DELIM})?           # … if it is a stand-alone token
+    """,
+    re.IGNORECASE | re.VERBOSE,
+)
+
+BRACKET_RE = re.compile(
+    r"""(
+        \[[^\[\]]*]               |   # [ … ]
+        \([^)]+\)                 |   # ( … )
+        \{[^}]+\}                 |   # { … }
+        "[^"\n]*"                 |   # " … "
+        '[^'\n]*'                 |   # ' … '
+        “[^”\n]*”                 |   # “ … ”
+        ‘[^’\n]*’                     # ‘ … ’
+    )""",
+    re.VERBOSE,
+)
+
+IGNORE_RE = re.compile(rf"(?:[^\w\s{SENT_DELIM}]+|\d+)")   # for word_segmentation
+