Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ version = "0.2.0"
description = "A tool for cleaning and formatting markdown documents"
readme = "README.md"
authors = [
{name = "Johannes Himmelreich", email = "[email protected]"}
{name = "Johannes Himmelreich", email = "[email protected]"},
]
license = {text = "MIT"}
classifiers = [
Expand All @@ -28,6 +28,13 @@ keywords = ["markdown", "cleaning", "formatting", "text processing"]
dependencies = [
"pyyaml>=6.0",
"ftfy>=6.0.3",
"markdown-it-py>=2.2.0",
"symspellpy>=6.9.0",
"mdformat>=0.7.22",
"deepmultilingualpunctuation>=1.0.1",
"spacy>=3.8.2",
"en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl",

]
requires-python = ">=3.10"

Expand All @@ -43,7 +50,7 @@ include-package-data = true
where = ["src"]

[tool.setuptools.package-data]
markdowncleaner = ["config/*.yaml"]
markdowncleaner = ["config/*.yaml", "data/*"]

[project.scripts]
markdowncleaner = "markdowncleaner.cli:main"
35 changes: 35 additions & 0 deletions src/markdowncleaner/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import re

SKIP_TYPES = {"code_inline", "code_block", "fence"} # never touch code
SENT_DELIM = "\uFFF9"

# compile once
SHORT_DUP_RE = re.compile(
rf"\b([A-Za-z]{{1,3}})(?:\s*{SENT_DELIM}\s*|\s+)+\1\b", # ← sentinel counts as space
flags=re.I)

DUP_RE = re.compile(
rf"""
\b([A-Za-z]{{1,3}}) # short word (capture = \1)
(?:\s*|{SENT_DELIM})+ # any mix of spaces or sentinel(s)
\1 # the SAME short word again …
(?=\W|{SENT_DELIM})? # … if it is a stand-alone token
""",
re.IGNORECASE | re.VERBOSE,
)

BRACKET_RE = re.compile(
r"""(
\[[^\[\]]*] | # [ … ]
\([^)]+\) | # ( … )
\{[^}]+\} | # { … }
"[^"\n]*" | # " … "
'[^'\n]*' | # ' … '
“[^”\n]*” | # “ … ”
‘[^’\n]*’ # ‘ … ’
)""",
re.VERBOSE,
)

IGNORE_RE = re.compile(rf"(?:[^\w\s{SENT_DELIM}]+|\d+)") # for word_segmentation

Loading
Loading