diff --git a/.gitignore b/.gitignore index 454059a..60e56f5 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ coverage.xml # Jupyter Notebook .ipynb_checkpoints +test.ipynb # pyenv .python-version diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..8fcbf31 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,51 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- **Heuristic reference removal**: Automatically detect and remove bibliographic reference lines based on pattern scoring +- **Batch processing script**: Process multiple markdown files in parallel with `scripts/clean_mds_in_folder.py` +- **Footnote pattern removal**: Remove footnote references in text (e.g., `.1`, `.23`) +- **Enhanced linebreak crimping**: Improved algorithm for fixing line break errors from PDF conversion + - Connective-based crimping for lines ending with `-`, `–`, `—`, or `...` + - Justified text crimping for adjacent lines of similar length +- **CLI options**: + - `--keep-references`: Disable heuristic reference detection + - `--no-crimping`: Disable linebreak crimping + - Additional fine-grained control options for cleaning operations + +### Changed +- **Default patterns**: Updated `default_cleaning_patterns.yaml` with: + - Improved section removal patterns (e.g., "Authors' Note", "Note on sources") + - Additional inline patterns (LaTeX footnotes, trailing ellipsis) + - Refined keyword and conflict of interest patterns +- **API**: `MarkdownCleaner` constructor now accepts optional `patterns` parameter (defaults to None for default patterns) +- **Linebreak crimping**: Now enabled by default (`crimp_linebreaks: True`) +- **README**: Comprehensive documentation updates with new features and examples + +### Fixed +- Linebreak crimping logic now properly handles various PDF conversion artifacts +- Test suite updated to match new implementation + +## [0.2.0] - 2025-03-XX + +Initial PyPI release with core markdown cleaning functionality. + +### Features +- Remove references, bibliographies, and citations +- Remove copyright notices and legal disclaimers +- Remove acknowledgements and funding information +- Pattern-based text cleaning with customizable YAML configuration +- Command-line interface +- Python API for programmatic use +- Text replacement and pattern removal +- Duplicate headline removal +- Short line removal +- Empty line contraction +- Encoding fix support (mojibake) +- Quotation normalization diff --git a/README.md b/README.md index bbbf607..6afc85a 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ A simple Python tool for cleaning and formatting markdown documents. Default con ## Description `markdowncleaner` helps you clean up markdown files by removing unwanted content such as: -- References, bibliographies, and citations +- References, bibliographies, and citations (including heuristic detection of bibliographic lines) - Footnotes and endnote references in text - Copyright notices and legal disclaimers - Acknowledgements and funding information @@ -13,6 +13,7 @@ A simple Python tool for cleaning and formatting markdown documents. Default con - Specific patterns like DOIs, URLs, and email addresses - Short lines and excessive whitespace - Duplicate headlines (for example, because paper title and author names were reprinted on every page of a PDF) +- Erroneous line breaks from PDF conversion This tool is particularly useful for processing academic papers, books, or any markdown document that needs formatting cleanup. @@ -24,7 +25,9 @@ pip install markdowncleaner ## Usage -### Basic Usage +### Python API + +#### Basic Usage ```python from markdowncleaner import MarkdownCleaner @@ -42,7 +45,7 @@ cleaned_text = cleaner.clean_markdown_string(text) print(cleaned_text) ``` -### Customizing Cleaning Options +#### Customizing Cleaning Options ```python from markdowncleaner import MarkdownCleaner, CleanerOptions @@ -51,9 +54,11 @@ from markdowncleaner import MarkdownCleaner, CleanerOptions options = CleanerOptions() options.remove_short_lines = True options.min_line_length = 50 # custom minimum line length -options.remove_duplicate_headlines = False +options.remove_duplicate_headlines = False options.remove_footnotes_in_text = True options.contract_empty_lines = True +options.fix_encoding_mojibake = True +options.normalize_quotation_symbols = True # Initialize cleaner with custom options cleaner = MarkdownCleaner(options=options) @@ -61,12 +66,13 @@ cleaner = MarkdownCleaner(options=options) # Use the cleaner as before ``` -### Custom Cleaning Patterns +#### Custom Cleaning Patterns You can also provide custom cleaning patterns: ```python -from markdowncleaner import MarkdownCleaner, CleaningPatterns +from markdowncleaner import MarkdownCleaner +from markdowncleaner.config.loader import CleaningPatterns from pathlib import Path # Load custom patterns from a YAML file @@ -76,6 +82,102 @@ custom_patterns = CleaningPatterns.from_yaml(Path("my_patterns.yaml")) cleaner = MarkdownCleaner(patterns=custom_patterns) ``` +### Command Line Interface + +Clean a single markdown file using the CLI: + +```bash +# Basic usage - creates a new file with "_cleaned" suffix +markdowncleaner input.md + +# Specify output file +markdowncleaner input.md -o output.md + +# Specify output directory +markdowncleaner input.md --output-dir cleaned_files/ + +# Use custom configuration +markdowncleaner input.md --config my_patterns.yaml + +# Enable encoding fixes and quotation normalization +markdowncleaner input.md --fix-encoding --normalize-quotation + +# Customize line length threshold +markdowncleaner input.md --min-line-length 50 + +# Disable specific cleaning operations +markdowncleaner input.md --keep-short-lines --keep-sections --keep-footnotes + +# Disable replacements and inline pattern removal +markdowncleaner input.md --no-replacements --keep-inline-patterns + +# Disable formatting operations +markdowncleaner input.md --no-crimping --keep-empty-lines + +# Keep references (disable heuristic reference detection) +markdowncleaner input.md --keep-references +``` + +**Available CLI Options:** + +- `-o`, `--output`: Path to save the cleaned markdown file +- `--output-dir`: Directory to save the cleaned file +- `--config`: Path to custom YAML configuration file +- `--fix-encoding`: Fix encoding mojibake issues +- `--normalize-quotation`: Normalize quotation symbols to standard ASCII +- `--keep-short-lines`: Don't remove lines shorter than minimum length +- `--min-line-length`: Minimum line length to keep (default: 70) +- `--keep-bad-lines`: Don't remove lines matching bad line patterns +- `--keep-sections`: Don't remove sections like References, Acknowledgements +- `--keep-duplicate-headlines`: Don't remove duplicate headlines +- `--keep-footnotes`: Don't remove footnote references in text +- `--no-replacements`: Don't perform text replacements +- `--keep-inline-patterns`: Don't remove inline patterns like citations +- `--keep-empty-lines`: Don't contract consecutive empty lines +- `--no-crimping`: Don't crimp linebreaks (fix line break errors from PDF conversion) +- `--keep-references`: Don't heuristically detect and remove bibliographic reference lines + +### Batch Processing Script + +For processing multiple markdown files in a folder and its subfolders, use the included batch processing script: + +```bash +# Basic usage - will prompt for confirmation +python scripts/clean_mds_in_folder.py documents/ + +# Skip confirmation prompt +python scripts/clean_mds_in_folder.py documents/ --yes + +# Use 8 parallel workers (default is your CPU count) +python scripts/clean_mds_in_folder.py documents/ --workers 8 + +# Use custom cleaning patterns +python scripts/clean_mds_in_folder.py documents/ --config my_patterns.yaml + +# Combine options +python scripts/clean_mds_in_folder.py documents/ --yes --workers 4 +``` + +**Features:** +- Recursively finds all `.md` files in the specified folder and subfolders +- Processes files in parallel using multiple CPU cores for faster processing +- Shows real-time progress bar with `tqdm` +- Cleans files in-place (modifies original files) +- Asks for confirmation before processing (unless `--yes` is used) +- Continues processing even if some files fail +- Reports all successful and failed files at the end + +**Script Options:** +- `folder`: Path to folder containing markdown files (required) +- `-y`, `--yes`: Skip confirmation prompt and proceed immediately +- `-w`, `--workers`: Number of parallel workers (default: CPU count) +- `--config`: Path to custom YAML configuration file + +**Note:** Requires `tqdm` for the progress bar: +```bash +pip install tqdm +``` + ## Configuration The default cleaning patterns are defined in `default_cleaning_patterns.yaml` and include: @@ -88,16 +190,22 @@ The default cleaning patterns are defined in `default_cleaning_patterns.yaml` an ## Options -- `remove_short_lines`: Remove lines shorter than `min_line_length` (default: 70 characters) -- `remove_whole_lines`: Remove lines matching specific patterns -- `remove_sections`: Remove entire sections based on section headings -- `remove_duplicate_headlines`: Remove duplicate headlines based on threshold -- `remove_duplicate_headlines_threshold`: Threshold for duplicate headline removal -- `remove_footnotes_in_text`: Remove footnote references -- `replace_within_lines`: Replace specific patterns within lines -- `remove_within_lines`: Remove specific patterns within lines -- `contract_empty_lines`: Normalize whitespace -- `crimp_linebreaks`: Improve line break formatting +All available `CleanerOptions`: + +- `fix_encoding_mojibake`: Fix encoding issues and mojibake using ftfy (default: False) +- `normalize_quotation_symbols`: Normalize various quotation marks to standard ASCII quotes (default: False) +- `remove_short_lines`: Remove lines shorter than `min_line_length` (default: True) +- `min_line_length`: Minimum line length to keep when `remove_short_lines` is enabled (default: 70) +- `remove_whole_lines`: Remove lines matching specific patterns (default: True) +- `remove_sections`: Remove entire sections based on section headings (default: True) +- `remove_duplicate_headlines`: Remove duplicate headlines based on threshold (default: True) +- `remove_duplicate_headlines_threshold`: Number of occurrences needed to consider a headline duplicate (default: 2) +- `remove_footnotes_in_text`: Remove footnote references like ".1" or ".23" (default: True) +- `replace_within_lines`: Replace specific patterns within lines (default: True) +- `remove_within_lines`: Remove specific patterns within lines (default: True) +- `contract_empty_lines`: Reduce multiple consecutive empty lines to one (default: True) +- `crimp_linebreaks`: Fix line break errors from PDF conversion (default: True) +- `remove_references_heuristically`: Heuristically detect and remove bibliographic reference lines by scoring lines based on bibliographic patterns (default: True) ## License diff --git a/scripts/clean_mds_in_folder.py b/scripts/clean_mds_in_folder.py new file mode 100644 index 0000000..2433936 --- /dev/null +++ b/scripts/clean_mds_in_folder.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +Script to clean all markdown files in a folder and its subfolders in parallel. +Processes files in-place using ProcessPoolExecutor for efficient parallel processing. +""" + +import argparse +import sys +from pathlib import Path +from concurrent.futures import ProcessPoolExecutor, as_completed +from typing import Tuple, Optional +import multiprocessing + +try: + from tqdm import tqdm +except ImportError: + print("Error: tqdm is required for progress tracking.", file=sys.stderr) + print("Install it with: pip install tqdm", file=sys.stderr) + sys.exit(1) + +from markdowncleaner import MarkdownCleaner +from markdowncleaner.config.loader import get_default_patterns, CleaningPatterns + + +def clean_single_file(file_path: Path, config_path: Optional[Path] = None) -> Tuple[Path, bool, Optional[str]]: + """ + Clean a single markdown file in-place. + + Args: + file_path: Path to the markdown file to clean + config_path: Optional path to custom YAML configuration + + Returns: + Tuple of (file_path, success, error_message) + """ + try: + # Load patterns + if config_path: + patterns = CleaningPatterns.from_yaml(config_path) + else: + patterns = get_default_patterns() + + # Initialize cleaner + cleaner = MarkdownCleaner(patterns=patterns) + + # Clean the file in-place + cleaner.clean_markdown_file(input_file=file_path, output_file=file_path) + + return (file_path, True, None) + except Exception as e: + return (file_path, False, str(e)) + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Clean all markdown files in a folder and its subfolders in parallel.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s documents/ + %(prog)s documents/ --yes + %(prog)s documents/ --workers 8 + %(prog)s documents/ --config custom_patterns.yaml + """ + ) + + parser.add_argument( + "folder", + type=Path, + help="Folder containing markdown files to clean" + ) + + parser.add_argument( + "-y", "--yes", + action="store_true", + help="Skip confirmation prompt and proceed immediately" + ) + + parser.add_argument( + "-w", "--workers", + type=int, + default=None, + help=f"Number of parallel workers (default: {multiprocessing.cpu_count()})" + ) + + parser.add_argument( + "--config", + type=Path, + help="Path to custom YAML configuration file with cleaning patterns" + ) + + return parser.parse_args() + + +def main() -> int: + """Main entry point for the script.""" + args = parse_args() + + # Validate folder exists + if not args.folder.exists(): + print(f"Error: Folder '{args.folder}' does not exist.", file=sys.stderr) + return 1 + + if not args.folder.is_dir(): + print(f"Error: '{args.folder}' is not a directory.", file=sys.stderr) + return 1 + + # Find all markdown files recursively + print(f"Searching for markdown files in '{args.folder}'...") + md_files = list(args.folder.rglob("*.md")) + + if not md_files: + print("No markdown files found.") + return 0 + + # Show count and ask for confirmation + print(f"Found {len(md_files)} markdown file(s).") + + if not args.yes: + response = input("Proceed with cleaning? [y/N]: ") + if response.lower() not in ['y', 'yes']: + print("Aborted.") + return 0 + + # Determine number of workers + max_workers = args.workers or multiprocessing.cpu_count() + print(f"Processing files using {max_workers} parallel worker(s)...") + + # Process files in parallel with progress bar + successful = [] + failed = [] + + with ProcessPoolExecutor(max_workers=max_workers) as executor: + # Submit all tasks + future_to_file = { + executor.submit(clean_single_file, file_path, args.config): file_path + for file_path in md_files + } + + # Process completed tasks with progress bar + with tqdm(total=len(md_files), unit="file") as pbar: + for future in as_completed(future_to_file): + file_path, success, error = future.result() + + if success: + successful.append(file_path) + else: + failed.append((file_path, error)) + + pbar.update(1) + + # Report results + print(f"\n{'='*60}") + print(f"Successfully cleaned: {len(successful)} file(s)") + + if failed: + print(f"Failed: {len(failed)} file(s)") + print("\nFailed files:") + for file_path, error in failed: + print(f" - {file_path}") + print(f" Error: {error}") + return 1 + else: + print("All files processed successfully!") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/markdowncleaner/cli.py b/src/markdowncleaner/cli.py index bdde4a0..1d12f26 100644 --- a/src/markdowncleaner/cli.py +++ b/src/markdowncleaner/cli.py @@ -43,6 +43,18 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace: ) # Options for customizing the cleaning process + parser.add_argument( + "--fix-encoding", + action="store_true", + help="Fix encoding mojibake" + ) + + parser.add_argument( + "--normalize-quotation", + action="store_true", + help="Normalize quotation symbols" + ) + parser.add_argument( "--keep-short-lines", action="store_true", @@ -99,11 +111,17 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace: ) parser.add_argument( - "--no-crimp-linebreaks", + "--no-crimping", action="store_true", help="Don't crimp linebreaks" ) + parser.add_argument( + "--keep-references", + action="store_true", + help="Don't heuristically detect and remove references" + ) + return parser.parse_args(args) @@ -113,6 +131,8 @@ def main(args: Optional[List[str]] = None) -> int: # Configure cleaner options based on arguments options = CleanerOptions() + options.fix_encoding_mojibake = parsed_args.fix_encoding + options.normalize_quotation_symbols = parsed_args.normalize_quotation options.remove_short_lines = not parsed_args.keep_short_lines options.min_line_length = parsed_args.min_line_length options.remove_whole_lines = not parsed_args.keep_bad_lines @@ -122,7 +142,8 @@ def main(args: Optional[List[str]] = None) -> int: options.replace_within_lines = not parsed_args.no_replacements options.remove_within_lines = not parsed_args.keep_inline_patterns options.contract_empty_lines = not parsed_args.keep_empty_lines - options.crimp_linebreaks = not parsed_args.no_crimp_linebreaks + options.crimp_linebreaks = not parsed_args.no_crimping + options.remove_references_heuristically = not parsed_args.keep_references # Load patterns from custom config or use defaults if parsed_args.config: diff --git a/src/markdowncleaner/config/default_cleaning_patterns.yaml b/src/markdowncleaner/config/default_cleaning_patterns.yaml index 5bf8b4f..3f3469e 100644 --- a/src/markdowncleaner/config/default_cleaning_patterns.yaml +++ b/src/markdowncleaner/config/default_cleaning_patterns.yaml @@ -44,7 +44,8 @@ sections_to_remove: - List of Illustrations - |- ^\s?Keywords\s?\.?$ - - Authors' Note + - ^\bauthors?[\''']?\s+note\b + - ^\bAUTHORS?[\''']?\s+NOTE\b - |- ^[a-zA-Z0-9]+-[a-zA-Z0-9]+-?[a-zA-Z0-9]+[^\s]*$ - |- @@ -62,6 +63,7 @@ sections_to_remove: - |- Published by\: - Preface + - Note on sources # Patterns for inline text that should be removed bad_inline_patterns: @@ -81,6 +83,9 @@ bad_inline_patterns: \* 1 , - |- \(p\.?\d{1,3}\) + - \\footnote\{[^}]*\} +# line ending with ... + - \.\.\.$ # Patterns for entire lines that should be removed bad_lines_patterns: @@ -110,10 +115,8 @@ bad_lines_patterns: - |- 10\.\d{4,9}/[-._;()/:\w]+ - Digital Object Identifier - - |- - ^[Kk]eywords - - |- - ^Conflict of Interest + - ^[Kk]ey\s?words + - ^Conflict of Interest - ^Library of Congress - ^Open Access - ^OpenAccess @@ -128,10 +131,8 @@ bad_lines_patterns: - ^Table of Contents - |- ^Source: - - |- - ^Published by - - |- - ^CCS Concepts + - ^Published by + - ^CCS Concepts - ISSN - ISBN - |- @@ -160,10 +161,19 @@ bad_lines_patterns: - ™ - |- c \d{4} + # Unicode footnote numbers + - ^([⁰¹²³⁴⁵⁶⁷⁸⁹]+) + # Latex OCRed footnotes, text and math environment + - ^\\textsuperscript\{(\d+)\} + - ^\\\(\^\{?(\d+)\}?\\\) + # bibliography + - ^\\bibitem\{[^\s}]+\} + # footnotes: line starting with number and "ibid" + - ^\d+.*\b[Ii]bid\b - |- - ^\d{1,3}\s?.\s?[Ss]ee + ^\d{1,3}\s?.\s?([Ss]ee|[Nn]ote) - |- - ^\-\s?\d{1,3}\s?.\s?[Ss]ee + ^\-\s?\d{1,3}\s?.\s?([Ss]ee|[Nn]ote) - |- ^\d{1,2}[A-Za-z]+ - |- @@ -215,6 +225,8 @@ bad_lines_patterns: - All rights reserved - No portion of this publication may be reproduced - authors have asserted their rights to be identified as the authors + - authors contributed equally + - ^\*{0,2} ?[Aa]uthor contributions - material in this work has been asserted in accordance - imprint of Macmillan Publishers - A catalogue record for this book is available @@ -223,21 +235,13 @@ bad_lines_patterns: - This article is published with open access - Links to third party websites - Registered in England and Wales - - |- - \*I thank - - |- - \*We thank - - |- - \*For comments - - |- - \*I would like to thank - - |- - \*We would like to thank - - |- - \*Thanks to + # Thanks + - \b(I|[Ww]e)\b.*\b(thank|grateful|indebted)\b.*\b(feedback|participants|individuals|referee|publisher|reviewer|NSF|NEH|NIH|discussions|comments)\b.* + - \*For comments + - \*Thanks to + - Many thanks to - |- \*\s?This article is - - I am grateful for - Oxford University Press is a department of - a registered trademark of - Published in the United States @@ -248,14 +252,24 @@ bad_lines_patterns: - catalogue recordfor this publication - An earlier version of this paper was - Springer International Publishing AG + - Springer Nature remains neutral - part of Springer Nature - This work is subject to copyright. - permission to reuse the copyright - distributed under the terms of a Creative Commons + - author(s) received no financial support + - author(s) declared no + - ^The views expressed in this + - ^Competing interests - |- , [A-Z]{2} \d{5}(?:-\d{4})?, +# some bibliography patterns + - ^[A-Z][a-z]+(?:,\s[A-Z][a-z]+)*(?:,\s(?:and\s)?[A-Z][a-z]+)*\.\s\d{4}\. + - ^([^(]+)\.\s+\((\[?\d{4}\]?(?:\s+\d{4})?)\)\s+(.+)$ -# Footnotes +# In-line footnote numbers, will be replaced with '.' if matched and if remove_footnotes_in_text +footnote_patterns: + - \\footnote\{[^}]*\} # Pattern matches: # 1. A period # 2. Optional space @@ -263,9 +277,13 @@ bad_lines_patterns: # 4. Optionally followed by comma and space and more 1-3 digit numbers # 5. This pattern repeats for any number of comma-separated numbers # 6. Followed by either end of string, whitespace, or newline -footnote_patterns: - - |- - \.\s?(\d{1,3}(?:,\s*\d{1,3})*?)(?=\s|$) + - \.\s?(\d{1,3}(?:,\s*\d{1,3})*?)(?=\s|$) +# Latex superscripted numerals in text and math environment like \textsuperscript{12} or .\(^{12}\) + - \.\\textsuperscript\{(\d+)\}(?=\s|$) + - \.\\\(\^\{?(\d+)\}?\\\)(?=\s|$) +# Unicode superscript numerals + - \.([⁰¹²³⁴⁵⁶⁷⁸⁹]+)(?=\s|$) + replacements: 'GLYPH<28>': 'fi' @@ -293,4 +311,6 @@ replacements: 'GLYPH<2>': 'Š' '/ornm58.ornm': '' '/ornm69.ornm': '' - '&': '&' \ No newline at end of file + '&': '&' + # note: need to escape the \ below (these are read as regex) + '\\item': '-' \ No newline at end of file diff --git a/src/markdowncleaner/markdowncleaner.py b/src/markdowncleaner/markdowncleaner.py index 05ffac1..f58851a 100644 --- a/src/markdowncleaner/markdowncleaner.py +++ b/src/markdowncleaner/markdowncleaner.py @@ -11,6 +11,8 @@ @dataclass class CleanerOptions: """Countainer for Cleaner options""" + fix_encoding_mojibake : bool = False + normalize_quotation_symbols : bool = False remove_short_lines: bool = True min_line_length: int = 70 remove_whole_lines: bool = True @@ -22,17 +24,19 @@ class CleanerOptions: remove_within_lines: bool = True contract_empty_lines: bool = True crimp_linebreaks: bool = True + remove_references_heuristically: bool = True class MarkdownCleaner: """Class to handle markdown document cleaning operations.""" - def __init__(self, patterns: CleaningPatterns = None, options: Optional[CleanerOptions] = None): + def __init__(self, patterns: Optional[CleaningPatterns] = None, options: Optional[CleanerOptions] = None): """ Initialize the cleaner with patterns. Args: patterns: CleaningPatterns instance, or None to use defaults + options: CleanerOptions instance, or None to use defaults """ self.patterns = patterns or get_default_patterns() self.options = options or CleanerOptions() @@ -86,14 +90,20 @@ def clean_markdown_string(self, content: str) -> str: """Apply all cleaning operations to the content.""" # Apply all default ftfy fixes if mojibake is detected - if ftfy.is_bad(content): - content = ftfy.fix_text(content) + if self.options.fix_encoding_mojibake: + if ftfy.is_bad(content): + content = ftfy.fix_text(content) + # Heuristically detect and remove blocks of lines with bibliographic information + if self.options.remove_references_heuristically: + content = self._remove_bibliographic_lines(content) + # Reduce two or more subsequent spaces to a single space content = re.sub(r' {2,}', ' ', content) # Normalize quotes - content = self._normalize_quotation_symbols(content) + if self.options.normalize_quotation_symbols: + content = self._normalize_quotation_symbols(content) # Remove lines shorter than min_line_length (default: 70 characters) if self.options.remove_short_lines: @@ -116,9 +126,9 @@ def clean_markdown_string(self, content: str) -> str: for k, v in self.patterns.replacements.items(): content = self._replace_within_lines(content, k, v) - # Replace footnote pattern (numbers at end of sentence) with '.' + # Replace footnote pattern (numbers at end of sentence) with '. ' if self.options.remove_footnotes_in_text: - content = self._replace_within_lines(content, self.patterns.footnote_patterns, '.') + content = self._replace_within_lines(content, self.patterns.footnote_patterns, '. ') # Remove remaining unwanted inline patterns (some may have been replaced by replacements) if self.options.remove_within_lines: @@ -132,6 +142,134 @@ def clean_markdown_string(self, content: str) -> str: return content + def _remove_bibliographic_lines(self, text: str, score_threshold: int = 3) -> str: + """ + Remove bibliographic reference lines from text. + + Detects and removes individual lines that appear to be bibliography entries + by scoring each line based on bibliographic patterns. + + Args: + text: Input text to clean + score_threshold: Minimum score for a line to be removed (default: 3) + + Returns: + Text with bibliographic lines removed + """ + lines = text.splitlines() + result_lines = [] + + for line in lines: + score = self._score_bibliography_line(line) + if score < score_threshold: + result_lines.append(line) + + return '\n'.join(result_lines) + + def _score_bibliography_line(self, line: str) -> int: + """ + Score a line based on bibliographic patterns. + + Args: + line: Line of text to score + + Returns: + Integer score based on number of bibliographic patterns matched + """ + score = 0 + stripped = line.strip() + + # Don't score very short lines + if len(stripped) < 20: + return 0 + + # 1 point patterns + + # Year in parentheses: (1984), (2020), or [1960] + if re.search(r'\([12][089]\d{2}\)|\[[12][089]\d{2}\]', line): + score += 1 + + # Page ranges: 35-57, pp. 332-487, 283-310 + if re.search(r'\bpp?\.\s*\d+-\d+|\b\d{2,3}-\d{2,3}\b', line): + score += 1 + + # Publisher/location: "Cambridge, MA: Harvard", "Oxford: Clarendon Press" + if re.search(r'[A-Z][a-z]+(?:,\s*[A-Z]{2})?:\s*[A-Z][a-z]+', line): + score += 1 + + # Numbered list format: starts with "1. ", "14. ", etc. + if re.match(r'^\s*\d{1,3}\.\s+', line): + score += 1 + + # Bullet format: starts with "- " or "• " + if re.match(r'^\s*[-•]\s+', line): + score += 1 + + # Italic markers: *Title Text* + if re.search(r'\*[^*]+\*', line): + score += 1 + + # "In:" followed by capital letter + if re.search(r'\bIn:\s+[A-Z]', line): + score += 1 + + # Ampersand: " & " in author context + if ' & ' in line: + score += 1 + + # Multiple initials: "J. B. Wiesner", "M.A.", "H. F." + if re.search(r'\b[A-Z]\.\s*[A-Z]\.|\b[A-Z]\.[A-Z]\.', line): + score += 1 + + # "et al." + if 'et al.' in line: + score += 1 + + # Author name patterns: "LastName, FirstInitial." or "LastName, FirstName" at line start + if re.match(r'^\s*[A-Z][a-z]+,\s+[A-Z]', line): + score += 1 + + # Punctuation density: >8% of characters are . , : ; ( ) + if len(line) > 0: + punct_chars = sum(1 for c in line if c in '.,;:()') + if punct_chars / len(line) > 0.08: + score += 1 + + # 2 point patterns + + # Common journal names + if re.search(r'\b(journal|proceedings|review|quarterly|annals|transactions|bulletin|University Press)\b', line, re.IGNORECASE): + score += 2 + + # Author initial and date without brackets separated with dots + if re.search(r' [A-Za-z]\. \d{4}\. ', line): + score += 2 + if re.search(r' [A-Za-z]\.\, \d{4}\, ', line): + score += 2 + # Author lastname, first abbreviated, date in brackets, then title, e.g., Axelrod, R. (1984) T + if re.search(r'\w+,\s+([A-Z]\.)+\s+\(\d{4}\)\s+[A-Z]', line): + score += 2 + + # volumne and page ranges + if re.search(r' \d{2,3}: \d{1,4}[-–—]\d{2,4}', line): + score += 2 + + # 3 point patterns + + # Volume/issue: 121(3), 14(2), Vol. I, vol(issue) + if re.search(r'\b\d+\(\d+\)\b|Vol\.\s*[IVX]+|vol\.\s*\d+', line, re.IGNORECASE): + score += 3 + + # DOI: doi.org/, DOI: + if re.search(r'doi\.org/|DOI:', line, re.IGNORECASE): + score += 3 + + # Editor markers: "Ed." or "Eds." (as standalone word or in parentheses) + if re.search(r'\bEds?\.\b|\(Eds?\.\)', line): + score += 3 + + return score + def _normalize_quotation_symbols(self, text: str) -> str: """ Normalizes quotation symbols in the input text. @@ -209,10 +347,10 @@ def _remove_short_lines(self, multiline_string: str, length: int = 70) -> str: # Split the content into lines lines = multiline_string.splitlines() - # Filter out lines that are shorter than length but that are neither empty nor start with '#' + # Filter out lines that are shorter than length but that are neither empty nor start with '#' nor with a pattern indicating a markdown list like '1. ' filtered_lines = [] for line in lines: - if not line.strip() == '' and not line.startswith('#') and len(line) < length: + if not line.strip() == '' and not line.startswith('#') and not re.match(r'^\d{1,2}\.\s', line) and len(line) < length: continue filtered_lines.append(line) @@ -366,51 +504,92 @@ def _crimp_linebreaks(self, markdown_text: str) -> str: markdown_text (str): Input markdown text with potential line break errors The function handles two cases: - 1. Hyphenated words split across lines (even with one empty line in between) - 2. Paragraphs incorrectly split by empty lines when a line ends with a letter + 1. Connective-based crimping: Lines ending with -, –, —, or ... + 2. Justified text crimping: Adjacent lines of similar length Returns: - str: Text with all patterns removed + str: Text with crimped linebreaks """ - lines = markdown_text.splitlines() result_lines = [] i = 0 + def _is_list_item(line: str) -> bool: + if not line: + return False + + # Check condition 1: starts with list marker + if line[0] in '-–—*∙•・◦●○': + return True + + # Check condition 2: contains list punctuation in first 5 chars + first_five = line[:5] + if any(char in first_five for char in '.)*]'): + return True + + # Check condition 3: starts with numeral + if line[0].isdigit(): + return True + + return False + while i < len(lines): current_line = lines[i].strip() # Try to join as many consecutive lines as possible while True: - joined = False - # Case 1: Handle hyphenated words - if current_line.endswith('-'): + # Case 1: Connective-based crimping + if current_line and current_line.endswith(('-', '–', '—', '...')): + # Find next non-empty line within 3 lines (max 2 empty lines between) j = i + 1 - while j < len(lines) and not lines[j].strip(): - j += 1 + empty_count = 0 + while j < len(lines) and empty_count <= 2: + if not lines[j].strip(): + empty_count += 1 + j += 1 + else: + break - if j < len(lines) and lines[j].strip() and lines[j].strip()[0].islower(): - current_line = current_line[:-1] + lines[j].strip() - i = j # Update i to the last joined line - joined = True - continue # Skip to next join check + # Check if we found a valid next line + if j < len(lines) and lines[j].strip(): + next_line = lines[j].strip() + + # Check all conditions + if (next_line[0].isalpha() and # Starts with letter + not _is_list_item(next_line) and # Not a list item + '.' in next_line[6:]): # Contains '.' at position 6 or later + + # Remove hyphen if present, otherwise add space + if current_line.endswith(('-', '–', '—')): + current_line = current_line[:-1] + next_line + else: # ends with '...' + current_line = current_line + ' ' + next_line + + i = j # Update i to the last joined line + continue - # Case 2: Handle paragraph merging - if not current_line.startswith('#') and current_line and (current_line[-1].isalpha() or current_line[-1] in ',;\'\"'): - _logger.debug(f'Crimping line:... {current_line[-50:]}') - j = i + 1 - while j < len(lines) and not lines[j].strip(): - j += 1 + # Case 2: Justified text crimping + if (current_line and + current_line[-1].isalpha() and # Ends with letter + not current_line.startswith('#') and # Not a heading + not _is_list_item(current_line)): # Not a list item - if j < len(lines) and lines[j].strip() and \ - not lines[j].strip().startswith('#') and \ - not lines[j].strip().startswith('*') and \ - not lines[j].strip().startswith('-'): - current_line = current_line + ' ' + lines[j].strip() - i = j # Update i to the last joined line - joined = True # noqa: F841 - continue # Skip to next join check + # Check immediately next line (L+1) + j = i + 1 + if j < len(lines) and lines[j].strip(): + next_line = lines[j].strip() + + # Check all conditions + if (next_line[0].isalpha() and # Starts with letter + not next_line.startswith('#') and # Not a heading + not _is_list_item(next_line) and # Not a list item + len(next_line) >= 78 and # Length >= 78 + abs(len(next_line) - len(current_line)) <= 10): # Within ±10 + + current_line = current_line + ' ' + next_line + i = j # Update i to the last joined line + continue # If no joins were made, break the loop break diff --git a/tests/test_cleaner.py b/tests/test_cleaner.py index 10806a3..1c7d6de 100644 --- a/tests/test_cleaner.py +++ b/tests/test_cleaner.py @@ -179,9 +179,10 @@ def test_remove_duplicate_headlines(self): self.assertEqual(self.cleaner._remove_duplicate_headlines(text_case), text_case) def test_crimp_linebreaks(self): - text = "This line ends \nwith an awkward break." + # Test connective-based crimping (line ending with hyphen) + text = "This line ends with a hy-\nphenated word that continues." result = self.cleaner._crimp_linebreaks(text) - self.assertEqual(result, "This line ends with an awkward break.") + self.assertEqual(result, "This line ends with a hyphenated word that continues.") def test_clean_markdown_file(self): # Create a test markdown file diff --git a/tests/test_cli.py b/tests/test_cli.py index ceef79a..db80dba 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -40,7 +40,7 @@ def test_parse_args_all_options(self): '--no-replacements', '--keep-inline-patterns', '--keep-empty-lines', - '--no-crimp-linebreaks', + '--no-crimping', ]) self.assertEqual(args.input_file, Path('somefile.md')) @@ -56,7 +56,7 @@ def test_parse_args_all_options(self): self.assertTrue(args.no_replacements) self.assertTrue(args.keep_inline_patterns) self.assertTrue(args.keep_empty_lines) - self.assertTrue(args.no_crimp_linebreaks) + self.assertTrue(args.no_crimping) def test_main_success(self): # Create a test markdown file