Skip to content

Commit

Permalink
Added logic to strip non printable whitespace characters
Browse files Browse the repository at this point in the history
  • Loading branch information
beveradb committed Sep 17, 2024
1 parent bae8c08 commit 61840ce
Show file tree
Hide file tree
Showing 4 changed files with 432 additions and 364 deletions.
3 changes: 2 additions & 1 deletion karaoke_lyrics_processor/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def main():
if args.output:
output_filename = args.output
else:
output_filename = f"{filename_parts[0]} (Lyrics Processed).{filename_parts[1]}"
base_name = filename_parts[0].replace(" (Lyrics)", "")
output_filename = f"{base_name} (Lyrics Processed).{filename_parts[1]}"

processor = KaraokeLyricsProcessor(
log_level=log_level,
Expand Down
24 changes: 23 additions & 1 deletion karaoke_lyrics_processor/karaoke_lyrics_processor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
import logging
import pyperclip
import unicodedata


class KaraokeLyricsProcessor:
Expand Down Expand Up @@ -87,11 +88,29 @@ def find_best_split_point(self, line):
self.logger.debug(f"Line is still too long, forcibly splitting at position {forced_split_point}")
return forced_split_point

def replace_non_printable_spaces(self, text):
"""
Replace non-printable space-like characters, tabs, and other whitespace with regular spaces,
excluding newline characters.
"""
self.logger.debug(f"Replacing non-printable spaces in: {text}")
# Define a pattern for space-like characters, including tabs and other whitespace, but excluding newlines
space_pattern = r"[^\S\n\r]|\u00A0|\u1680|\u2000-\u200A|\u202F|\u205F|\u3000"
# Replace matched characters with a regular space
cleaned_text = re.sub(space_pattern, " ", text)
# Remove leading/trailing spaces and collapse multiple spaces into one, preserving newlines
cleaned_text = re.sub(r" +", " ", cleaned_text).strip()
self.logger.debug(f"Text after replacing non-printable spaces: {cleaned_text}")
return cleaned_text

def process_line(self, line):
"""
Process a single line to ensure it's within the maximum length,
and handle parentheses.
handle parentheses, and replace non-printable spaces.
"""
# Replace non-printable spaces at the beginning
line = self.replace_non_printable_spaces(line)

processed_lines = []
iteration_count = 0
max_iterations = 100 # Failsafe limit
Expand Down Expand Up @@ -153,6 +172,9 @@ def process(self):

processed_lyrics_text = "\n".join(lyrics_lines)

# Final pass to replace any remaining non-printable spaces
processed_lyrics_text = self.replace_non_printable_spaces(processed_lyrics_text)

self.processed_lyrics_text = processed_lyrics_text
pyperclip.copy(processed_lyrics_text)

Expand Down
Loading

0 comments on commit 61840ce

Please sign in to comment.