-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconsolidate_epub.py
128 lines (103 loc) · 5.37 KB
/
consolidate_epub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import re
from pathlib import Path
class EpubConsolidator:
def __init__(self, base_path, character_limit = 350000):
self.base_path = Path(base_path)
self.order_file = self.base_path / "files_order.txt" # Updated file name
self.order = self.read_order_file()
self.character_limit = character_limit
def read_order_file(self):
with open(self.order_file, 'r', encoding='utf-8') as file:
order = file.readlines()
return [x.strip() for x in order]
def remove_html_tags_and_empty_lines(self, text):
# Remove all spaces and tabs first
text = re.sub(r"\s+", " ", text) # This collapses all whitespace into single spaces for cleaner processing
# Insert a newline before the start and after the end of <p> and <h1> tags
text = re.sub(r"<div[^>]*>", "", text)
text = re.sub(r"</div>", "\n", text)
text = re.sub(r"<p[^>]*>", "", text)
text = re.sub(r"</p>", "\n", text)
text = re.sub(r"<h1[^>]*>", "", text)
text = re.sub(r"</h1>", "\n", text)
text = re.sub(r"<a[^>]*>", "", text)
text = re.sub(r"</a>", "\n", text)
text = re.sub(r"<span[^>]*>", "", text)
text = re.sub(r"<link[^>]*/>", "", text)
text = re.sub(r"</span>", "\n", text)
# Remove DOCTYPE declarations
text = re.sub(r"<!DOCTYPE[^>]*>", "", text)
text = re.sub(r" "," ",text)
# Remove CSS style blocks
text = re.sub(r"<style.*?>.*?</style>", "", text, flags=re.DOTALL)
# Remove HTML comments
text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
# Remove all other HTML tags
text = re.sub(r"<.*?>", "", text)
# Remove consecutive spaces and tabs
#text = re.sub(r"\s+", " ", text)
# Split text into lines and remove empty lines
lines = text.split('\n')
non_empty_lines = [line for line in lines if line.strip() != '']
return '\n'.join(non_empty_lines)
#return text
def consolidate_files(self):
combined_files = ""
copyright_keywords = [
"copyright", "all rights reserved",
"ISBN", "Library of Congress"
]
for file_name in self.order:
full_file_path = self.base_path / file_name
if full_file_path.is_file():
with open(full_file_path, 'r', encoding='utf-8', errors='replace') as file:
file_content = file.read()
html_content = re.findall("<.*?>", file_content)
html_content_length = sum(len(tag) for tag in html_content)
total_content_length = len(file_content)
if html_content_length / total_content_length > 0.9:
print(f"File {file_name} is mainly HTML, skipping.")
continue
cleaned_content = self.remove_html_tags_and_empty_lines(file_content)
lines = cleaned_content.split('\n')
non_empty_lines = [line for line in lines if line.strip() != '']
# Keyword check for copyright pages
if any(keyword in cleaned_content.lower() for keyword in copyright_keywords):
print(f"File {file_name} detected as a copyright page, skipping.")
continue
# Analyze the content to determine if it's likely an index or footnote
if len(non_empty_lines) < 5 or (sum(len(line) for line in non_empty_lines) / len(non_empty_lines)) < 40:
print(f"File {file_name} seems to be an index or footnote, skipping.")
continue
# Add a new line and the file name as an indicator for a new chapter
combined_files += f"\n\nChapter: {file_name}\n\n{cleaned_content}\n"
else:
print(f"File {file_name} not found, skipping.")
return combined_files
def save_consolidated_files(self, combined_files):
output_file_base = self.base_path / 'book_segment'
# Delete any existing "book_segment_*.txt" files
for old_file in self.base_path.glob(f"{output_file_base.stem}_*.txt"):
old_file.unlink()
file_number = 1
output_file = output_file_base.with_name(f"{output_file_base.stem}_{file_number}.txt")
char_count = 0
for line in combined_files.split('\n'):
line_length = len(line)
if char_count + line_length >= self.character_limit:
file_number += 1
output_file = output_file_base.with_name(f"{output_file_base.stem}_{file_number}.txt")
char_count = 0
with open(output_file, 'a', encoding='utf-8') as f:
f.write(line + '\n')
char_count += line_length
print(f"Files consolidated and saved to {output_file_base.stem}_*.txt files successfully.")
def consolidate(books_folder,character_limit):
books_folder = Path(books_folder)
for book_folder in books_folder.iterdir():
if book_folder.is_dir():
print(f"----Consolidating files in {book_folder}----")
consolidator = EpubConsolidator(book_folder,character_limit)
combined_files = consolidator.consolidate_files()
consolidator.save_consolidated_files(combined_files)