-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit_proposal_pdf.py
More file actions
121 lines (103 loc) · 4.37 KB
/
split_proposal_pdf.py
File metadata and controls
121 lines (103 loc) · 4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import fitz # pymupdf
import os
import re
# Configuration
INPUT_PDF = "proposal.pdf"
OUTPUT_DIR = "output"
CHAPTER_DIR = os.path.join(OUTPUT_DIR, "chapters")
GOAL_DIR = os.path.join(OUTPUT_DIR, "detailed_goals")
# Chapter titles as identified from the Google Doc tabs (and likely headers)
CHAPTER_TITLES = [
"제안서",
"프롬프트 예시",
"To-dos",
"기획회의",
"일정",
"이메일",
"제안서 예시_",
"rfp",
"1. 연구필요성"
]
# Patterns for 17 Specific Goals
# We look for "세부목표 X" or "연구 X" where X is 1 to 17.
# Or "1-1.", "1-2." ... depending on actual content format.
# Based on common structures, we'll look for:
# "세부목표 1", "세부목표 2" ... and "연구 1", "연구 2" ... trying to map 17 items.
# The user said "17 specific goals and researches" (세부 목표와 총 연구가 17개).
# We will treat them as a single list of 17 items if possible, or try to detect them.
# Let's assume pattern: "세부목표 <number>" or "연구 <number>"
def ensure_dirs():
os.makedirs(CHAPTER_DIR, exist_ok=True)
os.makedirs(GOAL_DIR, exist_ok=True)
def split_pdf():
if not os.path.exists(INPUT_PDF):
print(f"Error: {INPUT_PDF} not found. Please place the file in {os.getcwd()}")
return
doc = fitz.open(INPUT_PDF)
total_pages = len(doc)
print(f"Opened {INPUT_PDF} with {total_pages} pages.")
# 1. Analyze pages to find start pages for Chapters and Goals
# structure: {title: start_page_index}
chapter_starts = {}
goal_starts = {}
print("Analyzing document structure...")
for page_num in range(total_pages):
page = doc.load_page(page_num)
text = page.get_text()
# Check for Chapters
for title in CHAPTER_TITLES:
if title not in chapter_starts and title in text:
# Heuristic: Title should be near the top or distinct?
# For now, just first occurrence.
chapter_starts[title] = page_num
print(f"Found Chapter '{title}' at page {page_num+1}")
# Check for Goals (1 to 17)
# Use simple regex for "세부목표 N" or "연구 N"
# Adjust matching based on exact document content
matches = re.findall(r"(세부목표|연구)\s*(\d+)", text)
for type_, num_str in matches:
num = int(num_str)
key = f"{type_} {num}"
if num <= 17 and key not in goal_starts:
goal_starts[key] = page_num
print(f"Found '{key}' at page {page_num+1}")
# 2. Extract Chapters
# Sort starting pages to define ranges
sorted_chapters = sorted(chapter_starts.items(), key=lambda x: x[1])
for i, (title, start_page) in enumerate(sorted_chapters):
# End page is start of next chapter, or end of doc
if i < len(sorted_chapters) - 1:
end_page = sorted_chapters[i+1][1] - 1
else:
end_page = total_pages - 1
# Validate range
if start_page > end_page:
# Logic implies overlap or single page
end_page = start_page
out_name = f"{i+1:02d}_{title.replace(' ', '_').replace('/', '-')}.pdf"
out_path = os.path.join(CHAPTER_DIR, out_name)
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=start_page, to_page=end_page)
new_doc.save(out_path)
print(f"Saved Chapter: {out_path} (Pages {start_page+1}-{end_page+1})")
# 3. Extract Goals
# Similar logic for goals
sorted_goals = sorted(goal_starts.items(), key=lambda x: x[1])
for i, (title, start_page) in enumerate(sorted_goals):
# Warning: Goals might be nested in chapters, so end page might be start of next Goal
# OR end of chapter.
# Simple approach: Goal lasts until next Goal starts.
if i < len(sorted_goals) - 1:
end_page = sorted_goals[i+1][1] - 1
else:
# Last goal goes until end of doc (roughly)
end_page = total_pages - 1
out_name = f"{title.replace(' ', '_')}.pdf"
out_path = os.path.join(GOAL_DIR, out_name)
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=start_page, to_page=end_page)
new_doc.save(out_path)
print(f"Saved Goal: {out_path} (Pages {start_page+1}-{end_page+1})")
if __name__ == "__main__":
ensure_dirs()
split_pdf()