-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.py
45 lines (37 loc) · 2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import pytesseract
from pdf2image import convert_from_path
from PIL import ImageOps
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import os
import psutil
def get_num_pages(pdf_path):
reader = PdfReader(pdf_path)
return len(reader.pages)
def process_and_write_page(pdf_path, start_page, end_page, output_file):
images = convert_from_path(pdf_path, dpi=300, first_page=start_page, last_page=end_page)
with open(output_file, 'a', encoding='utf-8') as file:
for i, image in enumerate(images, start=start_page):
gray_image = ImageOps.grayscale(image)
text = pytesseract.image_to_string(gray_image, lang='chi_sim+eng')
print(f"\nPage {i} Text:\n{text}") # Print recognized text
file.write(text + "\n")
def extract_text_from_pdf(pdf_path, num_pages, output_file):
max_workers = os.cpu_count() or 4
memory = psutil.virtual_memory()
# Adjust batch_size based on available memory
batch_size = max(1, int(memory.available / (500 * 1024 * 1024))) # 500MB per batch
with ThreadPoolExecutor(max_workers=max_workers) as executor, open(output_file, 'w', encoding='utf-8') as file:
file.write("") # Clear the file or ensure it's created
futures = []
for start_page in range(1, num_pages + 1, batch_size):
end_page = min(start_page + batch_size - 1, num_pages)
futures.append(executor.submit(process_and_write_page, pdf_path, start_page, end_page, output_file))
for future in tqdm(as_completed(futures), total=len(futures), desc="Processing Pages"):
pass # The actual writing is done in process_and_write_page, just wait for all to complete
pdf_path = './vvv.pdf' # Replace with your PDF file path
output_file = 'extracted_text.txt' # The file where the extracted text will be saved
num_pages = get_num_pages(pdf_path)
extract_text_from_pdf(pdf_path, num_pages, output_file)
print("Text extraction and saving complete.")