Skip to content

Commit

Permalink
feat(get-words-pdf): concurrency
Browse files Browse the repository at this point in the history
  • Loading branch information
jofaval committed Apr 3, 2023
1 parent 33b705d commit 8e53456
Showing 1 changed file with 36 additions and 11 deletions.
47 changes: 36 additions & 11 deletions python/get_words_from_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,15 @@
"""

import argparse
import logging
import os
import re
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool, cpu_count
from sys import maxsize
from typing import List
from typing import List, Tuple

from pypdf import PdfReader
from pypdf import PageObject, PdfReader

# Source: https://irisreading.com/what-is-the-average-reading-speed/
AVG_LEARNING_RATE_WORDS_PER_MINUTE = 150
Expand All @@ -49,6 +52,9 @@
PARAGRAPH_SEPARATOR = "\n\n"
INFINITY = maxsize

TOTAL_PROCESSORS = cpu_count()
PAGES_TUPLE_SPACE: List[str] = []


def get_total_words(text: str) -> List[str]:
"""Given a text, returns the total amount of words"""
Expand All @@ -66,6 +72,18 @@ def get_total_read_time(total_words: int, words_per_minute: int = AVERAGE_WORDS_
return round(total_words / words_per_minute)


def get_pdf_page_text_job(data: Tuple[PageObject, int, int]) -> None:
"""Job to parallelize the pdf page extraction for speed performance improvement"""
page, paragraph_start, paragraph_end = data
parsed = parse_pdf_page_text(
page.extract_text(),
paragraph_start,
paragraph_end
)

PAGES_TUPLE_SPACE.append(parsed)


def parse_pdf_page_text(text: str, paragraph_start: int, paragraph_end: int) -> str:
"""Parses a PDF page to get the desired amount of paragraphs"""
assert paragraph_start <= paragraph_end
Expand All @@ -88,18 +106,25 @@ def get_pdf_text(
assert pdf_path.endswith(".pdf")
assert page_start <= page_end

reader = PdfReader(pdf_path)
text = [
parse_pdf_page_text(
page.extract_text(),
paragraph_start,
paragraph_end
)
logger = logging.getLogger("pypdf")
logger.setLevel(logging.ERROR)

reader = PdfReader(pdf_path, strict=False)
PAGES_TUPLE_SPACE.clear()
job_elements = [
(page, paragraph_start, paragraph_end)
for (index, page) in enumerate(reader.pages)
if page_start <= index <= page_end
]
print(len(text), "page(s) extracted.")
text = "\n".join(text)

with ThreadPoolExecutor() as executor:
executor.map(
get_pdf_page_text_job,
job_elements,
)

print(len(PAGES_TUPLE_SPACE), "page(s) extracted.")
text = "\n".join(PAGES_TUPLE_SPACE)

return text

Expand Down

0 comments on commit 8e53456

Please sign in to comment.