Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-633 add PDF performance script #419

Merged
merged 5 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ ignore =
ANN101
per-file-ignores =
scripts/*:T201
scripts/benchmark_pdf_performance*:JS101,T201
7 changes: 3 additions & 4 deletions dedoc/utils/pdf_utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
from typing import Optional

from PIL.Image import Image
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_path
from pypdf import PdfReader


def get_pdf_page_count(path: str) -> Optional[int]:
try:
with open(path, "rb") as fl:
reader = PdfFileReader(fl)
return reader.getNumPages()
reader = PdfReader(path)
return len(reader.pages)
except Exception:
return None

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c'
pdfminer.six==20211012
piexif==1.1.3
pylzma==0.5.0
pypdf==4.1.0
PyPDF2==1.27.0
pytesseract==0.3.10
python-docx==0.8.11
Expand Down
94 changes: 94 additions & 0 deletions scripts/benchmark_pdf_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import argparse
import json
import os.path
from typing import List

from scripts.benchmark_utils.pdf_performance_task import PDFPerformanceTask


def get_tasks(configs: List[dict], input_path: str, dedoc_host: str, pdf_options: List[str]) -> List[List[PDFPerformanceTask]]:
tasks = []

for config in configs:
config_tasks = []

for task_name in sorted(os.listdir(input_path)):
files_path = os.path.join(input_path, task_name)
if os.path.isdir(files_path) and not task_name.startswith("_"):
config_tasks.append(PDFPerformanceTask(dedoc_host, task_name, files_path, pdf_options, config))

tasks.append(config_tasks)

return tasks


def make_report(tasks: List[List[PDFPerformanceTask]], output_path: str, configs: List[dict]) -> None:
with open(output_path, "w", encoding="utf-8") as f:
f.write("""<html>
<head>
<title>PDF performance benchmark</title>
<style>
p { margin-bottom: 5px; }
pre { background: #f0f0f0; padding: 5px; margin: 0; }
summary { font-weight: bold; font-size: 1.2em; margin-bottom: 5px; margin-top: 20px; }
table { border-collapse: collapse; }
td, th { padding: 5px 10px; border: 1px solid #000; text-align: center; }
td:first-child { text-align: left; max-width: 600px; word-break: break-word; }
td:last-child, tr:last-child td:not(:first-child) { background: #f0f0f0; }
tr:last-child td:first-child { font-weight: bold; text-align: right; cursor: pointer; }
.hidden-files tr:nth-child(n+3) { display: none; }
.hidden-files tr:last-child { display: table-row; }
</style>

<script>
function HideFiles(cell) {
cell.parentNode.parentNode.classList.toggle("hidden-files")
}
</script>
</head>
<body>""")

for config, config_tasks in zip(configs, tasks):
f.write("<p>Running parameters:</p>")
f.write(f"<pre>{json.dumps(config, ensure_ascii=False, indent=2)}</pre>\n\n")

for task in config_tasks:
f.write(task.to_html())

f.write("</body>\n")
f.write("</html>\n")


def main() -> None:
pdf_options = ["true", "false", "auto", "auto_tabby", "tabby"]
parser = argparse.ArgumentParser(description="Script for evaluate different PDF readers performance.", formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("-i", "--input", help="path to the directory with pdfs (default: %(default)s)", type=str, default="pdf_data")
parser.add_argument("-o", "--output", help="path to the report filename (default: %(default)s)", type=str, default="pdf_performance.html")
parser.add_argument("-n", "--loops", help="number of repetitions of testing one file (default: %(default)d)", type=int, default=1)
parser.add_argument("--dedoc-host", help="url to DEDOC instance for sending files (default: %(default)s", type=str, default="http://localhost:1231")
parser.add_argument("--pdf-options", help="values of pdf_with_text_layer argument", choices=pdf_options, nargs="+", required=True)
parser.add_argument("--parameters", help="path to json file with alternative parameters dictionaries")
args = parser.parse_args()

assert os.path.exists(args.input), f'Directory "{args.input}" does not exists'
assert os.path.isdir(args.input), f'Path "{args.input}" is not a directory'
assert args.loops > 0, "The number of repetitions of testing one file must be positive"

print(f'Run pdf performance benchmark with next pdf options: {", ".join(args.pdf_options)}')
configs = [{}]

if args.parameters:
with open(args.parameters, "r", encoding="utf-8") as f:
configs = json.load(f)

tasks = get_tasks(configs, args.input, args.dedoc_host, args.pdf_options)

for _ in range(args.loops):
for config_tasks in tasks:
for task in config_tasks:
task.run()
make_report(tasks, args.output, configs)


if __name__ == "__main__":
main()
80 changes: 80 additions & 0 deletions scripts/benchmark_utils/pdf_performance_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os
import time
from typing import List

from pdfminer.pdfpage import PDFPage

from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.utils.utils import send_file
from scripts.benchmark_utils.performance_time import PerformanceResult


class PDFPerformanceTask:
def __init__(self, dedoc_host: str, title: str, input_dir: str, pdf_reader_options: List[str], config: dict) -> None:
self.dedoc_host = dedoc_host
self.title = title
self.config = config
self.pdf_reader_options = pdf_reader_options

filenames = [os.path.join(input_dir, filename) for filename in os.listdir(input_dir) if filename.endswith(".pdf")]
self.times = {pdf_option: {filename: PerformanceResult() for filename in filenames} for pdf_option in self.pdf_reader_options}
self.pages = {filename: get_pdf_page_count(filename) for filename in filenames}
self.filenames = sorted(filenames, key=lambda filename: self.pages[filename])

def run(self) -> None:
print(f'Run task "{self.title}"')

for pdf_option in self.pdf_reader_options:
print(f' Handle files with pdf option "{pdf_option}":')
self.__run_files(pdf_option)

def to_html(self) -> str:
if not self.filenames:
return ""

pdf_header = "".join(f"<th>{pdf_option}</th>" for pdf_option in self.pdf_reader_options)

html = [
"<details open>",
f"<summary>{self.title} ({len(self.filenames)} files)</summary>", "<table>",
f'<tr><th rowspan="2">Filename</th><th rowspan="2">Pages</th><th colspan="{len(self.pdf_reader_options) + 1}">pdf_with_text_layer</th></tr>',
f"<tr>{pdf_header}<th>average</th></tr>"
]

for filename in self.filenames:
times = [self.times[pdf_option][filename] for pdf_option in self.pdf_reader_options]
pages = self.pages[filename]
html.append(f"<tr><td>{os.path.basename(filename)}</td><td>{pages}</td>{self.__get_performance_cells(times, pages)}</tr>")

times = []
for pdf_option in self.pdf_reader_options:
times.append(PerformanceResult([self.times[pdf_option][filename] / self.pages[filename] for filename in self.filenames]))

html.append(f'<tr><td colspan="2" onclick="HideFiles(this)">average (per page)</td>{self.__get_performance_cells(times)}</tr>')
html.append("</table>")
html.append("</details>\n")

return "\n".join(html)

def __run_file(self, pdf_option: str, filename: str) -> float:
start_time = time.time()
send_file(self.dedoc_host, os.path.basename(filename), filename, {"pdf_with_text_layer": pdf_option, **self.config})
return time.time() - start_time

def __run_files(self, pdf_option: str) -> None:
for i, filename in enumerate(self.filenames):
elapsed_time = self.__run_file(pdf_option, filename)
self.times[pdf_option][filename].add(elapsed_time)
print(f' - handle file {i + 1} / {len(self.filenames)} "{os.path.basename(filename)}" (pages: {self.pages[filename]}): {elapsed_time} seconds')

print("")

def __get_performance_cells(self, pdf_times: List[PerformanceResult], pages: int = 0) -> str:
total_times = pdf_times + [PerformanceResult(pdf_times)]
return "".join(f"<td>{times} ({times / pages} / page)</td>" if pages > 0 else f"<td>{times}</td>" for times in total_times)

def __get_page_count(self, path: str) -> int:
with open(path, "rb") as fp:
pages = len(list(PDFPage.get_pages(fp)))

return max(pages, 1)
43 changes: 43 additions & 0 deletions scripts/benchmark_utils/performance_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import Iterable, Optional, Union

import numpy as np


class PerformanceResult:
def __init__(self, results: Optional[Iterable["PerformanceResult"]] = None) -> None:
self.values = []

if results is not None:
for result in results:
self.add(result)

def add(self, value: Union[float, "PerformanceResult"]) -> None:
if isinstance(value, PerformanceResult):
self.values.extend(value.values)
else:
self.values.append(value)

@property
def mean(self) -> float:
return np.mean(self.values) if self.values else 0

@property
def std(self) -> float:
return np.std(self.values) if self.values else 0

def __str__(self) -> str:
if not self.values:
return "-"

if len(self.values) == 1:
return f"{self.mean:.2f}"

return f"{self.mean:.2f}±{self.std:.2f}"

def __truediv__(self, scale: float) -> "PerformanceResult":
result = PerformanceResult()

for t in self.values:
result.add(t / scale)

return result
Loading