Skip to content

Commit

Permalink
add PDF performance script
Browse files Browse the repository at this point in the history
  • Loading branch information
dronperminov committed Mar 28, 2024
1 parent 56b44dd commit 23b5bcd
Show file tree
Hide file tree
Showing 4 changed files with 217 additions and 0 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ ignore =
ANN101
per-file-ignores =
scripts/*:T201
scripts/benchmark_pdf_performance*:JS101,T201
94 changes: 94 additions & 0 deletions scripts/benchmark_pdf_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import argparse
import json
import os.path
from typing import List

from scripts.benchmark_utils.pdf_performance_task import PDFPerformanceTask


def get_tasks(configs: List[dict], input_path: str, dedoc_host: str, pdf_options: List[str]) -> List[List[PDFPerformanceTask]]:
tasks = []

for config in configs:
config_tasks = []

for task_name in sorted(os.listdir(input_path)):
files_path = os.path.join(input_path, task_name)
if os.path.isdir(files_path) and not task_name.startswith("_"):
config_tasks.append(PDFPerformanceTask(dedoc_host, task_name, files_path, pdf_options, config))

tasks.append(config_tasks)

return tasks


def make_report(tasks: List[List[PDFPerformanceTask]], output_path: str, configs: List[dict]) -> None:
with open(output_path, "w", encoding="utf-8") as f:
f.write("""<html>
<head>
<title>PDF performance benchmark</title>
<style>
p { margin-bottom: 5px; }
pre { background: #f0f0f0; padding: 5px; margin: 0; }
summary { font-weight: bold; font-size: 1.2em; margin-bottom: 5px; margin-top: 20px; }
table { border-collapse: collapse; }
td, th { padding: 5px 10px; border: 1px solid #000; text-align: center; }
td:first-child { text-align: left; max-width: 600px; word-break: break-word; }
td:last-child, tr:last-child td:not(:first-child) { background: #f0f0f0; }
tr:last-child td:first-child { font-weight: bold; text-align: right; cursor: pointer; }
.hidden-files tr:nth-child(n+3) { display: none; }
.hidden-files tr:last-child { display: table-row; }
</style>
<script>
function HideFiles(cell) {
cell.parentNode.parentNode.classList.toggle("hidden-files")
}
</script>
</head>
<body>""")

for config, config_tasks in zip(configs, tasks):
f.write("<p>Running parameters:</p>")
f.write(f"<pre>{json.dumps(config, ensure_ascii=False, indent=2)}</pre>\n\n")

for task in config_tasks:
f.write(task.to_html())

f.write("</body>\n")
f.write("</html>\n")


def main() -> None:
pdf_options = ["true", "false", "auto", "auto_tabby", "tabby"]
parser = argparse.ArgumentParser(description="Script for evaluate different PDF readers performance.", formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("-i", "--input", help="path to the directory with pdfs (default: %(default)s)", type=str, default="pdf_data")
parser.add_argument("-o", "--output", help="path to the report filename (default: %(default)s)", type=str, default="pdf_performance.html")
parser.add_argument("-n", "--loops", help="number of repetitions of testing one file (default: %(default)d)", type=int, default=1)
parser.add_argument("--dedoc-host", help="url to DEDOC instance for sending files (default: %(default)s", type=str, default="http://localhost:1231")
parser.add_argument("--pdf-options", help="values of pdf_with_text_layer argument", choices=pdf_options, nargs="+", required=True)
parser.add_argument("--parameters", help="path to json file with alternative parameters dictionaries")
args = parser.parse_args()

assert os.path.exists(args.input), f'Directory "{args.input}" does not exists'
assert os.path.isdir(args.input), f'Path "{args.input}" is not a directory'
assert args.loops > 0, "The number of repetitions of testing one file must be positive"

print(f'Run pdf performance benchmark with next pdf options: {", ".join(args.pdf_options)}')
configs = [{}]

if args.parameters:
with open(args.parameters, "r", encoding="utf-8") as f:
configs = json.load(f)

tasks = get_tasks(configs, args.input, args.dedoc_host, args.pdf_options)

for _ in range(args.loops):
for config_tasks in tasks:
for task in config_tasks:
task.run()
make_report(tasks, args.output, configs)


if __name__ == "__main__":
main()
79 changes: 79 additions & 0 deletions scripts/benchmark_utils/pdf_performance_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
import time
from typing import List

from pdfminer.pdfpage import PDFPage

from dedoc.utils.utils import send_file
from scripts.benchmark_utils.performance_time import PerformanceResult


class PDFPerformanceTask:
def __init__(self, dedoc_host: str, title: str, input_dir: str, pdf_reader_options: List[str], config: dict) -> None:
self.dedoc_host = dedoc_host
self.title = title
self.config = config
self.pdf_reader_options = pdf_reader_options

filenames = [os.path.join(input_dir, filename) for filename in os.listdir(input_dir) if filename.endswith(".pdf")]
self.times = {pdf_option: {filename: PerformanceResult() for filename in filenames} for pdf_option in self.pdf_reader_options}
self.pages = {filename: self.__get_page_count(filename) for filename in filenames}
self.filenames = sorted(filenames, key=lambda filename: self.pages[filename])

def run(self) -> None:
print(f'Run task "{self.title}"')

for pdf_option in self.pdf_reader_options:
print(f' Handle files with pdf option "{pdf_option}":')
self.__run_files(pdf_option)

def to_html(self) -> str:
if not self.filenames:
return ""

pdf_header = "".join(f"<th>{pdf_option}</th>" for pdf_option in self.pdf_reader_options)

html = [
"<details open>",
f"<summary>{self.title} ({len(self.filenames)} files)</summary>", "<table>",
f'<tr><th rowspan="2">Filename</th><th rowspan="2">Pages</th><th colspan="{len(self.pdf_reader_options) + 1}">pdf_with_text_layer</th></tr>',
f"<tr>{pdf_header}<th>average</th></tr>"
]

for filename in self.filenames:
times = [self.times[pdf_option][filename] for pdf_option in self.pdf_reader_options]
pages = self.pages[filename]
html.append(f"<tr><td>{os.path.basename(filename)}</td><td>{pages}</td>{self.__get_performance_cells(times, pages)}</tr>")

times = []
for pdf_option in self.pdf_reader_options:
times.append(PerformanceResult([self.times[pdf_option][filename] / self.pages[filename] for filename in self.filenames]))

html.append(f'<tr><td colspan="2" onclick="HideFiles(this)">average (per page)</td>{self.__get_performance_cells(times)}</tr>')
html.append("</table>")
html.append("</details>\n")

return "\n".join(html)

def __run_file(self, pdf_option: str, filename: str) -> float:
start_time = time.time()
send_file(self.dedoc_host, os.path.basename(filename), filename, {"pdf_with_text_layer": pdf_option, **self.config})
return time.time() - start_time

def __run_files(self, pdf_option: str) -> None:
for i, filename in enumerate(self.filenames):
elapsed_time = self.__run_file(pdf_option, filename)
self.times[pdf_option][filename].add(elapsed_time)
print(f' - handle file {i + 1} / {len(self.filenames)} "{os.path.basename(filename)}" (pages: {self.pages[filename]}): {elapsed_time} seconds')

print("")

def __get_performance_cells(self, pdf_times: List[PerformanceResult], pages: int = 0) -> str:
total_times = pdf_times + [PerformanceResult(pdf_times)]
return "".join(f"<td>{times} ({times / pages} / page)</td>" if pages > 0 else f"<td>{times}</td>" for times in total_times)

def __get_page_count(self, path: str) -> int:
with open(path, "rb") as fp:
pages = len(list(PDFPage.get_pages(fp)))

return max(pages, 1)
43 changes: 43 additions & 0 deletions scripts/benchmark_utils/performance_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import Iterable, Optional, Union

import numpy as np


class PerformanceResult:
def __init__(self, results: Optional[Iterable["PerformanceResult"]] = None) -> None:
self.values = []

if results is not None:
for result in results:
self.add(result)

def add(self, value: Union[float, "PerformanceResult"]) -> None:
if isinstance(value, PerformanceResult):
self.values.extend(value.values)
else:
self.values.append(value)

@property
def mean(self) -> float:
return np.mean(self.values) if self.values else 0

@property
def std(self) -> float:
return np.std(self.values) if self.values else 0

def __str__(self) -> str:
if not self.values:
return "-"

if len(self.values) == 1:
return f"{self.mean:.2f}"

return f"{self.mean:.2f}±{self.std:.2f}"

def __truediv__(self, scale: float) -> "PerformanceResult":
result = PerformanceResult()

for t in self.values:
result.add(t / scale)

return result

0 comments on commit 23b5bcd

Please sign in to comment.