-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
56b44dd
commit cffbc2d
Showing
4 changed files
with
218 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,3 +24,4 @@ ignore = | |
ANN101 | ||
per-file-ignores = | ||
scripts/*:T201 | ||
scripts/benchmark_pdf_performance*:JS101,T201 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
import argparse | ||
import json | ||
import os.path | ||
from typing import List | ||
|
||
from scripts.benchmark_utils.pdf_performance_task import PDFPerformanceTask | ||
|
||
|
||
def get_tasks(configs: List[dict], input_path: str, dedoc_host: str, pdf_options: List[str]) -> List[List[PDFPerformanceTask]]: | ||
tasks = [] | ||
|
||
for config in configs: | ||
config_tasks = [] | ||
|
||
for task_name in sorted(os.listdir(input_path)): | ||
files_path = os.path.join(input_path, task_name) | ||
if os.path.isdir(files_path) and not task_name.startswith("_"): | ||
config_tasks.append(PDFPerformanceTask(dedoc_host, task_name, files_path, pdf_options, config)) | ||
|
||
tasks.append(config_tasks) | ||
|
||
return tasks | ||
|
||
|
||
def make_report(tasks: List[List[PDFPerformanceTask]], output_path: str, configs: List[dict]) -> None: | ||
with open(output_path, "w", encoding="utf-8") as f: | ||
f.write("""<html> | ||
<head> | ||
<title>PDF performance benchmark</title> | ||
<style> | ||
p { margin-bottom: 5px; } | ||
pre { background: #f0f0f0; padding: 5px; margin: 0; } | ||
summary { font-weight: bold; font-size: 1.2em; margin-bottom: 5px; margin-top: 20px; } | ||
table { border-collapse: collapse; } | ||
td, th { padding: 5px 10px; border: 1px solid #000; text-align: center; } | ||
td:first-child { text-align: left; max-width: 600px; word-break: break-word; } | ||
td:last-child, tr:last-child td:not(:first-child) { background: #f0f0f0; } | ||
tr:last-child td:first-child { font-weight: bold; text-align: right; cursor: pointer; } | ||
.hidden-files tr:nth-child(n+3) { display: none; } | ||
.hidden-files tr:last-child { display: table-row; } | ||
</style> | ||
<script> | ||
function HideFiles(cell) { | ||
cell.parentNode.parentNode.classList.toggle("hidden-files") | ||
} | ||
</script> | ||
</head> | ||
<body>""") | ||
|
||
for config, config_tasks in zip(configs, tasks): | ||
f.write("<p>Running parameters:</p>") | ||
f.write(f"<pre>{json.dumps(config, ensure_ascii=False, indent=2)}</pre>\n\n") | ||
|
||
for task in config_tasks: | ||
f.write(task.to_html()) | ||
|
||
f.write("</body>\n") | ||
f.write("</html>\n") | ||
|
||
|
||
def main() -> None: | ||
pdf_options = ["true", "false", "auto", "auto_tabby", "tabby"] | ||
parser = argparse.ArgumentParser(description="Script for evaluate different PDF readers performance.", formatter_class=argparse.RawTextHelpFormatter) | ||
parser.add_argument("-i", "--input", help="path to the directory with pdfs (default: %(default)s)", type=str, default="pdf_data") | ||
parser.add_argument("-o", "--output", help="path to the report filename (default: %(default)s)", type=str, default="pdf_performance.html") | ||
parser.add_argument("-n", "--loops", help="number of repetitions of testing one file (default: %(default)d)", type=int, default=1) | ||
parser.add_argument("--dedoc-host", help="url to DEDOC instance for sending files (default: %(default)s", type=str, default="http://localhost:1231") | ||
parser.add_argument("--pdf-options", help="values of pdf_with_text_layer argument", choices=pdf_options, nargs="+", required=True) | ||
parser.add_argument("--parameters", help="path to json file with alternative parameters dictionaries") | ||
args = parser.parse_args() | ||
|
||
assert os.path.exists(args.input), f'Directory "{args.input}" does not exists' | ||
assert os.path.isdir(args.input), f'Path "{args.input}" is not a directory' | ||
assert args.loops > 0, "The number of repetitions of testing one file must be positive" | ||
|
||
print(f'Run pdf performance benchmark with next pdf options: {", ".join(args.pdf_options)}') | ||
configs = [{}] | ||
|
||
if args.parameters: | ||
with open(args.parameters, "r", encoding="utf-8") as f: | ||
configs = json.load(f) | ||
|
||
tasks = get_tasks(configs, args.input, args.dedoc_host, args.pdf_options) | ||
make_report(tasks, args.output, configs) | ||
|
||
for _ in range(args.loops): | ||
for config_tasks in tasks: | ||
for task in config_tasks: | ||
task.run() | ||
make_report(tasks, args.output, configs) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import os | ||
import time | ||
from typing import List | ||
|
||
from pdfminer.pdfpage import PDFPage | ||
|
||
from dedoc.utils.utils import send_file | ||
from scripts.benchmark_utils.performance_time import PerformanceResult | ||
|
||
|
||
class PDFPerformanceTask: | ||
def __init__(self, dedoc_host: str, title: str, input_dir: str, pdf_reader_options: List[str], config: dict) -> None: | ||
self.dedoc_host = dedoc_host | ||
self.title = title | ||
self.config = config | ||
self.pdf_reader_options = pdf_reader_options | ||
|
||
filenames = [os.path.join(input_dir, filename) for filename in os.listdir(input_dir)] if os.path.exists(input_dir) else [] | ||
self.times = {pdf_option: {filename: PerformanceResult() for filename in filenames} for pdf_option in self.pdf_reader_options} | ||
self.pages = {filename: self.__get_page_count(filename) for filename in filenames} | ||
self.filenames = sorted(filenames, key=lambda filename: self.pages[filename]) | ||
|
||
def run(self) -> None: | ||
print(f'Run task "{self.title}"') | ||
|
||
for pdf_option in self.pdf_reader_options: | ||
print(f' Handle files with pdf option "{pdf_option}":') | ||
self.__run_files(pdf_option) | ||
|
||
def to_html(self) -> str: | ||
if not self.filenames: | ||
return "" | ||
|
||
pdf_header = "".join(f"<th>{pdf_option}</th>" for pdf_option in self.pdf_reader_options) | ||
|
||
html = [ | ||
"<details open>", | ||
f"<summary>{self.title} ({len(self.filenames)} files)</summary>", "<table>", | ||
f'<tr><th rowspan="2">Filename</th><th rowspan="2">Pages</th><th colspan="{len(self.pdf_reader_options) + 1}">pdf_with_text_layer</th></tr>', | ||
f"<tr>{pdf_header}<th>average</th></tr>" | ||
] | ||
|
||
for filename in self.filenames: | ||
times = [self.times[pdf_option][filename] for pdf_option in self.pdf_reader_options] | ||
pages = self.pages[filename] | ||
html.append(f"<tr><td>{os.path.basename(filename)}</td><td>{pages}</td>{self.__get_performance_cells(times, pages)}</tr>") | ||
|
||
times = [] | ||
for pdf_option in self.pdf_reader_options: | ||
times.append(PerformanceResult([self.times[pdf_option][filename] / self.pages[filename] for filename in self.filenames])) | ||
|
||
html.append(f'<tr><td colspan="2" onclick="HideFiles(this)">average (per page)</td>{self.__get_performance_cells(times)}</tr>') | ||
html.append("</table>") | ||
html.append("</details>\n") | ||
|
||
return "\n".join(html) | ||
|
||
def __run_file(self, pdf_option: str, filename: str) -> float: | ||
start_time = time.time() | ||
send_file(self.dedoc_host, os.path.basename(filename), filename, {"pdf_with_text_layer": pdf_option, **self.config}) | ||
return time.time() - start_time | ||
|
||
def __run_files(self, pdf_option: str) -> None: | ||
for i, filename in enumerate(self.filenames): | ||
elapsed_time = self.__run_file(pdf_option, filename) | ||
self.times[pdf_option][filename].add(elapsed_time) | ||
print(f' - handle file {i + 1} / {len(self.filenames)} "{os.path.basename(filename)}" (pages: {self.pages[filename]}): {elapsed_time} seconds') | ||
|
||
print("") | ||
|
||
def __get_performance_cells(self, pdf_times: List[PerformanceResult], pages: int = 0) -> str: | ||
total_times = pdf_times + [PerformanceResult(pdf_times)] | ||
return "".join(f"<td>{times} ({times / pages} / page)</td>" if pages > 0 else f"<td>{times}</td>" for times in total_times) | ||
|
||
def __get_page_count(self, path: str) -> int: | ||
with open(path, "rb") as fp: | ||
pages = len(list(PDFPage.get_pages(fp))) | ||
|
||
return max(pages, 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from typing import Iterable, Optional, Union | ||
|
||
import numpy as np | ||
|
||
|
||
class PerformanceResult: | ||
def __init__(self, results: Optional[Iterable["PerformanceResult"]] = None) -> None: | ||
self.values = [] | ||
|
||
if results is not None: | ||
for result in results: | ||
self.add(result) | ||
|
||
def add(self, value: Union[float, "PerformanceResult"]) -> None: | ||
if isinstance(value, PerformanceResult): | ||
self.values.extend(value.values) | ||
else: | ||
self.values.append(value) | ||
|
||
@property | ||
def mean(self) -> float: | ||
return np.mean(self.values) if self.values else 0 | ||
|
||
@property | ||
def std(self) -> float: | ||
return np.std(self.values) if self.values else 0 | ||
|
||
def __str__(self) -> str: | ||
if not self.values: | ||
return "-" | ||
|
||
if len(self.values) == 1: | ||
return f"{self.mean:.2f}" | ||
|
||
return f"{self.mean:.2f}±{self.std:.2f}" | ||
|
||
def __truediv__(self, scale: float) -> "PerformanceResult": | ||
result = PerformanceResult() | ||
|
||
for t in self.values: | ||
result.add(t / scale) | ||
|
||
return result |