-
Notifications
You must be signed in to change notification settings - Fork 40
TLDR-633 add PDF performance script #419
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
23b5bcd
add PDF performance script
dronperminov 9c093db
fix bug with PDF page count getting
dronperminov 917d0bf
review fixes
dronperminov f57aa3b
add default data downloading
dronperminov 730e567
add pdf performance benchmark results
dronperminov File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,3 +24,4 @@ ignore = | |
ANN101 | ||
per-file-ignores = | ||
scripts/*:T201 | ||
scripts/benchmark_pdf_performance*:JS101,T201 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import argparse | ||
import json | ||
import os.path | ||
import zipfile | ||
from typing import List | ||
|
||
import wget | ||
|
||
from scripts.benchmark_utils.pdf_performance_task import PDFPerformanceTask | ||
|
||
|
||
def download_data(data_path: str) -> None: | ||
data_archive_path = f"{data_path}.zip" | ||
|
||
wget.download("https://at.ispras.ru/owncloud/index.php/s/lp4wEVyZTd9lA0u/download", data_archive_path) | ||
with zipfile.ZipFile(data_archive_path, "r") as archive: | ||
archive.extractall(data_path) | ||
|
||
os.remove(data_archive_path) | ||
|
||
|
||
def get_tasks(configs: List[dict], input_path: str, dedoc_host: str, pdf_options: List[str]) -> List[List[PDFPerformanceTask]]: | ||
if input_path == "": | ||
input_path = "pdf_performance_benchmark_data" | ||
download_data(input_path) | ||
|
||
tasks = [] | ||
|
||
for config in configs: | ||
config_tasks = [] | ||
|
||
for task_name in sorted(os.listdir(input_path)): | ||
files_path = os.path.join(input_path, task_name) | ||
if os.path.isdir(files_path) and not task_name.startswith("_"): | ||
config_tasks.append(PDFPerformanceTask(dedoc_host, task_name, files_path, pdf_options, config)) | ||
|
||
tasks.append(config_tasks) | ||
|
||
return tasks | ||
|
||
|
||
def make_report(tasks: List[List[PDFPerformanceTask]], output_path: str, configs: List[dict]) -> None: | ||
with open(output_path, "w", encoding="utf-8") as f: | ||
f.write("""<html> | ||
<head> | ||
<title>PDF performance benchmark</title> | ||
<style> | ||
p { margin-bottom: 5px; } | ||
pre { background: #f0f0f0; padding: 5px; margin: 0; } | ||
summary { font-weight: bold; font-size: 1.2em; margin-bottom: 5px; margin-top: 20px; } | ||
table { border-collapse: collapse; } | ||
td, th { padding: 5px 10px; border: 1px solid #000; text-align: center; } | ||
td:first-child { text-align: left; max-width: 600px; word-break: break-word; } | ||
td:last-child, tr:last-child td:not(:first-child) { background: #f0f0f0; } | ||
tr:last-child td:first-child { font-weight: bold; text-align: right; cursor: pointer; } | ||
.hidden-files tr:nth-child(n+3) { display: none; } | ||
.hidden-files tr:last-child { display: table-row; } | ||
</style> | ||
|
||
<script> | ||
function HideFiles(cell) { | ||
cell.parentNode.parentNode.classList.toggle("hidden-files") | ||
} | ||
</script> | ||
</head> | ||
<body>""") | ||
|
||
for config, config_tasks in zip(configs, tasks): | ||
f.write("<p>Running parameters:</p>") | ||
f.write(f"<pre>{json.dumps(config, ensure_ascii=False, indent=2)}</pre>\n\n") | ||
|
||
for task in config_tasks: | ||
f.write(task.to_html()) | ||
|
||
f.write("</body>\n") | ||
f.write("</html>\n") | ||
|
||
|
||
def main() -> None: | ||
default_output_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources", "benchmarks", "benchmark_pdf_performance.html")) | ||
pdf_options = ["true", "false", "auto", "auto_tabby", "tabby"] | ||
|
||
parser = argparse.ArgumentParser(description="Script for evaluate different PDF readers performance.", formatter_class=argparse.RawTextHelpFormatter) | ||
parser.add_argument("-i", "--input", help="path to the directory with pdfs (default: %(default)s)", type=str, default="") | ||
parser.add_argument("-o", "--output", help="path to the report filename (default: %(default)s)", type=str, default=default_output_path) | ||
parser.add_argument("-n", "--loops", help="number of repetitions of testing one file (default: %(default)d)", type=int, default=1) | ||
parser.add_argument("--dedoc-host", help="url to DEDOC instance for sending files (default: %(default)s", type=str, default="http://localhost:1231") | ||
parser.add_argument("--pdf-options", help="values of pdf_with_text_layer argument", choices=pdf_options, nargs="+", default=pdf_options) | ||
parser.add_argument("--parameters", help="path to json file with alternative parameters dictionaries") | ||
args = parser.parse_args() | ||
|
||
if args.input != "": | ||
assert os.path.exists(args.input), f'Directory "{args.input}" does not exists' | ||
assert os.path.isdir(args.input), f'Path "{args.input}" is not a directory' | ||
|
||
assert args.loops > 0, "The number of repetitions of testing one file must be positive" | ||
|
||
print(f'Run pdf performance benchmark with next pdf options: {", ".join(args.pdf_options)}') | ||
configs = [{}] | ||
|
||
if args.parameters: | ||
with open(args.parameters, "r", encoding="utf-8") as f: | ||
configs = json.load(f) | ||
|
||
tasks = get_tasks(configs, args.input, args.dedoc_host, args.pdf_options) | ||
|
||
for _ in range(args.loops): | ||
for config_tasks in tasks: | ||
for task in config_tasks: | ||
task.run() | ||
make_report(tasks, args.output, configs) | ||
|
||
|
||
""" | ||
How to run on default benchmark data? | ||
Simple run next command: | ||
python3 benchmark_pdf_performance.py | ||
|
||
Running on custom data: | ||
1. Prepare folder with tasks. The task is a directory with pdf files. Directories starting with an underscore (_) will be ignored. | ||
Example of a folder "pdf_data" with 3 tasks: | ||
pdf_data | ||
+--+--+ task1 | ||
| +--- file1.pdf | ||
| +--- file2.pdf | ||
| | ||
+--+ Some second task name | ||
| +--- f.pdf | ||
| | ||
+--+ And last task name | ||
| +--- file_.pdf | ||
| +--- file2.pdf | ||
| +--- not_pdf_file.docx | ||
| | ||
+--+ _ignored folder | ||
+--- some_image.png | ||
+--- some_pdf.pdf | ||
|
||
2. Run script with next command: | ||
python3 benchmark_pdf_performance.py --pdf-options tabby true auto auto_tabby -i pdf_data | ||
|
||
2*. To evaluate with different parameters, you can prepare a json file with a list of dictionaries and specify the “parameters” option: | ||
parameters.json: | ||
[ | ||
{ "need_pdf_table_analysis": "false" }, | ||
{ "need_pdf_table_analysis": "true", "return_format": "plain_text" } | ||
] | ||
|
||
Run with next command: | ||
python3 benchmark_pdf_performance.py --pdf-options tabby true auto auto_tabby -i pdf_data --parameters parameters.json | ||
|
||
3. Look your results in the pdf_performance.html file | ||
""" | ||
if __name__ == "__main__": | ||
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import os | ||
import time | ||
from typing import List | ||
|
||
from pdfminer.pdfpage import PDFPage | ||
|
||
from dedoc.utils.pdf_utils import get_pdf_page_count | ||
from dedoc.utils.utils import send_file | ||
from scripts.benchmark_utils.performance_result import PerformanceResult | ||
|
||
|
||
class PDFPerformanceTask: | ||
""" | ||
This class is used to estimate the elapsed time of different PDF pipelines | ||
in different PDF files and save the information into an html table. | ||
""" | ||
|
||
def __init__(self, dedoc_host: str, title: str, input_dir: str, pdf_reader_options: List[str], config: dict) -> None: | ||
""" | ||
Initialization of task | ||
|
||
:param dedoc_host: URL to launch the dedoc API instance, for example http://localhost:1231 | ||
:param title: title of the task to display in the html report | ||
:param input_dir: path to the directory containing the PDF files. | ||
:param pdf_reader_options: list of options available for the "pdf_with_text_layer" API parameter | ||
:param config: additional file processing parameters | ||
""" | ||
self.dedoc_host = dedoc_host | ||
self.title = title | ||
self.config = config | ||
self.pdf_reader_options = pdf_reader_options | ||
|
||
filenames = [os.path.join(input_dir, filename) for filename in os.listdir(input_dir) if filename.endswith(".pdf")] | ||
self.times = {pdf_option: {filename: PerformanceResult() for filename in filenames} for pdf_option in self.pdf_reader_options} | ||
self.pages = {filename: get_pdf_page_count(filename) for filename in filenames} | ||
self.filenames = sorted(filenames, key=lambda filename: self.pages[filename]) | ||
|
||
def run(self) -> None: | ||
print(f'Run task "{self.title}"') | ||
|
||
for pdf_option in self.pdf_reader_options: | ||
print(f' Handle files with pdf option "{pdf_option}":') | ||
self.__run_files(pdf_option) | ||
|
||
def to_html(self) -> str: | ||
if not self.filenames: | ||
return "" | ||
|
||
pdf_header = "".join(f"<th>{pdf_option}</th>" for pdf_option in self.pdf_reader_options) | ||
|
||
html = [ | ||
"<details open>", | ||
f"<summary>{self.title} ({len(self.filenames)} files)</summary>", "<table>", | ||
f'<tr><th rowspan="2">Filename</th><th rowspan="2">Pages</th><th colspan="{len(self.pdf_reader_options) + 1}">pdf_with_text_layer</th></tr>', | ||
f"<tr>{pdf_header}<th>average</th></tr>" | ||
] | ||
|
||
for filename in self.filenames: | ||
times = [self.times[pdf_option][filename] for pdf_option in self.pdf_reader_options] | ||
pages = self.pages[filename] | ||
html.append(f"<tr><td>{os.path.basename(filename)}</td><td>{pages}</td>{self.__get_performance_cells(times, pages)}</tr>") | ||
|
||
times = [] | ||
for pdf_option in self.pdf_reader_options: | ||
times.append(PerformanceResult([self.times[pdf_option][filename] / self.pages[filename] for filename in self.filenames])) | ||
|
||
html.append(f'<tr><td colspan="2" onclick="HideFiles(this)">average (per page)</td>{self.__get_performance_cells(times)}</tr>') | ||
html.append("</table>") | ||
html.append("</details>\n") | ||
|
||
return "\n".join(html) | ||
|
||
def __run_file(self, pdf_option: str, filename: str) -> float: | ||
start_time = time.time() | ||
send_file(self.dedoc_host, os.path.basename(filename), filename, {"pdf_with_text_layer": pdf_option, **self.config}) | ||
return time.time() - start_time | ||
|
||
def __run_files(self, pdf_option: str) -> None: | ||
for i, filename in enumerate(self.filenames): | ||
elapsed_time = self.__run_file(pdf_option, filename) | ||
self.times[pdf_option][filename].add(elapsed_time) | ||
print(f' - handle file {i + 1} / {len(self.filenames)} "{os.path.basename(filename)}" (pages: {self.pages[filename]}): {elapsed_time} seconds') | ||
|
||
print("") | ||
|
||
def __get_performance_cells(self, pdf_times: List[PerformanceResult], pages: int = 0) -> str: | ||
total_times = pdf_times + [PerformanceResult(pdf_times)] | ||
return "".join(f"<td>{times} ({times / pages} / page)</td>" if pages > 0 else f"<td>{times}</td>" for times in total_times) | ||
|
||
def __get_page_count(self, path: str) -> int: | ||
with open(path, "rb") as fp: | ||
pages = len(list(PDFPage.get_pages(fp))) | ||
|
||
return max(pages, 1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from typing import Iterable, Optional, Union | ||
|
||
import numpy as np | ||
|
||
|
||
class PerformanceResult: | ||
""" | ||
This class is used for storing multiple results of measuring some metric (for example, elapsed time) | ||
with support for calculating mean and std statistics and pretty printing of stored values | ||
|
||
>>> result = PerformanceResult() | ||
>>> f"result: {result}" # result: - | ||
>>> result.add(5.0) | ||
>>> f"result: {result}" # result: 5.00 | ||
>>> result.add(8.0) | ||
>>> f"result: {result}" # result: 6.50±1.50 | ||
>>> result.mean # 6.5 | ||
>>> result.std # 1.5 | ||
>>> partial_result = result / 4 | ||
>>> f"partial_result: {partial_result}" # partial_result: 1.62±0.38 | ||
""" | ||
|
||
def __init__(self, results: Optional[Iterable["PerformanceResult"]] = None) -> None: | ||
self.values = [] | ||
|
||
if results is not None: | ||
for result in results: | ||
self.add(result) | ||
|
||
def add(self, value: Union[float, "PerformanceResult"]) -> None: | ||
if isinstance(value, PerformanceResult): | ||
self.values.extend(value.values) | ||
else: | ||
self.values.append(value) | ||
|
||
@property | ||
def mean(self) -> float: | ||
return np.mean(self.values) if self.values else 0 | ||
|
||
@property | ||
def std(self) -> float: | ||
return np.std(self.values) if self.values else 0 | ||
|
||
def __str__(self) -> str: | ||
if not self.values: | ||
return "-" | ||
|
||
if len(self.values) == 1: | ||
return f"{self.mean:.2f}" | ||
|
||
return f"{self.mean:.2f}±{self.std:.2f}" | ||
|
||
def __truediv__(self, scale: float) -> "PerformanceResult": | ||
result = PerformanceResult() | ||
|
||
for t in self.values: | ||
result.add(t / scale) | ||
|
||
return result |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.