Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-633 add PDF performance script #419

Merged
merged 5 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ ignore =
ANN101
per-file-ignores =
scripts/*:T201
scripts/benchmark_pdf_performance*:JS101,T201
7 changes: 3 additions & 4 deletions dedoc/utils/pdf_utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
from typing import Optional

from PIL.Image import Image
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_path
from pypdf import PdfReader


def get_pdf_page_count(path: str) -> Optional[int]:
try:
with open(path, "rb") as fl:
reader = PdfFileReader(fl)
return reader.getNumPages()
reader = PdfReader(path)
return len(reader.pages)
except Exception:
return None

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c'
pdfminer.six==20211012
piexif==1.1.3
pylzma==0.5.0
pypdf==4.1.0
PyPDF2==1.27.0
pytesseract==0.3.10
python-docx==0.8.11
Expand Down
236 changes: 236 additions & 0 deletions resources/benchmarks/benchmark_pdf_performance.html

Large diffs are not rendered by default.

155 changes: 155 additions & 0 deletions scripts/benchmark_pdf_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import argparse
import json
import os.path
import zipfile
from typing import List

import wget

from scripts.benchmark_utils.pdf_performance_task import PDFPerformanceTask


def download_data(data_path: str) -> None:
data_archive_path = f"{data_path}.zip"

wget.download("https://at.ispras.ru/owncloud/index.php/s/lp4wEVyZTd9lA0u/download", data_archive_path)
with zipfile.ZipFile(data_archive_path, "r") as archive:
archive.extractall(data_path)

os.remove(data_archive_path)


def get_tasks(configs: List[dict], input_path: str, dedoc_host: str, pdf_options: List[str]) -> List[List[PDFPerformanceTask]]:
if input_path == "":
input_path = "pdf_performance_benchmark_data"
download_data(input_path)

tasks = []

for config in configs:
config_tasks = []

for task_name in sorted(os.listdir(input_path)):
files_path = os.path.join(input_path, task_name)
if os.path.isdir(files_path) and not task_name.startswith("_"):
config_tasks.append(PDFPerformanceTask(dedoc_host, task_name, files_path, pdf_options, config))

tasks.append(config_tasks)

return tasks


def make_report(tasks: List[List[PDFPerformanceTask]], output_path: str, configs: List[dict]) -> None:
with open(output_path, "w", encoding="utf-8") as f:
f.write("""<html>
<head>
<title>PDF performance benchmark</title>
<style>
p { margin-bottom: 5px; }
pre { background: #f0f0f0; padding: 5px; margin: 0; }
summary { font-weight: bold; font-size: 1.2em; margin-bottom: 5px; margin-top: 20px; }
table { border-collapse: collapse; }
td, th { padding: 5px 10px; border: 1px solid #000; text-align: center; }
td:first-child { text-align: left; max-width: 600px; word-break: break-word; }
td:last-child, tr:last-child td:not(:first-child) { background: #f0f0f0; }
tr:last-child td:first-child { font-weight: bold; text-align: right; cursor: pointer; }
.hidden-files tr:nth-child(n+3) { display: none; }
.hidden-files tr:last-child { display: table-row; }
</style>

<script>
function HideFiles(cell) {
cell.parentNode.parentNode.classList.toggle("hidden-files")
}
</script>
</head>
<body>""")

for config, config_tasks in zip(configs, tasks):
f.write("<p>Running parameters:</p>")
f.write(f"<pre>{json.dumps(config, ensure_ascii=False, indent=2)}</pre>\n\n")

for task in config_tasks:
f.write(task.to_html())

f.write("</body>\n")
f.write("</html>\n")


def main() -> None:
default_output_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources", "benchmarks", "benchmark_pdf_performance.html"))
pdf_options = ["true", "false", "auto", "auto_tabby", "tabby"]

parser = argparse.ArgumentParser(description="Script for evaluate different PDF readers performance.", formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("-i", "--input", help="path to the directory with pdfs (default: %(default)s)", type=str, default="")
parser.add_argument("-o", "--output", help="path to the report filename (default: %(default)s)", type=str, default=default_output_path)
parser.add_argument("-n", "--loops", help="number of repetitions of testing one file (default: %(default)d)", type=int, default=1)
parser.add_argument("--dedoc-host", help="url to DEDOC instance for sending files (default: %(default)s", type=str, default="http://localhost:1231")
parser.add_argument("--pdf-options", help="values of pdf_with_text_layer argument", choices=pdf_options, nargs="+", default=pdf_options)
parser.add_argument("--parameters", help="path to json file with alternative parameters dictionaries")
args = parser.parse_args()

if args.input != "":
assert os.path.exists(args.input), f'Directory "{args.input}" does not exists'
assert os.path.isdir(args.input), f'Path "{args.input}" is not a directory'

assert args.loops > 0, "The number of repetitions of testing one file must be positive"

print(f'Run pdf performance benchmark with next pdf options: {", ".join(args.pdf_options)}')
configs = [{}]

if args.parameters:
with open(args.parameters, "r", encoding="utf-8") as f:
configs = json.load(f)

tasks = get_tasks(configs, args.input, args.dedoc_host, args.pdf_options)

for _ in range(args.loops):
for config_tasks in tasks:
for task in config_tasks:
task.run()
make_report(tasks, args.output, configs)


"""
How to run on default benchmark data?
Simple run next command:
python3 benchmark_pdf_performance.py

Running on custom data:
1. Prepare folder with tasks. The task is a directory with pdf files. Directories starting with an underscore (_) will be ignored.
Example of a folder "pdf_data" with 3 tasks:
pdf_data
+--+--+ task1
| +--- file1.pdf
| +--- file2.pdf
|
+--+ Some second task name
| +--- f.pdf
|
+--+ And last task name
| +--- file_.pdf
| +--- file2.pdf
| +--- not_pdf_file.docx
|
+--+ _ignored folder
+--- some_image.png
+--- some_pdf.pdf

2. Run script with next command:
python3 benchmark_pdf_performance.py --pdf-options tabby true auto auto_tabby -i pdf_data

2*. To evaluate with different parameters, you can prepare a json file with a list of dictionaries and specify the “parameters” option:
parameters.json:
[
{ "need_pdf_table_analysis": "false" },
{ "need_pdf_table_analysis": "true", "return_format": "plain_text" }
]

Run with next command:
python3 benchmark_pdf_performance.py --pdf-options tabby true auto auto_tabby -i pdf_data --parameters parameters.json

3. Look your results in the pdf_performance.html file
"""
if __name__ == "__main__":
main()
94 changes: 94 additions & 0 deletions scripts/benchmark_utils/pdf_performance_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import os
import time
from typing import List

from pdfminer.pdfpage import PDFPage

from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.utils.utils import send_file
from scripts.benchmark_utils.performance_result import PerformanceResult


class PDFPerformanceTask:
"""
This class is used to estimate the elapsed time of different PDF pipelines
in different PDF files and save the information into an html table.
"""

def __init__(self, dedoc_host: str, title: str, input_dir: str, pdf_reader_options: List[str], config: dict) -> None:
"""
Initialization of task

:param dedoc_host: URL to launch the dedoc API instance, for example http://localhost:1231
:param title: title of the task to display in the html report
:param input_dir: path to the directory containing the PDF files.
:param pdf_reader_options: list of options available for the "pdf_with_text_layer" API parameter
:param config: additional file processing parameters
"""
self.dedoc_host = dedoc_host
self.title = title
self.config = config
self.pdf_reader_options = pdf_reader_options

filenames = [os.path.join(input_dir, filename) for filename in os.listdir(input_dir) if filename.endswith(".pdf")]
self.times = {pdf_option: {filename: PerformanceResult() for filename in filenames} for pdf_option in self.pdf_reader_options}
self.pages = {filename: get_pdf_page_count(filename) for filename in filenames}
self.filenames = sorted(filenames, key=lambda filename: self.pages[filename])

def run(self) -> None:
print(f'Run task "{self.title}"')

for pdf_option in self.pdf_reader_options:
print(f' Handle files with pdf option "{pdf_option}":')
self.__run_files(pdf_option)

def to_html(self) -> str:
if not self.filenames:
return ""

pdf_header = "".join(f"<th>{pdf_option}</th>" for pdf_option in self.pdf_reader_options)

html = [
"<details open>",
f"<summary>{self.title} ({len(self.filenames)} files)</summary>", "<table>",
f'<tr><th rowspan="2">Filename</th><th rowspan="2">Pages</th><th colspan="{len(self.pdf_reader_options) + 1}">pdf_with_text_layer</th></tr>',
f"<tr>{pdf_header}<th>average</th></tr>"
]

for filename in self.filenames:
times = [self.times[pdf_option][filename] for pdf_option in self.pdf_reader_options]
pages = self.pages[filename]
html.append(f"<tr><td>{os.path.basename(filename)}</td><td>{pages}</td>{self.__get_performance_cells(times, pages)}</tr>")

times = []
for pdf_option in self.pdf_reader_options:
times.append(PerformanceResult([self.times[pdf_option][filename] / self.pages[filename] for filename in self.filenames]))

html.append(f'<tr><td colspan="2" onclick="HideFiles(this)">average (per page)</td>{self.__get_performance_cells(times)}</tr>')
html.append("</table>")
html.append("</details>\n")

return "\n".join(html)

def __run_file(self, pdf_option: str, filename: str) -> float:
start_time = time.time()
send_file(self.dedoc_host, os.path.basename(filename), filename, {"pdf_with_text_layer": pdf_option, **self.config})
return time.time() - start_time

def __run_files(self, pdf_option: str) -> None:
for i, filename in enumerate(self.filenames):
elapsed_time = self.__run_file(pdf_option, filename)
self.times[pdf_option][filename].add(elapsed_time)
print(f' - handle file {i + 1} / {len(self.filenames)} "{os.path.basename(filename)}" (pages: {self.pages[filename]}): {elapsed_time} seconds')

print("")

def __get_performance_cells(self, pdf_times: List[PerformanceResult], pages: int = 0) -> str:
total_times = pdf_times + [PerformanceResult(pdf_times)]
return "".join(f"<td>{times} ({times / pages} / page)</td>" if pages > 0 else f"<td>{times}</td>" for times in total_times)

def __get_page_count(self, path: str) -> int:
with open(path, "rb") as fp:
pages = len(list(PDFPage.get_pages(fp)))

return max(pages, 1)
59 changes: 59 additions & 0 deletions scripts/benchmark_utils/performance_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from typing import Iterable, Optional, Union

import numpy as np


class PerformanceResult:
"""
This class is used for storing multiple results of measuring some metric (for example, elapsed time)
with support for calculating mean and std statistics and pretty printing of stored values

>>> result = PerformanceResult()
>>> f"result: {result}" # result: -
>>> result.add(5.0)
>>> f"result: {result}" # result: 5.00
>>> result.add(8.0)
>>> f"result: {result}" # result: 6.50±1.50
>>> result.mean # 6.5
>>> result.std # 1.5
>>> partial_result = result / 4
>>> f"partial_result: {partial_result}" # partial_result: 1.62±0.38
"""

def __init__(self, results: Optional[Iterable["PerformanceResult"]] = None) -> None:
self.values = []

if results is not None:
for result in results:
self.add(result)

def add(self, value: Union[float, "PerformanceResult"]) -> None:
if isinstance(value, PerformanceResult):
self.values.extend(value.values)
else:
self.values.append(value)

@property
def mean(self) -> float:
return np.mean(self.values) if self.values else 0

@property
def std(self) -> float:
return np.std(self.values) if self.values else 0

def __str__(self) -> str:
if not self.values:
return "-"

if len(self.values) == 1:
return f"{self.mean:.2f}"

return f"{self.mean:.2f}±{self.std:.2f}"

def __truediv__(self, scale: float) -> "PerformanceResult":
result = PerformanceResult()

for t in self.values:
result.add(t / scale)

return result
Loading