ispras · oksidgy · Apr 3, 2024 · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/.flake8 b/.flake8
@@ -24,3 +24,4 @@ ignore =
     ANN101
 per-file-ignores =
     scripts/*:T201
+    scripts/benchmark_pdf_performance*:JS101,T201
diff --git a/dedoc/utils/pdf_utils.py b/dedoc/utils/pdf_utils.py
@@ -1,15 +1,14 @@
 from typing import Optional
 
 from PIL.Image import Image
-from PyPDF2 import PdfFileReader
 from pdf2image import convert_from_path
+from pypdf import PdfReader
 
 
 def get_pdf_page_count(path: str) -> Optional[int]:
     try:
-        with open(path, "rb") as fl:
-            reader = PdfFileReader(fl)
-            return reader.getNumPages()
+        reader = PdfReader(path)
+        return len(reader.pages)
     except Exception:
         return None
 

diff --git a/requirements.txt b/requirements.txt
@@ -16,6 +16,7 @@ pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c'
 pdfminer.six==20211012
 piexif==1.1.3
 pylzma==0.5.0
+pypdf==4.1.0
 PyPDF2==1.27.0
 pytesseract==0.3.10
 python-docx==0.8.11

diff --git a/scripts/benchmark_pdf_performance.py b/scripts/benchmark_pdf_performance.py
@@ -0,0 +1,94 @@
+import argparse
+import json
+import os.path
+from typing import List
+
+from scripts.benchmark_utils.pdf_performance_task import PDFPerformanceTask
+
+
+def get_tasks(configs: List[dict], input_path: str, dedoc_host: str, pdf_options: List[str]) -> List[List[PDFPerformanceTask]]:
+    tasks = []
+
+    for config in configs:
+        config_tasks = []
+
+        for task_name in sorted(os.listdir(input_path)):
+            files_path = os.path.join(input_path, task_name)
+            if os.path.isdir(files_path) and not task_name.startswith("_"):
+                config_tasks.append(PDFPerformanceTask(dedoc_host, task_name, files_path, pdf_options, config))
+
+        tasks.append(config_tasks)
+
+    return tasks
+
+
+def make_report(tasks: List[List[PDFPerformanceTask]], output_path: str, configs: List[dict]) -> None:
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write("""<html>
+        <head>
+            <title>PDF performance benchmark</title>
+            <style>
+                p { margin-bottom: 5px; }
+                pre { background: #f0f0f0; padding: 5px; margin: 0; }
+                summary { font-weight: bold; font-size: 1.2em; margin-bottom: 5px; margin-top: 20px; }
+                table { border-collapse: collapse; }
+                td, th { padding: 5px 10px; border: 1px solid #000; text-align: center; }
+                td:first-child { text-align: left; max-width: 600px; word-break: break-word; }
+                td:last-child, tr:last-child td:not(:first-child) { background: #f0f0f0; }
+                tr:last-child td:first-child { font-weight: bold; text-align: right; cursor: pointer; }
+                .hidden-files tr:nth-child(n+3) { display: none; }
+                .hidden-files tr:last-child { display: table-row; }
+            </style>
+
+            <script>
+                function HideFiles(cell) {
+                    cell.parentNode.parentNode.classList.toggle("hidden-files")
+                }
+            </script>
+        </head>
+        <body>""")
+
+        for config, config_tasks in zip(configs, tasks):
+            f.write("<p>Running parameters:</p>")
+            f.write(f"<pre>{json.dumps(config, ensure_ascii=False, indent=2)}</pre>\n\n")
+
+            for task in config_tasks:
+                f.write(task.to_html())
+
+        f.write("</body>\n")
+        f.write("</html>\n")
+
+
+def main() -> None:
+    pdf_options = ["true", "false", "auto", "auto_tabby", "tabby"]
+    parser = argparse.ArgumentParser(description="Script for evaluate different PDF readers performance.", formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("-i", "--input", help="path to the directory with pdfs (default: %(default)s)", type=str, default="pdf_data")
+    parser.add_argument("-o", "--output", help="path to the report filename (default: %(default)s)", type=str, default="pdf_performance.html")
+    parser.add_argument("-n", "--loops", help="number of repetitions of testing one file (default: %(default)d)", type=int, default=1)
+    parser.add_argument("--dedoc-host", help="url to DEDOC instance for sending files (default: %(default)s", type=str, default="http://localhost:1231")
+    parser.add_argument("--pdf-options", help="values of pdf_with_text_layer argument", choices=pdf_options, nargs="+", required=True)
+    parser.add_argument("--parameters", help="path to json file with alternative parameters dictionaries")
+    args = parser.parse_args()
+
+    assert os.path.exists(args.input), f'Directory "{args.input}" does not exists'
+    assert os.path.isdir(args.input), f'Path "{args.input}" is not a directory'
+    assert args.loops > 0, "The number of repetitions of testing one file must be positive"
+
+    print(f'Run pdf performance benchmark with next pdf options: {", ".join(args.pdf_options)}')
+    configs = [{}]
+
+    if args.parameters:
+        with open(args.parameters, "r", encoding="utf-8") as f:
+            configs = json.load(f)
+
+    tasks = get_tasks(configs, args.input, args.dedoc_host, args.pdf_options)
+
+    for _ in range(args.loops):
+        for config_tasks in tasks:
+            for task in config_tasks:
+                task.run()
+                make_report(tasks, args.output, configs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/benchmark_utils/pdf_performance_task.py b/scripts/benchmark_utils/pdf_performance_task.py
@@ -0,0 +1,80 @@
+import os
+import time
+from typing import List
+
+from pdfminer.pdfpage import PDFPage
+
+from dedoc.utils.pdf_utils import get_pdf_page_count
+from dedoc.utils.utils import send_file
+from scripts.benchmark_utils.performance_time import PerformanceResult
+
+
+class PDFPerformanceTask:
+    def __init__(self, dedoc_host: str, title: str, input_dir: str, pdf_reader_options: List[str], config: dict) -> None:
+        self.dedoc_host = dedoc_host
+        self.title = title
+        self.config = config
+        self.pdf_reader_options = pdf_reader_options
+
+        filenames = [os.path.join(input_dir, filename) for filename in os.listdir(input_dir) if filename.endswith(".pdf")]
+        self.times = {pdf_option: {filename: PerformanceResult() for filename in filenames} for pdf_option in self.pdf_reader_options}
+        self.pages = {filename: get_pdf_page_count(filename) for filename in filenames}
+        self.filenames = sorted(filenames, key=lambda filename: self.pages[filename])
+
+    def run(self) -> None:
+        print(f'Run task "{self.title}"')
+
+        for pdf_option in self.pdf_reader_options:
+            print(f'  Handle files with pdf option "{pdf_option}":')
+            self.__run_files(pdf_option)
+
+    def to_html(self) -> str:
+        if not self.filenames:
+            return ""
+
+        pdf_header = "".join(f"<th>{pdf_option}</th>" for pdf_option in self.pdf_reader_options)
+
+        html = [
+            "<details open>",
+            f"<summary>{self.title} ({len(self.filenames)} files)</summary>", "<table>",
+            f'<tr><th rowspan="2">Filename</th><th rowspan="2">Pages</th><th colspan="{len(self.pdf_reader_options) + 1}">pdf_with_text_layer</th></tr>',
+            f"<tr>{pdf_header}<th>average</th></tr>"
+        ]
+
+        for filename in self.filenames:
+            times = [self.times[pdf_option][filename] for pdf_option in self.pdf_reader_options]
+            pages = self.pages[filename]
+            html.append(f"<tr><td>{os.path.basename(filename)}</td><td>{pages}</td>{self.__get_performance_cells(times, pages)}</tr>")
+
+        times = []
+        for pdf_option in self.pdf_reader_options:
+            times.append(PerformanceResult([self.times[pdf_option][filename] / self.pages[filename] for filename in self.filenames]))
+
+        html.append(f'<tr><td colspan="2" onclick="HideFiles(this)">average (per page)</td>{self.__get_performance_cells(times)}</tr>')
+        html.append("</table>")
+        html.append("</details>\n")
+
+        return "\n".join(html)
+
+    def __run_file(self, pdf_option: str, filename: str) -> float:
+        start_time = time.time()
+        send_file(self.dedoc_host, os.path.basename(filename), filename, {"pdf_with_text_layer": pdf_option, **self.config})
+        return time.time() - start_time
+
+    def __run_files(self, pdf_option: str) -> None:
+        for i, filename in enumerate(self.filenames):
+            elapsed_time = self.__run_file(pdf_option, filename)
+            self.times[pdf_option][filename].add(elapsed_time)
+            print(f'  - handle file {i + 1} / {len(self.filenames)} "{os.path.basename(filename)}" (pages: {self.pages[filename]}): {elapsed_time} seconds')
+
+        print("")
+
+    def __get_performance_cells(self, pdf_times: List[PerformanceResult], pages: int = 0) -> str:
+        total_times = pdf_times + [PerformanceResult(pdf_times)]
+        return "".join(f"<td>{times} ({times / pages} / page)</td>" if pages > 0 else f"<td>{times}</td>" for times in total_times)
+
+    def __get_page_count(self, path: str) -> int:
+        with open(path, "rb") as fp:
+            pages = len(list(PDFPage.get_pages(fp)))
+
+        return max(pages, 1)
diff --git a/scripts/benchmark_utils/performance_time.py b/scripts/benchmark_utils/performance_time.py
@@ -0,0 +1,43 @@
+from typing import Iterable, Optional, Union
+
+import numpy as np
+
+
+class PerformanceResult:
+    def __init__(self, results: Optional[Iterable["PerformanceResult"]] = None) -> None:
+        self.values = []
+
+        if results is not None:
+            for result in results:
+                self.add(result)
+
+    def add(self, value: Union[float, "PerformanceResult"]) -> None:
+        if isinstance(value, PerformanceResult):
+            self.values.extend(value.values)
+        else:
+            self.values.append(value)
+
+    @property
+    def mean(self) -> float:
+        return np.mean(self.values) if self.values else 0
+
+    @property
+    def std(self) -> float:
+        return np.std(self.values) if self.values else 0
+
+    def __str__(self) -> str:
+        if not self.values:
+            return "-"
+
+        if len(self.values) == 1:
+            return f"{self.mean:.2f}"
+
+        return f"{self.mean:.2f}±{self.std:.2f}"
+
+    def __truediv__(self, scale: float) -> "PerformanceResult":
+        result = PerformanceResult()
+
+        for t in self.values:
+            result.add(t / scale)
+
+        return result