ispras · oksidgy · Apr 3, 2024 · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/.flake8 b/.flake8
@@ -24,3 +24,4 @@ ignore =
     ANN101
 per-file-ignores =
     scripts/*:T201
+    scripts/benchmark_pdf_performance*:JS101,T201
diff --git a/dedoc/utils/pdf_utils.py b/dedoc/utils/pdf_utils.py
@@ -1,15 +1,14 @@
 from typing import Optional
 
 from PIL.Image import Image
-from PyPDF2 import PdfFileReader
 from pdf2image import convert_from_path
+from pypdf import PdfReader
 
 
 def get_pdf_page_count(path: str) -> Optional[int]:
     try:
-        with open(path, "rb") as fl:
-            reader = PdfFileReader(fl)
-            return reader.getNumPages()
+        reader = PdfReader(path)
+        return len(reader.pages)
     except Exception:
         return None
 

diff --git a/requirements.txt b/requirements.txt
@@ -16,6 +16,7 @@ pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c'
 pdfminer.six==20211012
 piexif==1.1.3
 pylzma==0.5.0
+pypdf==4.1.0
 PyPDF2==1.27.0
 pytesseract==0.3.10
 python-docx==0.8.11

diff --git a/resources/benchmarks/benchmark_pdf_performance.html b/resources/benchmarks/benchmark_pdf_performance.html
diff --git a/scripts/benchmark_pdf_performance.py b/scripts/benchmark_pdf_performance.py
@@ -0,0 +1,155 @@
+import argparse
+import json
+import os.path
+import zipfile
+from typing import List
+
+import wget
+
+from scripts.benchmark_utils.pdf_performance_task import PDFPerformanceTask
+
+
+def download_data(data_path: str) -> None:
+    data_archive_path = f"{data_path}.zip"
+
+    wget.download("https://at.ispras.ru/owncloud/index.php/s/lp4wEVyZTd9lA0u/download", data_archive_path)
+    with zipfile.ZipFile(data_archive_path, "r") as archive:
+        archive.extractall(data_path)
+
+    os.remove(data_archive_path)
+
+
+def get_tasks(configs: List[dict], input_path: str, dedoc_host: str, pdf_options: List[str]) -> List[List[PDFPerformanceTask]]:
+    if input_path == "":
+        input_path = "pdf_performance_benchmark_data"
+        download_data(input_path)
+
+    tasks = []
+
+    for config in configs:
+        config_tasks = []
+
+        for task_name in sorted(os.listdir(input_path)):
+            files_path = os.path.join(input_path, task_name)
+            if os.path.isdir(files_path) and not task_name.startswith("_"):
+                config_tasks.append(PDFPerformanceTask(dedoc_host, task_name, files_path, pdf_options, config))
+
+        tasks.append(config_tasks)
+
+    return tasks
+
+
+def make_report(tasks: List[List[PDFPerformanceTask]], output_path: str, configs: List[dict]) -> None:
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write("""<html>
+        <head>
+            <title>PDF performance benchmark</title>
+            <style>
+                p { margin-bottom: 5px; }
+                pre { background: #f0f0f0; padding: 5px; margin: 0; }
+                summary { font-weight: bold; font-size: 1.2em; margin-bottom: 5px; margin-top: 20px; }
+                table { border-collapse: collapse; }
+                td, th { padding: 5px 10px; border: 1px solid #000; text-align: center; }
+                td:first-child { text-align: left; max-width: 600px; word-break: break-word; }
+                td:last-child, tr:last-child td:not(:first-child) { background: #f0f0f0; }
+                tr:last-child td:first-child { font-weight: bold; text-align: right; cursor: pointer; }
+                .hidden-files tr:nth-child(n+3) { display: none; }
+                .hidden-files tr:last-child { display: table-row; }
+            </style>
+
+            <script>
+                function HideFiles(cell) {
+                    cell.parentNode.parentNode.classList.toggle("hidden-files")
+                }
+            </script>
+        </head>
+        <body>""")
+
+        for config, config_tasks in zip(configs, tasks):
+            f.write("<p>Running parameters:</p>")
+            f.write(f"<pre>{json.dumps(config, ensure_ascii=False, indent=2)}</pre>\n\n")
+
+            for task in config_tasks:
+                f.write(task.to_html())
+
+        f.write("</body>\n")
+        f.write("</html>\n")
+
+
+def main() -> None:
+    default_output_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources", "benchmarks", "benchmark_pdf_performance.html"))
+    pdf_options = ["true", "false", "auto", "auto_tabby", "tabby"]
+
+    parser = argparse.ArgumentParser(description="Script for evaluate different PDF readers performance.", formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("-i", "--input", help="path to the directory with pdfs (default: %(default)s)", type=str, default="")
+    parser.add_argument("-o", "--output", help="path to the report filename (default: %(default)s)", type=str, default=default_output_path)
+    parser.add_argument("-n", "--loops", help="number of repetitions of testing one file (default: %(default)d)", type=int, default=1)
+    parser.add_argument("--dedoc-host", help="url to DEDOC instance for sending files (default: %(default)s", type=str, default="http://localhost:1231")
+    parser.add_argument("--pdf-options", help="values of pdf_with_text_layer argument", choices=pdf_options, nargs="+", default=pdf_options)
+    parser.add_argument("--parameters", help="path to json file with alternative parameters dictionaries")
+    args = parser.parse_args()
+
+    if args.input != "":
+        assert os.path.exists(args.input), f'Directory "{args.input}" does not exists'
+        assert os.path.isdir(args.input), f'Path "{args.input}" is not a directory'
+
+    assert args.loops > 0, "The number of repetitions of testing one file must be positive"
+
+    print(f'Run pdf performance benchmark with next pdf options: {", ".join(args.pdf_options)}')
+    configs = [{}]
+
+    if args.parameters:
+        with open(args.parameters, "r", encoding="utf-8") as f:
+            configs = json.load(f)
+
+    tasks = get_tasks(configs, args.input, args.dedoc_host, args.pdf_options)
+
+    for _ in range(args.loops):
+        for config_tasks in tasks:
+            for task in config_tasks:
+                task.run()
+                make_report(tasks, args.output, configs)
+
+
+"""
+How to run on default benchmark data?
+Simple run next command:
+    python3 benchmark_pdf_performance.py
+
+Running on custom data:
+1. Prepare folder with tasks. The task is a directory with pdf files. Directories starting with an underscore (_) will be ignored.
+Example of a folder "pdf_data" with 3 tasks:
+    pdf_data
+    +--+--+ task1
+       |  +--- file1.pdf
+       |  +--- file2.pdf
+       |
+       +--+ Some second task name
+       |  +--- f.pdf
+       |
+       +--+ And last task name
+       |  +--- file_.pdf
+       |  +--- file2.pdf
+       |  +--- not_pdf_file.docx
+       |
+       +--+ _ignored folder
+          +--- some_image.png
+          +--- some_pdf.pdf
+
+2. Run script with next command:
+    python3 benchmark_pdf_performance.py --pdf-options tabby true auto auto_tabby -i pdf_data
+
+2*. To evaluate with different parameters, you can prepare a json file with a list of dictionaries and specify the “parameters” option:
+    parameters.json:
+    [
+        { "need_pdf_table_analysis": "false" },
+        { "need_pdf_table_analysis": "true", "return_format": "plain_text" }
+    ]
+
+Run with next command:
+    python3 benchmark_pdf_performance.py --pdf-options tabby true auto auto_tabby -i pdf_data --parameters parameters.json
+
+3. Look your results in the pdf_performance.html file
+"""
+if __name__ == "__main__":
+    main()
diff --git a/scripts/benchmark_utils/pdf_performance_task.py b/scripts/benchmark_utils/pdf_performance_task.py
@@ -0,0 +1,94 @@
+import os
+import time
+from typing import List
+
+from pdfminer.pdfpage import PDFPage
+
+from dedoc.utils.pdf_utils import get_pdf_page_count
+from dedoc.utils.utils import send_file
+from scripts.benchmark_utils.performance_result import PerformanceResult
+
+
+class PDFPerformanceTask:
+    """
+        This class is used to estimate the elapsed time of different PDF pipelines
+        in different PDF files and save the information into an html table.
+    """
+
+    def __init__(self, dedoc_host: str, title: str, input_dir: str, pdf_reader_options: List[str], config: dict) -> None:
+        """
+            Initialization of task
+
+            :param dedoc_host: URL to launch the dedoc API instance, for example http://localhost:1231
+            :param title: title of the task to display in the html report
+            :param input_dir: path to the directory containing the PDF files.
+            :param pdf_reader_options: list of options available for the "pdf_with_text_layer" API parameter
+            :param config: additional file processing parameters
+        """
+        self.dedoc_host = dedoc_host
+        self.title = title
+        self.config = config
+        self.pdf_reader_options = pdf_reader_options
+
+        filenames = [os.path.join(input_dir, filename) for filename in os.listdir(input_dir) if filename.endswith(".pdf")]
+        self.times = {pdf_option: {filename: PerformanceResult() for filename in filenames} for pdf_option in self.pdf_reader_options}
+        self.pages = {filename: get_pdf_page_count(filename) for filename in filenames}
+        self.filenames = sorted(filenames, key=lambda filename: self.pages[filename])
+
+    def run(self) -> None:
+        print(f'Run task "{self.title}"')
+
+        for pdf_option in self.pdf_reader_options:
+            print(f'  Handle files with pdf option "{pdf_option}":')
+            self.__run_files(pdf_option)
+
+    def to_html(self) -> str:
+        if not self.filenames:
+            return ""
+
+        pdf_header = "".join(f"<th>{pdf_option}</th>" for pdf_option in self.pdf_reader_options)
+
+        html = [
+            "<details open>",
+            f"<summary>{self.title} ({len(self.filenames)} files)</summary>", "<table>",
+            f'<tr><th rowspan="2">Filename</th><th rowspan="2">Pages</th><th colspan="{len(self.pdf_reader_options) + 1}">pdf_with_text_layer</th></tr>',
+            f"<tr>{pdf_header}<th>average</th></tr>"
+        ]
+
+        for filename in self.filenames:
+            times = [self.times[pdf_option][filename] for pdf_option in self.pdf_reader_options]
+            pages = self.pages[filename]
+            html.append(f"<tr><td>{os.path.basename(filename)}</td><td>{pages}</td>{self.__get_performance_cells(times, pages)}</tr>")
+
+        times = []
+        for pdf_option in self.pdf_reader_options:
+            times.append(PerformanceResult([self.times[pdf_option][filename] / self.pages[filename] for filename in self.filenames]))
+
+        html.append(f'<tr><td colspan="2" onclick="HideFiles(this)">average (per page)</td>{self.__get_performance_cells(times)}</tr>')
+        html.append("</table>")
+        html.append("</details>\n")
+
+        return "\n".join(html)
+
+    def __run_file(self, pdf_option: str, filename: str) -> float:
+        start_time = time.time()
+        send_file(self.dedoc_host, os.path.basename(filename), filename, {"pdf_with_text_layer": pdf_option, **self.config})
+        return time.time() - start_time
+
+    def __run_files(self, pdf_option: str) -> None:
+        for i, filename in enumerate(self.filenames):
+            elapsed_time = self.__run_file(pdf_option, filename)
+            self.times[pdf_option][filename].add(elapsed_time)
+            print(f'  - handle file {i + 1} / {len(self.filenames)} "{os.path.basename(filename)}" (pages: {self.pages[filename]}): {elapsed_time} seconds')
+
+        print("")
+
+    def __get_performance_cells(self, pdf_times: List[PerformanceResult], pages: int = 0) -> str:
+        total_times = pdf_times + [PerformanceResult(pdf_times)]
+        return "".join(f"<td>{times} ({times / pages} / page)</td>" if pages > 0 else f"<td>{times}</td>" for times in total_times)
+
+    def __get_page_count(self, path: str) -> int:
+        with open(path, "rb") as fp:
+            pages = len(list(PDFPage.get_pages(fp)))
+
+        return max(pages, 1)
diff --git a/scripts/benchmark_utils/performance_result.py b/scripts/benchmark_utils/performance_result.py
@@ -0,0 +1,59 @@
+from typing import Iterable, Optional, Union
+
+import numpy as np
+
+
+class PerformanceResult:
+    """
+        This class is used for storing multiple results of measuring some metric (for example, elapsed time)
+        with support for calculating mean and std statistics and pretty printing of stored values
+
+        >>> result = PerformanceResult()
+        >>> f"result: {result}"  # result: -
+        >>> result.add(5.0)
+        >>> f"result: {result}"  # result: 5.00
+        >>> result.add(8.0)
+        >>> f"result: {result}"  # result: 6.50±1.50
+        >>> result.mean  # 6.5
+        >>> result.std  # 1.5
+        >>> partial_result = result / 4
+        >>> f"partial_result: {partial_result}"  # partial_result: 1.62±0.38
+    """
+
+    def __init__(self, results: Optional[Iterable["PerformanceResult"]] = None) -> None:
+        self.values = []
+
+        if results is not None:
+            for result in results:
+                self.add(result)
+
+    def add(self, value: Union[float, "PerformanceResult"]) -> None:
+        if isinstance(value, PerformanceResult):
+            self.values.extend(value.values)
+        else:
+            self.values.append(value)
+
+    @property
+    def mean(self) -> float:
+        return np.mean(self.values) if self.values else 0
+
+    @property
+    def std(self) -> float:
+        return np.std(self.values) if self.values else 0
+
+    def __str__(self) -> str:
+        if not self.values:
+            return "-"
+
+        if len(self.values) == 1:
+            return f"{self.mean:.2f}"
+
+        return f"{self.mean:.2f}±{self.std:.2f}"
+
+    def __truediv__(self, scale: float) -> "PerformanceResult":
+        result = PerformanceResult()
+
+        for t in self.values:
+            result.add(t / scale)
+
+        return result