diff --git a/.flake8 b/.flake8 index 96a1bcff..d7afb7d1 100644 --- a/.flake8 +++ b/.flake8 @@ -24,3 +24,4 @@ ignore = ANN101 per-file-ignores = scripts/*:T201 + scripts/benchmark_pdf_performance*:JS101,T201 diff --git a/dedoc/utils/pdf_utils.py b/dedoc/utils/pdf_utils.py index 68bfa9a6..ba574dfd 100644 --- a/dedoc/utils/pdf_utils.py +++ b/dedoc/utils/pdf_utils.py @@ -1,15 +1,14 @@ from typing import Optional from PIL.Image import Image -from PyPDF2 import PdfFileReader from pdf2image import convert_from_path +from pypdf import PdfReader def get_pdf_page_count(path: str) -> Optional[int]: try: - with open(path, "rb") as fl: - reader = PdfFileReader(fl) - return reader.getNumPages() + reader = PdfReader(path) + return len(reader.pages) except Exception: return None diff --git a/requirements.txt b/requirements.txt index 3b967a2c..30469034 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c' pdfminer.six==20211012 piexif==1.1.3 pylzma==0.5.0 +pypdf==4.1.0 PyPDF2==1.27.0 pytesseract==0.3.10 python-docx==0.8.11 diff --git a/resources/benchmarks/benchmark_pdf_performance.html b/resources/benchmarks/benchmark_pdf_performance.html new file mode 100644 index 00000000..e2010e6d --- /dev/null +++ b/resources/benchmarks/benchmark_pdf_performance.html @@ -0,0 +1,236 @@ + + + PDF performance benchmark + + + + +

Running parameters:

{}
+ +
+1 Common (1-19 pages) (37 files) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FilenamePagespdf_with_text_layer
tabbytrueautoauto_tabbyaverage
big_table_with_merged_cells.pdf10.69±0.03 (0.69±0.03 / page)0.54±0.01 (0.54±0.01 / page)2.70±0.05 (2.70±0.05 / page)2.63±0.02 (2.63±0.02 / page)1.64±1.03 (1.64±1.03 / page)
VVP_global_table.pdf10.78±0.06 (0.78±0.06 / page)1.00±0.03 (1.00±0.03 / page)3.23±0.06 (3.23±0.06 / page)3.20±0.05 (3.20±0.05 / page)2.05±1.17 (2.05±1.17 / page)
пример.pdf10.79±0.01 (0.79±0.01 / page)0.92±0.00 (0.92±0.00 / page)1.75±0.02 (1.75±0.02 / page)1.59±0.02 (1.59±0.02 / page)1.26±0.41 (1.26±0.41 / page)
not_table.pdf11.33±0.02 (1.33±0.02 / page)1.63±0.01 (1.63±0.01 / page)5.60±0.03 (5.60±0.03 / page)5.62±0.06 (5.62±0.06 / page)3.55±2.07 (3.55±2.07 / page)
english_doc.pdf10.61±0.02 (0.61±0.02 / page)0.57±0.01 (0.57±0.01 / page)1.18±0.03 (1.18±0.03 / page)1.21±0.03 (1.21±0.03 / page)0.89±0.31 (0.89±0.31 / page)
liters_state.pdf11.03±0.04 (1.03±0.04 / page)0.63±0.01 (0.63±0.01 / page)1.65±0.06 (1.65±0.06 / page)2.03±0.04 (2.03±0.04 / page)1.33±0.54 (1.33±0.54 / page)
doc_with_long_list.pdf10.59±0.01 (0.59±0.01 / page)0.27±0.01 (0.27±0.01 / page)0.85±0.02 (0.85±0.02 / page)1.18±0.03 (1.18±0.03 / page)0.72±0.34 (0.72±0.34 / page)
example.pdf10.88±0.06 (0.88±0.06 / page)1.06±0.03 (1.06±0.03 / page)1.92±0.04 (1.92±0.04 / page)1.74±0.03 (1.74±0.03 / page)1.40±0.44 (1.40±0.44 / page)
2-column-state.pdf11.01±0.04 (1.01±0.04 / page)0.59±0.01 (0.59±0.01 / page)1.57±0.03 (1.57±0.03 / page)1.99±0.03 (1.99±0.03 / page)1.29±0.53 (1.29±0.53 / page)
14_dev_direct.pdf10.67±0.02 (0.67±0.02 / page)0.37±0.01 (0.37±0.01 / page)1.04±0.02 (1.04±0.02 / page)1.36±0.03 (1.36±0.03 / page)0.86±0.37 (0.86±0.37 / page)
example_table_with_90_orient_cells.pdf11.05±0.05 (1.05±0.05 / page)2.35±0.02 (2.35±0.02 / page)4.61±0.04 (4.61±0.04 / page)4.67±0.04 (4.67±0.04 / page)3.17±1.54 (3.17±1.54 / page)
two_column_document.pdf20.98±0.06 (0.49±0.03 / page)1.07±0.02 (0.54±0.01 / page)2.00±0.04 (1.00±0.02 / page)1.98±0.07 (0.99±0.03 / page)1.51±0.48 (0.75±0.24 / page)
example_mp_table_wo_repeate_header.pdf20.92±0.07 (0.46±0.04 / page)1.52±0.02 (0.76±0.01 / page)2.51±0.06 (1.25±0.03 / page)1.87±0.07 (0.94±0.04 / page)1.70±0.58 (0.85±0.29 / page)
mixed_pdf.pdf20.82±0.02 (0.41±0.01 / page)0.95±0.01 (0.48±0.01 / page)4.05±0.03 (2.03±0.01 / page)3.94±0.05 (1.97±0.02 / page)2.44±1.56 (1.22±0.78 / page)
example_mp_table_with_repeate_header_2.pdf20.95±0.05 (0.48±0.03 / page)1.61±0.03 (0.81±0.01 / page)2.54±0.08 (1.27±0.04 / page)1.95±0.08 (0.97±0.04 / page)1.76±0.58 (0.88±0.29 / page)
0004057v1.pdf21.00±0.03 (0.50±0.02 / page)0.87±0.01 (0.44±0.00 / page)1.92±0.03 (0.96±0.02 / page)2.03±0.07 (1.01±0.03 / page)1.45±0.52 (0.73±0.26 / page)
Document635.pdf21.67±0.04 (0.83±0.02 / page)3.13±0.08 (1.57±0.04 / page)4.82±0.07 (2.41±0.04 / page)3.37±0.08 (1.68±0.04 / page)3.25±1.12 (1.62±0.56 / page)
example_table_with_270_orient_cells.pdf21.14±0.02 (0.57±0.01 / page)2.67±0.07 (1.33±0.03 / page)6.54±0.07 (3.27±0.03 / page)6.50±0.03 (3.25±0.02 / page)4.21±2.37 (2.11±1.19 / page)
VVP_6_tables.pdf31.20±0.01 (0.40±0.00 / page)2.75±0.04 (0.92±0.01 / page)4.01±0.08 (1.34±0.03 / page)2.42±0.04 (0.81±0.01 / page)2.60±1.00 (0.87±0.33 / page)
example_with_table9.pdf31.26±0.04 (0.42±0.01 / page)5.32±0.05 (1.77±0.02 / page)12.47±0.06 (4.16±0.02 / page)12.38±0.01 (4.13±0.00 / page)7.86±4.79 (2.62±1.60 / page)
multipage_table.pdf31.10±0.06 (0.37±0.02 / page)3.56±0.03 (1.19±0.01 / page)9.10±0.10 (3.03±0.03 / page)9.05±0.07 (3.02±0.02 / page)5.70±3.48 (1.90±1.16 / page)
liao2020_merged_organized.pdf41.79±0.05 (0.45±0.01 / page)2.25±0.04 (0.56±0.01 / page)3.96±0.05 (0.99±0.01 / page)3.61±0.05 (0.90±0.01 / page)2.90±0.91 (0.73±0.23 / page)
with_header_footer_2.pdf51.46±0.01 (0.29±0.00 / page)1.98±0.01 (0.40±0.00 / page)3.44±0.04 (0.69±0.01 / page)2.87±0.03 (0.57±0.01 / page)2.44±0.77 (0.49±0.15 / page)
short_lines.pdf50.91±0.01 (0.18±0.00 / page)1.09±0.01 (0.22±0.00 / page)2.04±0.02 (0.41±0.00 / page)1.81±0.01 (0.36±0.00 / page)1.46±0.48 (0.29±0.10 / page)
prospectus.pdf69.00±0.14 (1.50±0.02 / page)12.49±0.13 (2.08±0.02 / page)21.28±0.06 (3.55±0.01 / page)17.96±0.09 (2.99±0.02 / page)15.18±4.76 (2.53±0.79 / page)
dogovor-oferty.pdf71.62±0.03 (0.23±0.00 / page)3.34±0.02 (0.48±0.00 / page)4.94±0.09 (0.71±0.01 / page)3.25±0.11 (0.46±0.02 / page)3.29±1.18 (0.47±0.17 / page)
Алан Тьюринг.pdf82.18±0.04 (0.27±0.01 / page)3.89±0.07 (0.49±0.01 / page)5.99±0.03 (0.75±0.00 / page)4.38±0.06 (0.55±0.01 / page)4.11±1.36 (0.51±0.17 / page)
multipage.pdf90.95±0.02 (0.11±0.00 / page)1.73±0.01 (0.19±0.00 / page)2.61±0.04 (0.29±0.00 / page)1.86±0.03 (0.21±0.00 / page)1.79±0.59 (0.20±0.07 / page)
with_changed_header_footer.pdf103.00±0.13 (0.30±0.01 / page)8.88±0.09 (0.89±0.01 / page)11.12±0.17 (1.11±0.02 / page)5.09±0.14 (0.51±0.01 / page)7.02±3.17 (0.70±0.32 / page)
2212.14834.pdf126.57±0.11 (0.55±0.01 / page)17.99±0.19 (1.50±0.02 / page)22.99±0.20 (1.92±0.02 / page)11.70±0.12 (0.98±0.01 / page)14.82±6.22 (1.23±0.52 / page)
s00371-018-1491-0.pdf135.53±0.07 (0.43±0.01 / page)12.59±0.13 (0.97±0.01 / page)16.19±0.14 (1.25±0.01 / page)9.11±0.09 (0.70±0.01 / page)10.86±3.97 (0.84±0.31 / page)
Successful_SAT_Encoding_Techniques.pdf132.74±0.05 (0.21±0.00 / page)7.72±0.14 (0.59±0.01 / page)9.68±0.13 (0.74±0.01 / page)4.87±0.08 (0.37±0.01 / page)6.25±2.66 (0.48±0.20 / page)
WAIT23_paper_1.pdf143.74±0.14 (0.27±0.01 / page)9.96±0.12 (0.71±0.01 / page)12.64±0.12 (0.90±0.01 / page)6.30±0.06 (0.45±0.00 / page)8.16±3.40 (0.58±0.24 / page)
S0965542513120129.pdf153.36±0.04 (0.22±0.00 / page)17.96±0.26 (1.20±0.02 / page)20.22±0.22 (1.35±0.01 / page)5.81±0.09 (0.39±0.01 / page)11.84±7.35 (0.79±0.49 / page)
INFORSID_2017_paper_34.pdf163.42±0.11 (0.21±0.01 / page)9.83±0.07 (0.61±0.00 / page)11.98±0.05 (0.75±0.00 / page)5.47±0.01 (0.34±0.00 / page)7.68±3.40 (0.48±0.21 / page)
1901.10861.pdf194.24±0.12 (0.22±0.01 / page)15.54±0.18 (0.82±0.01 / page)17.95±0.20 (0.94±0.01 / page)6.74±0.12 (0.35±0.01 / page)11.11±5.76 (0.58±0.30 / page)
applsci-12-04943.pdf196.10±0.13 (0.32±0.01 / page)29.97±0.20 (1.58±0.01 / page)33.14±0.35 (1.74±0.02 / page)9.48±0.07 (0.50±0.00 / page)19.67±12.00 (1.04±0.63 / page)
average (per page)0.54±0.330.89±0.521.75±1.221.53±1.311.18±1.06
+
+
+2 Common (20-99 pages) (19 files) + + + + + + + + + + + + + + + + + + + + + + + +
FilenamePagespdf_with_text_layer
tabbytrueautoauto_tabbyaverage
IVMEM2022_paper_2.pdf205.17±0.13 (0.26±0.01 / page)30.96±0.27 (1.55±0.01 / page)34.55±0.25 (1.73±0.01 / page)8.59±0.12 (0.43±0.01 / page)19.82±13.06 (0.99±0.65 / page)
4d9f_7b15_A-Worldwide-Survey-of-Encryption-Products.pdf224.72±0.08 (0.21±0.00 / page)14.66±0.26 (0.67±0.01 / page)16.53±0.23 (0.75±0.01 / page)6.56±0.08 (0.30±0.00 / page)10.62±5.07 (0.48±0.23 / page)
4-МГУ-Тулин-Д-И.pdf243.91±0.04 (0.16±0.00 / page)14.74±0.15 (0.61±0.01 / page)16.76±0.17 (0.70±0.01 / page)6.02±0.03 (0.25±0.00 / page)10.36±5.49 (0.43±0.23 / page)
4-МГУ-Попов-М-С.pdf243.73±0.10 (0.16±0.00 / page)16.46±0.16 (0.69±0.01 / page)18.62±0.13 (0.78±0.01 / page)5.85±0.05 (0.24±0.00 / page)11.17±6.47 (0.47±0.27 / page)
EtatArtRecSys15fev2021.pdf245.38±0.04 (0.22±0.00 / page)22.15±0.20 (0.92±0.01 / page)24.75±0.21 (1.03±0.01 / page)8.02±0.06 (0.33±0.00 / page)15.08±8.48 (0.63±0.35 / page)
US_DHS AS_2021.pdf254.42±0.08 (0.18±0.00 / page)21.16±0.21 (0.85±0.01 / page)23.72±0.15 (0.95±0.01 / page)7.19±0.08 (0.29±0.00 / page)14.12±8.43 (0.56±0.34 / page)
pbedrin_diploma.pdf285.15±0.08 (0.18±0.00 / page)17.15±0.16 (0.61±0.01 / page)19.00±0.21 (0.68±0.01 / page)7.00±0.04 (0.25±0.00 / page)12.08±6.07 (0.43±0.22 / page)
DK_FSP_2018.pdf285.40±0.07 (0.19±0.00 / page)27.79±0.28 (0.99±0.01 / page)30.32±0.12 (1.08±0.00 / page)7.91±0.06 (0.28±0.00 / page)17.86±11.27 (0.64±0.40 / page)
2111.15664.pdf2912.72±0.15 (0.44±0.01 / page)47.80±0.34 (1.65±0.01 / page)52.20±0.47 (1.80±0.02 / page)17.21±0.10 (0.59±0.00 / page)32.48±17.66 (1.12±0.61 / page)
4-физтех-Шишацкий-М-Н-230301.pdf306.03±0.10 (0.20±0.00 / page)42.41±0.26 (1.41±0.01 / page)45.24±0.13 (1.51±0.00 / page)8.81±0.16 (0.29±0.01 / page)25.62±18.26 (0.85±0.61 / page)
romanov_diplom.pdf335.57±0.06 (0.17±0.00 / page)24.20±0.24 (0.73±0.01 / page)26.96±0.17 (0.82±0.01 / page)8.30±0.09 (0.25±0.00 / page)16.26±9.42 (0.49±0.29 / page)
J93-2003.pdf5010.75±0.13 (0.21±0.00 / page)37.15±0.49 (0.74±0.01 / page)40.22±0.43 (0.80±0.01 / page)13.81±0.15 (0.28±0.00 / page)25.48±13.30 (0.51±0.27 / page)
diploma.pdf556.48±0.06 (0.12±0.00 / page)32.26±0.40 (0.59±0.01 / page)34.27±0.37 (0.62±0.01 / page)8.50±0.16 (0.15±0.00 / page)20.38±12.93 (0.37±0.24 / page)
s11263-020-01359-2.pdf5715.50±0.16 (0.27±0.00 / page)69.17±0.59 (1.21±0.01 / page)73.52±0.59 (1.29±0.01 / page)20.12±0.59 (0.35±0.01 / page)44.58±26.87 (0.78±0.47 / page)
FI_DR_2021.pdf6510.28±0.13 (0.16±0.00 / page)30.52±0.33 (0.47±0.01 / page)34.09±0.33 (0.52±0.01 / page)13.73±0.06 (0.21±0.00 / page)22.15±10.31 (0.34±0.16 / page)
FULLTEXT01.pdf6914.38±0.29 (0.21±0.00 / page)108.70±0.78 (1.58±0.01 / page)111.27±0.71 (1.61±0.01 / page)17.10±0.20 (0.25±0.00 / page)62.86±47.15 (0.91±0.68 / page)
FI_AS_2021.pdf7313.34±0.22 (0.18±0.00 / page)159.51±0.74 (2.19±0.01 / page)162.66±0.59 (2.23±0.01 / page)16.45±0.10 (0.23±0.00 / page)87.99±73.11 (1.21±1.00 / page)
АксеноваЕЛ_628.pdf747.81±0.15 (0.11±0.00 / page)53.75±0.35 (0.73±0.00 / page)55.60±0.43 (0.75±0.01 / page)9.68±0.06 (0.13±0.00 / page)31.71±22.99 (0.43±0.31 / page)
Научно-технический_отчет_(заключительный)_по_договору.pdf8317.85±3.65 (0.22±0.04 / page)414.40±3.30 (4.99±0.04 / page)416.25±3.49 (5.02±0.04 / page)19.78±3.37 (0.24±0.04 / page)217.07±198.29 (2.62±2.39 / page)
average (per page)0.20±0.071.22±1.001.30±0.990.28±0.100.75±0.87
+
+
+3 Common (100-500 pages) (6 files) + + + + + + + + + + +
FilenamePagespdf_with_text_layer
tabbytrueautoauto_tabbyaverage
FI_SDP_2012.pdf12111.36±0.16 (0.09±0.00 / page)91.62±0.51 (0.76±0.00 / page)93.79±0.33 (0.78±0.00 / page)13.73±0.23 (0.11±0.00 / page)52.62±40.10 (0.43±0.33 / page)
FI_SDP_2009.pdf14213.41±0.20 (0.09±0.00 / page)68.36±0.56 (0.48±0.00 / page)71.24±0.35 (0.50±0.00 / page)15.82±0.07 (0.11±0.00 / page)42.21±27.63 (0.30±0.19 / page)
1_Гарри_Поттер_и_философский_камень.pdf24415.80±0.36 (0.06±0.00 / page)257.90±1.86 (1.06±0.01 / page)269.97±1.02 (1.11±0.00 / page)28.16±0.10 (0.12±0.00 / page)142.96±121.13 (0.59±0.50 / page)
2.Гарри_Поттер_и_Тайная_комната.pdf33129.36±0.42 (0.09±0.00 / page)333.94±0.86 (1.01±0.00 / page)348.22±0.65 (1.05±0.00 / page)43.24±0.26 (0.13±0.00 / page)188.69±152.55 (0.57±0.46 / page)
3.Гарри_Поттер_и_узник_Азкабана.pdf40234.64±0.48 (0.09±0.00 / page)440.42±1.95 (1.10±0.00 / page)453.02±0.98 (1.13±0.00 / page)48.17±0.12 (0.12±0.00 / page)244.06±202.77 (0.61±0.50 / page)
dbguide.pdf52139.39±0.56 (0.08±0.00 / page)2188.78±7.38 (4.20±0.01 / page)2198.74±4.19 (4.22±0.01 / page)47.34±0.60 (0.09±0.00 / page)1118.56±1075.22 (2.15±2.06 / page)
average (per page)0.08±0.011.43±1.261.46±1.250.11±0.010.77±1.11
+
+
+4 Common (750+ pages) (4 files) + + + + + + + + +
FilenamePagespdf_with_text_layer
tabbytrueautoauto_tabbyaverage
7.Гарри_Поттер_и_Дары_Смерти.pdf76864.41±0.68 (0.08±0.00 / page)1227.33±3.57 (1.60±0.00 / page)1243.04±1.88 (1.62±0.00 / page)81.81±4.67 (0.11±0.01 / page)654.15±581.10 (0.85±0.76 / page)
Python-k-vershinam-masterstva_RuLit_Me_639739.pdf76961.92±1.44 (0.08±0.00 / page)1934.12±1.49 (2.52±0.00 / page)1948.38±3.14 (2.53±0.00 / page)69.99±0.84 (0.09±0.00 / page)1003.60±937.67 (1.31±1.22 / page)
5.Гарри_Поттер_и_Орден_Феникса.pdf96781.26±1.67 (0.08±0.00 / page)1815.83±1.87 (1.88±0.00 / page)1838.23±2.16 (1.90±0.00 / page)95.31±0.91 (0.10±0.00 / page)957.66±869.43 (0.99±0.90 / page)
NNDesign.pdf1012108.70±1.69 (0.11±0.00 / page)22116.06±15.78 (21.85±0.02 / page)22224.07±83.69 (21.96±0.08 / page)132.00±9.06 (0.13±0.01 / page)11145.21±11025.01 (11.01±10.89 / page)
average (per page)0.09±0.016.96±8.607.00±8.640.11±0.023.54±7.00
+
+
+5 FinTOC 2022 (en, 1-19 pages) (6 files) + + + + + + + + + + +
FilenamePagespdf_with_text_layer
tabbytrueautoauto_tabbyaverage
EdR_Private_Equity_Select_Access_Fund_S.A._SICAV-SIF-Amethis_II__Sub-Fund_2018_K_X_X_X.pdf32.07±0.25 (0.69±0.08 / page)3.41±0.17 (1.14±0.06 / page)5.35±0.26 (1.78±0.09 / page)3.94±0.11 (1.31±0.04 / page)3.69±1.19 (1.23±0.40 / page)
LU0992626050-FR0011269182-FR0011269190-FR0010312660-FR0010148981-LU0992625912-LU0992625839-LU0992626134_English_2012_Carmignac.pdf124.44±0.54 (0.37±0.05 / page)12.27±0.62 (1.02±0.05 / page)15.12±0.72 (1.26±0.06 / page)7.01±0.16 (0.58±0.01 / page)9.71±4.25 (0.81±0.35 / page)
CH0002775168_English_2016_BSIMultihelvetia.pdf166.06±0.41 (0.38±0.03 / page)17.97±0.87 (1.12±0.05 / page)21.24±0.54 (1.33±0.03 / page)9.58±0.28 (0.60±0.02 / page)13.72±6.16 (0.86±0.38 / page)
LU0035346187-LU0035345882_English_2012_UBS-Lux-BondFundGBPP.pdf1710.76±0.80 (0.63±0.05 / page)25.54±0.55 (1.50±0.03 / page)31.19±0.70 (1.83±0.04 / page)16.16±0.49 (0.95±0.03 / page)20.91±7.98 (1.23±0.47 / page)
LU0415166403-LU0033050237-LU0033049577_English_2012_UBS-Lux-BondFundEUR.pdf1711.03±0.61 (0.65±0.04 / page)25.70±0.92 (1.51±0.05 / page)31.17±1.06 (1.83±0.06 / page)16.36±0.17 (0.96±0.01 / page)21.06±7.89 (1.24±0.46 / page)
LU0214904665-LU0214905043_English_2012_UBS-Lux-BSEmergingEuropeEURP.pdf188.58±0.49 (0.48±0.03 / page)18.57±0.75 (1.03±0.04 / page)22.47±0.77 (1.25±0.04 / page)12.44±0.33 (0.69±0.02 / page)15.51±5.41 (0.86±0.30 / page)
average (per page)0.53±0.141.22±0.211.55±0.280.85±0.261.04±0.44
+
+
+6 FinTOC 2022 (en, 20-99 pages) (61 files) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FilenamePagespdf_with_text_layer
tabbytrueautoauto_tabbyaverage
Credit_Suisse_Portfolio_Fund_(Lux)_2011_X_P_X_X.pdf236.69±0.10 (0.29±0.00 / page)17.85±0.08 (0.78±0.00 / page)20.77±0.08 (0.90±0.00 / page)9.91±0.06 (0.43±0.00 / page)13.81±5.72 (0.60±0.25 / page)
Bantleon_AnleihenFonds_LU0524467833_2012_X_P_R_A.pdf244.68±0.07 (0.19±0.00 / page)16.83±0.04 (0.70±0.00 / page)18.77±0.11 (0.78±0.00 / page)6.76±0.01 (0.28±0.00 / page)11.76±6.12 (0.49±0.26 / page)
LU0129337548-LU0129337381_English_2009_CS-Lux-EuropeanPropertyEquityFd-.pdf246.98±0.04 (0.29±0.00 / page)25.75±0.22 (1.07±0.01 / page)29.10±0.82 (1.21±0.03 / page)10.13±0.06 (0.42±0.00 / page)17.99±9.58 (0.75±0.40 / page)
LU0118405827_English_2013_SEBConceptBiotechnology.pdf294.65±0.03 (0.16±0.00 / page)13.79±0.03 (0.48±0.00 / page)15.55±0.56 (0.54±0.02 / page)6.42±0.04 (0.22±0.00 / page)10.10±4.66 (0.35±0.16 / page)
Columbus_EM_Debt_Strategy_FR0013204161_2018_K_P_R_X.pdf294.96±0.04 (0.17±0.00 / page)18.65±0.10 (0.64±0.00 / page)20.77±0.32 (0.72±0.01 / page)7.28±0.03 (0.25±0.00 / page)12.92±6.89 (0.45±0.24 / page)
LU0337413677-LU0634998545-LU0337414568-LU0764661145-LU0764660501-LU0337413834-LU0764661574-LU0764660840-LU0371477885-LU0634998206-LU0337414139-LU0337414642-LU0634998461-LU0337414485-LU0834220419_English_2012_Bantleon.pdf325.95±0.07 (0.19±0.00 / page)23.17±0.20 (0.72±0.01 / page)25.05±0.16 (0.78±0.01 / page)8.25±0.05 (0.26±0.00 / page)15.61±8.57 (0.49±0.27 / page)
LU0116762864-LU0041228874_English_2011_Deka-Renten-Euro3-7CF.pdf345.26±0.01 (0.15±0.00 / page)21.61±0.21 (0.64±0.01 / page)23.31±0.10 (0.69±0.00 / page)7.52±0.11 (0.22±0.00 / page)14.43±8.10 (0.42±0.24 / page)
LU0097655574-LU0097654924_English_2012_Deka-EuroStocks.pdf355.36±0.04 (0.15±0.00 / page)23.66±0.24 (0.68±0.01 / page)25.35±0.12 (0.72±0.00 / page)7.57±0.08 (0.22±0.00 / page)15.49±9.08 (0.44±0.26 / page)
Credit_Suisse_Fund_I_(Lux)_2012_X_P_X_X.pdf369.90±0.21 (0.28±0.01 / page)31.38±0.32 (0.87±0.01 / page)34.62±0.78 (0.96±0.02 / page)13.41±0.11 (0.37±0.00 / page)22.33±10.81 (0.62±0.30 / page)
LU0456547701-LU0120526693_English_2013_SEBHighYield.pdf375.59±0.04 (0.15±0.00 / page)19.79±0.16 (0.53±0.00 / page)21.37±0.13 (0.58±0.00 / page)7.55±0.01 (0.20±0.00 / page)13.58±7.06 (0.37±0.19 / page)
BI_SICAV_2011_X_P_X_X.pdf389.57±0.14 (0.25±0.00 / page)53.12±0.38 (1.40±0.01 / page)55.43±0.05 (1.46±0.00 / page)12.80±0.06 (0.34±0.00 / page)32.73±21.59 (0.86±0.57 / page)
LU1009600286-LU0098995292-LU1009600955-LU0404639410_English_2012_UBS-Lux-Eq-USAMultiStrat--.pdf4025.72±0.93 (0.64±0.02 / page)69.50±0.76 (1.74±0.02 / page)75.60±0.28 (1.89±0.01 / page)31.59±0.97 (0.79±0.02 / page)50.60±22.17 (1.27±0.55 / page)
OYSTER_2005_X_P_X_X.pdf406.19±0.13 (0.15±0.00 / page)29.57±0.22 (0.74±0.01 / page)31.03±0.13 (0.78±0.00 / page)8.41±0.07 (0.21±0.00 / page)18.80±11.54 (0.47±0.29 / page)
CANDRIAM_GF_LU1220230442_2015_K_P_R_A.pdf427.11±0.04 (0.17±0.00 / page)31.56±0.23 (0.75±0.01 / page)33.32±0.10 (0.79±0.00 / page)9.30±0.05 (0.22±0.00 / page)20.32±12.16 (0.48±0.29 / page)
LU0091821107_English_2013_CarnegieGlobalHealthcareFund.pdf426.39±0.01 (0.15±0.00 / page)22.07±0.19 (0.53±0.00 / page)23.78±0.25 (0.57±0.01 / page)8.36±0.06 (0.20±0.00 / page)15.15±7.83 (0.36±0.19 / page)
LU0611173930-LU0626901861-LU0611173427_English_2011_UBS-Lux-Eq-SICAVGl-HDUSDP.pdf4241.30±1.51 (0.98±0.04 / page)832.00±4.68 (19.81±0.11 / page)847.74±8.20 (20.18±0.20 / page)52.15±0.10 (1.24±0.00 / page)443.30±396.66 (10.55±9.44 / page)
GB00B94CTF25-GB00B39R2V77-GB00B39R2T55-GB00BK6MCK32-GB00B46J9127-GB00B39R2S49-GB00B39R2W84-GB00BK6MC.pdf439.66±0.12 (0.22±0.00 / page)67.85±0.31 (1.58±0.01 / page)71.24±0.59 (1.66±0.01 / page)12.84±0.06 (0.30±0.00 / page)40.40±29.20 (0.94±0.68 / page)
PRO_NFOF_eng_LU.pdf4310.14±0.10 (0.24±0.00 / page)45.38±0.20 (1.06±0.00 / page)48.32±0.15 (1.12±0.00 / page)13.28±0.14 (0.31±0.00 / page)29.28±17.64 (0.68±0.41 / page)
LU1150262910-LU1150255971-LU1150259296_English_2015_BNPParibasIslamicFundHilalIncome.pdf448.60±0.21 (0.20±0.00 / page)48.44±0.66 (1.10±0.01 / page)50.73±0.04 (1.15±0.00 / page)11.46±0.16 (0.26±0.00 / page)29.81±19.82 (0.68±0.45 / page)
LU0073418229-LU0280479329_English_2010_BaringRussiaFundAUSD.pdf467.36±0.23 (0.16±0.01 / page)22.81±0.40 (0.50±0.01 / page)24.59±0.05 (0.53±0.00 / page)9.51±0.18 (0.21±0.00 / page)16.07±7.70 (0.35±0.17 / page)
ALGER_SICAV_LU0070176184_2012_X_P_X_X.pdf467.56±0.15 (0.16±0.00 / page)28.77±0.08 (0.63±0.00 / page)30.85±0.05 (0.67±0.00 / page)9.94±0.04 (0.22±0.00 / page)19.28±10.59 (0.42±0.23 / page)
LU0562934934-LU0289591256_English_2015_EastCapital-Lux-ChinaEnviron-.pdf479.30±0.10 (0.20±0.00 / page)29.55±0.50 (0.63±0.01 / page)31.71±0.14 (0.67±0.00 / page)12.00±0.04 (0.26±0.00 / page)20.64±10.07 (0.44±0.21 / page)
LU0346062424_English_2015_QuestCleantechBAcc.pdf486.73±0.04 (0.14±0.00 / page)23.13±0.44 (0.48±0.01 / page)24.64±0.08 (0.51±0.00 / page)8.55±0.03 (0.18±0.00 / page)15.76±8.17 (0.33±0.17 / page)
LU0145217120-LU0323243989_English_2015_ShareGold.pdf507.32±0.05 (0.15±0.00 / page)24.83±0.46 (0.50±0.01 / page)26.25±0.11 (0.53±0.00 / page)9.11±0.03 (0.18±0.00 / page)16.88±8.70 (0.34±0.17 / page)
LU0230834854-LU0486541344-LU0254675159-LU0230242686-LU0230242504_English_2015_RobecoFlex-o-.pdf508.27±0.12 (0.17±0.00 / page)34.55±0.80 (0.69±0.02 / page)35.97±0.08 (0.72±0.00 / page)10.66±0.11 (0.21±0.00 / page)22.36±12.94 (0.45±0.26 / page)
LU0144591038_English_2015_TurkisfundBondsI.pdf537.34±0.01 (0.14±0.00 / page)25.25±0.69 (0.48±0.01 / page)26.66±0.04 (0.50±0.00 / page)9.62±0.08 (0.18±0.00 / page)17.22±8.80 (0.32±0.17 / page)
LU0313364811-LU0313363508_English_2016_MultiM-AccessII-.pdf548.65±0.02 (0.16±0.00 / page)52.08±0.38 (0.96±0.01 / page)54.52±0.20 (1.01±0.00 / page)11.37±0.02 (0.21±0.00 / page)31.65±21.69 (0.59±0.40 / page)
LU0273373414_English_2010_GS-PFondsSchwellenlanderG.pdf557.92±0.08 (0.14±0.00 / page)27.93±0.10 (0.51±0.00 / page)29.55±0.06 (0.54±0.00 / page)9.76±0.05 (0.18±0.00 / page)18.79±9.99 (0.34±0.18 / page)
LU0244071956-LU0301247077-LU0301246772_English_2009_LTIF-Classic.pdf557.65±0.24 (0.14±0.00 / page)25.13±0.11 (0.46±0.00 / page)26.88±0.13 (0.49±0.00 / page)9.62±0.10 (0.17±0.00 / page)17.32±8.73 (0.31±0.16 / page)
LU0415391431-LU0415391514-LU0433847679-LU0415392249-LU0437409112-LU0513479948-LU0513479864-LU0433846606-LU0453818972-LU0631859575-LU0415391605-LU0415392751-LU0494761835-LU0453818899-LU0631859229_English_2011_BB.pdf5611.43±0.19 (0.20±0.00 / page)76.86±0.07 (1.37±0.00 / page)80.39±0.23 (1.44±0.00 / page)14.92±0.08 (0.27±0.00 / page)45.90±32.77 (0.82±0.59 / page)
BSI-Multinvest_2015_X_P_X_A.pdf5710.28±0.24 (0.18±0.00 / page)64.32±0.07 (1.13±0.00 / page)67.12±0.08 (1.18±0.00 / page)13.16±0.07 (0.23±0.00 / page)38.72±27.04 (0.68±0.47 / page)
Universe_The_CMI_Global_Network_Fund_2007_X_P_X_X.pdf588.01±0.17 (0.14±0.00 / page)34.83±0.06 (0.60±0.00 / page)36.58±0.07 (0.63±0.00 / page)9.86±0.02 (0.17±0.00 / page)22.32±13.42 (0.38±0.23 / page)
LU0066480616-LU0208183011_English_2012_ValartisRussianMarketFund.pdf587.36±0.10 (0.13±0.00 / page)25.37±0.09 (0.44±0.00 / page)27.10±0.03 (0.47±0.00 / page)9.30±0.07 (0.16±0.00 / page)17.28±9.00 (0.30±0.16 / page)
Henderson_Euroland_Fund_2012_X_P_X_A.pdf6316.30±0.46 (0.26±0.01 / page)42.19±0.20 (0.67±0.00 / page)47.49±0.10 (0.75±0.00 / page)21.59±0.21 (0.34±0.00 / page)31.90±13.22 (0.51±0.21 / page)
DNB_FUND_LU0029375739_2014_X_P_X_X.pdf659.67±0.13 (0.15±0.00 / page)36.89±0.09 (0.57±0.00 / page)38.82±0.11 (0.60±0.00 / page)11.67±0.11 (0.18±0.00 / page)24.26±13.63 (0.37±0.21 / page)
Fidelity_Active_STrategy_LU1048814831_2016_X_P_X_A.pdf6511.67±0.30 (0.18±0.00 / page)72.04±0.16 (1.11±0.00 / page)74.20±0.11 (1.14±0.00 / page)14.06±0.02 (0.22±0.00 / page)42.99±30.15 (0.66±0.46 / page)
LU0447610410-LU0931136328_English_2015_HSBCPortfoliosWorldSelect-1.pdf669.96±0.09 (0.15±0.00 / page)53.60±0.09 (0.81±0.00 / page)55.72±0.08 (0.84±0.00 / page)12.37±0.10 (0.19±0.00 / page)32.91±21.78 (0.50±0.33 / page)
SEB_SICAV_2_LU0086813762_2013_X_P_X_X.pdf678.51±0.03 (0.13±0.00 / page)69.62±0.12 (1.04±0.00 / page)71.84±0.21 (1.07±0.00 / page)10.98±0.05 (0.16±0.00 / page)40.24±30.51 (0.60±0.46 / page)
MainFirst_LU0152754726_2011_X_P_X_X.pdf678.37±0.06 (0.12±0.00 / page)36.39±0.10 (0.54±0.00 / page)38.04±0.06 (0.57±0.00 / page)10.29±0.09 (0.15±0.00 / page)23.27±13.97 (0.35±0.21 / page)
JULIUS_BAER_MULTIBOND_LU0189697427_2011_X_P_X_X.pdf6910.97±0.12 (0.16±0.00 / page)56.18±0.07 (0.81±0.00 / page)58.29±0.03 (0.84±0.00 / page)13.39±0.06 (0.19±0.00 / page)34.71±22.56 (0.50±0.33 / page)
LU0012197231-LU0372412295-LU0012197314-LU0261938939-LU0261938004-LU0372412022-LU0100838696-LU0261938426-LU0372411990_English_2011_JBGlobalConvertBd-.pdf6911.03±0.17 (0.16±0.00 / page)56.17±0.13 (0.81±0.00 / page)58.29±0.07 (0.84±0.00 / page)13.29±0.19 (0.19±0.00 / page)34.70±22.56 (0.50±0.33 / page)
LU0030165871_English_2014_SEBNordicFund.pdf738.51±0.05 (0.12±0.00 / page)41.51±0.09 (0.57±0.00 / page)43.29±0.06 (0.59±0.00 / page)10.43±0.04 (0.14±0.00 / page)25.93±16.49 (0.36±0.23 / page)
LU0212018807_English_2013_SFPCEuropeanPropertySecurities.pdf7410.64±0.10 (0.14±0.00 / page)48.30±0.08 (0.65±0.00 / page)50.26±0.08 (0.68±0.00 / page)13.10±0.06 (0.18±0.00 / page)30.57±18.74 (0.41±0.25 / page)
Prospectus AXA IM Cash en_FCP.pdf759.59±0.12 (0.13±0.00 / page)36.29±0.13 (0.48±0.00 / page)38.12±0.07 (0.51±0.00 / page)11.63±0.02 (0.16±0.00 / page)23.90±13.33 (0.32±0.18 / page)
BE0946843266-BE0947250453-BE0133741752-BE0175280016-BE0175279976-BE0946844272-BE0175717504-BE0175479.pdf769.39±0.03 (0.12±0.00 / page)91.41±0.17 (1.20±0.00 / page)93.36±0.14 (1.23±0.00 / page)11.83±0.01 (0.16±0.00 / page)51.50±40.90 (0.68±0.54 / page)
LU0026740844-LU0026740760-LU0099389313_English_2011_JBEuro-FocusEUR.pdf7712.95±0.04 (0.17±0.00 / page)61.40±0.11 (0.80±0.00 / page)64.12±0.08 (0.83±0.00 / page)15.64±0.13 (0.20±0.00 / page)38.53±24.27 (0.50±0.32 / page)
LU0529497777-LU0529498072-LU0529497934-LU0529497850-LU0529497694-LU0529498239-LU0529498155-LU0529498742-LU0529497421-LU0529498825-LU0529498668_English_2011_JBAbso-ReturnEuro-Eq-.pdf7813.60±0.14 (0.17±0.00 / page)62.68±0.15 (0.80±0.00 / page)65.44±0.08 (0.84±0.00 / page)16.33±0.04 (0.21±0.00 / page)39.51±24.59 (0.51±0.32 / page)
BDLCM-Funds-Prospectus-VISA-28042020.pdf7912.40±0.11 (0.16±0.00 / page)118.46±0.33 (1.50±0.00 / page)121.09±0.07 (1.53±0.00 / page)15.45±0.06 (0.20±0.00 / page)66.85±52.95 (0.85±0.67 / page)
1.9.900555.pdf8214.74±0.17 (0.18±0.00 / page)57.43±0.48 (0.70±0.01 / page)60.15±0.08 (0.73±0.00 / page)17.85±0.11 (0.22±0.00 / page)37.54±21.30 (0.46±0.26 / page)
LU0302081707-LU0267827326-LU0363285338-LU0494360927-LU0419264733-LU0232040708-LU0363285411-LU0363286658-LU0363286732-LU0363285767-LU0302080998-LU0232043801-LU0232043124-LU0267829611-LU0494361065_English_2013_WIOF.pdf8410.84±0.03 (0.13±0.00 / page)117.39±0.63 (1.40±0.01 / page)118.98±0.18 (1.42±0.00 / page)13.49±0.08 (0.16±0.00 / page)65.17±53.02 (0.78±0.63 / page)
MARCH_INTERNATIONAL_2015_X_P_X_X.pdf8511.28±0.02 (0.13±0.00 / page)49.33±0.66 (0.58±0.01 / page)51.06±0.06 (0.60±0.00 / page)13.58±0.03 (0.16±0.00 / page)31.31±18.91 (0.37±0.22 / page)
GB00BR4R5445-GB00BR4R5551-GB0033772624_English_2016_DimensionalEm.pdf8511.83±0.18 (0.14±0.00 / page)123.81±1.81 (1.46±0.02 / page)125.51±0.36 (1.48±0.00 / page)14.69±0.07 (0.17±0.00 / page)68.96±55.72 (0.81±0.66 / page)
LU0028051117_English_2014_CMIUKEquityFundDC2.pdf8710.95±0.14 (0.13±0.00 / page)45.25±0.36 (0.52±0.00 / page)46.93±0.11 (0.54±0.00 / page)12.74±0.06 (0.15±0.00 / page)28.97±17.14 (0.33±0.20 / page)
LU0146081418-LU0028047438-LU0129306311_English_2014_CMIUSEnhancedEquityDC.pdf8710.90±0.19 (0.13±0.00 / page)45.67±0.95 (0.52±0.01 / page)46.99±0.23 (0.54±0.00 / page)12.81±0.10 (0.15±0.00 / page)29.09±17.26 (0.33±0.20 / page)
Arabesque_SICAV_LU1023698662_2016_X_P_X_A.pdf8711.61±0.26 (0.13±0.00 / page)96.20±0.57 (1.11±0.01 / page)98.38±0.17 (1.13±0.00 / page)14.17±0.07 (0.16±0.00 / page)55.09±42.22 (0.63±0.49 / page)
LU1151059737-LU1151057954_English_2015_RICHELIEUB--Richelieu2020.pdf8913.42±0.24 (0.15±0.00 / page)124.81±0.28 (1.40±0.00 / page)127.68±0.09 (1.43±0.00 / page)16.38±0.04 (0.18±0.00 / page)70.57±55.69 (0.79±0.63 / page)
LU0477234263-LU0466397824_English_2015_R.pdf8913.40±0.21 (0.15±0.00 / page)124.77±0.32 (1.40±0.00 / page)127.57±0.28 (1.43±0.00 / page)16.54±0.05 (0.19±0.00 / page)70.57±55.62 (0.79±0.62 / page)
LU1139920265-LU1125674611-LU1125674538-LU1125674967-LU1125674454-LU1125674702_English_2015_EastCapital-Lux-FrontierM.pdf9214.09±0.13 (0.15±0.00 / page)118.30±0.58 (1.29±0.01 / page)120.39±0.18 (1.31±0.00 / page)17.11±0.11 (0.19±0.00 / page)67.48±51.89 (0.73±0.56 / page)
MAGALLANES_VALUE_INVESTORS_UCITS_LU1330191542_2016_X_P_X_X.pdf9310.52±0.16 (0.11±0.00 / page)51.99±0.24 (0.56±0.00 / page)53.63±0.11 (0.58±0.00 / page)12.49±0.11 (0.13±0.00 / page)32.16±20.67 (0.35±0.22 / page)
DEMOGRAPHIC_CHANGE_2018_X_P_X_X.pdf9511.88±0.22 (0.13±0.00 / page)48.95±0.21 (0.52±0.00 / page)50.97±0.16 (0.54±0.00 / page)13.97±0.08 (0.15±0.00 / page)31.44±18.55 (0.33±0.20 / page)
UNK_English_UNK_LFIS-Vision-UCITS.pdf9711.34±0.05 (0.12±0.00 / page)57.55±0.35 (0.59±0.00 / page)59.22±0.10 (0.61±0.00 / page)13.37±0.04 (0.14±0.00 / page)35.37±23.03 (0.36±0.24 / page)
average (per page)0.19±0.131.14±2.431.19±2.480.24±0.160.69±1.80
+
+
+7 FinTOC 2022 (en, 100-500 pages) (21 files) + + + + + + + + + + + + + + + + + + + + + + + + + +
FilenamePagespdf_with_text_layer
tabbytrueautoauto_tabbyaverage
Dexia_Equities_L_2011_X_P_X_X.pdf10010.98±0.09 (0.11±0.00 / page)69.44±0.06 (0.69±0.00 / page)71.54±0.15 (0.72±0.00 / page)13.23±0.09 (0.13±0.00 / page)41.30±29.21 (0.41±0.29 / page)
Lombard_Odier_Funds_2014_X_P_X_X.pdf10415.49±0.04 (0.15±0.00 / page)101.54±0.07 (0.98±0.00 / page)104.63±0.25 (1.01±0.00 / page)18.61±0.14 (0.18±0.00 / page)60.07±43.04 (0.58±0.41 / page)
LU0881817786-LU0881818081-LU0881817430-LU0881817190_English_2014_OddoBondsHighYieldEurope.pdf10713.42±0.04 (0.13±0.00 / page)254.76±0.21 (2.38±0.00 / page)258.69±0.44 (2.42±0.00 / page)16.88±0.19 (0.16±0.00 / page)135.94±120.80 (1.27±1.13 / page)
LU0800341645-LU0800341132-LU0800341991-LU0800341215-LU0800341058-LU0800341488-LU0800341306_English_2012_FranklinBrazilOpportunities.pdf11017.99±0.08 (0.16±0.00 / page)97.31±0.10 (0.88±0.00 / page)101.53±0.17 (0.92±0.00 / page)21.57±0.16 (0.20±0.00 / page)59.60±39.87 (0.54±0.36 / page)
Prospectus-2016-02-01.pdf11013.12±0.13 (0.12±0.00 / page)78.04±0.17 (0.71±0.00 / page)80.75±0.08 (0.73±0.00 / page)15.37±0.07 (0.14±0.00 / page)46.82±32.60 (0.43±0.30 / page)
LU1252823262-LU0482498846-LU0482498762-LU0955861710-LU0955867758-LU0955867915-LU0432616810-LU0607521506-LU0955867832-LU0482498176-LU0955861983-LU0955861801-LU0432616901_English_2013_InvescoBalanced-RiskAlloc-.pdf11318.53±0.06 (0.16±0.00 / page)88.26±0.28 (0.78±0.00 / page)92.13±0.34 (0.82±0.00 / page)21.98±0.15 (0.19±0.00 / page)55.23±35.02 (0.49±0.31 / page)
LU0949250459-LU0645132902-LU0229041164-LU0390138864-LU0188151251-LU0543370943-LU0195951883-LU0152904719-LU0188151095-LU0543370513-LU0889566138-LU0229948244-LU0229948087-LU0122613572-LU0229949648_English_2012_Franklin.pdf11622.53±0.59 (0.19±0.01 / page)306.84±0.41 (2.65±0.00 / page)314.77±1.15 (2.71±0.01 / page)28.40±0.08 (0.24±0.00 / page)168.13±142.71 (1.45±1.23 / page)
LU1057354992_English_2016_EchiquierEuropeanBondsAEUR.pdf11814.77±0.09 (0.13±0.00 / page)96.58±0.11 (0.82±0.00 / page)99.78±0.10 (0.85±0.00 / page)17.49±0.24 (0.15±0.00 / page)57.15±41.05 (0.48±0.35 / page)
LU0641972152-LU0641972079_English_2015_DBPWMIGlobalAllocationTracker-.pdf12114.30±0.03 (0.12±0.00 / page)485.43±0.30 (4.01±0.00 / page)488.98±0.44 (4.04±0.00 / page)17.63±0.10 (0.15±0.00 / page)251.58±235.63 (2.08±1.95 / page)
FU_BF097_EN_2019-05-13_a5585d06-b4df-4b1e-bf30-3c1b5615cf74.pdf12215.64±0.04 (0.13±0.00 / page)125.95±0.16 (1.03±0.00 / page)129.33±0.20 (1.06±0.00 / page)18.40±0.02 (0.15±0.00 / page)72.33±55.33 (0.59±0.45 / page)
LU0705072691-LU0705072345-LU0705072188_English_2015_RAM-LUX-Long-Sh-EmergingMarktesEq-.pdf12519.43±0.11 (0.16±0.00 / page)321.80±0.30 (2.57±0.00 / page)328.51±0.47 (2.63±0.00 / page)24.03±0.35 (0.19±0.00 / page)173.44±151.74 (1.39±1.21 / page)
LU0309082104-LU0309082799-LU0309082369_English_2015_DNCAInvest-Infrastructures.pdf13414.62±0.06 (0.11±0.00 / page)219.02±0.44 (1.63±0.00 / page)223.57±0.30 (1.67±0.00 / page)17.44±0.10 (0.13±0.00 / page)118.66±102.65 (0.89±0.77 / page)
LU0462973008-LU0512124362_English_2015_DNCAInvestMiura.pdf13414.52±0.03 (0.11±0.00 / page)218.86±0.45 (1.63±0.00 / page)223.51±0.23 (1.67±0.00 / page)17.49±0.09 (0.13±0.00 / page)118.60±102.61 (0.89±0.77 / page)
LU0375979613-LU0375979290_English_2015_GISDyn-ControlPFCo.pdf13618.45±0.02 (0.14±0.00 / page)222.75±0.19 (1.64±0.00 / page)227.47±0.23 (1.67±0.00 / page)22.13±0.09 (0.16±0.00 / page)122.70±102.43 (0.90±0.75 / page)
LU0289452210_English_2015_DBPWMIIGISUSEquityPortfolioB.pdf13618.27±0.35 (0.13±0.00 / page)83.28±0.08 (0.61±0.00 / page)87.35±0.07 (0.64±0.00 / page)21.63±0.12 (0.16±0.00 / page)52.63±32.73 (0.39±0.24 / page)
LU0424369923-LU0424369766-LU0114314536-LU0063949068-LU0686792812-LU0061927850-LU0686794354_English_2013_ManConvertibles.pdf14721.73±0.03 (0.15±0.00 / page)496.84±0.96 (3.38±0.01 / page)504.14±0.97 (3.43±0.01 / page)26.09±0.17 (0.18±0.00 / page)262.20±238.31 (1.78±1.62 / page)
LU0575375588-LU0493852429-LU0261074230-LU0640453774-LU0860716223-LU0575374698-LU0493865678-LU0860715415-LU0493851454-LU0160485420-LU0493867534-LU0860716140-LU0953070868-LU0956110364-LU0688432862_English_2016_AshmoreS.pdf18121.72±0.16 (0.12±0.00 / page)137.00±0.14 (0.76±0.00 / page)140.50±0.07 (0.78±0.00 / page)24.88±0.06 (0.14±0.00 / page)81.02±57.75 (0.45±0.32 / page)
HSBC_Global_Investment_Funds_2017_X_P_X_A.pdf25032.34±0.19 (0.13±0.00 / page)246.00±0.49 (0.98±0.00 / page)250.83±0.46 (1.00±0.00 / page)36.65±0.21 (0.15±0.00 / page)141.46±106.98 (0.57±0.43 / page)
LU0178440839-LU0178439401-LU0178439310-LU0178439666_English_2012_AllianzBestSty-Eu-Eq-.pdf28534.42±0.19 (0.12±0.00 / page)256.92±0.54 (0.90±0.00 / page)261.52±0.22 (0.92±0.00 / page)36.69±0.19 (0.13±0.00 / page)147.39±111.85 (0.52±0.39 / page)
LU0589944569-LU0348788117-LU1156968403-LU1254141333-LU0348791418_English_2011_AllianzEm-AsiaEq-.pdf28533.59±0.59 (0.12±0.00 / page)256.60±0.34 (0.90±0.00 / page)261.08±0.47 (0.92±0.00 / page)37.69±0.27 (0.13±0.00 / page)147.24±111.62 (0.52±0.39 / page)
LU0734574329-LU0734574162-LU0333227550-LU0333226230-LU0571576585-LU1039626509-LU0333227394-LU0333226826-LU0734574246-LU0333227048_English_2015_ML.pdf40549.15±0.53 (0.12±0.00 / page)2917.33±9.87 (7.20±0.02 / page)2930.59±2.55 (7.24±0.01 / page)56.62±0.52 (0.14±0.00 / page)1488.42±1435.56 (3.68±3.54 / page)
average (per page)0.13±0.021.77±1.541.80±1.540.16±0.030.97±1.36
+
+ + diff --git a/scripts/benchmark_pdf_performance.py b/scripts/benchmark_pdf_performance.py new file mode 100644 index 00000000..c3fa48af --- /dev/null +++ b/scripts/benchmark_pdf_performance.py @@ -0,0 +1,155 @@ +import argparse +import json +import os.path +import zipfile +from typing import List + +import wget + +from scripts.benchmark_utils.pdf_performance_task import PDFPerformanceTask + + +def download_data(data_path: str) -> None: + data_archive_path = f"{data_path}.zip" + + wget.download("https://at.ispras.ru/owncloud/index.php/s/lp4wEVyZTd9lA0u/download", data_archive_path) + with zipfile.ZipFile(data_archive_path, "r") as archive: + archive.extractall(data_path) + + os.remove(data_archive_path) + + +def get_tasks(configs: List[dict], input_path: str, dedoc_host: str, pdf_options: List[str]) -> List[List[PDFPerformanceTask]]: + if input_path == "": + input_path = "pdf_performance_benchmark_data" + download_data(input_path) + + tasks = [] + + for config in configs: + config_tasks = [] + + for task_name in sorted(os.listdir(input_path)): + files_path = os.path.join(input_path, task_name) + if os.path.isdir(files_path) and not task_name.startswith("_"): + config_tasks.append(PDFPerformanceTask(dedoc_host, task_name, files_path, pdf_options, config)) + + tasks.append(config_tasks) + + return tasks + + +def make_report(tasks: List[List[PDFPerformanceTask]], output_path: str, configs: List[dict]) -> None: + with open(output_path, "w", encoding="utf-8") as f: + f.write(""" + + PDF performance benchmark + + + + + """) + + for config, config_tasks in zip(configs, tasks): + f.write("

Running parameters:

") + f.write(f"
{json.dumps(config, ensure_ascii=False, indent=2)}
\n\n") + + for task in config_tasks: + f.write(task.to_html()) + + f.write("\n") + f.write("\n") + + +def main() -> None: + default_output_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources", "benchmarks", "benchmark_pdf_performance.html")) + pdf_options = ["true", "false", "auto", "auto_tabby", "tabby"] + + parser = argparse.ArgumentParser(description="Script for evaluate different PDF readers performance.", formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("-i", "--input", help="path to the directory with pdfs (default: %(default)s)", type=str, default="") + parser.add_argument("-o", "--output", help="path to the report filename (default: %(default)s)", type=str, default=default_output_path) + parser.add_argument("-n", "--loops", help="number of repetitions of testing one file (default: %(default)d)", type=int, default=1) + parser.add_argument("--dedoc-host", help="url to DEDOC instance for sending files (default: %(default)s", type=str, default="http://localhost:1231") + parser.add_argument("--pdf-options", help="values of pdf_with_text_layer argument", choices=pdf_options, nargs="+", default=pdf_options) + parser.add_argument("--parameters", help="path to json file with alternative parameters dictionaries") + args = parser.parse_args() + + if args.input != "": + assert os.path.exists(args.input), f'Directory "{args.input}" does not exists' + assert os.path.isdir(args.input), f'Path "{args.input}" is not a directory' + + assert args.loops > 0, "The number of repetitions of testing one file must be positive" + + print(f'Run pdf performance benchmark with next pdf options: {", ".join(args.pdf_options)}') + configs = [{}] + + if args.parameters: + with open(args.parameters, "r", encoding="utf-8") as f: + configs = json.load(f) + + tasks = get_tasks(configs, args.input, args.dedoc_host, args.pdf_options) + + for _ in range(args.loops): + for config_tasks in tasks: + for task in config_tasks: + task.run() + make_report(tasks, args.output, configs) + + +""" +How to run on default benchmark data? +Simple run next command: + python3 benchmark_pdf_performance.py + +Running on custom data: +1. Prepare folder with tasks. The task is a directory with pdf files. Directories starting with an underscore (_) will be ignored. +Example of a folder "pdf_data" with 3 tasks: + pdf_data + +--+--+ task1 + | +--- file1.pdf + | +--- file2.pdf + | + +--+ Some second task name + | +--- f.pdf + | + +--+ And last task name + | +--- file_.pdf + | +--- file2.pdf + | +--- not_pdf_file.docx + | + +--+ _ignored folder + +--- some_image.png + +--- some_pdf.pdf + +2. Run script with next command: + python3 benchmark_pdf_performance.py --pdf-options tabby true auto auto_tabby -i pdf_data + +2*. To evaluate with different parameters, you can prepare a json file with a list of dictionaries and specify the “parameters” option: + parameters.json: + [ + { "need_pdf_table_analysis": "false" }, + { "need_pdf_table_analysis": "true", "return_format": "plain_text" } + ] + +Run with next command: + python3 benchmark_pdf_performance.py --pdf-options tabby true auto auto_tabby -i pdf_data --parameters parameters.json + +3. Look your results in the pdf_performance.html file +""" +if __name__ == "__main__": + main() diff --git a/scripts/benchmark_utils/pdf_performance_task.py b/scripts/benchmark_utils/pdf_performance_task.py new file mode 100644 index 00000000..b82f9f7f --- /dev/null +++ b/scripts/benchmark_utils/pdf_performance_task.py @@ -0,0 +1,94 @@ +import os +import time +from typing import List + +from pdfminer.pdfpage import PDFPage + +from dedoc.utils.pdf_utils import get_pdf_page_count +from dedoc.utils.utils import send_file +from scripts.benchmark_utils.performance_result import PerformanceResult + + +class PDFPerformanceTask: + """ + This class is used to estimate the elapsed time of different PDF pipelines + in different PDF files and save the information into an html table. + """ + + def __init__(self, dedoc_host: str, title: str, input_dir: str, pdf_reader_options: List[str], config: dict) -> None: + """ + Initialization of task + + :param dedoc_host: URL to launch the dedoc API instance, for example http://localhost:1231 + :param title: title of the task to display in the html report + :param input_dir: path to the directory containing the PDF files. + :param pdf_reader_options: list of options available for the "pdf_with_text_layer" API parameter + :param config: additional file processing parameters + """ + self.dedoc_host = dedoc_host + self.title = title + self.config = config + self.pdf_reader_options = pdf_reader_options + + filenames = [os.path.join(input_dir, filename) for filename in os.listdir(input_dir) if filename.endswith(".pdf")] + self.times = {pdf_option: {filename: PerformanceResult() for filename in filenames} for pdf_option in self.pdf_reader_options} + self.pages = {filename: get_pdf_page_count(filename) for filename in filenames} + self.filenames = sorted(filenames, key=lambda filename: self.pages[filename]) + + def run(self) -> None: + print(f'Run task "{self.title}"') + + for pdf_option in self.pdf_reader_options: + print(f' Handle files with pdf option "{pdf_option}":') + self.__run_files(pdf_option) + + def to_html(self) -> str: + if not self.filenames: + return "" + + pdf_header = "".join(f"{pdf_option}" for pdf_option in self.pdf_reader_options) + + html = [ + "
", + f"{self.title} ({len(self.filenames)} files)", "", + f'', + f"{pdf_header}" + ] + + for filename in self.filenames: + times = [self.times[pdf_option][filename] for pdf_option in self.pdf_reader_options] + pages = self.pages[filename] + html.append(f"{self.__get_performance_cells(times, pages)}") + + times = [] + for pdf_option in self.pdf_reader_options: + times.append(PerformanceResult([self.times[pdf_option][filename] / self.pages[filename] for filename in self.filenames])) + + html.append(f'{self.__get_performance_cells(times)}') + html.append("
FilenamePagespdf_with_text_layer
average
{os.path.basename(filename)}{pages}
average (per page)
") + html.append("
\n") + + return "\n".join(html) + + def __run_file(self, pdf_option: str, filename: str) -> float: + start_time = time.time() + send_file(self.dedoc_host, os.path.basename(filename), filename, {"pdf_with_text_layer": pdf_option, **self.config}) + return time.time() - start_time + + def __run_files(self, pdf_option: str) -> None: + for i, filename in enumerate(self.filenames): + elapsed_time = self.__run_file(pdf_option, filename) + self.times[pdf_option][filename].add(elapsed_time) + print(f' - handle file {i + 1} / {len(self.filenames)} "{os.path.basename(filename)}" (pages: {self.pages[filename]}): {elapsed_time} seconds') + + print("") + + def __get_performance_cells(self, pdf_times: List[PerformanceResult], pages: int = 0) -> str: + total_times = pdf_times + [PerformanceResult(pdf_times)] + return "".join(f"{times} ({times / pages} / page)" if pages > 0 else f"{times}" for times in total_times) + + def __get_page_count(self, path: str) -> int: + with open(path, "rb") as fp: + pages = len(list(PDFPage.get_pages(fp))) + + return max(pages, 1) diff --git a/scripts/benchmark_utils/performance_result.py b/scripts/benchmark_utils/performance_result.py new file mode 100644 index 00000000..93b7db8c --- /dev/null +++ b/scripts/benchmark_utils/performance_result.py @@ -0,0 +1,59 @@ +from typing import Iterable, Optional, Union + +import numpy as np + + +class PerformanceResult: + """ + This class is used for storing multiple results of measuring some metric (for example, elapsed time) + with support for calculating mean and std statistics and pretty printing of stored values + + >>> result = PerformanceResult() + >>> f"result: {result}" # result: - + >>> result.add(5.0) + >>> f"result: {result}" # result: 5.00 + >>> result.add(8.0) + >>> f"result: {result}" # result: 6.50±1.50 + >>> result.mean # 6.5 + >>> result.std # 1.5 + >>> partial_result = result / 4 + >>> f"partial_result: {partial_result}" # partial_result: 1.62±0.38 + """ + + def __init__(self, results: Optional[Iterable["PerformanceResult"]] = None) -> None: + self.values = [] + + if results is not None: + for result in results: + self.add(result) + + def add(self, value: Union[float, "PerformanceResult"]) -> None: + if isinstance(value, PerformanceResult): + self.values.extend(value.values) + else: + self.values.append(value) + + @property + def mean(self) -> float: + return np.mean(self.values) if self.values else 0 + + @property + def std(self) -> float: + return np.std(self.values) if self.values else 0 + + def __str__(self) -> str: + if not self.values: + return "-" + + if len(self.values) == 1: + return f"{self.mean:.2f}" + + return f"{self.mean:.2f}±{self.std:.2f}" + + def __truediv__(self, scale: float) -> "PerformanceResult": + result = PerformanceResult() + + for t in self.values: + result.add(t / scale) + + return result