Skip to content

Commit a799bde

Browse files
Add support for concurrent processing of pages.
setup.py Add new _apply_pages.py to wheels/installs. src/__init__.py New top-level apply_pages(). Also convenience fn get_text() which uses apply_pages(). Unlike Python's `multiprocessing` module, we also support passing keyword args to functions in worker process. src/_apply_pages.py New, contains implementation of apply_pages(). tests/test_pylint.py Avoid pylint failure by disabling `R0801: Similar lines in 2 files`. tests/test_textextract.py Test get_text() and show timings. src/fitz___init__.py tests/conftest.py Use functions to manipulate _g_log_items so that things work even when using `fitz` alias. Timings for 8-core MacOS-arm64 and PDF spec: method='multiprocessing' : 3.3x. method='fork': 3.6x.
1 parent 838d8f1 commit a799bde

File tree

7 files changed

+525
-9
lines changed

7 files changed

+525
-9
lines changed

setup.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -606,21 +606,26 @@ def add( ret, from_, to_):
606606

607607
if path_so_leaf_b:
608608
# Add rebased implementation files.
609-
add( ret_p, f'{g_root}/src/fitz___init__.py', 'fitz/__init__.py') # For `fitz` module alias.
610-
add( ret_p, f'{g_root}/src/fitz_table.py', 'fitz/table.py') # For `fitz` module alias.
611-
add( ret_p, f'{g_root}/src/fitz_utils.py', 'fitz/utils.py') # For `fitz` module alias.
612609
to_dir = 'pymupdf/'
613610
add( ret_p, f'{g_root}/src/__init__.py', to_dir)
614611
add( ret_p, f'{g_root}/src/__main__.py', to_dir)
615612
add( ret_p, f'{g_root}/src/pymupdf.py', to_dir)
616613
add( ret_p, f'{g_root}/src/table.py', to_dir)
617614
add( ret_p, f'{g_root}/src/utils.py', to_dir)
615+
add( ret_p, f'{g_root}/src/_apply_pages.py', to_dir)
618616
add( ret_p, f'{g_root}/src/build/extra.py', to_dir)
619617
add( ret_p, f'{g_root}/src/build/{path_so_leaf_b}', to_dir)
620618

619+
# Add support for `fitz` backwards compatibility.
620+
add( ret_p, f'{g_root}/src/fitz___init__.py', 'fitz/__init__.py')
621+
add( ret_p, f'{g_root}/src/fitz_table.py', 'fitz/table.py')
622+
add( ret_p, f'{g_root}/src/fitz_utils.py', 'fitz/utils.py')
623+
621624
if mupdf_local:
625+
# Add MuPDF Python API.
622626
add( ret_p, f'{mupdf_build_dir}/mupdf.py', to_dir)
623627

628+
# Add MuPDF shared libraries.
624629
if windows:
625630
wp = pipcl.wdev.WindowsPython()
626631
add( ret_p, f'{mupdf_build_dir}/_mupdf.pyd', to_dir)

src/__init__.py

Lines changed: 206 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import string
2121
import sys
2222
import tarfile
23+
import time
2324
import typing
2425
import warnings
2526
import weakref
@@ -61,8 +62,19 @@ def _set_stream(name, default):
6162
_g_out_log = _set_stream('PYMUPDF_LOG', sys.stdout)
6263
_g_out_message = _set_stream('PYMUPDF_MESSAGE', sys.stdout)
6364

64-
# Set to list() if we are in test suite.
65-
_g_log_items = None
65+
_g_log_items = list()
66+
_g_log_items_active = False
67+
68+
def _log_items():
69+
return _g_log_items
70+
71+
def _log_items_active(active):
72+
global _g_log_items_active
73+
_g_log_items_active = active
74+
75+
def _log_items_clear():
76+
del _g_log_items[:]
77+
6678

6779
def log( text='', caller=1):
6880
'''
@@ -73,7 +85,7 @@ def log( text='', caller=1):
7385
line = frame_record.lineno
7486
function = frame_record.function
7587
text = f'{filename}:{line}:{function}: {text}'
76-
if _g_log_items is not None:
88+
if _g_log_items_active:
7789
_g_log_items.append(text)
7890
print(text, file=_g_out_log)
7991
_g_out_log.flush()
@@ -20942,6 +20954,197 @@ def vdist(dir, a, b):
2094220954
return mupdf.fz_abs(dx * dir.y + dy * dir.x)
2094320955

2094420956

20957+
def apply_pages(
20958+
path,
20959+
pagefn,
20960+
*,
20961+
pagefn_args=(),
20962+
pagefn_kwargs=dict(),
20963+
initfn=None,
20964+
initfn_args=(),
20965+
initfn_kwargs=dict(),
20966+
pages=None,
20967+
method='single',
20968+
concurrency=None,
20969+
_stats=False,
20970+
):
20971+
'''
20972+
Returns list of results from `pagefn()`, optionally using concurrency for
20973+
speed.
20974+
20975+
Args:
20976+
path:
20977+
Path of document.
20978+
pagefn:
20979+
Function to call for each page; is passed (page, *pagefn_args,
20980+
**pagefn_kwargs). Return value is added to list that we return. If
20981+
`method` is not 'single', must be a top-level function - nested
20982+
functions don't work with concurrency.
20983+
pagefn_args
20984+
pagefn_kwargs:
20985+
Additional args to pass to `pagefn`. Must be picklable.
20986+
initfn:
20987+
If true, called once in each worker process; is passed
20988+
(*initfn_args, **initfn_kwargs).
20989+
initfn_args
20990+
initfn_kwargs:
20991+
Args to pass to initfn. Must be picklable.
20992+
pages:
20993+
List of page numbers to process, or None to include all pages.
20994+
method:
20995+
'single'
20996+
Do not use concurrency.
20997+
'mp'
20998+
Operate concurrently using Python's `multiprocessing` module.
20999+
'fork'
21000+
Operate concurrently using custom implementation with
21001+
`os.fork()`. Does not work on Windows.
21002+
concurrency:
21003+
Number of worker processes to use when operating concurrently. If
21004+
None, we use the number of available CPUs.
21005+
_stats:
21006+
Internal, may change or be removed. If true, we output simple
21007+
timing diagnostics.
21008+
21009+
Note: We require a file path rather than a Document, because Document
21010+
instances do not work properly after a fork - internal file descriptor
21011+
offsets are shared between the parent and child processes.
21012+
'''
21013+
if _stats:
21014+
t0 = time.time()
21015+
21016+
if method == 'single':
21017+
if initfn:
21018+
initfn(*initfn_args, **initfn_kwargs)
21019+
ret = list()
21020+
document = Document(path)
21021+
for page in document:
21022+
r = pagefn(page, *pagefn_args, **initfn_kwargs)
21023+
ret.append(r)
21024+
21025+
else:
21026+
# Use concurrency.
21027+
#
21028+
from . import _apply_pages
21029+
21030+
if pages is None:
21031+
if _stats:
21032+
t = time.time()
21033+
with Document(path) as document:
21034+
num_pages = len(document)
21035+
pages = list(range(num_pages))
21036+
if _stats:
21037+
t = time.time() - t
21038+
log(f'{t:.2f}s: count pages.')
21039+
21040+
if _stats:
21041+
t = time.time()
21042+
21043+
if method == 'mp':
21044+
ret = _apply_pages._multiprocessing(
21045+
path,
21046+
pages,
21047+
pagefn,
21048+
pagefn_args,
21049+
pagefn_kwargs,
21050+
initfn,
21051+
initfn_args,
21052+
initfn_kwargs,
21053+
concurrency,
21054+
_stats,
21055+
)
21056+
21057+
elif method == 'fork':
21058+
ret = _apply_pages._fork(
21059+
path,
21060+
pages,
21061+
pagefn,
21062+
pagefn_args,
21063+
pagefn_kwargs,
21064+
initfn,
21065+
initfn_args,
21066+
initfn_kwargs,
21067+
concurrency,
21068+
_stats,
21069+
)
21070+
21071+
else:
21072+
assert 0, f'Unrecognised {method=}.'
21073+
21074+
if _stats:
21075+
t = time.time() - t
21076+
log(f'{t:.2f}s: work.')
21077+
21078+
if _stats:
21079+
t = time.time() - t0
21080+
log(f'{t:.2f}s: total.')
21081+
return ret
21082+
21083+
21084+
def get_text(
21085+
path,
21086+
*,
21087+
pages=None,
21088+
method='single',
21089+
concurrency=None,
21090+
21091+
option='text',
21092+
clip=None,
21093+
flags=None,
21094+
textpage=None,
21095+
sort=False,
21096+
delimiters=None,
21097+
21098+
_stats=False,
21099+
):
21100+
'''
21101+
Returns list of results from `Page.get_text()`, optionally using
21102+
concurrency for speed.
21103+
21104+
Args:
21105+
path:
21106+
Path of document.
21107+
pages:
21108+
List of page numbers to process, or None to include all pages.
21109+
method:
21110+
'single'
21111+
Do not use concurrency.
21112+
'mp'
21113+
Operate concurrently using Python's `multiprocessing` module.
21114+
'fork'
21115+
Operate concurrently using custom implementation with
21116+
`os.fork`. Does not work on Windows.
21117+
concurrency:
21118+
Number of worker processes to use when operating concurrently. If
21119+
None, we use the number of available CPUs.
21120+
option
21121+
clip
21122+
flags
21123+
textpage
21124+
sort
21125+
delimiters:
21126+
Passed to internal calls to `Page.get_text()`.
21127+
'''
21128+
args_dict = dict(
21129+
option=option,
21130+
clip=clip,
21131+
flags=flags,
21132+
textpage=textpage,
21133+
sort=sort,
21134+
delimiters=delimiters,
21135+
)
21136+
21137+
return apply_pages(
21138+
path,
21139+
Page.get_text,
21140+
pagefn_kwargs=args_dict,
21141+
pages=pages,
21142+
method=method,
21143+
concurrency=concurrency,
21144+
_stats=_stats,
21145+
)
21146+
21147+
2094521148
class TOOLS:
2094621149
'''
2094721150
We use @staticmethod to avoid the need to create an instance of this class.

0 commit comments

Comments
 (0)