20
20
import string
21
21
import sys
22
22
import tarfile
23
+ import time
23
24
import typing
24
25
import warnings
25
26
import weakref
@@ -61,8 +62,19 @@ def _set_stream(name, default):
61
62
_g_out_log = _set_stream('PYMUPDF_LOG', sys.stdout)
62
63
_g_out_message = _set_stream('PYMUPDF_MESSAGE', sys.stdout)
63
64
64
- # Set to list() if we are in test suite.
65
- _g_log_items = None
65
+ _g_log_items = list()
66
+ _g_log_items_active = False
67
+
68
+ def _log_items():
69
+ return _g_log_items
70
+
71
+ def _log_items_active(active):
72
+ global _g_log_items_active
73
+ _g_log_items_active = active
74
+
75
+ def _log_items_clear():
76
+ del _g_log_items[:]
77
+
66
78
67
79
def log( text='', caller=1):
68
80
'''
@@ -73,7 +85,7 @@ def log( text='', caller=1):
73
85
line = frame_record.lineno
74
86
function = frame_record.function
75
87
text = f'{filename}:{line}:{function}: {text}'
76
- if _g_log_items is not None :
88
+ if _g_log_items_active :
77
89
_g_log_items.append(text)
78
90
print(text, file=_g_out_log)
79
91
_g_out_log.flush()
@@ -20942,6 +20954,197 @@ def vdist(dir, a, b):
20942
20954
return mupdf.fz_abs(dx * dir.y + dy * dir.x)
20943
20955
20944
20956
20957
+ def apply_pages(
20958
+ path,
20959
+ pagefn,
20960
+ *,
20961
+ pagefn_args=(),
20962
+ pagefn_kwargs=dict(),
20963
+ initfn=None,
20964
+ initfn_args=(),
20965
+ initfn_kwargs=dict(),
20966
+ pages=None,
20967
+ method='single',
20968
+ concurrency=None,
20969
+ _stats=False,
20970
+ ):
20971
+ '''
20972
+ Returns list of results from `pagefn()`, optionally using concurrency for
20973
+ speed.
20974
+
20975
+ Args:
20976
+ path:
20977
+ Path of document.
20978
+ pagefn:
20979
+ Function to call for each page; is passed (page, *pagefn_args,
20980
+ **pagefn_kwargs). Return value is added to list that we return. If
20981
+ `method` is not 'single', must be a top-level function - nested
20982
+ functions don't work with concurrency.
20983
+ pagefn_args
20984
+ pagefn_kwargs:
20985
+ Additional args to pass to `pagefn`. Must be picklable.
20986
+ initfn:
20987
+ If true, called once in each worker process; is passed
20988
+ (*initfn_args, **initfn_kwargs).
20989
+ initfn_args
20990
+ initfn_kwargs:
20991
+ Args to pass to initfn. Must be picklable.
20992
+ pages:
20993
+ List of page numbers to process, or None to include all pages.
20994
+ method:
20995
+ 'single'
20996
+ Do not use concurrency.
20997
+ 'mp'
20998
+ Operate concurrently using Python's `multiprocessing` module.
20999
+ 'fork'
21000
+ Operate concurrently using custom implementation with
21001
+ `os.fork()`. Does not work on Windows.
21002
+ concurrency:
21003
+ Number of worker processes to use when operating concurrently. If
21004
+ None, we use the number of available CPUs.
21005
+ _stats:
21006
+ Internal, may change or be removed. If true, we output simple
21007
+ timing diagnostics.
21008
+
21009
+ Note: We require a file path rather than a Document, because Document
21010
+ instances do not work properly after a fork - internal file descriptor
21011
+ offsets are shared between the parent and child processes.
21012
+ '''
21013
+ if _stats:
21014
+ t0 = time.time()
21015
+
21016
+ if method == 'single':
21017
+ if initfn:
21018
+ initfn(*initfn_args, **initfn_kwargs)
21019
+ ret = list()
21020
+ document = Document(path)
21021
+ for page in document:
21022
+ r = pagefn(page, *pagefn_args, **initfn_kwargs)
21023
+ ret.append(r)
21024
+
21025
+ else:
21026
+ # Use concurrency.
21027
+ #
21028
+ from . import _apply_pages
21029
+
21030
+ if pages is None:
21031
+ if _stats:
21032
+ t = time.time()
21033
+ with Document(path) as document:
21034
+ num_pages = len(document)
21035
+ pages = list(range(num_pages))
21036
+ if _stats:
21037
+ t = time.time() - t
21038
+ log(f'{t:.2f}s: count pages.')
21039
+
21040
+ if _stats:
21041
+ t = time.time()
21042
+
21043
+ if method == 'mp':
21044
+ ret = _apply_pages._multiprocessing(
21045
+ path,
21046
+ pages,
21047
+ pagefn,
21048
+ pagefn_args,
21049
+ pagefn_kwargs,
21050
+ initfn,
21051
+ initfn_args,
21052
+ initfn_kwargs,
21053
+ concurrency,
21054
+ _stats,
21055
+ )
21056
+
21057
+ elif method == 'fork':
21058
+ ret = _apply_pages._fork(
21059
+ path,
21060
+ pages,
21061
+ pagefn,
21062
+ pagefn_args,
21063
+ pagefn_kwargs,
21064
+ initfn,
21065
+ initfn_args,
21066
+ initfn_kwargs,
21067
+ concurrency,
21068
+ _stats,
21069
+ )
21070
+
21071
+ else:
21072
+ assert 0, f'Unrecognised {method=}.'
21073
+
21074
+ if _stats:
21075
+ t = time.time() - t
21076
+ log(f'{t:.2f}s: work.')
21077
+
21078
+ if _stats:
21079
+ t = time.time() - t0
21080
+ log(f'{t:.2f}s: total.')
21081
+ return ret
21082
+
21083
+
21084
+ def get_text(
21085
+ path,
21086
+ *,
21087
+ pages=None,
21088
+ method='single',
21089
+ concurrency=None,
21090
+
21091
+ option='text',
21092
+ clip=None,
21093
+ flags=None,
21094
+ textpage=None,
21095
+ sort=False,
21096
+ delimiters=None,
21097
+
21098
+ _stats=False,
21099
+ ):
21100
+ '''
21101
+ Returns list of results from `Page.get_text()`, optionally using
21102
+ concurrency for speed.
21103
+
21104
+ Args:
21105
+ path:
21106
+ Path of document.
21107
+ pages:
21108
+ List of page numbers to process, or None to include all pages.
21109
+ method:
21110
+ 'single'
21111
+ Do not use concurrency.
21112
+ 'mp'
21113
+ Operate concurrently using Python's `multiprocessing` module.
21114
+ 'fork'
21115
+ Operate concurrently using custom implementation with
21116
+ `os.fork`. Does not work on Windows.
21117
+ concurrency:
21118
+ Number of worker processes to use when operating concurrently. If
21119
+ None, we use the number of available CPUs.
21120
+ option
21121
+ clip
21122
+ flags
21123
+ textpage
21124
+ sort
21125
+ delimiters:
21126
+ Passed to internal calls to `Page.get_text()`.
21127
+ '''
21128
+ args_dict = dict(
21129
+ option=option,
21130
+ clip=clip,
21131
+ flags=flags,
21132
+ textpage=textpage,
21133
+ sort=sort,
21134
+ delimiters=delimiters,
21135
+ )
21136
+
21137
+ return apply_pages(
21138
+ path,
21139
+ Page.get_text,
21140
+ pagefn_kwargs=args_dict,
21141
+ pages=pages,
21142
+ method=method,
21143
+ concurrency=concurrency,
21144
+ _stats=_stats,
21145
+ )
21146
+
21147
+
20945
21148
class TOOLS:
20946
21149
'''
20947
21150
We use @staticmethod to avoid the need to create an instance of this class.
0 commit comments