18
18
from .utils import get_text_objects
19
19
from .utils import is_url
20
20
21
+ import warnings
21
22
22
23
class PDFHandler :
23
24
"""Handles all operations like temp directory creation, splitting
@@ -36,7 +37,7 @@ class PDFHandler:
36
37
37
38
"""
38
39
39
- def __init__ (self , filepath : Union [StrByteType , Path ], pages = "1" , password = None , multi = [] ):
40
+ def __init__ (self , filepath : Union [StrByteType , Path ], pages = "1" , password = None , multi = {} ):
40
41
if is_url (filepath ):
41
42
filepath = download_url (filepath )
42
43
self .filepath : Union [StrByteType , Path ] = filepath
@@ -188,35 +189,39 @@ def parse(
188
189
if parallel and len (self .pages ) > 1 and cpu_count > 1 :
189
190
with mp .get_context ("spawn" ).Pool (processes = cpu_count ) as pool :
190
191
jobs = []
191
- for p in self .pages :
192
-
192
+ for i , p in enumerate ( self .pages , 1 ) :
193
+ p_no = str ( i ) # [start] # [-5]
193
194
page_kwargs = kwargs
194
195
page_parser = parser
195
-
196
- if p in self .multi :
196
+ # assert p == 0
197
+ # print("test")
198
+ # warnings.warn(UserWarning("{}".format(p)))
199
+ if p_no in self .multi :
200
+ print (p + " is found in " + self .multi )
197
201
page_kwargs .update (self .multi [p_no ])
198
202
page_parser = Lattice (** page_kwargs ) if flavor == 'lattice' else Stream (** page_kwargs )
199
203
200
204
j = pool .apply_async (
201
- self ._parse_page ,(p , tempdir , parser , suppress_stdout , layout_kwargs )
205
+ self ._parse_page ,(p , tempdir , page_parser , suppress_stdout , layout_kwargs )
202
206
)
203
207
jobs .append (j )
204
208
205
209
for j in jobs :
206
210
t = j .get ()
207
211
tables .extend (t )
208
212
else :
209
- for p in self .pages :
210
- # p_no = p
213
+ for i , p in enumerate ( self .pages , 1 ) :
214
+ p_no = str ( i ) # [start] # [-5]
211
215
212
216
page_kwargs = kwargs
213
217
page_parser = parser
214
218
215
- if p in self .multi :
219
+ if p_no in self .multi :
220
+ print (i ,p ) # debug
216
221
page_kwargs .update (self .multi [p_no ])
217
222
page_parser = Lattice (** page_kwargs ) if flavor == 'lattice' else Stream (** page_kwargs )
218
223
219
- t = self ._parse_page (p , tempdir , parser , suppress_stdout , layout_kwargs )
224
+ t = self ._parse_page (p , tempdir , page_parser , suppress_stdout , layout_kwargs )
220
225
tables .extend (t )
221
226
222
227
return TableList (sorted (tables ))
0 commit comments