Skip to content

Commit 23b0427

Browse files
committed
Add pypdfium2 rendering backend (experimental patch)
1 parent 44b4e68 commit 23b0427

File tree

5 files changed

+62
-6
lines changed

5 files changed

+62
-6
lines changed

camelot/backends/image_conversion.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22

33
from .poppler_backend import PopplerBackend
44
from .ghostscript_backend import GhostscriptBackend
5+
from .pdfium_backend import PdfiumBackend
56

6-
BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
7+
BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend, "pdfium": PdfiumBackend}
78

89

910
class ImageConversionBackend(object):
10-
def __init__(self, backend="poppler", use_fallback=True):
11+
def __init__(self, backend="pdfium", use_fallback=True):
1112
if backend not in BACKENDS.keys():
1213
raise ValueError(f"Image conversion backend '{backend}' not supported")
1314

camelot/backends/pdfium_backend.py

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# -*- coding: utf-8 -*-
2+
3+
try:
4+
import pypdfium2 as pdfium
5+
except Exception:
6+
pdfium = None
7+
8+
class PdfiumBackend(object):
9+
def convert(self, pdf_path, png_path, resolution=300):
10+
if not pdfium:
11+
raise OSError("pypdfium2 is not installed.")
12+
doc = pdfium.PdfDocument(pdf_path)
13+
assert len(doc) == 1
14+
image = doc[0].render(scale=resolution/72).to_pil()
15+
image.save(png_path)

setup.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,13 @@
2424
"tabulate>=0.8.9",
2525
]
2626

27-
base_requires = ["ghostscript>=0.7", "opencv-python>=3.4.2.17", "pdftopng>=0.2.3"]
27+
base_requires = [
28+
"opencv-python>=3.4.2.17",
29+
"pypdfium2>=4,<5",
30+
"pillow",
31+
"ghostscript>=0.7", # deprecate?
32+
"pdftopng>=0.2.3", # deprecate?
33+
]
2834

2935
plot_requires = [
3036
"matplotlib>=2.2.3",

tests/test_common.py

+34
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,14 @@ def test_password():
5959
assert_frame_equal(df, tables[0].df)
6060

6161

62+
def test_repr_pdfium():
63+
filename = os.path.join(testdir, "foo.pdf")
64+
tables = camelot.read_pdf(filename, backend="pdfium")
65+
assert repr(tables) == "<TableList n=1>"
66+
assert repr(tables[0]) == "<Table shape=(7, 7)>"
67+
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"
68+
69+
6270
def test_repr_poppler():
6371
filename = os.path.join(testdir, "foo.pdf")
6472
tables = camelot.read_pdf(filename, backend="poppler")
@@ -76,6 +84,14 @@ def test_repr_ghostscript():
7684
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
7785

7886

87+
def test_url_pdfium():
88+
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
89+
tables = camelot.read_pdf(url, backend="pdfium")
90+
assert repr(tables) == "<TableList n=1>"
91+
assert repr(tables[0]) == "<Table shape=(7, 7)>"
92+
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"
93+
94+
7995
def test_url_poppler():
8096
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
8197
tables = camelot.read_pdf(url, backend="poppler")
@@ -93,6 +109,24 @@ def test_url_ghostscript():
93109
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
94110

95111

112+
def test_pages_pdfium():
113+
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
114+
tables = camelot.read_pdf(url, backend="pdfium")
115+
assert repr(tables) == "<TableList n=1>"
116+
assert repr(tables[0]) == "<Table shape=(7, 7)>"
117+
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"
118+
119+
tables = camelot.read_pdf(url, pages="1-end", backend="pdfium")
120+
assert repr(tables) == "<TableList n=1>"
121+
assert repr(tables[0]) == "<Table shape=(7, 7)>"
122+
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"
123+
124+
tables = camelot.read_pdf(url, pages="all", backend="pdfium")
125+
assert repr(tables) == "<TableList n=1>"
126+
assert repr(tables[0]) == "<Table shape=(7, 7)>"
127+
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"
128+
129+
96130
def test_pages_poppler():
97131
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
98132
tables = camelot.read_pdf(url, backend="poppler")

tests/test_image_conversion_backend.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def test_poppler_backend_error_when_no_use_fallback(monkeypatch):
2929
monkeypatch.setattr(
3030
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
3131
)
32-
backend = ImageConversionBackend(use_fallback=False)
32+
backend = ImageConversionBackend(backend="poppler", use_fallback=False)
3333

3434
message = "Image conversion failed with image conversion backend 'poppler'"
3535
with pytest.raises(ValueError, match=message):
@@ -44,7 +44,7 @@ def test_ghostscript_backend_when_use_fallback(monkeypatch):
4444
monkeypatch.setattr(
4545
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
4646
)
47-
backend = ImageConversionBackend()
47+
backend = ImageConversionBackend(backend="poppler")
4848
backend.convert("foo", "bar")
4949

5050

@@ -53,7 +53,7 @@ def test_ghostscript_backend_error_when_use_fallback(monkeypatch):
5353
monkeypatch.setattr(
5454
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
5555
)
56-
backend = ImageConversionBackend()
56+
backend = ImageConversionBackend(backend="poppler")
5757

5858
message = "Image conversion failed with image conversion backend 'ghostscript'"
5959
with pytest.raises(ValueError, match=message):

0 commit comments

Comments
 (0)