Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pypdfium2 rendering backend #384

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions camelot/backends/image_conversion.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
# -*- coding: utf-8 -*-

from .pdfium_backend import PdfiumBackend
from .poppler_backend import PopplerBackend
from .ghostscript_backend import GhostscriptBackend

BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
BACKENDS = {
"pdfium": PdfiumBackend,
"poppler": PopplerBackend,
"ghostscript": GhostscriptBackend,
}


class ImageConversionBackend(object):
def __init__(self, backend="poppler", use_fallback=True):
def __init__(self, backend="pdfium", use_fallback=True):
if backend not in BACKENDS.keys():
raise ValueError(f"Image conversion backend '{backend}' not supported")

self.backend = backend
self.use_fallback = use_fallback
self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys()))
self.fallbacks = list(BACKENDS.keys())
self.fallbacks.remove(self.backend)

def convert(self, pdf_path, png_path):
try:
Expand Down
15 changes: 15 additions & 0 deletions camelot/backends/pdfium_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-

try:
import pypdfium2 as pdfium
except Exception:
pdfium = None

class PdfiumBackend(object):
def convert(self, pdf_path, png_path, resolution=300):
if not pdfium:
raise OSError("pypdfium2 is not installed.")
doc = pdfium.PdfDocument(pdf_path)
assert len(doc) == 1
image = doc[0].render(scale=resolution/72).to_pil()
image.save(png_path)
8 changes: 7 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,13 @@
"tabulate>=0.8.9",
]

base_requires = ["ghostscript>=0.7", "opencv-python>=3.4.2.17", "pdftopng>=0.2.3"]
base_requires = [
"opencv-python>=3.4.2.17",
"pypdfium2>=4,<5",
"pillow",
"ghostscript>=0.7", # deprecate?
"pdftopng>=0.2.3", # deprecate?
Copy link
Contributor Author

@mara004 mara004 Jun 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, I was not able to get pdftopng installed locally with Python 3.11

]

plot_requires = [
"matplotlib>=2.2.3",
Expand Down
34 changes: 34 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ def test_password():
assert_frame_equal(df, tables[0].df)


def test_repr_pdfium():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"


def test_repr_poppler():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend="poppler")
Expand All @@ -76,6 +84,14 @@ def test_repr_ghostscript():
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


def test_url_pdfium():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url, backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"


def test_url_poppler():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url, backend="poppler")
Expand All @@ -93,6 +109,24 @@ def test_url_ghostscript():
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


def test_pages_pdfium():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url, backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"

tables = camelot.read_pdf(url, pages="1-end", backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"

tables = camelot.read_pdf(url, pages="all", backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"


def test_pages_poppler():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url, backend="poppler")
Expand Down
6 changes: 3 additions & 3 deletions tests/test_image_conversion_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_poppler_backend_error_when_no_use_fallback(monkeypatch):
monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
)
backend = ImageConversionBackend(use_fallback=False)
backend = ImageConversionBackend(backend="poppler", use_fallback=False)

message = "Image conversion failed with image conversion backend 'poppler'"
with pytest.raises(ValueError, match=message):
Expand All @@ -44,7 +44,7 @@ def test_ghostscript_backend_when_use_fallback(monkeypatch):
monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
)
backend = ImageConversionBackend()
backend = ImageConversionBackend(backend="poppler")
backend.convert("foo", "bar")


Expand All @@ -53,7 +53,7 @@ def test_ghostscript_backend_error_when_use_fallback(monkeypatch):
monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
)
backend = ImageConversionBackend()
backend = ImageConversionBackend(backend="poppler")

message = "Image conversion failed with image conversion backend 'ghostscript'"
with pytest.raises(ValueError, match=message):
Expand Down