From f889a58f2f822c0a9e9f30a3296576b8666bf695 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:26:56 +0000 Subject: [PATCH 01/16] STY: Move _xobj_to_image to _xobj_image_helpers.py --- pypdf/_xobj_image_helpers.py | 170 ++++++++++++++++++++++++++++++++- pypdf/filters.py | 177 ----------------------------------- 2 files changed, 169 insertions(+), 178 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index a9531fab0..b85984d24 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -2,12 +2,13 @@ import sys from io import BytesIO -from typing import Any, Literal, Union, cast +from typing import Any, Literal, Optional, Union, cast from ._utils import check_if_whitespace_only, logger_warning from .constants import ColorSpaces from .constants import FilterTypes as FT from .constants import ImageAttributes as IA +from .constants import StreamAttributes from .errors import EmptyImageDataError, PdfReadError from .generic import ( ArrayObject, @@ -392,3 +393,170 @@ def _get_mode_and_invert_color( "", ) return mode, invert_color + + +def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any]: + """ + Users need to have the pillow package installed. + + It's unclear if pypdf will keep this function here, hence it's private. + It might get removed at any point. + + Args: + x_object: + + Returns: + Tuple[file extension, bytes, PIL.Image.Image] + + """ + def _apply_alpha( + img: Image.Image, + x_object: dict[str, Any], + obj_as_text: str, + image_format: str, + extension: str, + ) -> tuple[Image.Image, str, str]: + alpha = None + if IA.S_MASK in x_object: # add alpha channel + alpha = _xobj_to_image(x_object[IA.S_MASK])[2] + if img.size != alpha.size: + logger_warning( + f"image and mask size not matching: {obj_as_text}", __name__ + ) + else: + # TODO: implement mask + if alpha.mode != "L": + alpha = alpha.convert("L") + if img.mode == "P": + img = img.convert("RGB") + elif img.mode == "1": + img = img.convert("L") + img.putalpha(alpha) + if "JPEG" in image_format: + image_format = "JPEG2000" + extension = ".jp2" + else: + image_format = "PNG" + extension = ".png" + return img, extension, image_format + + # For error reporting + obj_as_text = ( + x_object.indirect_reference.__repr__() + if x_object is None # pragma: no cover + else x_object.__repr__() + ) + + # Get size and data + size = (cast(int, x_object[IA.WIDTH]), cast(int, x_object[IA.HEIGHT])) + data = x_object.get_data() # type: ignore + if isinstance(data, str): # pragma: no cover + data = data.encode() + if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n' + data = data[:-1] + + # Get color properties + colors = x_object.get("/Colors", 1) + color_space: Any = x_object.get("/ColorSpace", NullObject()).get_object() + if isinstance(color_space, list) and len(color_space) == 1: + color_space = color_space[0].get_object() + + mode, invert_color = _get_mode_and_invert_color(x_object, colors, color_space) + + # Get filters + filters = x_object.get(StreamAttributes.FILTER, NullObject()).get_object() + lfilters = filters[-1] if isinstance(filters, list) else filters + decode_parms = x_object.get(StreamAttributes.DECODE_PARMS, None) + if decode_parms and isinstance(decode_parms, (tuple, list)): + decode_parms = decode_parms[0] + else: + decode_parms = {} + if not isinstance(decode_parms, dict): + decode_parms = {} + + extension = None + if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE): + img, image_format, extension, _ = _handle_flate( + size, + data, + mode, + color_space, + colors, + obj_as_text, + ) + elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE): + # I'm not sure if the following logic is correct. + # There might not be any relationship between the filters and the + # extension + if lfilters == FT.LZW_DECODE: + image_format = "TIFF" + extension = ".tiff" # mime_type = "image/tiff" + else: + image_format = "PNG" + extension = ".png" # mime_type = "image/png" + try: + img = Image.open(BytesIO(data), formats=("TIFF", "PNG")) + except UnidentifiedImageError: + img = _extended_image_frombytes(mode, size, data) + elif lfilters == FT.DCT_DECODE: + img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg" + # invert_color kept unchanged + elif lfilters == FT.JPX_DECODE: + img, image_format, extension, invert_color = _handle_jpx( + size, data, mode, color_space, colors + ) + elif lfilters == FT.CCITT_FAX_DECODE: + img, image_format, extension, invert_color = ( + Image.open(BytesIO(data), formats=("TIFF",)), + "TIFF", + ".tiff", + False, + ) + elif lfilters == FT.JBIG2_DECODE: + img, image_format, extension, invert_color = ( + Image.open(BytesIO(data), formats=("PNG",)), + "PNG", + ".png", + False, + ) + elif mode == "CMYK": + img, image_format, extension, invert_color = ( + _extended_image_frombytes(mode, size, data), + "TIFF", + ".tif", + False, + ) + elif mode == "": + raise PdfReadError(f"ColorSpace field not found in {x_object}") + else: + img, image_format, extension, invert_color = ( + _extended_image_frombytes(mode, size, data), + "PNG", + ".png", + False, + ) + + img = _apply_decode(img, x_object, lfilters, color_space, invert_color) + img, extension, image_format = _apply_alpha( + img, x_object, obj_as_text, image_format, extension + ) + + # Save image to bytes + img_byte_arr = BytesIO() + try: + img.save(img_byte_arr, format=image_format) + except OSError: # pragma: no cover # covered with pillow 10.3 + # in case of we convert to RGBA and then to PNG + img1 = img.convert("RGBA") + image_format = "PNG" + extension = ".png" + img_byte_arr = BytesIO() + img1.save(img_byte_arr, format=image_format) + data = img_byte_arr.getvalue() + + try: # temporary try/except until other fixes of images + img = Image.open(BytesIO(data)) + except Exception as exception: + logger_warning(f"Failed loading image: {exception}", __name__) + img = None # type: ignore + return extension, data, img diff --git a/pypdf/filters.py b/pypdf/filters.py index fc0486555..795abfc77 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -788,180 +788,3 @@ def decode_stream_data(stream: Any) -> bytes: else: raise NotImplementedError(f"Unsupported filter {filter_name}") return data - - -def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any]: - """ - Users need to have the pillow package installed. - - It's unclear if pypdf will keep this function here, hence it's private. - It might get removed at any point. - - Args: - x_object: - - Returns: - Tuple[file extension, bytes, PIL.Image.Image] - - """ - from ._xobj_image_helpers import ( # noqa: PLC0415 - Image, - UnidentifiedImageError, - _apply_decode, - _extended_image_frombytes, - _get_mode_and_invert_color, - _handle_flate, - _handle_jpx, - ) - - def _apply_alpha( - img: Image.Image, - x_object: dict[str, Any], - obj_as_text: str, - image_format: str, - extension: str, - ) -> tuple[Image.Image, str, str]: - alpha = None - if IA.S_MASK in x_object: # add alpha channel - alpha = _xobj_to_image(x_object[IA.S_MASK])[2] - if img.size != alpha.size: - logger_warning( - f"image and mask size not matching: {obj_as_text}", __name__ - ) - else: - # TODO: implement mask - if alpha.mode != "L": - alpha = alpha.convert("L") - if img.mode == "P": - img = img.convert("RGB") - elif img.mode == "1": - img = img.convert("L") - img.putalpha(alpha) - if "JPEG" in image_format: - image_format = "JPEG2000" - extension = ".jp2" - else: - image_format = "PNG" - extension = ".png" - return img, extension, image_format - - # For error reporting - obj_as_text = ( - x_object.indirect_reference.__repr__() - if x_object is None # pragma: no cover - else x_object.__repr__() - ) - - # Get size and data - size = (cast(int, x_object[IA.WIDTH]), cast(int, x_object[IA.HEIGHT])) - data = x_object.get_data() # type: ignore - if isinstance(data, str): # pragma: no cover - data = data.encode() - if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n' - data = data[:-1] - - # Get color properties - colors = x_object.get("/Colors", 1) - color_space: Any = x_object.get("/ColorSpace", NullObject()).get_object() - if isinstance(color_space, list) and len(color_space) == 1: - color_space = color_space[0].get_object() - - mode, invert_color = _get_mode_and_invert_color(x_object, colors, color_space) - - # Get filters - filters = x_object.get(SA.FILTER, NullObject()).get_object() - lfilters = filters[-1] if isinstance(filters, list) else filters - decode_parms = x_object.get(SA.DECODE_PARMS, None) - if decode_parms and isinstance(decode_parms, (tuple, list)): - decode_parms = decode_parms[0] - else: - decode_parms = {} - if not isinstance(decode_parms, dict): - decode_parms = {} - - extension = None - if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE): - img, image_format, extension, _ = _handle_flate( - size, - data, - mode, - color_space, - colors, - obj_as_text, - ) - elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE): - # I'm not sure if the following logic is correct. - # There might not be any relationship between the filters and the - # extension - if lfilters == FT.LZW_DECODE: - image_format = "TIFF" - extension = ".tiff" # mime_type = "image/tiff" - else: - image_format = "PNG" - extension = ".png" # mime_type = "image/png" - try: - img = Image.open(BytesIO(data), formats=("TIFF", "PNG")) - except UnidentifiedImageError: - img = _extended_image_frombytes(mode, size, data) - elif lfilters == FT.DCT_DECODE: - img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg" - # invert_color kept unchanged - elif lfilters == FT.JPX_DECODE: - img, image_format, extension, invert_color = _handle_jpx( - size, data, mode, color_space, colors - ) - elif lfilters == FT.CCITT_FAX_DECODE: - img, image_format, extension, invert_color = ( - Image.open(BytesIO(data), formats=("TIFF",)), - "TIFF", - ".tiff", - False, - ) - elif lfilters == FT.JBIG2_DECODE: - img, image_format, extension, invert_color = ( - Image.open(BytesIO(data), formats=("PNG",)), - "PNG", - ".png", - False, - ) - elif mode == "CMYK": - img, image_format, extension, invert_color = ( - _extended_image_frombytes(mode, size, data), - "TIFF", - ".tif", - False, - ) - elif mode == "": - raise PdfReadError(f"ColorSpace field not found in {x_object}") - else: - img, image_format, extension, invert_color = ( - _extended_image_frombytes(mode, size, data), - "PNG", - ".png", - False, - ) - - img = _apply_decode(img, x_object, lfilters, color_space, invert_color) - img, extension, image_format = _apply_alpha( - img, x_object, obj_as_text, image_format, extension - ) - - # Save image to bytes - img_byte_arr = BytesIO() - try: - img.save(img_byte_arr, format=image_format) - except OSError: # pragma: no cover # covered with pillow 10.3 - # in case of we convert to RGBA and then to PNG - img1 = img.convert("RGBA") - image_format = "PNG" - extension = ".png" - img_byte_arr = BytesIO() - img1.save(img_byte_arr, format=image_format) - data = img_byte_arr.getvalue() - - try: # temporary try/except until other fixes of images - img = Image.open(BytesIO(data)) - except Exception as exception: - logger_warning(f"Failed loading image: {exception}", __name__) - img = None # type: ignore - return extension, data, img From 60b632f04ab7077a81c60d5b15ba1a9331630005 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:35:29 +0000 Subject: [PATCH 02/16] Fix errors --- pypdf/_xobj_image_helpers.py | 2 +- pypdf/filters.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index b85984d24..238b9b9ce 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -26,7 +26,7 @@ try: - from PIL import Image, UnidentifiedImageError # noqa: F401 + from PIL import Image, UnidentifiedImageError except ImportError: raise ImportError( "pillow is required to do image extraction. " diff --git a/pypdf/filters.py b/pypdf/filters.py index 795abfc77..b4cc89152 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -43,7 +43,6 @@ import zlib from base64 import a85decode from dataclasses import dataclass -from io import BytesIO from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Optional, Union, cast From b86372eb004219bf23a740df5688393b57d86e45 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:38:57 +0000 Subject: [PATCH 03/16] Fix error --- pypdf/_xobj_image_helpers.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 238b9b9ce..5695ab840 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -5,10 +5,12 @@ from typing import Any, Literal, Optional, Union, cast from ._utils import check_if_whitespace_only, logger_warning -from .constants import ColorSpaces -from .constants import FilterTypes as FT -from .constants import ImageAttributes as IA -from .constants import StreamAttributes +from .constants import ( + ColorSpaces, + FilterTypes as FT, + ImageAttributes as IA, + StreamAttributes, +) from .errors import EmptyImageDataError, PdfReadError from .generic import ( ArrayObject, From 6bba029aa0a20e35f6feeee7a1ea9523baf904c2 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:43:47 +0000 Subject: [PATCH 04/16] Fix import block is un-sorted or unformatted --- pypdf/_xobj_image_helpers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 5695ab840..4075485aa 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -7,9 +7,13 @@ from ._utils import check_if_whitespace_only, logger_warning from .constants import ( ColorSpaces, + StreamAttributes, +) +from .constants import ( FilterTypes as FT, +) +from .constants import ( ImageAttributes as IA, - StreamAttributes, ) from .errors import EmptyImageDataError, PdfReadError from .generic import ( From 2ab5b96f7f2df3d02630f0ab5a45b373f425f657 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:52:21 +0000 Subject: [PATCH 05/16] Fix errors --- pypdf/_page.py | 4 ++-- pypdf/generic/_data_structures.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index e7b47882c..d438eaf78 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -64,7 +64,7 @@ from .constants import PageAttributes as PG from .constants import Resources as RES from .errors import PageSizeNotDefinedError, PdfReadError -from .filters import _xobj_to_image +from ._xobj_image_helpers import _xobj_to_image from .generic import ( ArrayObject, ContentStream, @@ -374,7 +374,7 @@ def replace(self, new_image: Image, **kwargs: Any) -> None: from ._reader import PdfReader # noqa: PLC0415 # to prevent circular import - from .filters import _xobj_to_image # noqa: PLC0415 + from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 from .generic import DictionaryObject, PdfObject # noqa: PLC0415 if self.indirect_reference is None: diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 198d15443..8fffd7228 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1058,7 +1058,7 @@ def decode_as_image(self) -> Any: stops in your program. """ - from ..filters import _xobj_to_image # noqa: PLC0415 + from .._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 if self.get("/Subtype", "") != "/Image": try: From eb1a757d5824acec75d616f1c2f55e97f47d4c90 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:55:27 +0000 Subject: [PATCH 06/16] Fix error --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index d438eaf78..08e7cc804 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -58,13 +58,13 @@ logger_warning, matrix_multiply, ) +from ._xobj_image_helpers import _xobj_to_image from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING from .constants import AnnotationDictionaryAttributes as ADA from .constants import ImageAttributes as IA from .constants import PageAttributes as PG from .constants import Resources as RES from .errors import PageSizeNotDefinedError, PdfReadError -from ._xobj_image_helpers import _xobj_to_image from .generic import ( ArrayObject, ContentStream, From 1ca1f7916dbbd4666cb1ff26c42c32c7932859a1 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 13 Aug 2025 17:21:21 +0100 Subject: [PATCH 07/16] Import locally --- pypdf/_xobj_image_helpers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 4075485aa..6e77aedc0 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -415,6 +415,11 @@ def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any] Tuple[file extension, bytes, PIL.Image.Image] """ + from ._xobj_image_helpers import ( # noqa: PLC0415 + Image, + UnidentifiedImageError, + ) + def _apply_alpha( img: Image.Image, x_object: dict[str, Any], From cc12bdaf546140332d417771f2fb09e42e59bb25 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 13 Aug 2025 17:23:30 +0100 Subject: [PATCH 08/16] Fix error --- pypdf/_xobj_image_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 6e77aedc0..4bc2a58b4 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -415,7 +415,7 @@ def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any] Tuple[file extension, bytes, PIL.Image.Image] """ - from ._xobj_image_helpers import ( # noqa: PLC0415 + from ._xobj_image_helpers import ( # noqa: PLC0415 Image, UnidentifiedImageError, ) From d5e21fbe1ab51ae096e0527c2a47c23f4e2ade66 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Sun, 17 Aug 2025 11:44:02 +0000 Subject: [PATCH 09/16] Remove local import and change docstring --- pypdf/_xobj_image_helpers.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 4bc2a58b4..7fb45990d 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -1,4 +1,4 @@ -"""Code in here is only used by pypdf.filters._xobj_to_image""" +"""Functions to convert an image XObject to an image""" import sys from io import BytesIO @@ -405,9 +405,6 @@ def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any] """ Users need to have the pillow package installed. - It's unclear if pypdf will keep this function here, hence it's private. - It might get removed at any point. - Args: x_object: @@ -415,11 +412,6 @@ def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any] Tuple[file extension, bytes, PIL.Image.Image] """ - from ._xobj_image_helpers import ( # noqa: PLC0415 - Image, - UnidentifiedImageError, - ) - def _apply_alpha( img: Image.Image, x_object: dict[str, Any], From 332107451dae59537cb36cd9a7da38541e8a1405 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Sun, 17 Aug 2025 15:51:28 +0100 Subject: [PATCH 10/16] Fix error --- pypdf/_xobj_image_helpers.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 7fb45990d..1fab6f7bc 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -412,6 +412,16 @@ def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any] Tuple[file extension, bytes, PIL.Image.Image] """ + from ._xobj_image_helpers import ( # noqa: PLC0415 + Image, + UnidentifiedImageError, + _apply_decode, + _extended_image_frombytes, + _get_mode_and_invert_color, + _handle_flate, + _handle_jpx, + ) + def _apply_alpha( img: Image.Image, x_object: dict[str, Any], From 14baaf26a8662a5d2ea57dc22beccf7a7208b73f Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Sun, 17 Aug 2025 15:54:25 +0100 Subject: [PATCH 11/16] Fix error --- pypdf/_xobj_image_helpers.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 1fab6f7bc..12dc4eafd 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -30,15 +30,6 @@ else: from typing_extensions import TypeAlias - -try: - from PIL import Image, UnidentifiedImageError -except ImportError: - raise ImportError( - "pillow is required to do image extraction. " - "It can be installed via 'pip install pypdf[image]'" - ) - mode_str_type: TypeAlias = Literal[ "", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK" ] From db27af7291be238782030cd41818bd8a102d83b7 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Tue, 26 Aug 2025 12:56:53 +0100 Subject: [PATCH 12/16] Fix error --- pypdf/_xobj_image_helpers.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 12dc4eafd..5f8faa52d 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -30,6 +30,14 @@ else: from typing_extensions import TypeAlias +try: + from PIL import Image, UnidentifiedImageError # noqa: F401 +except ImportError: + raise ImportError( + "pillow is required to do image extraction. " + "It can be installed via 'pip install pypdf[image]'" + ) + mode_str_type: TypeAlias = Literal[ "", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK" ] From 7fad762fcb14de2ded273babc53ab5e37ea48a26 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Tue, 23 Sep 2025 07:35:28 +0000 Subject: [PATCH 13/16] Remove import --- pypdf/_page.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 08e7cc804..e4d16c159 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -58,7 +58,6 @@ logger_warning, matrix_multiply, ) -from ._xobj_image_helpers import _xobj_to_image from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING from .constants import AnnotationDictionaryAttributes as ADA from .constants import ImageAttributes as IA From 002c37853e90ecee43149a7177aea7aa45b3da81 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Tue, 23 Sep 2025 07:43:12 +0000 Subject: [PATCH 14/16] Import locally --- pypdf/_page.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pypdf/_page.py b/pypdf/_page.py index e4d16c159..4a022ab8f 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -645,6 +645,7 @@ def _get_image( raise KeyError("No inline image can be found") return self.inline_images[id] + from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) extension, byte_stream = imgd[:2] return ImageFile( @@ -748,6 +749,7 @@ def _get_inline_images(self) -> dict[str, ImageFile]: if k not in init: init[k] = v ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) + from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415 extension, byte_stream, img = _xobj_to_image(ii["object"]) files[f"~{num}~"] = ImageFile( name=f"~{num}~{extension}", From a46017ca7edf17eefe6c8b1ba60c5ab80f43acfc Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Tue, 23 Sep 2025 09:31:36 +0000 Subject: [PATCH 15/16] Small changes --- pypdf/_xobj_image_helpers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 5f8faa52d..23c77b953 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -193,7 +193,7 @@ def _handle_flate( Process image encoded in flateEncode Returns img, image_format, extension, color inversion """ - extension = ".png" # mime_type = "image/png" + extension = ".png" # mime_type: "image/png" image_format = "PNG" lookup: Any base: Any @@ -303,7 +303,7 @@ def _handle_jpx( Process image encoded in flateEncode Returns img, image_format, extension, inversion """ - extension = ".jp2" # mime_type = "image/x-jp2" + extension = ".jp2" # mime_type: "image/x-jp2" img1 = Image.open(BytesIO(data), formats=("JPEG2000",)) mode, invert_color = _get_imagemode(color_space, colors, mode) if mode == "": @@ -321,7 +321,7 @@ def _handle_jpx( img = Image.frombytes(mode, img1.size, img1.tobytes()) else: # pragma: no cover img = img1.convert(mode) - # for CMYK conversion : + # CMYK conversion: # https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop # not implemented for the moment as I need to get properly the ICC if img.mode == "CMYK": @@ -502,10 +502,10 @@ def _apply_alpha( # extension if lfilters == FT.LZW_DECODE: image_format = "TIFF" - extension = ".tiff" # mime_type = "image/tiff" + extension = ".tiff" # mime_type: "image/tiff" else: image_format = "PNG" - extension = ".png" # mime_type = "image/png" + extension = ".png" # mime_type: "image/png" try: img = Image.open(BytesIO(data), formats=("TIFF", "PNG")) except UnidentifiedImageError: From a13eac66a4f5be18611e342df9e729c58d64c19e Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Tue, 23 Sep 2025 14:39:51 +0000 Subject: [PATCH 16/16] Remove local import --- pypdf/_xobj_image_helpers.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 23c77b953..cba428109 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -31,7 +31,7 @@ from typing_extensions import TypeAlias try: - from PIL import Image, UnidentifiedImageError # noqa: F401 + from PIL import Image, UnidentifiedImageError except ImportError: raise ImportError( "pillow is required to do image extraction. " @@ -411,16 +411,6 @@ def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any] Tuple[file extension, bytes, PIL.Image.Image] """ - from ._xobj_image_helpers import ( # noqa: PLC0415 - Image, - UnidentifiedImageError, - _apply_decode, - _extended_image_frombytes, - _get_mode_and_invert_color, - _handle_flate, - _handle_jpx, - ) - def _apply_alpha( img: Image.Image, x_object: dict[str, Any],