Skip to content
5 changes: 3 additions & 2 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@
from .constants import PageAttributes as PG
from .constants import Resources as RES
from .errors import PageSizeNotDefinedError, PdfReadError
from .filters import _xobj_to_image
from .generic import (
ArrayObject,
ContentStream,
Expand Down Expand Up @@ -374,7 +373,7 @@ def replace(self, new_image: Image, **kwargs: Any) -> None:
from ._reader import PdfReader # noqa: PLC0415

# to prevent circular import
from .filters import _xobj_to_image # noqa: PLC0415
from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415
from .generic import DictionaryObject, PdfObject # noqa: PLC0415

if self.indirect_reference is None:
Expand Down Expand Up @@ -646,6 +645,7 @@ def _get_image(
raise KeyError("No inline image can be found")
return self.inline_images[id]

from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415
imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
extension, byte_stream = imgd[:2]
return ImageFile(
Expand Down Expand Up @@ -749,6 +749,7 @@ def _get_inline_images(self) -> dict[str, ImageFile]:
if k not in init:
init[k] = v
ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
from ._xobj_image_helpers import _xobj_to_image # noqa: PLC0415
extension, byte_stream, img = _xobj_to_image(ii["object"])
files[f"~{num}~"] = ImageFile(
name=f"~{num}~{extension}",
Expand Down
190 changes: 180 additions & 10 deletions pypdf/_xobj_image_helpers.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
"""Code in here is only used by pypdf.filters._xobj_to_image"""
"""Functions to convert an image XObject to an image"""

import sys
from io import BytesIO
from typing import Any, Literal, Union, cast
from typing import Any, Literal, Optional, Union, cast

from ._utils import check_if_whitespace_only, logger_warning
from .constants import ColorSpaces
from .constants import FilterTypes as FT
from .constants import ImageAttributes as IA
from .constants import (
ColorSpaces,
StreamAttributes,
)
from .constants import (
FilterTypes as FT,
)
from .constants import (
ImageAttributes as IA,
)
from .errors import EmptyImageDataError, PdfReadError
from .generic import (
ArrayObject,
Expand All @@ -23,9 +30,8 @@
else:
from typing_extensions import TypeAlias


try:
from PIL import Image, UnidentifiedImageError # noqa: F401
from PIL import Image, UnidentifiedImageError
except ImportError:
raise ImportError(
"pillow is required to do image extraction. "
Expand Down Expand Up @@ -187,7 +193,7 @@ def _handle_flate(
Process image encoded in flateEncode
Returns img, image_format, extension, color inversion
"""
extension = ".png" # mime_type = "image/png"
extension = ".png" # mime_type: "image/png"
image_format = "PNG"
lookup: Any
base: Any
Expand Down Expand Up @@ -297,7 +303,7 @@ def _handle_jpx(
Process image encoded in flateEncode
Returns img, image_format, extension, inversion
"""
extension = ".jp2" # mime_type = "image/x-jp2"
extension = ".jp2" # mime_type: "image/x-jp2"
img1 = Image.open(BytesIO(data), formats=("JPEG2000",))
mode, invert_color = _get_imagemode(color_space, colors, mode)
if mode == "":
Expand All @@ -315,7 +321,7 @@ def _handle_jpx(
img = Image.frombytes(mode, img1.size, img1.tobytes())
else: # pragma: no cover
img = img1.convert(mode)
# for CMYK conversion :
# CMYK conversion:
# https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop
# not implemented for the moment as I need to get properly the ICC
if img.mode == "CMYK":
Expand Down Expand Up @@ -392,3 +398,167 @@ def _get_mode_and_invert_color(
"",
)
return mode, invert_color


def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any]:
"""
Users need to have the pillow package installed.

Args:
x_object:

Returns:
Tuple[file extension, bytes, PIL.Image.Image]

"""
def _apply_alpha(
img: Image.Image,
x_object: dict[str, Any],
obj_as_text: str,
image_format: str,
extension: str,
) -> tuple[Image.Image, str, str]:
alpha = None
if IA.S_MASK in x_object: # add alpha channel
alpha = _xobj_to_image(x_object[IA.S_MASK])[2]
if img.size != alpha.size:
logger_warning(
f"image and mask size not matching: {obj_as_text}", __name__
)
else:
# TODO: implement mask
if alpha.mode != "L":
alpha = alpha.convert("L")
if img.mode == "P":
img = img.convert("RGB")
elif img.mode == "1":
img = img.convert("L")
img.putalpha(alpha)
if "JPEG" in image_format:
image_format = "JPEG2000"
extension = ".jp2"
else:
image_format = "PNG"
extension = ".png"
return img, extension, image_format

# For error reporting
obj_as_text = (
x_object.indirect_reference.__repr__()
if x_object is None # pragma: no cover
else x_object.__repr__()
)

# Get size and data
size = (cast(int, x_object[IA.WIDTH]), cast(int, x_object[IA.HEIGHT]))
data = x_object.get_data() # type: ignore
if isinstance(data, str): # pragma: no cover
data = data.encode()
if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n'
data = data[:-1]

# Get color properties
colors = x_object.get("/Colors", 1)
color_space: Any = x_object.get("/ColorSpace", NullObject()).get_object()
if isinstance(color_space, list) and len(color_space) == 1:
color_space = color_space[0].get_object()

mode, invert_color = _get_mode_and_invert_color(x_object, colors, color_space)

# Get filters
filters = x_object.get(StreamAttributes.FILTER, NullObject()).get_object()
lfilters = filters[-1] if isinstance(filters, list) else filters
decode_parms = x_object.get(StreamAttributes.DECODE_PARMS, None)
if decode_parms and isinstance(decode_parms, (tuple, list)):
decode_parms = decode_parms[0]
else:
decode_parms = {}
if not isinstance(decode_parms, dict):
decode_parms = {}

extension = None
if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE):
img, image_format, extension, _ = _handle_flate(
size,
data,
mode,
color_space,
colors,
obj_as_text,
)
elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE):
# I'm not sure if the following logic is correct.
# There might not be any relationship between the filters and the
# extension
if lfilters == FT.LZW_DECODE:
image_format = "TIFF"
extension = ".tiff" # mime_type: "image/tiff"
else:
image_format = "PNG"
extension = ".png" # mime_type: "image/png"
try:
img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
except UnidentifiedImageError:
img = _extended_image_frombytes(mode, size, data)
elif lfilters == FT.DCT_DECODE:
img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
# invert_color kept unchanged
elif lfilters == FT.JPX_DECODE:
img, image_format, extension, invert_color = _handle_jpx(
size, data, mode, color_space, colors
)
elif lfilters == FT.CCITT_FAX_DECODE:
img, image_format, extension, invert_color = (
Image.open(BytesIO(data), formats=("TIFF",)),
"TIFF",
".tiff",
False,
)
elif lfilters == FT.JBIG2_DECODE:
img, image_format, extension, invert_color = (
Image.open(BytesIO(data), formats=("PNG",)),
"PNG",
".png",
False,
)
elif mode == "CMYK":
img, image_format, extension, invert_color = (
_extended_image_frombytes(mode, size, data),
"TIFF",
".tif",
False,
)
elif mode == "":
raise PdfReadError(f"ColorSpace field not found in {x_object}")
else:
img, image_format, extension, invert_color = (
_extended_image_frombytes(mode, size, data),
"PNG",
".png",
False,
)

img = _apply_decode(img, x_object, lfilters, color_space, invert_color)
img, extension, image_format = _apply_alpha(
img, x_object, obj_as_text, image_format, extension
)

# Save image to bytes
img_byte_arr = BytesIO()
try:
img.save(img_byte_arr, format=image_format)
except OSError: # pragma: no cover # covered with pillow 10.3
# in case of we convert to RGBA and then to PNG
img1 = img.convert("RGBA")
image_format = "PNG"
extension = ".png"
img_byte_arr = BytesIO()
img1.save(img_byte_arr, format=image_format)
data = img_byte_arr.getvalue()

try: # temporary try/except until other fixes of images
img = Image.open(BytesIO(data))
except Exception as exception:
logger_warning(f"Failed loading image: {exception}", __name__)
img = None # type: ignore
return extension, data, img
Loading
Loading