Skip to content

Commit 499cd9d

Browse files
authored
BUG: Add font stack to q/Q operations in layout mode (#3225)
Closes #3212.
1 parent 9ee8a5a commit 499cd9d

File tree

3 files changed

+20
-1
lines changed

3 files changed

+20
-1
lines changed

CONTRIBUTORS.md

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr
2525
* [Gutteridge, David H.](https://github.com/dhgutteridge)
2626
* [Hale, Joseph](https://github.com/thehale)
2727
* [harshhes](https://github.com/harshhes)
28+
* [Jackowitz, Noah](https://github.com/hackowitz-af) | [LinkedIn](https://www.linkedin.com/in/noah-jackowitz/)
2829
* [JianzhengLuo](https://github.com/JianzhengLuo)
2930
* [Karvonen, Harry](https://github.com/Hatell/)
3031
* [King, Hunter](https://github.com/neversphere)

pypdf/_text_extraction/_layout_mode/_text_state_manager.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""manage the PDF transform stack during "layout" mode text extraction"""
22

33
from collections import ChainMap, Counter
4-
from typing import Any, Dict, List, MutableMapping, Union
4+
from typing import Any, Dict, List, MutableMapping, Tuple, Union
55
from typing import ChainMap as ChainMapType
66
from typing import Counter as CounterType
77

@@ -43,6 +43,7 @@ def __init__(self) -> None:
4343
self.Tz: float = 100.0
4444
self.TL: float = 0.0
4545
self.Ts: float = 0.0
46+
self.font_stack: List[Tuple[Union[Font, None], Union[int, float]]] = []
4647
self.font: Union[Font, None] = None
4748
self.font_size: Union[int, float] = 0
4849

@@ -167,6 +168,7 @@ def reset_trm(self) -> TextStateManagerChainMapType:
167168

168169
def remove_q(self) -> TextStateManagerChainMapType:
169170
"""Rewind to stack prior state after closing a 'q' with internal 'cm' ops"""
171+
self.font, self.font_size = self.font_stack.pop(-1)
170172
self.transform_stack = self.reset_tm()
171173
self.transform_stack.maps = self.transform_stack.maps[
172174
self.q_queue.pop(self.q_depth.pop(), 0) :
@@ -175,6 +177,7 @@ def remove_q(self) -> TextStateManagerChainMapType:
175177

176178
def add_q(self) -> None:
177179
"""Add another level to q_queue"""
180+
self.font_stack.append((self.font, self.font_size))
178181
self.q_depth.append(len(self.q_depth))
179182

180183
def add_cm(self, *args: Any) -> TextStateManagerChainMapType:

tests/test_text_extraction.py

+15
Original file line numberDiff line numberDiff line change
@@ -338,3 +338,18 @@ def test_iss3074():
338338
# pypdf.errors.PdfReadError: ZeroDivisionError: float division by zero
339339
txt = reader.pages[0].extract_text(extraction_mode="layout")
340340
assert txt.strip().startswith("AAAAAA")
341+
342+
343+
@pytest.mark.enable_socket
344+
def test_layout_mode_text_state():
345+
"""Ensure the text state is stored and reset with q/Q operators."""
346+
# Get the PDF from issue #3212
347+
url = "https://github.com/user-attachments/files/19396790/garbled.pdf"
348+
name = "garbled-font.pdf"
349+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
350+
# Get the txt from issue #3212 and normalize line endings
351+
txt_url = "https://github.com/user-attachments/files/19510731/garbled-font.layout.txt"
352+
txt_name = "garbled-font.layout.txt"
353+
expected = get_data_from_url(txt_url, name=txt_name).decode("utf-8").replace("\r\n", "\n")
354+
355+
assert expected == reader.pages[0].extract_text(extraction_mode="layout")

0 commit comments

Comments
 (0)