Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

All notable changes to PDFOxide are documented here.

## [Unreleased]

### Fixed

- **Watermark annotations rendered as nothing in compliant viewers** — a watermark's `/AP` appearance was serialized as a stream nested *directly* inside the annotation dictionary (`/AP <</N <<…>> stream … endstream>>`). A PDF stream must be an indirect object (ISO 32000-1:2008 §7.3.8); the inline form is invalid, so spec-compliant readers (e.g. MuPDF/PyMuPDF) rejected the annotation with "invalid key in dict" and the watermark never appeared — even though the bytes were present in the file. A shared `hoist_appearance_streams` helper now lifts nested `/N`, `/D`, and `/R` appearance streams (including named-state sub-dictionaries) into freshly allocated indirect objects and replaces the slot with a reference, applied on both the `DocumentBuilder` writer and the existing-page `DocumentEditor::save_page` paths. Verified end-to-end with MuPDF: the watermark now parses and renders on both paths.

## [0.3.63] - 2026-06-09

> CJK extraction-quality fixes — vertical-CJK (tategaki) reading order no longer mis-fires on horizontal Japanese text, CJK glyphs no longer surface as Kangxi-radical codepoints, and Korean number spacing is preserved — plus recovery of dropped inter-word spaces on tightly-typeset PDFs, and routine dependency updates.
Expand Down
76 changes: 50 additions & 26 deletions src/editor/document_editor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::error::{Error, Result};
use crate::extractors::HierarchicalExtractor;
use crate::geometry::Rect;
use crate::object::{Object, ObjectRef};
use crate::writer::{ContentStreamBuilder, ObjectSerializer};
use crate::writer::{hoist_appearance_streams, ContentStreamBuilder, ObjectSerializer};
use std::collections::{HashMap, HashSet};
use std::fs::File;
#[cfg(not(target_arch = "wasm32"))]
Expand Down Expand Up @@ -3375,34 +3375,58 @@ impl DocumentEditor {
// Get page refs for building annotations (needed for link destinations)
let page_refs = self.get_page_refs().unwrap_or_default();

if let Some(annotations) =
self.modified_annotations.get(&source_page_index)
{
let new_annotations: Vec<_> =
annotations.iter().filter(|a| a.is_new()).collect();

for (annot_id, annot_wrapper) in
new_annotation_ids.iter().zip(new_annotations.iter())
// Build the annotation dictionaries first, in
// a scope that releases the `modified_annotations`
// borrow before we allocate ids / mutate `self`.
let built_annots: Vec<(u32, HashMap<String, Object>)> =
if let Some(annotations) =
self.modified_annotations.get(&source_page_index)
{
if let Some(writer_annot) =
annot_wrapper.writer_annotation()
{
// Build the annotation dictionary
let annot_dict = writer_annot.build(&page_refs);
new_annotation_ids
.iter()
.zip(annotations.iter().filter(|a| a.is_new()))
.filter_map(|(annot_id, annot_wrapper)| {
annot_wrapper
.writer_annotation()
.map(|wa| (*annot_id, wa.build(&page_refs)))
})
.collect()
} else {
Vec::new()
};

// Write the annotation object
let offset = writer.stream_position()?;
let bytes = serialize_obj(
&serializer,
*annot_id,
0,
&Object::Dictionary(annot_dict),
&encryption_handler,
);
writer.write_all(&bytes)?;
xref_entries.push((*annot_id, offset, 0, true));
}
for (annot_id, mut annot_dict) in built_annots {
// Hoist any inline appearance stream (e.g. a
// watermark's `/AP /N`) into its own indirect
// object; a stream may not be a direct dict value.
let hoisted = hoist_appearance_streams(
&mut annot_dict,
&mut self.next_object_id,
);
for (stream_id, stream_obj) in hoisted {
let offset = writer.stream_position()?;
let bytes = serialize_obj(
&serializer,
stream_id,
0,
&stream_obj,
&encryption_handler,
);
writer.write_all(&bytes)?;
xref_entries.push((stream_id, offset, 0, true));
}

// Write the annotation object
let offset = writer.stream_position()?;
let bytes = serialize_obj(
&serializer,
annot_id,
0,
&Object::Dictionary(annot_dict),
&encryption_handler,
);
writer.write_all(&bytes)?;
xref_entries.push((annot_id, offset, 0, true));
}
}

Expand Down
37 changes: 37 additions & 0 deletions src/writer/document_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3716,6 +3716,43 @@ mod tests {
assert!(content.contains("%%EOF"));
}

#[test]
fn test_watermark_appearance_is_indirect_not_inline() {
// Regression: a watermark's `/AP /N` appearance stream must be an
// indirect object (`/N <id> 0 R`), not an inline stream embedded in
// the annotation dict (which is invalid PDF and renders to nothing).
let mut builder = DocumentBuilder::new();
let wm = crate::writer::WatermarkAnnotation::new("CONFIDENTIAL")
.with_rect(crate::geometry::Rect::new(50.0, 50.0, 200.0, 400.0));
builder
.letter_page()
.paragraph("body")
.watermark_custom(wm)
.done();

let bytes = builder.build().unwrap();
let content = String::from_utf8_lossy(&bytes);

assert!(content.contains("/Subtype /Watermark"), "watermark missing");
// `/N` references an indirect object...
let re = regex_lite_n_ref(&content);
assert!(re, "appearance `/N` is not an indirect reference: not found");
// ...and the appearance dict is never opened inline after `/N`.
assert!(
!content.contains("/N <<"),
"inline appearance stream leaked into annotation dict"
);
}

// Minimal check for `/N <digits> 0 R` without pulling in a regex crate.
fn regex_lite_n_ref(s: &str) -> bool {
s.match_indices("/N ").any(|(i, _)| {
let rest = &s[i + 3..];
let digits: String = rest.chars().take_while(|c| c.is_ascii_digit()).collect();
!digits.is_empty() && rest[digits.len()..].starts_with(" 0 R")
})
}

#[test]
fn test_document_builder_with_metadata() {
let mut builder = DocumentBuilder::new().metadata(
Expand Down
1 change: 1 addition & 0 deletions src/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ pub use linearization::{
SharedObjectHeader,
};
pub use movie::{MovieActivation, MovieAnnotation, MovieData, MoviePlayMode};
pub(crate) use object_serializer::hoist_appearance_streams;
pub use object_serializer::ObjectSerializer;
pub use outline_builder::{
FitMode, OutlineBuildResult, OutlineBuilder, OutlineDestination, OutlineItem, OutlineStyle,
Expand Down
124 changes: 124 additions & 0 deletions src/writer/object_serializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,54 @@ impl ObjectSerializer {
}
}

/// Hoist inline appearance streams out of an annotation dictionary into
/// indirect objects, as the PDF spec requires.
///
/// An annotation's `/AP` appearance can carry stream objects under `/N`,
/// `/D` and `/R` (each either a single stream or a sub-dictionary of named
/// appearance states). Some builders construct these as a direct
/// [`Object::Stream`] nested inside the dictionary — which serializes to an
/// illegal inline `<< … >> stream … endstream` *inside* the annotation dict.
/// A stream **must** be an indirect object (ISO 32000-1 §7.3.8), so this
/// replaces every nested stream with an [`Object::Reference`] to a freshly
/// allocated id (drawn from `next_id`, which is advanced) and returns the
/// `(id, stream)` pairs for the caller to write as separate indirect objects.
///
/// Returns an empty vector when there is nothing to hoist, so it is safe to
/// call on every annotation dictionary unconditionally.
pub(crate) fn hoist_appearance_streams(
annot_dict: &mut HashMap<String, Object>,
next_id: &mut u32,
) -> Vec<(u32, Object)> {
let mut extracted = Vec::new();
let mut hoist = |slot: &mut Object, out: &mut Vec<(u32, Object)>| {
if matches!(slot, Object::Stream { .. }) {
let id = *next_id;
*next_id += 1;
let stream = std::mem::replace(slot, Object::Reference(ObjectRef::new(id, 0)));
out.push((id, stream));
}
};

if let Some(Object::Dictionary(ap)) = annot_dict.get_mut("AP") {
for key in ["N", "D", "R"] {
match ap.get_mut(key) {
Some(slot @ Object::Stream { .. }) => hoist(slot, &mut extracted),
// A sub-dictionary of named appearance states (e.g. a
// checkbox's `/On` and `/Off`); hoist each state's stream.
Some(Object::Dictionary(states)) => {
for state in states.values_mut() {
hoist(state, &mut extracted);
}
},
_ => {},
}
}
}

extracted
}

#[cfg(test)]
mod tests {
use super::*;
Expand All @@ -414,6 +462,82 @@ mod tests {
assert_eq!(s.serialize_to_string(&Object::Null), "null");
}

fn stream(tag: &str) -> Object {
let mut d = HashMap::new();
d.insert("Subtype".to_string(), Object::Name("Form".to_string()));
Object::Stream {
dict: d,
data: bytes::Bytes::from(tag.to_string()),
}
}

#[test]
fn test_hoist_appearance_stream_replaces_inline_n_with_reference() {
// An annotation whose `/AP /N` is an inline stream must come back with
// `/N` rewritten to an indirect reference and the stream returned for
// separate writing -- a stream is illegal as a direct dict value.
let mut ap = HashMap::new();
ap.insert("N".to_string(), stream("appearance"));
let mut annot = HashMap::new();
annot.insert("Subtype".to_string(), Object::Name("Watermark".to_string()));
annot.insert("AP".to_string(), Object::Dictionary(ap));

let mut next_id = 42;
let hoisted = hoist_appearance_streams(&mut annot, &mut next_id);

assert_eq!(next_id, 43, "one id consumed");
assert_eq!(hoisted.len(), 1);
assert_eq!(hoisted[0].0, 42);
assert!(matches!(hoisted[0].1, Object::Stream { .. }));

let Object::Dictionary(ap) = &annot["AP"] else {
panic!("AP not a dict")
};
assert_eq!(ap["N"], Object::Reference(ObjectRef::new(42, 0)));

// And it must serialize as an indirect ref, never as an inline stream.
let rendered = ObjectSerializer::new().serialize_to_string(&annot["AP"]);
assert!(rendered.contains("42 0 R"), "got: {rendered}");
assert!(!rendered.contains("stream"), "inline stream leaked: {rendered}");
}

#[test]
fn test_hoist_appearance_handles_named_state_substreams() {
// `/N` may be a dict of named states (e.g. checkbox /On, /Off); each
// state's stream must be hoisted independently.
let mut states = HashMap::new();
states.insert("On".to_string(), stream("on"));
states.insert("Off".to_string(), stream("off"));
let mut ap = HashMap::new();
ap.insert("N".to_string(), Object::Dictionary(states));
let mut annot = HashMap::new();
annot.insert("AP".to_string(), Object::Dictionary(ap));

let mut next_id = 100;
let hoisted = hoist_appearance_streams(&mut annot, &mut next_id);

assert_eq!(hoisted.len(), 2);
assert_eq!(next_id, 102);
let Object::Dictionary(ap) = &annot["AP"] else {
panic!("AP not a dict")
};
let Object::Dictionary(states) = &ap["N"] else {
panic!("N not a dict")
};
assert!(matches!(states["On"], Object::Reference(_)));
assert!(matches!(states["Off"], Object::Reference(_)));
}

#[test]
fn test_hoist_appearance_noop_without_streams() {
// No `/AP`, or an `/AP` without streams, must consume no ids.
let mut annot = HashMap::new();
annot.insert("Subtype".to_string(), Object::Name("Link".to_string()));
let mut next_id = 7;
assert!(hoist_appearance_streams(&mut annot, &mut next_id).is_empty());
assert_eq!(next_id, 7);
}

#[test]
fn test_serialize_boolean() {
let s = ObjectSerializer::new();
Expand Down
15 changes: 14 additions & 1 deletion src/writer/pdf_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use super::form_fields::{
};
use super::freetext::FreeTextAnnotation;
use super::ink::InkAnnotation;
use super::object_serializer::ObjectSerializer;
use super::object_serializer::{hoist_appearance_streams, ObjectSerializer};
use super::shape_annotations::{LineAnnotation, PolygonAnnotation, ShapeAnnotation};
use super::special_annotations::{
CaretAnnotation, FileAttachmentAnnotation, FileAttachmentIcon, PopupAnnotation,
Expand Down Expand Up @@ -1984,6 +1984,19 @@ impl PdfWriter {
output.extend_from_slice(&serializer.serialize_indirect(*obj_id, 0, obj));
}

// Hoist any inline appearance streams (e.g. a watermark's `/AP /N`)
// into their own indirect objects — a stream may not be a direct
// dictionary value. New ids come from `next_obj_id` and the extracted
// streams are appended so they get xref entries below.
let mut hoisted_ap_objects: Vec<(u32, Object)> = Vec::new();
for (_, annot_obj) in annotation_objects.iter_mut() {
if let Object::Dictionary(annot_dict) = annot_obj {
hoisted_ap_objects
.extend(hoist_appearance_streams(annot_dict, &mut self.next_obj_id));
}
}
annotation_objects.extend(hoisted_ap_objects);

// Annotation objects
for (annot_id, annot_obj) in &annotation_objects {
xref_offsets.push((*annot_id, output.len()));
Expand Down
Loading