diff --git a/CHANGELOG.md b/CHANGELOG.md index e943dbdbb..1e63a220d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable changes to PDFOxide are documented here. ### Fixed +- **Watermark annotations rendered as nothing in compliant viewers** — a watermark's `/AP` appearance was serialized as a stream nested *directly* inside the annotation dictionary (`/AP <> stream … endstream>>`). A PDF stream must be an indirect object (ISO 32000-1:2008 §7.3.8); the inline form is invalid, so spec-compliant readers (e.g. MuPDF/PyMuPDF) rejected the annotation with "invalid key in dict" and the watermark never appeared — even though the bytes were present in the file. A shared `hoist_appearance_streams` helper now lifts nested `/N`, `/D`, and `/R` appearance streams (including named-state sub-dictionaries) into freshly allocated indirect objects and replaces the slot with a reference, applied on both the `DocumentBuilder` writer and the existing-page `DocumentEditor::save_page` paths. Verified end-to-end with MuPDF: the watermark now parses and renders on both paths. - **Fixed Python type stubs leaking the pyo3 `Py` receiver as a positional parameter** — methods implemented in Rust with a by-value receiver (`fn page(slf_handle: Py, …)` — the idiom pyo3 uses to hand a method an owned handle to its own instance) were emitted by the rylai stub generator with that receiver re-exposed *alongside* the injected `self`. ## [0.3.63] - 2026-06-09 diff --git a/src/editor/document_editor.rs b/src/editor/document_editor.rs index b9f02e04e..fce5cc3bf 100644 --- a/src/editor/document_editor.rs +++ b/src/editor/document_editor.rs @@ -10,7 +10,7 @@ use crate::error::{Error, Result}; use crate::extractors::HierarchicalExtractor; use crate::geometry::Rect; use crate::object::{Object, ObjectRef}; -use crate::writer::{ContentStreamBuilder, ObjectSerializer}; +use crate::writer::{hoist_appearance_streams, ContentStreamBuilder, ObjectSerializer}; use std::collections::{HashMap, HashSet}; use std::fs::File; #[cfg(not(target_arch = "wasm32"))] @@ -3375,34 +3375,58 @@ impl DocumentEditor { // Get page refs for building annotations (needed for link destinations) let page_refs = self.get_page_refs().unwrap_or_default(); - if let Some(annotations) = - self.modified_annotations.get(&source_page_index) - { - let new_annotations: Vec<_> = - annotations.iter().filter(|a| a.is_new()).collect(); - - for (annot_id, annot_wrapper) in - new_annotation_ids.iter().zip(new_annotations.iter()) + // Build the annotation dictionaries first, in + // a scope that releases the `modified_annotations` + // borrow before we allocate ids / mutate `self`. + let built_annots: Vec<(u32, HashMap)> = + if let Some(annotations) = + self.modified_annotations.get(&source_page_index) { - if let Some(writer_annot) = - annot_wrapper.writer_annotation() - { - // Build the annotation dictionary - let annot_dict = writer_annot.build(&page_refs); + new_annotation_ids + .iter() + .zip(annotations.iter().filter(|a| a.is_new())) + .filter_map(|(annot_id, annot_wrapper)| { + annot_wrapper + .writer_annotation() + .map(|wa| (*annot_id, wa.build(&page_refs))) + }) + .collect() + } else { + Vec::new() + }; - // Write the annotation object - let offset = writer.stream_position()?; - let bytes = serialize_obj( - &serializer, - *annot_id, - 0, - &Object::Dictionary(annot_dict), - &encryption_handler, - ); - writer.write_all(&bytes)?; - xref_entries.push((*annot_id, offset, 0, true)); - } + for (annot_id, mut annot_dict) in built_annots { + // Hoist any inline appearance stream (e.g. a + // watermark's `/AP /N`) into its own indirect + // object; a stream may not be a direct dict value. + let hoisted = hoist_appearance_streams( + &mut annot_dict, + &mut self.next_object_id, + ); + for (stream_id, stream_obj) in hoisted { + let offset = writer.stream_position()?; + let bytes = serialize_obj( + &serializer, + stream_id, + 0, + &stream_obj, + &encryption_handler, + ); + writer.write_all(&bytes)?; + xref_entries.push((stream_id, offset, 0, true)); } + + // Write the annotation object + let offset = writer.stream_position()?; + let bytes = serialize_obj( + &serializer, + annot_id, + 0, + &Object::Dictionary(annot_dict), + &encryption_handler, + ); + writer.write_all(&bytes)?; + xref_entries.push((annot_id, offset, 0, true)); } } diff --git a/src/writer/document_builder.rs b/src/writer/document_builder.rs index 369243f5c..27e4b6a8d 100644 --- a/src/writer/document_builder.rs +++ b/src/writer/document_builder.rs @@ -3716,6 +3716,43 @@ mod tests { assert!(content.contains("%%EOF")); } + #[test] + fn test_watermark_appearance_is_indirect_not_inline() { + // Regression: a watermark's `/AP /N` appearance stream must be an + // indirect object (`/N 0 R`), not an inline stream embedded in + // the annotation dict (which is invalid PDF and renders to nothing). + let mut builder = DocumentBuilder::new(); + let wm = crate::writer::WatermarkAnnotation::new("CONFIDENTIAL") + .with_rect(crate::geometry::Rect::new(50.0, 50.0, 200.0, 400.0)); + builder + .letter_page() + .paragraph("body") + .watermark_custom(wm) + .done(); + + let bytes = builder.build().unwrap(); + let content = String::from_utf8_lossy(&bytes); + + assert!(content.contains("/Subtype /Watermark"), "watermark missing"); + // `/N` references an indirect object... + let re = regex_lite_n_ref(&content); + assert!(re, "appearance `/N` is not an indirect reference: not found"); + // ...and the appearance dict is never opened inline after `/N`. + assert!( + !content.contains("/N <<"), + "inline appearance stream leaked into annotation dict" + ); + } + + // Minimal check for `/N 0 R` without pulling in a regex crate. + fn regex_lite_n_ref(s: &str) -> bool { + s.match_indices("/N ").any(|(i, _)| { + let rest = &s[i + 3..]; + let digits: String = rest.chars().take_while(|c| c.is_ascii_digit()).collect(); + !digits.is_empty() && rest[digits.len()..].starts_with(" 0 R") + }) + } + #[test] fn test_document_builder_with_metadata() { let mut builder = DocumentBuilder::new().metadata( diff --git a/src/writer/mod.rs b/src/writer/mod.rs index 93bffd34a..67cc9ad9d 100644 --- a/src/writer/mod.rs +++ b/src/writer/mod.rs @@ -131,6 +131,7 @@ pub use linearization::{ SharedObjectHeader, }; pub use movie::{MovieActivation, MovieAnnotation, MovieData, MoviePlayMode}; +pub(crate) use object_serializer::hoist_appearance_streams; pub use object_serializer::ObjectSerializer; pub use outline_builder::{ FitMode, OutlineBuildResult, OutlineBuilder, OutlineDestination, OutlineItem, OutlineStyle, diff --git a/src/writer/object_serializer.rs b/src/writer/object_serializer.rs index 0935f5022..4775ed3d6 100644 --- a/src/writer/object_serializer.rs +++ b/src/writer/object_serializer.rs @@ -404,6 +404,54 @@ impl ObjectSerializer { } } +/// Hoist inline appearance streams out of an annotation dictionary into +/// indirect objects, as the PDF spec requires. +/// +/// An annotation's `/AP` appearance can carry stream objects under `/N`, +/// `/D` and `/R` (each either a single stream or a sub-dictionary of named +/// appearance states). Some builders construct these as a direct +/// [`Object::Stream`] nested inside the dictionary — which serializes to an +/// illegal inline `<< … >> stream … endstream` *inside* the annotation dict. +/// A stream **must** be an indirect object (ISO 32000-1 §7.3.8), so this +/// replaces every nested stream with an [`Object::Reference`] to a freshly +/// allocated id (drawn from `next_id`, which is advanced) and returns the +/// `(id, stream)` pairs for the caller to write as separate indirect objects. +/// +/// Returns an empty vector when there is nothing to hoist, so it is safe to +/// call on every annotation dictionary unconditionally. +pub(crate) fn hoist_appearance_streams( + annot_dict: &mut HashMap, + next_id: &mut u32, +) -> Vec<(u32, Object)> { + let mut extracted = Vec::new(); + let mut hoist = |slot: &mut Object, out: &mut Vec<(u32, Object)>| { + if matches!(slot, Object::Stream { .. }) { + let id = *next_id; + *next_id += 1; + let stream = std::mem::replace(slot, Object::Reference(ObjectRef::new(id, 0))); + out.push((id, stream)); + } + }; + + if let Some(Object::Dictionary(ap)) = annot_dict.get_mut("AP") { + for key in ["N", "D", "R"] { + match ap.get_mut(key) { + Some(slot @ Object::Stream { .. }) => hoist(slot, &mut extracted), + // A sub-dictionary of named appearance states (e.g. a + // checkbox's `/On` and `/Off`); hoist each state's stream. + Some(Object::Dictionary(states)) => { + for state in states.values_mut() { + hoist(state, &mut extracted); + } + }, + _ => {}, + } + } + } + + extracted +} + #[cfg(test)] mod tests { use super::*; @@ -414,6 +462,82 @@ mod tests { assert_eq!(s.serialize_to_string(&Object::Null), "null"); } + fn stream(tag: &str) -> Object { + let mut d = HashMap::new(); + d.insert("Subtype".to_string(), Object::Name("Form".to_string())); + Object::Stream { + dict: d, + data: bytes::Bytes::from(tag.to_string()), + } + } + + #[test] + fn test_hoist_appearance_stream_replaces_inline_n_with_reference() { + // An annotation whose `/AP /N` is an inline stream must come back with + // `/N` rewritten to an indirect reference and the stream returned for + // separate writing -- a stream is illegal as a direct dict value. + let mut ap = HashMap::new(); + ap.insert("N".to_string(), stream("appearance")); + let mut annot = HashMap::new(); + annot.insert("Subtype".to_string(), Object::Name("Watermark".to_string())); + annot.insert("AP".to_string(), Object::Dictionary(ap)); + + let mut next_id = 42; + let hoisted = hoist_appearance_streams(&mut annot, &mut next_id); + + assert_eq!(next_id, 43, "one id consumed"); + assert_eq!(hoisted.len(), 1); + assert_eq!(hoisted[0].0, 42); + assert!(matches!(hoisted[0].1, Object::Stream { .. })); + + let Object::Dictionary(ap) = &annot["AP"] else { + panic!("AP not a dict") + }; + assert_eq!(ap["N"], Object::Reference(ObjectRef::new(42, 0))); + + // And it must serialize as an indirect ref, never as an inline stream. + let rendered = ObjectSerializer::new().serialize_to_string(&annot["AP"]); + assert!(rendered.contains("42 0 R"), "got: {rendered}"); + assert!(!rendered.contains("stream"), "inline stream leaked: {rendered}"); + } + + #[test] + fn test_hoist_appearance_handles_named_state_substreams() { + // `/N` may be a dict of named states (e.g. checkbox /On, /Off); each + // state's stream must be hoisted independently. + let mut states = HashMap::new(); + states.insert("On".to_string(), stream("on")); + states.insert("Off".to_string(), stream("off")); + let mut ap = HashMap::new(); + ap.insert("N".to_string(), Object::Dictionary(states)); + let mut annot = HashMap::new(); + annot.insert("AP".to_string(), Object::Dictionary(ap)); + + let mut next_id = 100; + let hoisted = hoist_appearance_streams(&mut annot, &mut next_id); + + assert_eq!(hoisted.len(), 2); + assert_eq!(next_id, 102); + let Object::Dictionary(ap) = &annot["AP"] else { + panic!("AP not a dict") + }; + let Object::Dictionary(states) = &ap["N"] else { + panic!("N not a dict") + }; + assert!(matches!(states["On"], Object::Reference(_))); + assert!(matches!(states["Off"], Object::Reference(_))); + } + + #[test] + fn test_hoist_appearance_noop_without_streams() { + // No `/AP`, or an `/AP` without streams, must consume no ids. + let mut annot = HashMap::new(); + annot.insert("Subtype".to_string(), Object::Name("Link".to_string())); + let mut next_id = 7; + assert!(hoist_appearance_streams(&mut annot, &mut next_id).is_empty()); + assert_eq!(next_id, 7); + } + #[test] fn test_serialize_boolean() { let s = ObjectSerializer::new(); diff --git a/src/writer/pdf_writer.rs b/src/writer/pdf_writer.rs index 76d0a42c9..7fdc534bc 100644 --- a/src/writer/pdf_writer.rs +++ b/src/writer/pdf_writer.rs @@ -12,7 +12,7 @@ use super::form_fields::{ }; use super::freetext::FreeTextAnnotation; use super::ink::InkAnnotation; -use super::object_serializer::ObjectSerializer; +use super::object_serializer::{hoist_appearance_streams, ObjectSerializer}; use super::shape_annotations::{LineAnnotation, PolygonAnnotation, ShapeAnnotation}; use super::special_annotations::{ CaretAnnotation, FileAttachmentAnnotation, FileAttachmentIcon, PopupAnnotation, @@ -1984,6 +1984,19 @@ impl PdfWriter { output.extend_from_slice(&serializer.serialize_indirect(*obj_id, 0, obj)); } + // Hoist any inline appearance streams (e.g. a watermark's `/AP /N`) + // into their own indirect objects — a stream may not be a direct + // dictionary value. New ids come from `next_obj_id` and the extracted + // streams are appended so they get xref entries below. + let mut hoisted_ap_objects: Vec<(u32, Object)> = Vec::new(); + for (_, annot_obj) in annotation_objects.iter_mut() { + if let Object::Dictionary(annot_dict) = annot_obj { + hoisted_ap_objects + .extend(hoist_appearance_streams(annot_dict, &mut self.next_obj_id)); + } + } + annotation_objects.extend(hoisted_ap_objects); + // Annotation objects for (annot_id, annot_obj) in &annotation_objects { xref_offsets.push((*annot_id, output.len()));