Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

All notable changes to PDFOxide are documented here.

## [Unreleased]

### Fixed
- **#595** — Cross-document font cache poisoning when subset fonts
(e.g. `AAAAAA+Arial`) from different PDFs collide on BaseFont name.
The global cache now skips subset fonts entirely, and fonts without
a `BaseFont` entry default to `is_subset = true` as a fail-safe.

## [0.3.55] - 2026-05-25

> Ruby + PHP language bindings + multi-line heading reading-order fix
Expand Down
125 changes: 87 additions & 38 deletions src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10687,17 +10687,30 @@ impl PdfDocument {
/// Uses only inline fields (no reference resolution / load_object calls) to keep
/// the cost at ~200ns. Relies on BaseFont + Subtype + Encoding (when inline) to
/// uniquely identify fonts within a document. For reference-only fields (ToUnicode,
/// FontDescriptor, DescendantFonts), hashes their presence to avoid false positives
/// between fonts with vs without these features.
fn font_identity_hash_cheap(font_obj: &Object) -> u64 {
/// FontDescriptor, DescendantFonts), hashes their values to avoid false positives
/// between fonts with vs without these features. Indirect references (Encoding,
/// ToUnicode, DescendantFonts) are hashed by object ID, not just presence.
/// Returns `(hash, is_subset)`.
///
/// Subset fonts (e.g. `AAAAAA+ArialUnicodeMS`) have document-specific
/// glyph subsets and ToUnicode mappings that are NOT safe to share across
/// documents even when the BaseFont name matches. The caller must skip
/// the global cross-document cache for subset fonts.
fn font_identity_hash_cheap(font_obj: &Object) -> (u64, bool) {
use std::hash::{Hash, Hasher};
let mut hasher = std::collections::hash_map::DefaultHasher::new();
let mut is_subset = true;

if let Some(d) = font_obj.as_dict() {
// BaseFont: primary identity — unique per font within a document
if let Some(Object::Name(n)) = d.get("BaseFont") {
1u8.hash(&mut hasher);
n.hash(&mut hasher);
// Detect subset prefix: 6 uppercase ASCII letters followed by '+'
let name_bytes = n.as_bytes();
is_subset = name_bytes.len() > 7
&& name_bytes[6] == b'+'
&& name_bytes[..6].iter().all(|b| b.is_ascii_uppercase());
}
// Subtype: Type1, TrueType, Type0, CIDFontType0, CIDFontType2
if let Some(Object::Name(n)) = d.get("Subtype") {
Expand All @@ -10709,12 +10722,16 @@ impl PdfDocument {
3u8.hash(&mut hasher);
match enc {
Object::Name(n) => n.hash(&mut hasher),
Object::Reference(_) => b"enc_ref".hash(&mut hasher),
Object::Reference(r) => {
b"enc_ref".hash(&mut hasher);
r.id.hash(&mut hasher);
r.gen.hash(&mut hasher);
},
Object::Dictionary(_) => b"enc_dict".hash(&mut hasher),
_ => {},
}
}
// ToUnicode: hash content via reference or inline presence
// ToUnicode: hash reference ID (unique within a document)
if let Some(to_unicode) = d.get("ToUnicode") {
4u8.hash(&mut hasher);
if let Some(r) = to_unicode.as_reference() {
Expand All @@ -10737,7 +10754,7 @@ impl PdfDocument {
}
}
}
hasher.finish()
(hasher.finish(), is_subset)
}

/// Load fonts from a Resources dictionary into the extractor.
Expand Down Expand Up @@ -10872,7 +10889,8 @@ impl PdfDocument {
if let Some(font_ref) = font_obj.as_reference() {
if let Ok(font) = self.load_object(font_ref) {
name.as_str().hash(&mut h);
Self::font_identity_hash_cheap(&font).hash(&mut h);
let (id_hash, _) = Self::font_identity_hash_cheap(&font);
id_hash.hash(&mut h);
}
}
}
Expand Down Expand Up @@ -10913,10 +10931,11 @@ impl PdfDocument {
let font = self.load_object(font_ref)?;

// Compute identity hash (cheap: 3-6 dict lookups, ~200ns)
let id_hash = Self::font_identity_hash_cheap(&font);
let (id_hash, is_subset) = Self::font_identity_hash_cheap(&font);

// Layer 5: Per-font identity cache — skip from_dict when a
// structurally identical font was already parsed elsewhere.
// structurally identical font was already parsed elsewhere
// in the SAME document.
let cached_identity_opt = self
.font_identity_cache
.lock_or_recover()
Expand All @@ -10932,27 +10951,34 @@ impl PdfDocument {

// Layer 6: Global cross-document font cache — reuse fonts
// parsed by previous PdfDocument instances in this process.
if let Some(cached) =
crate::fonts::global_cache::global_font_cache_get(id_hash)
{
self.font_identity_cache
.lock_or_recover()
.insert(id_hash, Arc::clone(&cached));
self.font_cache
.lock_or_recover()
.insert(font_ref, Arc::clone(&cached));
extractor.add_font_shared((*name).clone(), cached);
continue;
// Skip for subset fonts: their ToUnicode mappings are
// document-specific and unsafe to share across documents.
if !is_subset {
if let Some(cached) =
crate::fonts::global_cache::global_font_cache_get(id_hash)
{
self.font_identity_cache
.lock_or_recover()
.insert(id_hash, Arc::clone(&cached));
self.font_cache
.lock_or_recover()
.insert(font_ref, Arc::clone(&cached));
extractor.add_font_shared((*name).clone(), cached);
continue;
}
}

match FontInfo::from_dict(&font, self) {
Ok(font_info) => {
let arc = Arc::new(font_info);
// Populate both document-level and global caches
crate::fonts::global_cache::global_font_cache_insert(
id_hash,
Arc::clone(&arc),
);
// Global cache: only for non-subset fonts
if !is_subset {
crate::fonts::global_cache::global_font_cache_insert(
id_hash,
Arc::clone(&arc),
);
}
// Document-level caches: always populate
self.font_identity_cache
.lock_or_recover()
.insert(id_hash, Arc::clone(&arc));
Expand Down Expand Up @@ -11024,7 +11050,8 @@ impl PdfDocument {
if let Some(font_ref) = font_obj.as_reference() {
if let Ok(font) = self.load_object(font_ref) {
name.as_str().hash(&mut h);
Self::font_identity_hash_cheap(&font).hash(&mut h);
let (id_hash, _) = Self::font_identity_hash_cheap(&font);
id_hash.hash(&mut h);
}
}
}
Expand Down Expand Up @@ -15818,8 +15845,8 @@ mod tests {
dict2.insert("BaseFont".to_string(), Object::Name("Helvetica".to_string()));
dict2.insert("Subtype".to_string(), Object::Name("Type1".to_string()));

let hash1 = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(dict1));
let hash2 = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(dict2));
let (hash1, _) = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(dict1));
let (hash2, _) = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(dict2));
assert_eq!(hash1, hash2);
}

Expand All @@ -15831,16 +15858,16 @@ mod tests {
let mut dict2 = std::collections::HashMap::new();
dict2.insert("BaseFont".to_string(), Object::Name("Times-Roman".to_string()));

let hash1 = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(dict1));
let hash2 = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(dict2));
let (hash1, _) = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(dict1));
let (hash2, _) = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(dict2));
assert_ne!(hash1, hash2);
}

#[test]
fn test_font_identity_hash_null_object() {
let hash = PdfDocument::font_identity_hash_cheap(&Object::Null);
// Should not panic, returns some hash
let _ = hash;
let (_, is_subset) = PdfDocument::font_identity_hash_cheap(&Object::Null);
// No BaseFont key → is_subset defaults to true (fail-safe)
assert!(is_subset);
}

// ========================================================================
Expand Down Expand Up @@ -17103,27 +17130,33 @@ mod tests {
let mut enc = std::collections::HashMap::new();
enc.insert("Type".to_string(), Object::Name("Encoding".to_string()));
font_dict.insert("Encoding".to_string(), Object::Dictionary(enc));
assert_ne!(PdfDocument::font_identity_hash_cheap(&Object::Dictionary(font_dict)), 0);
let (hash, is_subset) =
PdfDocument::font_identity_hash_cheap(&Object::Dictionary(font_dict));
assert_ne!(hash, 0);
assert!(!is_subset);
}

#[test]
fn test_font_identity_hash_with_encoding_ref() {
let mut font_dict = std::collections::HashMap::new();
font_dict.insert("BaseFont".to_string(), Object::Name("Helvetica".to_string()));
font_dict.insert("Encoding".to_string(), Object::Reference(ObjectRef::new(99, 0)));
assert_ne!(PdfDocument::font_identity_hash_cheap(&Object::Dictionary(font_dict)), 0);
let (hash, is_subset) =
PdfDocument::font_identity_hash_cheap(&Object::Dictionary(font_dict));
assert_ne!(hash, 0);
assert!(!is_subset);
}

#[test]
fn test_font_identity_hash_tounicode_changes_hash() {
let mut d1 = std::collections::HashMap::new();
d1.insert("BaseFont".to_string(), Object::Name("Arial".to_string()));
d1.insert("ToUnicode".to_string(), Object::Reference(ObjectRef::new(50, 0)));
let h1 = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(d1));
let (h1, _) = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(d1));

let mut d2 = std::collections::HashMap::new();
d2.insert("BaseFont".to_string(), Object::Name("Arial".to_string()));
let h2 = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(d2));
let (h2, _) = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(d2));
assert_ne!(h1, h2);
}

Expand All @@ -17136,7 +17169,23 @@ mod tests {
"DescendantFonts".to_string(),
Object::Array(vec![Object::Reference(ObjectRef::new(20, 0))]),
);
assert_ne!(PdfDocument::font_identity_hash_cheap(&Object::Dictionary(d)), 0);
let (hash, is_subset) = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(d));
assert_ne!(hash, 0);
assert!(!is_subset);
}

#[test]
fn test_font_identity_hash_detects_subset_prefix() {
let mut d = std::collections::HashMap::new();
d.insert("BaseFont".to_string(), Object::Name("AAAAAA+ArialUnicodeMS".to_string()));
d.insert("Subtype".to_string(), Object::Name("Type0".to_string()));
let (_, is_subset) = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(d));
assert!(is_subset, "Should detect AAAAAA+ as subset prefix");

let mut d2 = std::collections::HashMap::new();
d2.insert("BaseFont".to_string(), Object::Name("ArialUnicodeMS".to_string()));
let (_, is_subset2) = PdfDocument::font_identity_hash_cheap(&Object::Dictionary(d2));
assert!(!is_subset2, "No prefix should not be detected as subset");
}

// ========================================================================
Expand Down
1 change: 1 addition & 0 deletions src/fonts/global_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ mod tests {

#[test]
fn test_global_api_insert_get_clear_stats() {
// Relies on process-global state — may interact with parallel tests
// Use very high unique keys to avoid collisions with other tests
let key_base = 9_000_000u64;

Expand Down
Loading