diff --git a/src/iTextSharp.LGPLv2.Core.FunctionalTests/PdfSmartCopyTests.cs b/src/iTextSharp.LGPLv2.Core.FunctionalTests/PdfSmartCopyTests.cs index 37ac2a7..460e98d 100644 --- a/src/iTextSharp.LGPLv2.Core.FunctionalTests/PdfSmartCopyTests.cs +++ b/src/iTextSharp.LGPLv2.Core.FunctionalTests/PdfSmartCopyTests.cs @@ -1,4 +1,6 @@ +using System.Collections.Generic; using System.IO; +using System.Linq; using iTextSharp.text; using iTextSharp.text.pdf; using Microsoft.VisualStudio.TestTools.UnitTesting; @@ -9,7 +11,7 @@ namespace iTextSharp.LGPLv2.Core.FunctionalTests; public class PdfSmartCopyTests { [TestMethod] - public void Verify_Remove_Duplicate_Objects_Works() + public void Verify_Remove_Duplicate_Streams_Works() { var inputFile = CreateALargePdfFile(); var outFile = TestUtils.GetOutputFileName(); @@ -20,25 +22,29 @@ public void Verify_Remove_Duplicate_Objects_Works() Assert.IsTrue(new FileInfo(inputFile).Length > new FileInfo(outFile).Length); } - private static void CompressPdfFileRemoveDuplicateObjects(string inputFile, string outFile) + [TestMethod] + public void Verify_Remove_Duplicate_Dictionaries_Works() { - using var fileStream = new FileStream(outFile, FileMode.Create); - using var pdfDoc = new Document(); - var pdfSmartCopy = new PdfSmartCopy(pdfDoc, fileStream); - pdfSmartCopy.SetFullCompression(); + var inputFile = CreatePdfFileWithEmbeddedFont(); + var outFile = TestUtils.GetOutputFileName(); - pdfDoc.AddAuthor(TestUtils.Author); - pdfDoc.Open(); + CompressMultiplePdfFilesRemoveDuplicateObjects(inputFile, outFile); - using var reader = new PdfReader(inputFile); + TestUtils.VerifyPdfFileIsReadable(outFile); - var n = reader.NumberOfPages; - for (var page = 0; page < n;) - { - pdfSmartCopy.AddPage(pdfSmartCopy.GetImportedPage(reader, ++page)); - } + using var reader = new PdfReader(outFile); + var fontCount = GetPdfObjects(reader) + .OfType() + .Select(d => d.GetDirectObject(PdfName.TYPE)) + .Where(PdfName.Fontdescriptor.Equals) + .Count(); - pdfSmartCopy.FreeReader(reader); + Assert.AreEqual(1, fontCount); + } + + private static void CompressPdfFileRemoveDuplicateObjects(string inputFile, string outFile) + { + CompressMultiplePdfFilesRemoveDuplicateObjects(inputFile, outFile, 1); } private string CreateALargePdfFile() @@ -48,7 +54,7 @@ private string CreateALargePdfFile() { using (var pdfDoc = new Document(PageSize.A4)) { - var pdfWriter = PdfWriter.GetInstance(pdfDoc, fileStream); + PdfWriter.GetInstance(pdfDoc, fileStream); pdfDoc.AddAuthor(TestUtils.Author); pdfDoc.Open(); @@ -66,4 +72,57 @@ private string CreateALargePdfFile() TestUtils.VerifyPdfFileIsReadable(pdfFilePath); return pdfFilePath; } + + private string CreatePdfFileWithEmbeddedFont() + { + var pdfFilePath = TestUtils.GetOutputFileName(); + using (var fileStream = new FileStream(pdfFilePath, FileMode.Create)) + { + using (var pdfDoc = new Document(PageSize.A4)) + { + PdfWriter.GetInstance(pdfDoc, fileStream); + pdfDoc.AddAuthor(TestUtils.Author); + pdfDoc.Open(); + + var font = TestUtils.GetUnicodeFont("Tahoma", TestUtils.GetTahomaFontPath(), 10, Font.NORMAL, BaseColor.Black); + pdfDoc.Add(new Paragraph("Document with embedded font", font)); + } + } + + TestUtils.VerifyPdfFileIsReadable(pdfFilePath); + return pdfFilePath; + } + + + private static void CompressMultiplePdfFilesRemoveDuplicateObjects(string inputFile, string outFile, int times = 10) + { + using var fileStream = new FileStream(outFile, FileMode.Create); + using var pdfDoc = new Document(); + var pdfSmartCopy = new PdfSmartCopy(pdfDoc, fileStream); + pdfSmartCopy.SetFullCompression(); + + pdfDoc.AddAuthor(TestUtils.Author); + pdfDoc.Open(); + + // The same document has been added multiple times + // This will cause duplicate dictionaries (ex: FontDescriptors) + for (var i = 0; i < times; ++i) + { + using var reader = new PdfReader(inputFile); + + var n = reader.NumberOfPages; + for (var page = 0; page < n;) + { + pdfSmartCopy.AddPage(pdfSmartCopy.GetImportedPage(reader, ++page)); + } + + pdfSmartCopy.FreeReader(reader); + } + } + + private IEnumerable GetPdfObjects(PdfReader reader) + { + for (var idx = 0; idx < reader.XrefSize; ++idx) + yield return reader.GetPdfObjectRelease(idx); + } } \ No newline at end of file diff --git a/src/iTextSharp.LGPLv2.Core/iTextSharp/text/pdf/PdfCopy.cs b/src/iTextSharp.LGPLv2.Core/iTextSharp/text/pdf/PdfCopy.cs index 5520d8b..872e3d8 100644 --- a/src/iTextSharp.LGPLv2.Core/iTextSharp/text/pdf/PdfCopy.cs +++ b/src/iTextSharp.LGPLv2.Core/iTextSharp/text/pdf/PdfCopy.cs @@ -937,7 +937,7 @@ public override bool Equals(object obj) return false; } - return Gen == other.Gen && Num == other.Num; + return Num == other.Num && Gen == other.Gen; } public override int GetHashCode() => (Gen << 16) + Num; diff --git a/src/iTextSharp.LGPLv2.Core/iTextSharp/text/pdf/PdfSmartCopy.cs b/src/iTextSharp.LGPLv2.Core/iTextSharp/text/pdf/PdfSmartCopy.cs index 3b64959..4814e33 100644 --- a/src/iTextSharp.LGPLv2.Core/iTextSharp/text/pdf/PdfSmartCopy.cs +++ b/src/iTextSharp.LGPLv2.Core/iTextSharp/text/pdf/PdfSmartCopy.cs @@ -47,20 +47,6 @@ protected override PdfIndirectReference CopyIndirect(PrIndirectReference inp) return null; } - ByteStore streamKey = null; - var validStream = false; - - if (srcObj.IsStream()) - { - streamKey = new ByteStore(srcObj); - validStream = true; - var streamRef = _streamMap[streamKey]; - if (streamRef != null) - { - return streamRef; - } - } - PdfIndirectReference theRef; var key = new RefKey(inp); var iRef = Indirects[key]; @@ -74,9 +60,25 @@ protected override PdfIndirectReference CopyIndirect(PrIndirectReference inp) } else { + ByteStore streamKey = null; + if (srcObj.IsStream() || srcObj.IsDictionary()) + { + streamKey = new ByteStore(srcObj); + var streamRef = _streamMap[streamKey]; + if (streamRef != null) + { + return streamRef; + } + } + theRef = Body.PdfIndirectReference; iRef = new IndirectReferences(theRef); Indirects[key] = iRef; + + if (streamKey != null) + { + _streamMap[streamKey] = theRef; + } } if (srcObj.IsDictionary()) @@ -90,11 +92,6 @@ protected override PdfIndirectReference CopyIndirect(PrIndirectReference inp) iRef.SetCopied(); - if (validStream) - { - _streamMap[streamKey] = theRef; - } - var obj = CopyObject(srcObj); AddToBody(obj, theRef); return theRef; @@ -103,10 +100,12 @@ protected override PdfIndirectReference CopyIndirect(PrIndirectReference inp) internal class ByteStore { private readonly byte[] _b; + private List _references; internal ByteStore(PdfObject str) { var bb = new ByteBuffer(); + _references = new List(); var level = 100; serObject(str, level, bb); _b = bb.ToByteArray(); @@ -199,18 +198,29 @@ private void serObject(PdfObject obj, int level, ByteBuffer bb) return; } + if (obj.IsIndirect()) + { + var refKey = new RefKey((PdfIndirectReference)obj); + var refIdx = _references.IndexOf(refKey); + if (refIdx >= 0) + { + // Already seen, print relative reference label only + bb.Append($"$R{refIdx}"); + return; + } + + // First occurence, print relative reference label and process content + bb.Append($"$R{_references.Count}"); + _references.Add(refKey); + } + obj = PdfReader.GetPdfObject(obj); if (obj.IsStream()) { bb.Append("$B"); serDic((PdfDictionary)obj, level - 1, bb); - if (level > 0) - { - using (var md5 = MD5BouncyCastle.Create()) - { - bb.Append(md5.ComputeHash(PdfReader.GetStreamBytesRaw((PrStream)obj))); - } - } + using var md5 = MD5BouncyCastle.Create(); + bb.Append(md5.ComputeHash(PdfReader.GetStreamBytesRaw((PrStream)obj))); } else if (obj.IsDictionary()) {