Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve SmartPdfCopy compression and performance #132

Merged
merged 3 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 75 additions & 16 deletions src/iTextSharp.LGPLv2.Core.FunctionalTests/PdfSmartCopyTests.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using iTextSharp.text;
using iTextSharp.text.pdf;
using Microsoft.VisualStudio.TestTools.UnitTesting;
Expand All @@ -9,7 +11,7 @@ namespace iTextSharp.LGPLv2.Core.FunctionalTests;
public class PdfSmartCopyTests
{
[TestMethod]
public void Verify_Remove_Duplicate_Objects_Works()
public void Verify_Remove_Duplicate_Streams_Works()
{
var inputFile = CreateALargePdfFile();
var outFile = TestUtils.GetOutputFileName();
Expand All @@ -20,25 +22,29 @@ public void Verify_Remove_Duplicate_Objects_Works()
Assert.IsTrue(new FileInfo(inputFile).Length > new FileInfo(outFile).Length);
}

private static void CompressPdfFileRemoveDuplicateObjects(string inputFile, string outFile)
[TestMethod]
public void Verify_Remove_Duplicate_Dictionaries_Works()
{
using var fileStream = new FileStream(outFile, FileMode.Create);
using var pdfDoc = new Document();
var pdfSmartCopy = new PdfSmartCopy(pdfDoc, fileStream);
pdfSmartCopy.SetFullCompression();
var inputFile = CreatePdfFileWithEmbeddedFont();
var outFile = TestUtils.GetOutputFileName();

pdfDoc.AddAuthor(TestUtils.Author);
pdfDoc.Open();
CompressMultiplePdfFilesRemoveDuplicateObjects(inputFile, outFile);

using var reader = new PdfReader(inputFile);
TestUtils.VerifyPdfFileIsReadable(outFile);

var n = reader.NumberOfPages;
for (var page = 0; page < n;)
{
pdfSmartCopy.AddPage(pdfSmartCopy.GetImportedPage(reader, ++page));
}
using var reader = new PdfReader(outFile);
var fontCount = GetPdfObjects(reader)
.OfType<PdfDictionary>()
.Select(d => d.GetDirectObject(PdfName.TYPE))
.Where(PdfName.Fontdescriptor.Equals)
.Count();

pdfSmartCopy.FreeReader(reader);
Assert.AreEqual(1, fontCount);
}

private static void CompressPdfFileRemoveDuplicateObjects(string inputFile, string outFile)
{
CompressMultiplePdfFilesRemoveDuplicateObjects(inputFile, outFile, 1);
}

private string CreateALargePdfFile()
Expand All @@ -48,7 +54,7 @@ private string CreateALargePdfFile()
{
using (var pdfDoc = new Document(PageSize.A4))
{
var pdfWriter = PdfWriter.GetInstance(pdfDoc, fileStream);
PdfWriter.GetInstance(pdfDoc, fileStream);

pdfDoc.AddAuthor(TestUtils.Author);
pdfDoc.Open();
Expand All @@ -66,4 +72,57 @@ private string CreateALargePdfFile()
TestUtils.VerifyPdfFileIsReadable(pdfFilePath);
return pdfFilePath;
}

private string CreatePdfFileWithEmbeddedFont()
{
var pdfFilePath = TestUtils.GetOutputFileName();
using (var fileStream = new FileStream(pdfFilePath, FileMode.Create))
{
using (var pdfDoc = new Document(PageSize.A4))
{
PdfWriter.GetInstance(pdfDoc, fileStream);
pdfDoc.AddAuthor(TestUtils.Author);
pdfDoc.Open();

var font = TestUtils.GetUnicodeFont("Tahoma", TestUtils.GetTahomaFontPath(), 10, Font.NORMAL, BaseColor.Black);
pdfDoc.Add(new Paragraph("Document with embedded font", font));
}
}

TestUtils.VerifyPdfFileIsReadable(pdfFilePath);
return pdfFilePath;
}


private static void CompressMultiplePdfFilesRemoveDuplicateObjects(string inputFile, string outFile, int times = 10)
{
using var fileStream = new FileStream(outFile, FileMode.Create);
using var pdfDoc = new Document();
var pdfSmartCopy = new PdfSmartCopy(pdfDoc, fileStream);
pdfSmartCopy.SetFullCompression();

pdfDoc.AddAuthor(TestUtils.Author);
pdfDoc.Open();

// The same document has been added multiple times
// This will cause duplicate dictionaries (ex: FontDescriptors)
for (var i = 0; i < times; ++i)
{
using var reader = new PdfReader(inputFile);

var n = reader.NumberOfPages;
for (var page = 0; page < n;)
{
pdfSmartCopy.AddPage(pdfSmartCopy.GetImportedPage(reader, ++page));
}

pdfSmartCopy.FreeReader(reader);
}
}

private IEnumerable<object> GetPdfObjects(PdfReader reader)
{
for (var idx = 0; idx < reader.XrefSize; ++idx)
yield return reader.GetPdfObjectRelease(idx);
}
}
2 changes: 1 addition & 1 deletion src/iTextSharp.LGPLv2.Core/iTextSharp/text/pdf/PdfCopy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -937,7 +937,7 @@ public override bool Equals(object obj)
return false;
}

return Gen == other.Gen && Num == other.Num;
return Num == other.Num && Gen == other.Gen;
}

public override int GetHashCode() => (Gen << 16) + Num;
Expand Down
62 changes: 36 additions & 26 deletions src/iTextSharp.LGPLv2.Core/iTextSharp/text/pdf/PdfSmartCopy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,20 +47,6 @@
return null;
}

ByteStore streamKey = null;
var validStream = false;

if (srcObj.IsStream())
{
streamKey = new ByteStore(srcObj);
validStream = true;
var streamRef = _streamMap[streamKey];
if (streamRef != null)
{
return streamRef;
}
}

PdfIndirectReference theRef;
var key = new RefKey(inp);
var iRef = Indirects[key];
Expand All @@ -74,9 +60,25 @@
}
else
{
ByteStore streamKey = null;
if (srcObj.IsStream() || srcObj.IsDictionary())
{
streamKey = new ByteStore(srcObj);
var streamRef = _streamMap[streamKey];
if (streamRef != null)
{
return streamRef;
}
}

theRef = Body.PdfIndirectReference;
iRef = new IndirectReferences(theRef);
Indirects[key] = iRef;

if (streamKey != null)
{
_streamMap[streamKey] = theRef;
}
}

if (srcObj.IsDictionary())
Expand All @@ -90,11 +92,6 @@

iRef.SetCopied();

if (validStream)
{
_streamMap[streamKey] = theRef;
}

var obj = CopyObject(srcObj);
AddToBody(obj, theRef);
return theRef;
Expand All @@ -103,10 +100,12 @@
internal class ByteStore
{
private readonly byte[] _b;
private List<RefKey> _references;

Check notice

Code scanning / CodeQL

Missed 'readonly' opportunity Note

Field '_references' can be 'readonly'.

internal ByteStore(PdfObject str)
{
var bb = new ByteBuffer();
_references = new List<RefKey>();
var level = 100;
serObject(str, level, bb);
_b = bb.ToByteArray();
Expand Down Expand Up @@ -199,18 +198,29 @@
return;
}

if (obj.IsIndirect())
{
var refKey = new RefKey((PdfIndirectReference)obj);
var refIdx = _references.IndexOf(refKey);
if (refIdx >= 0)
{
// Already seen, print relative reference label only
bb.Append($"$R{refIdx}");
return;
}

// First occurence, print relative reference label and process content
bb.Append($"$R{_references.Count}");
_references.Add(refKey);
}

obj = PdfReader.GetPdfObject(obj);
if (obj.IsStream())
{
bb.Append("$B");
serDic((PdfDictionary)obj, level - 1, bb);
if (level > 0)
{
using (var md5 = MD5BouncyCastle.Create())
{
bb.Append(md5.ComputeHash(PdfReader.GetStreamBytesRaw((PrStream)obj)));
}
}
using var md5 = MD5BouncyCastle.Create();
bb.Append(md5.ComputeHash(PdfReader.GetStreamBytesRaw((PrStream)obj)));
}
else if (obj.IsDictionary())
{
Expand Down