Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for DEFLATE64 algorithm when extracting from zip archive #818

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -108,6 +108,24 @@ public void SetInflaterInput(Inflater inflater)
}
}

/// <summary>
/// Resize input buffer according to a specific array size
/// </summary>
/// <param name="bufferSize"></param>
public void ResizeBuffer(int bufferSize)
{
if (available == 0)
{
int oldSize = rawData.Length;
byte[] resized = rawData;
Array.Resize(ref resized, bufferSize);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are aware that is a costly copy operation?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I am. But in this particular case I don't think that it is significant.

rawData = resized;
rawLength = rawData.Length;
clearText = rawData;
clearTextLength = clearText.Length;
}
}

/// <summary>
/// Fill the buffer from the underlying input stream.
/// </summary>
@@ -696,7 +714,7 @@ public override int Read(byte[] buffer, int offset, int count)
/// <summary>
/// Base stream the inflater reads from.
/// </summary>
private Stream baseInputStream;
protected Stream baseInputStream;

/// <summary>
/// The compressed size
157 changes: 157 additions & 0 deletions src/ICSharpCode.SharpZipLib/Zip/Deflate64/Deflate64OutputWindow.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
// The content of the class is borrowed from DEFLATE64 support implementation for DotNetZip
// which on its part contains modified code from the .NET Core Libraries (CoreFX and System.IO.Compression/DeflateManaged)
// where deflate64 decompression is implemented.
// https://github.com/haf/DotNetZip.Semverd/blob/master/src/Zip.Shared/Deflate64/OutputWindow.cs

using System;
using System.Diagnostics;

namespace ICSharpCode.SharpZipLib.Zip.Deflate64
{
/// <summary>
/// This class maintains a window for decompressed output.
/// We need to keep this because the decompressed information can be
/// a literal or a length/distance pair. For length/distance pair,
/// we need to look back in the output window and copy bytes from there.
/// We use a byte array of WindowSize circularly.
/// </summary>
internal sealed class Deflate64OutputWindow
{
// With Deflate64 we can have up to a 65536 length as well as up to a 65538 distance. This means we need a Window that is at
// least 131074 bytes long so we have space to retrieve up to a full 64kb in lookback and place it in our buffer without
// overwriting existing data. Deflate64OutputWindow requires that the WindowSize be an exponent of 2, so we round up to 2^18.
private const int WindowSize = 262144;
private const int WindowMask = 262143;

private readonly byte[] _window = new byte[WindowSize]; // The window is 2^18 bytes
private int _end; // this is the position to where we should write next byte
private int _bytesUsed; // The number of bytes in the output window which is not consumed.

internal void ClearBytesUsed()
{
_bytesUsed = 0;
}

/// <summary>Add a byte to output window.</summary>
public void Write(byte b)
{
Debug.Assert(_bytesUsed < WindowSize, "Can't add byte when window is full!");
_window[_end++] = b;
_end &= WindowMask;
++_bytesUsed;
}

public void WriteLengthDistance(int length, int distance)
{
Debug.Assert((_bytesUsed + length) <= WindowSize, "No Enough space");

// move backwards distance bytes in the output stream,
// and copy length bytes from this position to the output stream.
_bytesUsed += length;
int copyStart = (_end - distance) & WindowMask; // start position for coping.

int border = WindowSize - length;
if (copyStart <= border && _end < border)
{
if (length <= distance)
{
Array.Copy(_window, copyStart, _window, _end, length);
_end += length;
}
else
{
// The referenced string may overlap the current
// position; for example, if the last 2 bytes decoded have values
// X and Y, a string reference with <length = 5, distance = 2>
// adds X,Y,X,Y,X to the output stream.
while (length-- > 0)
{
_window[_end++] = _window[copyStart++];
}
}
}
else
{
// copy byte by byte
while (length-- > 0)
{
_window[_end++] = _window[copyStart++];
_end &= WindowMask;
copyStart &= WindowMask;
}
}
}

/// <summary>
/// Copy up to length of bytes from input directly.
/// This is used for uncompressed block.
/// </summary>
public int CopyFrom(InputBuffer input, int length)
{
length = Math.Min(Math.Min(length, WindowSize - _bytesUsed), input.AvailableBytes);
int copied;

// We might need wrap around to copy all bytes.
int tailLen = WindowSize - _end;
if (length > tailLen)
{
// copy the first part
copied = input.CopyTo(_window, _end, tailLen);
if (copied == tailLen)
{
// only try to copy the second part if we have enough bytes in input
copied += input.CopyTo(_window, 0, length - tailLen);
}
}
else
{
// only one copy is needed if there is no wrap around.
copied = input.CopyTo(_window, _end, length);
}

_end = (_end + copied) & WindowMask;
_bytesUsed += copied;
return copied;
}

/// <summary>Free space in output window.</summary>
public int FreeBytes => WindowSize - _bytesUsed;

/// <summary>Bytes not consumed in output window.</summary>
public int AvailableBytes => _bytesUsed;

/// <summary>Copy the decompressed bytes to output array.</summary>
public int CopyTo(byte[] output, int offset, int length)
{
int copy_end;

if (length > _bytesUsed)
{
// we can copy all the decompressed bytes out
copy_end = _end;
length = _bytesUsed;
}
else
{
copy_end = (_end - _bytesUsed + length) & WindowMask; // copy length of bytes
}

int copied = length;

int tailLen = length - copy_end;
if (tailLen > 0)
{
// this means we need to copy two parts separately
// copy tailLen bytes from the end of output window
Array.Copy(_window, WindowSize - tailLen,
output, offset, tailLen);
offset += tailLen;
length = copy_end;
}
Array.Copy(_window, copy_end - length, output, offset, length);
_bytesUsed -= copied;
Debug.Assert(_bytesUsed >= 0, "check this function and find why we copied more bytes than we have");
return copied;
}
}
}
479 changes: 479 additions & 0 deletions src/ICSharpCode.SharpZipLib/Zip/Deflate64/Deflate64Stream.cs

Large diffs are not rendered by default.

323 changes: 323 additions & 0 deletions src/ICSharpCode.SharpZipLib/Zip/Deflate64/HuffmanTree.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
// The content of the class is borrowed from DEFLATE64 support implementation for DotNetZip
// which on its part contains modified code from the .NET Core Libraries (CoreFX and System.IO.Compression/DeflateManaged)
// where deflate64 decompression is implemented.
// https://github.com/haf/DotNetZip.Semverd/blob/master/src/Zip.Shared/Deflate64/HuffmanTree.cs

using System.Diagnostics;
using System.IO;

namespace ICSharpCode.SharpZipLib.Zip.Deflate64
{
// Strictly speaking this class is not a HuffmanTree, this class is
// a lookup table combined with a HuffmanTree. The idea is to speed up
// the lookup for short symbols (they should appear more frequently ideally.)
// However we don't want to create a huge table since it might take longer to
// build the table than decoding (Deflate usually generates new tables frequently.)
// Jean-loup Gailly and Mark Adler gave a very good explanation about this.
// The full text (algorithm.txt) can be found inside
// ftp://ftp.uu.net/pub/archiving/zip/zlib/zlib.zip.
// Following paper explains decoding in details:
// Hirschberg and Lelewer, "Efficient decoding of prefix codes,"
// Comm. ACM, 33,4, April 1990, pp. 449-459.

internal sealed class HuffmanTree
{
internal const int MaxLiteralTreeElements = 288;
internal const int MaxDistTreeElements = 32;
internal const int EndOfBlockCode = 256;
internal const int NumberOfCodeLengthTreeElements = 19;

private readonly int _tableBits;
private readonly short[] _table;
private readonly short[] _left;
private readonly short[] _right;
private readonly byte[] _codeLengthArray;
#if DEBUG
private uint[] _codeArrayDebug;
#endif

private readonly int _tableMask;

// huffman tree for static block
public static HuffmanTree StaticLiteralLengthTree { get; } = new HuffmanTree(GetStaticLiteralTreeLength());

public static HuffmanTree StaticDistanceTree { get; } = new HuffmanTree(GetStaticDistanceTreeLength());

public HuffmanTree(byte[] codeLengths)
{
Debug.Assert(
codeLengths.Length == MaxLiteralTreeElements ||
codeLengths.Length == MaxDistTreeElements ||
codeLengths.Length == NumberOfCodeLengthTreeElements,
"we only expect three kinds of Length here");
_codeLengthArray = codeLengths;

if (_codeLengthArray.Length == MaxLiteralTreeElements)
{
// bits for Literal/Length tree table
_tableBits = 9;
}
else
{
// bits for distance tree table and code length tree table
_tableBits = 7;
}
_tableMask = (1 << _tableBits) - 1;

_table = new short[1 << _tableBits];

// I need to find proof that left and right array will always be
// enough. I think they are.
_left = new short[2 * _codeLengthArray.Length];
_right = new short[2 * _codeLengthArray.Length];

CreateTable();
}

// Generate the array contains huffman codes lengths for static huffman tree.
// The data is in RFC 1951.
private static byte[] GetStaticLiteralTreeLength()
{
byte[] literalTreeLength = new byte[MaxLiteralTreeElements];
for (int i = 0; i <= 143; i++)
literalTreeLength[i] = 8;

for (int i = 144; i <= 255; i++)
literalTreeLength[i] = 9;

for (int i = 256; i <= 279; i++)
literalTreeLength[i] = 7;

for (int i = 280; i <= 287; i++)
literalTreeLength[i] = 8;

return literalTreeLength;
}

private static byte[] GetStaticDistanceTreeLength()
{
byte[] staticDistanceTreeLength = new byte[MaxDistTreeElements];
for (int i = 0; i < MaxDistTreeElements; i++)
{
staticDistanceTreeLength[i] = 5;
}
return staticDistanceTreeLength;
}

// Reverse 'length' of the bits in code
private static uint BitReverse(uint code, int length)
{
uint new_code = 0;

Debug.Assert(length > 0 && length <= 16, "Invalid len");
do
{
new_code |= (code & 1);
new_code <<= 1;
code >>= 1;
} while (--length > 0);

return new_code >> 1;
}

// Calculate the huffman code for each character based on the code length for each character.
// This algorithm is described in standard RFC 1951
private uint[] CalculateHuffmanCode()
{
uint[] bitLengthCount = new uint[17];
foreach (int codeLength in _codeLengthArray)
{
bitLengthCount[codeLength]++;
}
bitLengthCount[0] = 0; // clear count for length 0

uint[] nextCode = new uint[17];
uint tempCode = 0;
for (int bits = 1; bits <= 16; bits++)
{
tempCode = (tempCode + bitLengthCount[bits - 1]) << 1;
nextCode[bits] = tempCode;
}

uint[] code = new uint[MaxLiteralTreeElements];
for (int i = 0; i < _codeLengthArray.Length; i++)
{
int len = _codeLengthArray[i];

if (len > 0)
{
code[i] = BitReverse(nextCode[len], len);
nextCode[len]++;
}
}
return code;
}

private void CreateTable()
{
uint[] codeArray = CalculateHuffmanCode();
#if DEBUG
_codeArrayDebug = codeArray;
#endif

short avail = (short)_codeLengthArray.Length;

for (int ch = 0; ch < _codeLengthArray.Length; ch++)
{
// length of this code
int len = _codeLengthArray[ch];
if (len > 0)
{
// start value (bit reversed)
int start = (int)codeArray[ch];

if (len <= _tableBits)
{
// If a particular symbol is shorter than nine bits,
// then that symbol's translation is duplicated
// in all those entries that start with that symbol's bits.
// For example, if the symbol is four bits, then it's duplicated
// 32 times in a nine-bit table. If a symbol is nine bits long,
// it appears in the table once.
//
// Make sure that in the loop below, code is always
// less than table_size.
//
// On last iteration we store at array index:
// initial_start_at + (locs-1)*increment
// = initial_start_at + locs*increment - increment
// = initial_start_at + (1 << tableBits) - increment
// = initial_start_at + table_size - increment
//
// Therefore we must ensure:
// initial_start_at + table_size - increment < table_size
// or: initial_start_at < increment
//
int increment = 1 << len;
if (start >= increment)
{
throw new InvalidDataException("InvalidHuffmanData");
}

// Note the bits in the table are reverted.
int locs = 1 << (_tableBits - len);
for (int j = 0; j < locs; j++)
{
_table[start] = (short)ch;
start += increment;
}
}
else
{
// For any code which has length longer than num_elements,
// build a binary tree.

int overflowBits = len - _tableBits; // the nodes we need to respent the data.
int codeBitMask = 1 << _tableBits; // mask to get current bit (the bits can't fit in the table)

// the left, right table is used to repesent the
// the rest bits. When we got the first part (number bits.) and look at
// tbe table, we will need to follow the tree to find the real character.
// This is in place to avoid bloating the table if there are
// a few ones with long code.
int index = start & ((1 << _tableBits) - 1);
short[] array = _table;

do
{
short value = array[index];

if (value == 0)
{
// set up next pointer if this node is not used before.
array[index] = (short)-avail; // use next available slot.
value = (short)-avail;
avail++;
}

if (value > 0)
{
// prevent an IndexOutOfRangeException from array[index]
throw new InvalidDataException("InvalidHuffmanData");
}

Debug.Assert(value < 0, "CreateTable: Only negative numbers are used for tree pointers!");

if ((start & codeBitMask) == 0)
{
// if current bit is 0, go change the left array
array = _left;
}
else
{
// if current bit is 1, set value in the right array
array = _right;
}
index = -value; // go to next node

codeBitMask <<= 1;
overflowBits--;
} while (overflowBits != 0);

array[index] = (short)ch;
}
}
}
}

// This function will try to get enough bits from input and
// try to decode the bits.
// If there are no enought bits in the input, this function will return -1.
public int GetNextSymbol(InputBuffer input)
{
// Try to load 16 bits into input buffer if possible and get the bitBuffer value.
// If there aren't 16 bits available we will return all we have in the
// input buffer.
uint bitBuffer = input.TryLoad16Bits();
if (input.AvailableBits == 0)
{ // running out of input.
return -1;
}

// decode an element
int symbol = _table[bitBuffer & _tableMask];
if (symbol < 0)
{ // this will be the start of the binary tree
// navigate the tree
uint mask = (uint)1 << _tableBits;
do
{
symbol = -symbol;
if ((bitBuffer & mask) == 0)
symbol = _left[symbol];
else
symbol = _right[symbol];
mask <<= 1;
} while (symbol < 0);
}

int codeLength = _codeLengthArray[symbol];

// huffman code lengths must be at least 1 bit long
if (codeLength <= 0)
{
throw new InvalidDataException("InvalidHuffmanData");
}

//
// If this code is longer than the # bits we had in the bit buffer (i.e.
// we read only part of the code), we can hit the entry in the table or the tree
// for another symbol. However the length of another symbol will not match the
// available bits count.
if (codeLength > input.AvailableBits)
{
// We already tried to load 16 bits and maximum length is 15,
// so this means we are running out of input.
return -1;
}

input.SkipBits(codeLength);
return symbol;
}
}
}
794 changes: 794 additions & 0 deletions src/ICSharpCode.SharpZipLib/Zip/Deflate64/InflaterManaged.cs

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/ICSharpCode.SharpZipLib/Zip/ZipEntry.cs
Original file line number Diff line number Diff line change
@@ -1085,6 +1085,7 @@ public object Clone()
/// <returns>Returns true if the compression method is supported; false otherwise</returns>
public static bool IsCompressionMethodSupported(CompressionMethod method)
=> method == CompressionMethod.Deflated
|| method == CompressionMethod.Deflate64
|| method == CompressionMethod.Stored
|| method == CompressionMethod.BZip2;

40 changes: 39 additions & 1 deletion src/ICSharpCode.SharpZipLib/Zip/ZipInputStream.cs
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@
using System;
using System.Diagnostics;
using System.IO;
using ICSharpCode.SharpZipLib.Zip.Deflate64;

namespace ICSharpCode.SharpZipLib.Zip
{
@@ -73,6 +74,9 @@ public class ZipInputStream : InflaterInputStream
private Crc32 crc = new Crc32();
private ZipEntry entry;

Deflate64Stream inputDeflate64Stream;
byte[] buffer;

private long size;
private CompressionMethod method;
private int flags;
@@ -197,6 +201,12 @@ public ZipEntry GetNextEntry()
CloseEntry();
}

int bufferSize = inputBuffer.RawData.Length;
//Resize the input buffer in order to read file information only and keep the correct position in the stream
//needed for forward-only stream support
//At first step is needed to read the header and after that the file info
inputBuffer.ResizeBuffer(ZipConstants.LocalHeaderBaseSize);

if (!SkipUntilNextEntry())
{
Dispose();
@@ -216,13 +226,18 @@ public ZipEntry GetNextEntry()

bool isCrypted = (flags & 1) == 1;

//Resize to read the file name and extra data if available
inputBuffer.ResizeBuffer(nameLen + extraLen);
byte[] buffer = new byte[nameLen];
inputBuffer.ReadRawBuffer(buffer);

var entryEncoding = _stringCodec.ZipInputEncoding(flags);
string name = entryEncoding.GetString(buffer);
var unicode = entryEncoding.IsZipUnicode();

//Back to the original size
inputBuffer.ResizeBuffer(bufferSize);

entry = new ZipEntry(name, versionRequiredToExtract, ZipConstants.VersionMadeBy, method, unicode)
{
Flags = flags,
@@ -286,6 +301,12 @@ public ZipEntry GetNextEntry()
{
throw new ZipException("Stored, but compressed != uncompressed");
}
else if (method == CompressionMethod.Deflate64)
{
//All the needed information for decompression is gathered, no need to proceed
this.inputDeflate64Stream = null;
return entry;
}

// Determine how to handle reading of data if this is attempted.
if (IsEntryCompressionMethodSupported(entry))
@@ -425,6 +446,12 @@ public void CloseEntry()
return;
}

if (entry.CompressionMethod == CompressionMethod.Deflate64)
{
//There is no need of inputBuffer processing, all information is available; this would move the stream position
return;
}

if (method == CompressionMethod.Deflated)
{
if ((flags & 8) != 0)
@@ -658,7 +685,18 @@ public override int Read(byte[] buffer, int offset, int count)
throw new ArgumentException("Invalid offset/count combination");
}

return internalReader(buffer, offset, count);
if (entry.CompressionMethod == CompressionMethod.Deflate64)
{
if (inputDeflate64Stream == null)
{
inputDeflate64Stream = new Deflate64Stream(base.baseInputStream, entry.CompressedSize);
}
return inputDeflate64Stream.Read(buffer, 0, count);
}
else
{
return internalReader(buffer, offset, count);
}
}

/// <summary>