Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 54 additions & 8 deletions src/CsvHelper/CsvParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using CsvHelper.Delegates;
using System.Diagnostics;
using System.Globalization;
using System.Runtime.InteropServices;
using System.Text;

namespace CsvHelper;
Expand All @@ -22,6 +23,7 @@ public class CsvParser : IParser, IDisposable
private readonly char escape;
private readonly bool countBytes;
private readonly Encoding encoding;
private readonly Encoder? encoder;
private readonly bool ignoreBlankLines;
private readonly char comment;
private readonly bool allowComments;
Expand Down Expand Up @@ -184,6 +186,8 @@ public CsvParser(TextReader reader, IParserConfiguration configuration, bool lea
delimiterValues = configuration.DetectDelimiterValues;
detectDelimiter = configuration.DetectDelimiter;
encoding = configuration.Encoding;
// encoder only used when counting bytes, so avoid NRE when configuration.Encoding is null
encoder = countBytes ? encoding.GetEncoder() : null;
escape = configuration.Escape;
ignoreBlankLines = configuration.IgnoreBlankLines;
isNewLineSet = configuration.IsNewLineSet;
Expand Down Expand Up @@ -223,7 +227,14 @@ public bool Read()
{
if (!FillBuffer())
{
return ReadEndOfFile();
bool haveMoreData = ReadEndOfFile();

if (countBytes && !haveMoreData)
{
byteCount += FlushEncoder();
}

return haveMoreData;
}

if (row == 1 && detectDelimiter)
Expand Down Expand Up @@ -258,7 +269,14 @@ public async Task<bool> ReadAsync()
{
if (!await FillBufferAsync().ConfigureAwait(false))
{
return ReadEndOfFile();
bool haveMoreData = ReadEndOfFile();

if (countBytes && !haveMoreData)
{
byteCount += FlushEncoder();
}

return haveMoreData;
}

if (row == 1 && detectDelimiter)
Expand Down Expand Up @@ -340,7 +358,7 @@ private ReadLineResult ReadLine(ref char c, ref char cPrev)

if (countBytes)
{
byteCount += encoding.GetByteCount(new char[] { c });
byteCount += PushCharToEncoder(c);
}

if (maxFieldSize > 0 && bufferPosition - fieldStartPosition - 1 > maxFieldSize)
Expand Down Expand Up @@ -519,7 +537,7 @@ private ReadLineResult ReadSpaces(ref char c)
charCount++;
if (countBytes)
{
byteCount += encoding.GetByteCount(new char[] { c });
byteCount += PushCharToEncoder(c);
}
}

Expand Down Expand Up @@ -549,7 +567,7 @@ private ReadLineResult ReadBlankLine(ref char c)
charCount++;
if (countBytes)
{
byteCount += encoding.GetByteCount(new char[] { c });
byteCount += PushCharToEncoder(c);
}
}

Expand Down Expand Up @@ -580,7 +598,7 @@ private ReadLineResult ReadDelimiter(ref char c)
charCount++;
if (countBytes)
{
byteCount += encoding.GetByteCount(new[] { c });
byteCount += PushCharToEncoder(c);
}

if (bufferPosition >= charsRead)
Expand Down Expand Up @@ -618,7 +636,7 @@ private ReadLineResult ReadLineEnding(ref char c)
charCount++;
if (countBytes)
{
byteCount += encoding.GetByteCount(new char[] { c });
byteCount += PushCharToEncoder(c);
}
}
}
Expand Down Expand Up @@ -657,7 +675,7 @@ private ReadLineResult ReadNewLine(ref char c)
charCount++;
if (countBytes)
{
byteCount += encoding.GetByteCount(new[] { c });
byteCount += PushCharToEncoder(c);
}

if (bufferPosition >= charsRead)
Expand Down Expand Up @@ -1089,6 +1107,34 @@ protected ProcessedField ProcessNoEscapeField(int start, int length)
return new ProcessedField(newStart, newLength, buffer);
}

private int PushCharToEncoder(char c)
{
Debug.Assert(encoder is not null);

// We use GetBytes instead of GetByteCount because the former updates the internal state
// of the encoder and the latter doesn't. We use a throwaway buffer for the encoded bytes.
#if NETSTANDARD2_1 || NET
Span<byte> bytesBuffer = encoding.GetMaxByteCount(1) <= 16 ? stackalloc byte[16] : new byte[encoding.GetMaxByteCount(1)];
return encoder.GetBytes([c], bytesBuffer, flush: false);
#else
byte[] bytes = new byte[encoding.GetMaxByteCount(1)];
return encoder!.GetBytes([c], 0, 1, bytes, 0, flush: false);
#endif
}

private int FlushEncoder()
{
Debug.Assert(encoder is not null);

#if NETSTANDARD2_1 || NET
Span<byte> bytesBuffer = encoding.GetMaxByteCount(1) <= 16 ? stackalloc byte[16] : new byte[encoding.GetMaxByteCount(1)];
return encoder.GetBytes(Array.Empty<char>(), bytesBuffer, flush: true);
#else
byte[] bytes = new byte[encoding.GetMaxByteCount(1)];
return encoder!.GetBytes(Array.Empty<char>(), 0, 0, bytes, 0, flush: true);
#endif
}

/// <inheritdoc/>
public void Dispose()
{
Expand Down
53 changes: 53 additions & 0 deletions tests/CsvHelper.Tests/Parsing/ByteCountTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,58 @@ public void Read_Trimmed_WhiteSpaceCorrect()
}
}

[Theory]
[MemberData(nameof(Utf8CharsData))]
public void UTF8_ByteCounts(char[] chars, long expectedByteCount)
{
var config = new CsvConfiguration(CultureInfo.InvariantCulture)
{
Encoding = Encoding.UTF8,
CountBytes = true,
};
using (var reader = new CharsReader(chars))
using (var parser = new CsvParser(reader, config))
{
while (parser.Read()) { }

Assert.Equal(expectedByteCount, parser.ByteCount);
}
}

public static IEnumerable<object[]> Utf8CharsData =>
new List<object[]>
{
new object[] { "ABC✋😉👍".ToCharArray(), Encoding.UTF8.GetByteCount("ABC✋😉👍") },
new object[] { "𐓏𐓘𐓻𐓘𐓻𐓟 𐒻𐓟".ToCharArray(), Encoding.UTF8.GetByteCount("𐓏𐓘𐓻𐓘𐓻𐓟 𐒻𐓟") },
new object[] { new char[] { '\u0232' }, 2 }, // U+0232 (Ȳ - LATIN CAPITAL LETTER Y WITH MACRON)
new object[] { new char[] { '\u0985' }, 3 }, // U+0985 (অ - BENGALI LETTER A)
new object[] { new char[] { '\ud83d', '\ude17' }, 4 }, // U+1F617 (😗 - KISSING FACE)
// The next line tests the encoder is flushed correctly: if the supplied TextReader terminates
// on an unpaired (high) surrogate character then only upon flushing the encoder will the
// ByteCount be increased, in this case by 3 corresponding to the number of UTF8 bytes
// of the replacement character U+FFFD (the default fallback behaviour of the static Encoding.UTF8).
new object[] { new char[] { '\ud800' }, 3 },
};

private class CharsReader : TextReader
{
private readonly char[] _chars;
private int idx = -1;

public CharsReader(char[] chars)
{
_chars = chars;
}

public override int Peek()
{
return idx + 1 >= _chars.Length ? -1 : _chars[idx + 1];
}

public override int Read()
{
return idx + 1 >= _chars.Length ? -1 : _chars[++idx];
}
}
}
}
Loading