diff --git a/src/CsvHelper/CsvParser.cs b/src/CsvHelper/CsvParser.cs index 83ad97b22..87ee63c80 100644 --- a/src/CsvHelper/CsvParser.cs +++ b/src/CsvHelper/CsvParser.cs @@ -6,6 +6,7 @@ using CsvHelper.Delegates; using System.Diagnostics; using System.Globalization; +using System.Runtime.InteropServices; using System.Text; namespace CsvHelper; @@ -22,6 +23,7 @@ public class CsvParser : IParser, IDisposable private readonly char escape; private readonly bool countBytes; private readonly Encoding encoding; + private readonly Encoder? encoder; private readonly bool ignoreBlankLines; private readonly char comment; private readonly bool allowComments; @@ -184,6 +186,8 @@ public CsvParser(TextReader reader, IParserConfiguration configuration, bool lea delimiterValues = configuration.DetectDelimiterValues; detectDelimiter = configuration.DetectDelimiter; encoding = configuration.Encoding; + // encoder only used when counting bytes, so avoid NRE when configuration.Encoding is null + encoder = countBytes ? encoding.GetEncoder() : null; escape = configuration.Escape; ignoreBlankLines = configuration.IgnoreBlankLines; isNewLineSet = configuration.IsNewLineSet; @@ -223,7 +227,14 @@ public bool Read() { if (!FillBuffer()) { - return ReadEndOfFile(); + bool haveMoreData = ReadEndOfFile(); + + if (countBytes && !haveMoreData) + { + byteCount += FlushEncoder(); + } + + return haveMoreData; } if (row == 1 && detectDelimiter) @@ -258,7 +269,14 @@ public async Task ReadAsync() { if (!await FillBufferAsync().ConfigureAwait(false)) { - return ReadEndOfFile(); + bool haveMoreData = ReadEndOfFile(); + + if (countBytes && !haveMoreData) + { + byteCount += FlushEncoder(); + } + + return haveMoreData; } if (row == 1 && detectDelimiter) @@ -340,7 +358,7 @@ private ReadLineResult ReadLine(ref char c, ref char cPrev) if (countBytes) { - byteCount += encoding.GetByteCount(new char[] { c }); + byteCount += PushCharToEncoder(c); } if (maxFieldSize > 0 && bufferPosition - fieldStartPosition - 1 > maxFieldSize) @@ -519,7 +537,7 @@ private ReadLineResult ReadSpaces(ref char c) charCount++; if (countBytes) { - byteCount += encoding.GetByteCount(new char[] { c }); + byteCount += PushCharToEncoder(c); } } @@ -549,7 +567,7 @@ private ReadLineResult ReadBlankLine(ref char c) charCount++; if (countBytes) { - byteCount += encoding.GetByteCount(new char[] { c }); + byteCount += PushCharToEncoder(c); } } @@ -580,7 +598,7 @@ private ReadLineResult ReadDelimiter(ref char c) charCount++; if (countBytes) { - byteCount += encoding.GetByteCount(new[] { c }); + byteCount += PushCharToEncoder(c); } if (bufferPosition >= charsRead) @@ -618,7 +636,7 @@ private ReadLineResult ReadLineEnding(ref char c) charCount++; if (countBytes) { - byteCount += encoding.GetByteCount(new char[] { c }); + byteCount += PushCharToEncoder(c); } } } @@ -657,7 +675,7 @@ private ReadLineResult ReadNewLine(ref char c) charCount++; if (countBytes) { - byteCount += encoding.GetByteCount(new[] { c }); + byteCount += PushCharToEncoder(c); } if (bufferPosition >= charsRead) @@ -1089,6 +1107,34 @@ protected ProcessedField ProcessNoEscapeField(int start, int length) return new ProcessedField(newStart, newLength, buffer); } + private int PushCharToEncoder(char c) + { + Debug.Assert(encoder is not null); + + // We use GetBytes instead of GetByteCount because the former updates the internal state + // of the encoder and the latter doesn't. We use a throwaway buffer for the encoded bytes. +#if NETSTANDARD2_1 || NET + Span bytesBuffer = encoding.GetMaxByteCount(1) <= 16 ? stackalloc byte[16] : new byte[encoding.GetMaxByteCount(1)]; + return encoder.GetBytes([c], bytesBuffer, flush: false); +#else + byte[] bytes = new byte[encoding.GetMaxByteCount(1)]; + return encoder!.GetBytes([c], 0, 1, bytes, 0, flush: false); +#endif + } + + private int FlushEncoder() + { + Debug.Assert(encoder is not null); + +#if NETSTANDARD2_1 || NET + Span bytesBuffer = encoding.GetMaxByteCount(1) <= 16 ? stackalloc byte[16] : new byte[encoding.GetMaxByteCount(1)]; + return encoder.GetBytes(Array.Empty(), bytesBuffer, flush: true); +#else + byte[] bytes = new byte[encoding.GetMaxByteCount(1)]; + return encoder!.GetBytes(Array.Empty(), 0, 0, bytes, 0, flush: true); +#endif + } + /// public void Dispose() { diff --git a/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs b/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs index 659cba571..907092ab8 100644 --- a/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs +++ b/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs @@ -133,5 +133,58 @@ public void Read_Trimmed_WhiteSpaceCorrect() } } + [Theory] + [MemberData(nameof(Utf8CharsData))] + public void UTF8_ByteCounts(char[] chars, long expectedByteCount) + { + var config = new CsvConfiguration(CultureInfo.InvariantCulture) + { + Encoding = Encoding.UTF8, + CountBytes = true, + }; + using (var reader = new CharsReader(chars)) + using (var parser = new CsvParser(reader, config)) + { + while (parser.Read()) { } + + Assert.Equal(expectedByteCount, parser.ByteCount); + } + } + + public static IEnumerable Utf8CharsData => + new List + { + new object[] { "ABC✋😉👍".ToCharArray(), Encoding.UTF8.GetByteCount("ABC✋😉👍") }, + new object[] { "𐓏𐓘𐓻𐓘𐓻𐓟 𐒻𐓟".ToCharArray(), Encoding.UTF8.GetByteCount("𐓏𐓘𐓻𐓘𐓻𐓟 𐒻𐓟") }, + new object[] { new char[] { '\u0232' }, 2 }, // U+0232 (Ȳ - LATIN CAPITAL LETTER Y WITH MACRON) + new object[] { new char[] { '\u0985' }, 3 }, // U+0985 (অ - BENGALI LETTER A) + new object[] { new char[] { '\ud83d', '\ude17' }, 4 }, // U+1F617 (😗 - KISSING FACE) + // The next line tests the encoder is flushed correctly: if the supplied TextReader terminates + // on an unpaired (high) surrogate character then only upon flushing the encoder will the + // ByteCount be increased, in this case by 3 corresponding to the number of UTF8 bytes + // of the replacement character U+FFFD (the default fallback behaviour of the static Encoding.UTF8). + new object[] { new char[] { '\ud800' }, 3 }, + }; + + private class CharsReader : TextReader + { + private readonly char[] _chars; + private int idx = -1; + + public CharsReader(char[] chars) + { + _chars = chars; + } + + public override int Peek() + { + return idx + 1 >= _chars.Length ? -1 : _chars[idx + 1]; + } + + public override int Read() + { + return idx + 1 >= _chars.Length ? -1 : _chars[++idx]; + } + } } }