From abda326998849e743b6ed18e0e3ee3bdf7210f17 Mon Sep 17 00:00:00 2001 From: Robert Hague Date: Mon, 26 Dec 2022 18:31:41 +0000 Subject: [PATCH 1/2] Use Encoder instead of Encoding in CsvParser This improves the ByteCount logic when consuming surrogate characters because the Encoder maintains state across calls. Previously the default behaviour was that the Encoding would return the byte count having replaced the surrogate character with U+FFFD REPLACEMENT CHARACTER because on its own a surrogate character is invalid. For example, when reading the UTF-16 surrogate pair \uD83D\uDE17 corresponding to the Unicode character U+1F617, the (UTF-8) ByteCount would be 6 (the byte count of the sequence \uFFFD\uFFFD) instead of the correct value of 4 UTF-8 bytes. --- src/CsvHelper/CsvParser.cs | 62 ++++++++++++++++--- .../CsvHelper.Tests/Parsing/ByteCountTests.cs | 53 ++++++++++++++++ 2 files changed, 105 insertions(+), 10 deletions(-) diff --git a/src/CsvHelper/CsvParser.cs b/src/CsvHelper/CsvParser.cs index 187db6317..a6397b8a1 100644 --- a/src/CsvHelper/CsvParser.cs +++ b/src/CsvHelper/CsvParser.cs @@ -11,6 +11,7 @@ using System.IO; using System.Linq; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; @@ -28,7 +29,7 @@ public class CsvParser : IParser, IDisposable private readonly char quote; private readonly char escape; private readonly bool countBytes; - private readonly Encoding encoding; + private readonly Encoder encoder; private readonly bool ignoreBlankLines; private readonly char comment; private readonly bool allowComments; @@ -190,7 +191,8 @@ public CsvParser(TextReader reader, IParserConfiguration configuration, bool lea delimiterFirstChar = configuration.Delimiter[0]; delimiterValues = configuration.DetectDelimiterValues; detectDelimiter = configuration.DetectDelimiter; - encoding = configuration.Encoding; + // encoder only used when counting bytes, so avoid NRE when configuration.Encoding is null + encoder = countBytes ? configuration.Encoding.GetEncoder() : null; escape = configuration.Escape; ignoreBlankLines = configuration.IgnoreBlankLines; isNewLineSet = configuration.IsNewLineSet; @@ -230,7 +232,14 @@ public bool Read() { if (!FillBuffer()) { - return ReadEndOfFile(); + bool haveMoreData = ReadEndOfFile(); + + if (countBytes && !haveMoreData) + { + byteCount += FlushEncoder(encoder); + } + + return haveMoreData; } if (row == 1 && detectDelimiter) @@ -265,7 +274,14 @@ public async Task ReadAsync() { if (!await FillBufferAsync().ConfigureAwait(false)) { - return ReadEndOfFile(); + bool haveMoreData = ReadEndOfFile(); + + if (countBytes && !haveMoreData) + { + byteCount += FlushEncoder(encoder); + } + + return haveMoreData; } if (row == 1 && detectDelimiter) @@ -347,7 +363,7 @@ private ReadLineResult ReadLine(ref char c, ref char cPrev) if (countBytes) { - byteCount += encoding.GetByteCount(new char[] { c }); + byteCount += PushCharToEncoder(encoder, c); } if (maxFieldSize > 0 && bufferPosition - fieldStartPosition - 1 > maxFieldSize) @@ -504,7 +520,7 @@ private ReadLineResult ReadSpaces(ref char c) charCount++; if (countBytes) { - byteCount += encoding.GetByteCount(new char[] { c }); + byteCount += PushCharToEncoder(encoder, c); } } @@ -534,7 +550,7 @@ private ReadLineResult ReadBlankLine(ref char c) charCount++; if (countBytes) { - byteCount += encoding.GetByteCount(new char[] { c }); + byteCount += PushCharToEncoder(encoder, c); } } @@ -565,7 +581,7 @@ private ReadLineResult ReadDelimiter(ref char c) charCount++; if (countBytes) { - byteCount += encoding.GetByteCount(new[] { c }); + byteCount += PushCharToEncoder(encoder, c); } if (bufferPosition >= charsRead) @@ -603,7 +619,7 @@ private ReadLineResult ReadLineEnding(ref char c) charCount++; if (countBytes) { - byteCount += encoding.GetByteCount(new char[] { c }); + byteCount += PushCharToEncoder(encoder, c); } } } @@ -642,7 +658,7 @@ private ReadLineResult ReadNewLine(ref char c) charCount++; if (countBytes) { - byteCount += encoding.GetByteCount(new[] { c }); + byteCount += PushCharToEncoder(encoder, c); } if (bufferPosition >= charsRead) @@ -1085,6 +1101,32 @@ protected ProcessedField ProcessNoEscapeField(int start, int length) return new ProcessedField(newStart, newLength, buffer); } + private static int PushCharToEncoder(Encoder encoder, char c) + { + // We use GetBytes instead of GetByteCount because the former updates the internal state + // of the encoder and the latter doesn't. We use a throwaway buffer for the encoded bytes, + // whose length of 16 *should* comfortably hold the bytes for a single character in a given encoding. +#if NETSTANDARD2_1 || NET + Span bytesBuffer = stackalloc byte[16]; + return encoder.GetBytes(MemoryMarshal.CreateReadOnlySpan(ref c, 1), bytesBuffer, flush: false); +#else + char[] chars = { c }; + byte[] bytes = new byte[16]; + return encoder.GetBytes(chars, 0, 1, bytes, 0, flush: false); +#endif + } + + private static int FlushEncoder(Encoder encoder) + { +#if NETSTANDARD2_1 || NET + Span bytesBuffer = stackalloc byte[16]; + return encoder.GetBytes(Array.Empty(), bytesBuffer, flush: true); +#else + byte[] bytes = new byte[16]; + return encoder.GetBytes(new char[]{ }, 0, 0, bytes, 0, flush: true); +#endif + } + /// public void Dispose() { diff --git a/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs b/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs index dc3bd8da5..79ae6c200 100644 --- a/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs +++ b/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs @@ -133,5 +133,58 @@ public void Read_Trimmed_WhiteSpaceCorrect() } } + [Theory] + [MemberData(nameof(Utf8CharsData))] + public void UTF8_ByteCounts(char[] chars, long expectedByteCount) + { + var config = new CsvConfiguration(CultureInfo.InvariantCulture) + { + Encoding = Encoding.UTF8, + CountBytes = true, + }; + using (var reader = new CharsReader(chars)) + using (var parser = new CsvParser(reader, config)) + { + while (parser.Read()) { } + + Assert.Equal(expectedByteCount, parser.ByteCount); + } + } + + public static IEnumerable Utf8CharsData => + new List + { + new object[] { "ABC✋😉👍".ToCharArray(), Encoding.UTF8.GetByteCount("ABC✋😉👍") }, + new object[] { "𐓏𐓘𐓻𐓘𐓻𐓟 𐒻𐓟".ToCharArray(), Encoding.UTF8.GetByteCount("𐓏𐓘𐓻𐓘𐓻𐓟 𐒻𐓟") }, + new object[] { new char[] { '\u0232' }, 2 }, // U+0232 (Ȳ - LATIN CAPITAL LETTER Y WITH MACRON) + new object[] { new char[] { '\u0985' }, 3 }, // U+0985 (অ - BENGALI LETTER A) + new object[] { new char[] { '\ud83d', '\ude17' }, 4 }, // U+1F617 (😗 - KISSING FACE) + // The next line tests the encoder is flushed correctly: if the supplied TextReader terminates + // on an unpaired (high) surrogate character then only upon flushing the encoder will the + // ByteCount be increased, in this case by 3 corresponding to the number of UTF8 bytes + // of the replacement character U+FFFD (the default fallback behaviour of the static Encoding.UTF8). + new object[] { new char[] { '\ud800' }, 3 }, + }; + + private class CharsReader : TextReader + { + private readonly char[] _chars; + private int idx = -1; + + public CharsReader(char[] chars) + { + _chars = chars; + } + + public override int Peek() + { + return idx + 1 >= _chars.Length ? -1 : _chars[idx + 1]; + } + + public override int Read() + { + return idx + 1 >= _chars.Length ? -1 : _chars[++idx]; + } + } } } From 52c2a9357e3ab9aa85ca24d44fff9314534e8b96 Mon Sep 17 00:00:00 2001 From: Robert Hague Date: Tue, 24 Jan 2023 13:38:49 +0000 Subject: [PATCH 2/2] Be less hopeful about the size of the throwaway bytes buffer An encoder with a custom EncoderReplacementFallback could easily require a larger buffer, for example if its replacement string is "{LONG REPLACEMENT STRING}". So the upper-bound of 16 bytes is not correct. Note that in this case CsvParser.ByteCount would be nonsensical for input requiring the fallback, and configuration.Encoding probably does not match the actual encoding, e.g we are reading from a UTF-16 byte stream containing non-ASCII characters and we have set configuration.Encoding to ASCII. Nevertheless the safeguard is more sensible to have than not. --- src/CsvHelper/CsvParser.cs | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/CsvHelper/CsvParser.cs b/src/CsvHelper/CsvParser.cs index a6397b8a1..5b120e03c 100644 --- a/src/CsvHelper/CsvParser.cs +++ b/src/CsvHelper/CsvParser.cs @@ -29,6 +29,7 @@ public class CsvParser : IParser, IDisposable private readonly char quote; private readonly char escape; private readonly bool countBytes; + private readonly Encoding encoding; private readonly Encoder encoder; private readonly bool ignoreBlankLines; private readonly char comment; @@ -191,8 +192,9 @@ public CsvParser(TextReader reader, IParserConfiguration configuration, bool lea delimiterFirstChar = configuration.Delimiter[0]; delimiterValues = configuration.DetectDelimiterValues; detectDelimiter = configuration.DetectDelimiter; - // encoder only used when counting bytes, so avoid NRE when configuration.Encoding is null - encoder = countBytes ? configuration.Encoding.GetEncoder() : null; + encoding = configuration.Encoding; + // encoder only used when counting bytes, so avoid NRE when configuration.Encoding is null + encoder = countBytes ? encoding.GetEncoder() : null; escape = configuration.Escape; ignoreBlankLines = configuration.IgnoreBlankLines; isNewLineSet = configuration.IsNewLineSet; @@ -236,7 +238,7 @@ public bool Read() if (countBytes && !haveMoreData) { - byteCount += FlushEncoder(encoder); + byteCount += FlushEncoder(); } return haveMoreData; @@ -278,7 +280,7 @@ public async Task ReadAsync() if (countBytes && !haveMoreData) { - byteCount += FlushEncoder(encoder); + byteCount += FlushEncoder(); } return haveMoreData; @@ -363,7 +365,7 @@ private ReadLineResult ReadLine(ref char c, ref char cPrev) if (countBytes) { - byteCount += PushCharToEncoder(encoder, c); + byteCount += PushCharToEncoder(c); } if (maxFieldSize > 0 && bufferPosition - fieldStartPosition - 1 > maxFieldSize) @@ -520,7 +522,7 @@ private ReadLineResult ReadSpaces(ref char c) charCount++; if (countBytes) { - byteCount += PushCharToEncoder(encoder, c); + byteCount += PushCharToEncoder(c); } } @@ -550,7 +552,7 @@ private ReadLineResult ReadBlankLine(ref char c) charCount++; if (countBytes) { - byteCount += PushCharToEncoder(encoder, c); + byteCount += PushCharToEncoder(c); } } @@ -581,7 +583,7 @@ private ReadLineResult ReadDelimiter(ref char c) charCount++; if (countBytes) { - byteCount += PushCharToEncoder(encoder, c); + byteCount += PushCharToEncoder(c); } if (bufferPosition >= charsRead) @@ -619,7 +621,7 @@ private ReadLineResult ReadLineEnding(ref char c) charCount++; if (countBytes) { - byteCount += PushCharToEncoder(encoder, c); + byteCount += PushCharToEncoder(c); } } } @@ -658,7 +660,7 @@ private ReadLineResult ReadNewLine(ref char c) charCount++; if (countBytes) { - byteCount += PushCharToEncoder(encoder, c); + byteCount += PushCharToEncoder(c); } if (bufferPosition >= charsRead) @@ -1101,28 +1103,27 @@ protected ProcessedField ProcessNoEscapeField(int start, int length) return new ProcessedField(newStart, newLength, buffer); } - private static int PushCharToEncoder(Encoder encoder, char c) + private int PushCharToEncoder(char c) { // We use GetBytes instead of GetByteCount because the former updates the internal state - // of the encoder and the latter doesn't. We use a throwaway buffer for the encoded bytes, - // whose length of 16 *should* comfortably hold the bytes for a single character in a given encoding. + // of the encoder and the latter doesn't. We use a throwaway buffer for the encoded bytes. #if NETSTANDARD2_1 || NET - Span bytesBuffer = stackalloc byte[16]; + Span bytesBuffer = encoding.GetMaxByteCount(1) <= 16 ? stackalloc byte[16] : new byte[encoding.GetMaxByteCount(1)]; return encoder.GetBytes(MemoryMarshal.CreateReadOnlySpan(ref c, 1), bytesBuffer, flush: false); #else char[] chars = { c }; - byte[] bytes = new byte[16]; + byte[] bytes = new byte[encoding.GetMaxByteCount(1)]; return encoder.GetBytes(chars, 0, 1, bytes, 0, flush: false); #endif } - private static int FlushEncoder(Encoder encoder) + private int FlushEncoder() { #if NETSTANDARD2_1 || NET - Span bytesBuffer = stackalloc byte[16]; + Span bytesBuffer = encoding.GetMaxByteCount(1) <= 16 ? stackalloc byte[16] : new byte[encoding.GetMaxByteCount(1)]; return encoder.GetBytes(Array.Empty(), bytesBuffer, flush: true); #else - byte[] bytes = new byte[16]; + byte[] bytes = new byte[encoding.GetMaxByteCount(1)]; return encoder.GetBytes(new char[]{ }, 0, 0, bytes, 0, flush: true); #endif }