From abda326998849e743b6ed18e0e3ee3bdf7210f17 Mon Sep 17 00:00:00 2001
From: Robert Hague <rh@johnstreetcapital.com>
Date: Mon, 26 Dec 2022 18:31:41 +0000
Subject: [PATCH 1/2] Use Encoder instead of Encoding in CsvParser

This improves the ByteCount logic when consuming surrogate characters because
the Encoder maintains state across calls. Previously the default behaviour was
that the Encoding would return the byte count having replaced the surrogate
character with U+FFFD REPLACEMENT CHARACTER because on its own a surrogate
character is invalid.

For example, when reading the UTF-16 surrogate pair \uD83D\uDE17 corresponding
to the Unicode character U+1F617, the (UTF-8) ByteCount would be 6 (the byte
count of the sequence \uFFFD\uFFFD) instead of the correct value of 4 UTF-8
bytes.
---
 src/CsvHelper/CsvParser.cs                    | 62 ++++++++++++++++---
 .../CsvHelper.Tests/Parsing/ByteCountTests.cs | 53 ++++++++++++++++
 2 files changed, 105 insertions(+), 10 deletions(-)
diff --git a/src/CsvHelper/CsvParser.cs b/src/CsvHelper/CsvParser.cs
index 187db6317..a6397b8a1 100644
--- a/src/CsvHelper/CsvParser.cs
+++ b/src/CsvHelper/CsvParser.cs
@@ -11,6 +11,7 @@
 using System.IO;
 using System.Linq;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 using System.Text;
 using System.Text.RegularExpressions;
 using System.Threading.Tasks;
@@ -28,7 +29,7 @@ public class CsvParser : IParser, IDisposable
 		private readonly char quote;
 		private readonly char escape;
 		private readonly bool countBytes;
-		private readonly Encoding encoding;
+		private readonly Encoder encoder;
 		private readonly bool ignoreBlankLines;
 		private readonly char comment;
 		private readonly bool allowComments;
@@ -190,7 +191,8 @@ public CsvParser(TextReader reader, IParserConfiguration configuration, bool lea
 			delimiterFirstChar = configuration.Delimiter[0];
 			delimiterValues = configuration.DetectDelimiterValues;
 			detectDelimiter = configuration.DetectDelimiter;
-			encoding = configuration.Encoding;
+			// encoder only used when counting bytes, so avoid NRE when configuration.Encoding is null 
+			encoder = countBytes ? configuration.Encoding.GetEncoder() : null;
 			escape = configuration.Escape;
 			ignoreBlankLines = configuration.IgnoreBlankLines;
 			isNewLineSet = configuration.IsNewLineSet;
@@ -230,7 +232,14 @@ public bool Read()
 				{
 					if (!FillBuffer())
 					{
-						return ReadEndOfFile();
+						bool haveMoreData = ReadEndOfFile();
+
+						if (countBytes && !haveMoreData)
+						{
+							byteCount += FlushEncoder(encoder);
+						}
+
+						return haveMoreData;
 					}
 
 					if (row == 1 && detectDelimiter)
@@ -265,7 +274,14 @@ public async Task<bool> ReadAsync()
 				{
 					if (!await FillBufferAsync().ConfigureAwait(false))
 					{
-						return ReadEndOfFile();
+						bool haveMoreData = ReadEndOfFile();
+
+						if (countBytes && !haveMoreData)
+						{
+							byteCount += FlushEncoder(encoder);
+						}
+
+						return haveMoreData;
 					}
 
 					if (row == 1 && detectDelimiter)
@@ -347,7 +363,7 @@ private ReadLineResult ReadLine(ref char c, ref char cPrev)
 
 				if (countBytes)
 				{
-					byteCount += encoding.GetByteCount(new char[] { c });
+					byteCount += PushCharToEncoder(encoder, c);
 				}
 
 				if (maxFieldSize > 0 && bufferPosition - fieldStartPosition - 1 > maxFieldSize)
@@ -504,7 +520,7 @@ private ReadLineResult ReadSpaces(ref char c)
 				charCount++;
 				if (countBytes)
 				{
-					byteCount += encoding.GetByteCount(new char[] { c });
+					byteCount += PushCharToEncoder(encoder, c);
 				}
 			}
 
@@ -534,7 +550,7 @@ private ReadLineResult ReadBlankLine(ref char c)
 				charCount++;
 				if (countBytes)
 				{
-					byteCount += encoding.GetByteCount(new char[] { c });
+					byteCount += PushCharToEncoder(encoder, c);
 				}
 			}
 
@@ -565,7 +581,7 @@ private ReadLineResult ReadDelimiter(ref char c)
 				charCount++;
 				if (countBytes)
 				{
-					byteCount += encoding.GetByteCount(new[] { c });
+					byteCount += PushCharToEncoder(encoder, c);
 				}
 
 				if (bufferPosition >= charsRead)
@@ -603,7 +619,7 @@ private ReadLineResult ReadLineEnding(ref char c)
 					charCount++;
 					if (countBytes)
 					{
-						byteCount += encoding.GetByteCount(new char[] { c });
+						byteCount += PushCharToEncoder(encoder, c);
 					}
 				}
 			}
@@ -642,7 +658,7 @@ private ReadLineResult ReadNewLine(ref char c)
 				charCount++;
 				if (countBytes)
 				{
-					byteCount += encoding.GetByteCount(new[] { c });
+					byteCount += PushCharToEncoder(encoder, c);
 				}
 
 				if (bufferPosition >= charsRead)
@@ -1085,6 +1101,32 @@ protected ProcessedField ProcessNoEscapeField(int start, int length)
 			return new ProcessedField(newStart, newLength, buffer);
 		}
 
+		private static int PushCharToEncoder(Encoder encoder, char c)
+		{
+			// We use GetBytes instead of GetByteCount because the former updates the internal state
+			// of the encoder and the latter doesn't. We use a throwaway buffer for the encoded bytes,
+			// whose length of 16 *should* comfortably hold the bytes for a single character in a given encoding.
+#if NETSTANDARD2_1 || NET
+			Span<byte> bytesBuffer = stackalloc byte[16];
+			return encoder.GetBytes(MemoryMarshal.CreateReadOnlySpan(ref c, 1), bytesBuffer, flush: false);
+#else
+			char[] chars = { c };
+			byte[] bytes = new byte[16];
+			return encoder.GetBytes(chars, 0, 1, bytes, 0, flush: false);
+#endif
+		}
+
+		private static int FlushEncoder(Encoder encoder)
+		{
+#if NETSTANDARD2_1 || NET
+			Span<byte> bytesBuffer = stackalloc byte[16];
+			return encoder.GetBytes(Array.Empty<char>(), bytesBuffer, flush: true);
+#else
+			byte[] bytes = new byte[16];
+			return encoder.GetBytes(new char[]{ }, 0, 0, bytes, 0, flush: true);
+#endif
+		}
+
 		/// <inheritdoc/>
 		public void Dispose()
 		{
diff --git a/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs b/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs
index dc3bd8da5..79ae6c200 100644
--- a/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs
+++ b/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs
@@ -133,5 +133,58 @@ public void Read_Trimmed_WhiteSpaceCorrect()
 			}
 		}
 
+		[Theory]
+		[MemberData(nameof(Utf8CharsData))]
+		public void UTF8_ByteCounts(char[] chars, long expectedByteCount)
+		{
+			var config = new CsvConfiguration(CultureInfo.InvariantCulture)
+			{
+				Encoding = Encoding.UTF8,
+				CountBytes = true,
+			};
+			using (var reader = new CharsReader(chars))
+			using (var parser = new CsvParser(reader, config))
+			{
+				while (parser.Read()) { }
+
+				Assert.Equal(expectedByteCount, parser.ByteCount);
+			}
+		}
+
+		public static IEnumerable<object[]> Utf8CharsData =>
+		   new List<object[]>
+		   {
+				new object[] { "ABC✋😉👍".ToCharArray(), Encoding.UTF8.GetByteCount("ABC✋😉👍") },
+				new object[] { "𐓏𐓘𐓻𐓘𐓻𐓟 𐒻𐓟".ToCharArray(), Encoding.UTF8.GetByteCount("𐓏𐓘𐓻𐓘𐓻𐓟 𐒻𐓟") },
+				new object[] { new char[] { '\u0232' }, 2 }, // U+0232 (Ȳ - LATIN CAPITAL LETTER Y WITH MACRON)
+				new object[] { new char[] { '\u0985' }, 3 }, // U+0985 (অ - BENGALI LETTER A)
+				new object[] { new char[] { '\ud83d', '\ude17' }, 4 }, // U+1F617 (😗 - KISSING FACE)
+				// The next line tests the encoder is flushed correctly: if the supplied TextReader terminates
+				// on an unpaired (high) surrogate character then only upon flushing the encoder will the
+				// ByteCount be increased, in this case by 3 corresponding to the number of UTF8 bytes
+				// of the replacement character U+FFFD (the default fallback behaviour of the static Encoding.UTF8).
+				new object[] { new char[] { '\ud800' }, 3 },
+		   };
+
+		private class CharsReader : TextReader
+		{
+			private readonly char[] _chars;
+			private int idx = -1;
+
+			public CharsReader(char[] chars)
+			{
+				_chars = chars;
+			}
+
+			public override int Peek()
+			{
+				return idx + 1 >= _chars.Length ? -1 : _chars[idx + 1];
+			}
+
+			public override int Read()
+			{
+				return idx + 1 >= _chars.Length ? -1 : _chars[++idx];
+			}
+		}
 	}
 }

From 52c2a9357e3ab9aa85ca24d44fff9314534e8b96 Mon Sep 17 00:00:00 2001
From: Robert Hague <rh@johnstreetcapital.com>
Date: Tue, 24 Jan 2023 13:38:49 +0000
Subject: [PATCH 2/2] Be less hopeful about the size of the throwaway bytes
 buffer

An encoder with a custom EncoderReplacementFallback could easily require
a larger buffer, for example if its replacement string is "{LONG REPLACEMENT STRING}".
So the upper-bound of 16 bytes is not correct. Note that in this case
CsvParser.ByteCount would be nonsensical for input requiring the fallback, and
configuration.Encoding probably does not match the actual encoding, e.g we are reading
from a UTF-16 byte stream containing non-ASCII characters and we have set
configuration.Encoding to ASCII.

Nevertheless the safeguard is more sensible to have than not.
---
 src/CsvHelper/CsvParser.cs | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/CsvHelper/CsvParser.cs b/src/CsvHelper/CsvParser.cs
index a6397b8a1..5b120e03c 100644
--- a/src/CsvHelper/CsvParser.cs
+++ b/src/CsvHelper/CsvParser.cs
@@ -29,6 +29,7 @@ public class CsvParser : IParser, IDisposable
 		private readonly char quote;
 		private readonly char escape;
 		private readonly bool countBytes;
+		private readonly Encoding encoding;
 		private readonly Encoder encoder;
 		private readonly bool ignoreBlankLines;
 		private readonly char comment;
@@ -191,8 +192,9 @@ public CsvParser(TextReader reader, IParserConfiguration configuration, bool lea
 			delimiterFirstChar = configuration.Delimiter[0];
 			delimiterValues = configuration.DetectDelimiterValues;
 			detectDelimiter = configuration.DetectDelimiter;
-			// encoder only used when counting bytes, so avoid NRE when configuration.Encoding is null 
-			encoder = countBytes ? configuration.Encoding.GetEncoder() : null;
+			encoding = configuration.Encoding;
+			// encoder only used when counting bytes, so avoid NRE when configuration.Encoding is null
+			encoder = countBytes ? encoding.GetEncoder() : null;
 			escape = configuration.Escape;
 			ignoreBlankLines = configuration.IgnoreBlankLines;
 			isNewLineSet = configuration.IsNewLineSet;
@@ -236,7 +238,7 @@ public bool Read()
 
 						if (countBytes && !haveMoreData)
 						{
-							byteCount += FlushEncoder(encoder);
+							byteCount += FlushEncoder();
 						}
 
 						return haveMoreData;
@@ -278,7 +280,7 @@ public async Task<bool> ReadAsync()
 
 						if (countBytes && !haveMoreData)
 						{
-							byteCount += FlushEncoder(encoder);
+							byteCount += FlushEncoder();
 						}
 
 						return haveMoreData;
@@ -363,7 +365,7 @@ private ReadLineResult ReadLine(ref char c, ref char cPrev)
 
 				if (countBytes)
 				{
-					byteCount += PushCharToEncoder(encoder, c);
+					byteCount += PushCharToEncoder(c);
 				}
 
 				if (maxFieldSize > 0 && bufferPosition - fieldStartPosition - 1 > maxFieldSize)
@@ -520,7 +522,7 @@ private ReadLineResult ReadSpaces(ref char c)
 				charCount++;
 				if (countBytes)
 				{
-					byteCount += PushCharToEncoder(encoder, c);
+					byteCount += PushCharToEncoder(c);
 				}
 			}
 
@@ -550,7 +552,7 @@ private ReadLineResult ReadBlankLine(ref char c)
 				charCount++;
 				if (countBytes)
 				{
-					byteCount += PushCharToEncoder(encoder, c);
+					byteCount += PushCharToEncoder(c);
 				}
 			}
 
@@ -581,7 +583,7 @@ private ReadLineResult ReadDelimiter(ref char c)
 				charCount++;
 				if (countBytes)
 				{
-					byteCount += PushCharToEncoder(encoder, c);
+					byteCount += PushCharToEncoder(c);
 				}
 
 				if (bufferPosition >= charsRead)
@@ -619,7 +621,7 @@ private ReadLineResult ReadLineEnding(ref char c)
 					charCount++;
 					if (countBytes)
 					{
-						byteCount += PushCharToEncoder(encoder, c);
+						byteCount += PushCharToEncoder(c);
 					}
 				}
 			}
@@ -658,7 +660,7 @@ private ReadLineResult ReadNewLine(ref char c)
 				charCount++;
 				if (countBytes)
 				{
-					byteCount += PushCharToEncoder(encoder, c);
+					byteCount += PushCharToEncoder(c);
 				}
 
 				if (bufferPosition >= charsRead)
@@ -1101,28 +1103,27 @@ protected ProcessedField ProcessNoEscapeField(int start, int length)
 			return new ProcessedField(newStart, newLength, buffer);
 		}
 
-		private static int PushCharToEncoder(Encoder encoder, char c)
+		private int PushCharToEncoder(char c)
 		{
 			// We use GetBytes instead of GetByteCount because the former updates the internal state
-			// of the encoder and the latter doesn't. We use a throwaway buffer for the encoded bytes,
-			// whose length of 16 *should* comfortably hold the bytes for a single character in a given encoding.
+			// of the encoder and the latter doesn't. We use a throwaway buffer for the encoded bytes.
 #if NETSTANDARD2_1 || NET
-			Span<byte> bytesBuffer = stackalloc byte[16];
+			Span<byte> bytesBuffer = encoding.GetMaxByteCount(1) <= 16 ? stackalloc byte[16] : new byte[encoding.GetMaxByteCount(1)];
 			return encoder.GetBytes(MemoryMarshal.CreateReadOnlySpan(ref c, 1), bytesBuffer, flush: false);
 #else
 			char[] chars = { c };
-			byte[] bytes = new byte[16];
+			byte[] bytes = new byte[encoding.GetMaxByteCount(1)];
 			return encoder.GetBytes(chars, 0, 1, bytes, 0, flush: false);
 #endif
 		}
 
-		private static int FlushEncoder(Encoder encoder)
+		private int FlushEncoder()
 		{
 #if NETSTANDARD2_1 || NET
-			Span<byte> bytesBuffer = stackalloc byte[16];
+			Span<byte> bytesBuffer = encoding.GetMaxByteCount(1) <= 16 ? stackalloc byte[16] : new byte[encoding.GetMaxByteCount(1)];
 			return encoder.GetBytes(Array.Empty<char>(), bytesBuffer, flush: true);
 #else
-			byte[] bytes = new byte[16];
+			byte[] bytes = new byte[encoding.GetMaxByteCount(1)];
 			return encoder.GetBytes(new char[]{ }, 0, 0, bytes, 0, flush: true);
 #endif
 		}