JoshClose · Rob-Hague · Dec 26, 2022 · Jan 24, 2023 · Feb 24, 2025
diff --git a/src/CsvHelper/CsvParser.cs b/src/CsvHelper/CsvParser.cs
@@ -6,6 +6,7 @@
 using CsvHelper.Delegates;
 using System.Diagnostics;
 using System.Globalization;
+using System.Runtime.InteropServices;
 using System.Text;
 
 namespace CsvHelper;
@@ -22,6 +23,7 @@ public class CsvParser : IParser, IDisposable
 	private readonly char escape;
 	private readonly bool countBytes;
 	private readonly Encoding encoding;
+	private readonly Encoder? encoder;
 	private readonly bool ignoreBlankLines;
 	private readonly char comment;
 	private readonly bool allowComments;
@@ -184,6 +186,8 @@ public CsvParser(TextReader reader, IParserConfiguration configuration, bool lea
 		delimiterValues = configuration.DetectDelimiterValues;
 		detectDelimiter = configuration.DetectDelimiter;
 		encoding = configuration.Encoding;
+		// encoder only used when counting bytes, so avoid NRE when configuration.Encoding is null
+		encoder = countBytes ? encoding.GetEncoder() : null;
 		escape = configuration.Escape;
 		ignoreBlankLines = configuration.IgnoreBlankLines;
 		isNewLineSet = configuration.IsNewLineSet;
@@ -223,7 +227,14 @@ public bool Read()
 			{
 				if (!FillBuffer())
 				{
-					return ReadEndOfFile();
+					bool haveMoreData = ReadEndOfFile();
+
+					if (countBytes && !haveMoreData)
+					{
+						byteCount += FlushEncoder();
+					}
+
+					return haveMoreData;
 				}
 
 				if (row == 1 && detectDelimiter)
@@ -258,7 +269,14 @@ public async Task<bool> ReadAsync()
 			{
 				if (!await FillBufferAsync().ConfigureAwait(false))
 				{
-					return ReadEndOfFile();
+					bool haveMoreData = ReadEndOfFile();
+
+					if (countBytes && !haveMoreData)
+					{
+						byteCount += FlushEncoder();
+					}
+
+					return haveMoreData;
 				}
 
 				if (row == 1 && detectDelimiter)
@@ -340,7 +358,7 @@ private ReadLineResult ReadLine(ref char c, ref char cPrev)
 
 			if (countBytes)
 			{
-				byteCount += encoding.GetByteCount(new char[] { c });
+				byteCount += PushCharToEncoder(c);
 			}
 
 			if (maxFieldSize > 0 && bufferPosition - fieldStartPosition - 1 > maxFieldSize)
@@ -519,7 +537,7 @@ private ReadLineResult ReadSpaces(ref char c)
 			charCount++;
 			if (countBytes)
 			{
-				byteCount += encoding.GetByteCount(new char[] { c });
+				byteCount += PushCharToEncoder(c);
 			}
 		}
 
@@ -549,7 +567,7 @@ private ReadLineResult ReadBlankLine(ref char c)
 			charCount++;
 			if (countBytes)
 			{
-				byteCount += encoding.GetByteCount(new char[] { c });
+				byteCount += PushCharToEncoder(c);
 			}
 		}
 
@@ -580,7 +598,7 @@ private ReadLineResult ReadDelimiter(ref char c)
 			charCount++;
 			if (countBytes)
 			{
-				byteCount += encoding.GetByteCount(new[] { c });
+				byteCount += PushCharToEncoder(c);
 			}
 
 			if (bufferPosition >= charsRead)
@@ -618,7 +636,7 @@ private ReadLineResult ReadLineEnding(ref char c)
 				charCount++;
 				if (countBytes)
 				{
-					byteCount += encoding.GetByteCount(new char[] { c });
+					byteCount += PushCharToEncoder(c);
 				}
 			}
 		}
@@ -657,7 +675,7 @@ private ReadLineResult ReadNewLine(ref char c)
 			charCount++;
 			if (countBytes)
 			{
-				byteCount += encoding.GetByteCount(new[] { c });
+				byteCount += PushCharToEncoder(c);
 			}
 
 			if (bufferPosition >= charsRead)
@@ -1089,6 +1107,34 @@ protected ProcessedField ProcessNoEscapeField(int start, int length)
 		return new ProcessedField(newStart, newLength, buffer);
 	}
 
+	private int PushCharToEncoder(char c)
+	{
+		Debug.Assert(encoder is not null);
+
+		// We use GetBytes instead of GetByteCount because the former updates the internal state
+		// of the encoder and the latter doesn't. We use a throwaway buffer for the encoded bytes.
+#if NETSTANDARD2_1 || NET
+		Span<byte> bytesBuffer = encoding.GetMaxByteCount(1) <= 16 ? stackalloc byte[16] : new byte[encoding.GetMaxByteCount(1)];
+		return encoder.GetBytes([c], bytesBuffer, flush: false);
+#else
+		byte[] bytes = new byte[encoding.GetMaxByteCount(1)];
+		return encoder!.GetBytes([c], 0, 1, bytes, 0, flush: false);
+#endif
+	}
+
+	private int FlushEncoder()
+	{
+		Debug.Assert(encoder is not null);
+
+#if NETSTANDARD2_1 || NET
+		Span<byte> bytesBuffer = encoding.GetMaxByteCount(1) <= 16 ? stackalloc byte[16] : new byte[encoding.GetMaxByteCount(1)];
+		return encoder.GetBytes(Array.Empty<char>(), bytesBuffer, flush: true);
+#else
+		byte[] bytes = new byte[encoding.GetMaxByteCount(1)];
+		return encoder!.GetBytes(Array.Empty<char>(), 0, 0, bytes, 0, flush: true);
+#endif
+	}
+
 	/// <inheritdoc/>
 	public void Dispose()
 	{

diff --git a/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs b/tests/CsvHelper.Tests/Parsing/ByteCountTests.cs
@@ -133,5 +133,58 @@ public void Read_Trimmed_WhiteSpaceCorrect()
 			}
 		}
 
+		[Theory]
+		[MemberData(nameof(Utf8CharsData))]
+		public void UTF8_ByteCounts(char[] chars, long expectedByteCount)
+		{
+			var config = new CsvConfiguration(CultureInfo.InvariantCulture)
+			{
+				Encoding = Encoding.UTF8,
+				CountBytes = true,
+			};
+			using (var reader = new CharsReader(chars))
+			using (var parser = new CsvParser(reader, config))
+			{
+				while (parser.Read()) { }
+
+				Assert.Equal(expectedByteCount, parser.ByteCount);
+			}
+		}
+
+		public static IEnumerable<object[]> Utf8CharsData =>
+		   new List<object[]>
+		   {
+				new object[] { "ABC✋😉👍".ToCharArray(), Encoding.UTF8.GetByteCount("ABC✋😉👍") },
+				new object[] { "𐓏𐓘𐓻𐓘𐓻𐓟 𐒻𐓟".ToCharArray(), Encoding.UTF8.GetByteCount("𐓏𐓘𐓻𐓘𐓻𐓟 𐒻𐓟") },
+				new object[] { new char[] { '\u0232' }, 2 }, // U+0232 (Ȳ - LATIN CAPITAL LETTER Y WITH MACRON)
+				new object[] { new char[] { '\u0985' }, 3 }, // U+0985 (অ - BENGALI LETTER A)
+				new object[] { new char[] { '\ud83d', '\ude17' }, 4 }, // U+1F617 (😗 - KISSING FACE)
+				// The next line tests the encoder is flushed correctly: if the supplied TextReader terminates
+				// on an unpaired (high) surrogate character then only upon flushing the encoder will the
+				// ByteCount be increased, in this case by 3 corresponding to the number of UTF8 bytes
+				// of the replacement character U+FFFD (the default fallback behaviour of the static Encoding.UTF8).
+				new object[] { new char[] { '\ud800' }, 3 },
+		   };
+
+		private class CharsReader : TextReader
+		{
+			private readonly char[] _chars;
+			private int idx = -1;
+
+			public CharsReader(char[] chars)
+			{
+				_chars = chars;
+			}
+
+			public override int Peek()
+			{
+				return idx + 1 >= _chars.Length ? -1 : _chars[idx + 1];
+			}
+
+			public override int Read()
+			{
+				return idx + 1 >= _chars.Length ? -1 : _chars[++idx];
+			}
+		}
 	}
 }