Merge pull request #198 from mayawarrier/main

Add opt-in SIMD support for char16_t
fastfloat · Jun 9, 2023 · 8139e16 · 8139e16
2 parents 127a6c7 + b711947
commit 8139e16
Show file tree

Hide file tree

Showing 6 changed files with 173 additions and 61 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,10 +3,11 @@ Testing/*
 .cache/
 compile_commands.json
 
-# Visual Studio
+# Visual studio
 .vs/
 Debug/
 Release/
+/out/
 *.sln
 *.vcxproj
 *.vcxproj.filters

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -5,4 +5,5 @@ Neal Richardson
 Tim Paine
 Fabio Pellacini
 Lénárd Szolnoki
-Jan Pharago
+Jan Pharago
+Maya Warrier
diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
@@ -5,11 +5,26 @@
 #include <cstdint>
 #include <cstring>
 #include <iterator>
+#include <type_traits>
 
 #include "float_common.h"
 
+#ifdef FASTFLOAT_SSE2
+#include <emmintrin.h>
+#endif
+
+
 namespace fast_float {
 
+template <typename UC>
+fastfloat_really_inline constexpr bool has_simd_opt() {
+#ifdef FASTFLOAT_HAS_SIMD
+  return std::is_same<UC, char16_t>::value;
+#else
+  return false;
+#endif
+}
+
 // Next function can be micro-optimized, but compilers are entirely
 // able to optimize it well.
 template <typename UC>
@@ -28,12 +43,14 @@ fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) {
     | (val & 0x00000000000000FF) << 56;
 }
 
+// Read 8 UC into a u64. Truncates UC if not char.
+template <typename UC>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t read_u64(const char *chars) {
-  if (cpp20_and_in_constexpr()) {
+uint64_t read8_to_u64(const UC *chars) {
+  if (cpp20_and_in_constexpr() || !std::is_same<UC, char>::value) {
     uint64_t val = 0;
     for(int i = 0; i < 8; ++i) {
-      val |= uint64_t(*chars) << (i*8);
+      val |= uint64_t(uint8_t(*chars)) << (i*8);
       ++chars;
     }
     return val;
@@ -47,6 +64,39 @@ uint64_t read_u64(const char *chars) {
   return val;
 }
 
+#ifdef FASTFLOAT_SSE2
+
+fastfloat_really_inline
+uint64_t simd_read8_to_u64(const __m128i data) {
+FASTFLOAT_SIMD_DISABLE_WARNINGS
+  const __m128i packed = _mm_packus_epi16(data, data);
+#ifdef FASTFLOAT_64BIT
+  return uint64_t(_mm_cvtsi128_si64(packed));
+#else
+  uint64_t value;
+  // Visual Studio + older versions of GCC don't support _mm_storeu_si64
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(&value), packed);
+  return value;
+#endif
+FASTFLOAT_SIMD_RESTORE_WARNINGS
+}
+
+fastfloat_really_inline
+uint64_t simd_read8_to_u64(const char16_t* chars) {
+FASTFLOAT_SIMD_DISABLE_WARNINGS
+  return simd_read8_to_u64(_mm_loadu_si128(reinterpret_cast<const __m128i*>(chars)));
+FASTFLOAT_SIMD_RESTORE_WARNINGS
+}
+
+#endif
+
+// dummy for compile
+template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>())>
+uint64_t simd_read8_to_u64(UC const*) {
+  return 0;
+}
+
+
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 void write_u64(uint8_t *chars, uint64_t val) {
   if (cpp20_and_in_constexpr()) {
@@ -76,40 +126,80 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) {
   return uint32_t(val);
 }
 
-fastfloat_really_inline constexpr
-uint32_t parse_eight_digits_unrolled(const char16_t *)  noexcept  {
-  return 0;
-}
-
-fastfloat_really_inline constexpr
-uint32_t parse_eight_digits_unrolled(const char32_t *)  noexcept  {
-  return 0;
-}
 
+// Call this if chars are definitely 8 digits.
+template <typename UC>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint32_t parse_eight_digits_unrolled(const char *chars)  noexcept  {
-  return parse_eight_digits_unrolled(read_u64(chars));
+uint32_t parse_eight_digits_unrolled(UC const * chars)  noexcept {
+  if (cpp20_and_in_constexpr() || !has_simd_opt<UC>()) {
+    return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay
+  }
+  return parse_eight_digits_unrolled(simd_read8_to_u64(chars));
 }
 
+
 // credit @aqrit
-fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val)  noexcept  {
+fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val)  noexcept {
   return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) &
      0x8080808080808080));
 }
 
-fastfloat_really_inline constexpr
-bool is_made_of_eight_digits_fast(const char16_t *)  noexcept  {
-  return false;
+
+#ifdef FASTFLOAT_HAS_SIMD
+
+// Call this if chars might not be 8 digits.
+// Using this style (instead of is_made_of_eight_digits_fast() then parse_eight_digits_unrolled())
+// ensures we don't load SIMD registers twice.
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+bool simd_parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept {
+  if (cpp20_and_in_constexpr()) {
+    return false;
+  }   
+#ifdef FASTFLOAT_SSE2
+FASTFLOAT_SIMD_DISABLE_WARNINGS
+  const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(chars));
+
+  // (x - '0') <= 9
+  // http://0x80.pl/articles/simd-parsing-int-sequences.html
+  const __m128i t0 = _mm_add_epi16(data, _mm_set1_epi16(32720));
+  const __m128i t1 = _mm_cmpgt_epi16(t0, _mm_set1_epi16(-32759));
+
+  if (_mm_movemask_epi8(t1) == 0) {
+    i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data));
+    return true;
+  }
+  else return false;
+FASTFLOAT_SIMD_RESTORE_WARNINGS
+#endif
 }
 
-fastfloat_really_inline constexpr
-bool is_made_of_eight_digits_fast(const char32_t *)  noexcept  {
-  return false;
+#endif
+
+// dummy for compile
+template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>())>
+uint64_t simd_parse_if_eight_digits_unrolled(UC const*, uint64_t&) {
+  return 0;
+}
+
+
+template <typename UC, FASTFLOAT_ENABLE_IF(!std::is_same<UC, char>::value)>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+void loop_parse_if_eight_digits(const UC*& p, const UC* const pend, uint64_t& i) {
+  if (!has_simd_opt<UC>()) {
+    return;
+  }
+  while ((std::distance(p, pend) >= 8) && simd_parse_if_eight_digits_unrolled(p, i)) { // in rare cases, this will overflow, but that's ok
+    p += 8;
+  }
 }
 
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool is_made_of_eight_digits_fast(const char *chars)  noexcept  {
-  return is_made_of_eight_digits_fast(read_u64(chars));
+void loop_parse_if_eight_digits(const char*& p, const char* const pend, uint64_t& i) {
+  // optimizes better than parse_if_eight_digits_unrolled() for UC = char.
+  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(read8_to_u64(p))) {
+    i = i * 100000000 + parse_eight_digits_unrolled(read8_to_u64(p)); // in rare cases, this will overflow, but that's ok
+    p += 8;
+  }
 }
 
 template <typename UC>
@@ -124,8 +214,10 @@ struct parsed_number_string_t {
   span<const UC> integer{};  // non-nullable
   span<const UC> fraction{}; // nullable
 };
-using byte_span = span<char>;
+
+using byte_span = span<const char>;
 using parsed_number_string = parsed_number_string_t<char>;
+
 // Assuming that you use no more than 19 digits, this will
 // parse an ASCII string.
 template <typename UC>
@@ -171,12 +263,8 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
     UC const * before = p;
     // can occur at most twice without overflowing, but let it occur more, since
     // for integers with many digits, digit parsing is the primary bottleneck.
-    if (std::is_same<UC,char>::value) {
-      while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
-        i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
-        p += 8;
-      }
-    }
+    loop_parse_if_eight_digits(p, pend, i);
+
     while ((p != pend) && is_integer(*p)) {
       uint8_t digit = uint8_t(*p - UC('0'));
       ++p;
@@ -241,29 +329,31 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
       if(*start == UC('0')) { digit_count --; }
       start++;
     }
+
     if (digit_count > 19) {
       answer.too_many_digits = true;
       // Let us start again, this time, avoiding overflows.
       // We don't need to check if is_integer, since we use the
       // pre-tokenized spans from above.
       i = 0;
       p = answer.integer.ptr;
-      UC const * int_end = p + answer.integer.len();
-      const uint64_t minimal_nineteen_digit_integer{1000000000000000000};
-      while((i < minimal_nineteen_digit_integer) && (p != int_end)) {
+      UC const* int_end = p + answer.integer.len();
+      const uint64_t minimal_nineteen_digit_integer{ 1000000000000000000 };
+      while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
         i = i * 10 + uint64_t(*p - UC('0'));
         ++p;
       }
       if (i >= minimal_nineteen_digit_integer) { // We have a big integers
         exponent = end_of_integer_part - p + exp_number;
-      } else { // We have a value with a fractional component.
-          p = answer.fraction.ptr;
-          UC const * frac_end = p + answer.fraction.len();
-          while((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
-            i = i * 10 + uint64_t(*p - UC('0'));
-            ++p;
-          }
-          exponent = answer.fraction.ptr - p + exp_number;
+      }
+      else { // We have a value with a fractional component.
+        p = answer.fraction.ptr;
+        UC const* frac_end = p + answer.fraction.len();
+        while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
+          i = i * 10 + uint64_t(*p - UC('0'));
+          ++p;
+        }
+        exponent = answer.fraction.ptr - p + exp_number;
       }
       // We have now corrected both exponent and i, to a truncated value
     }

diff --git a/include/fast_float/digit_comparison.h b/include/fast_float/digit_comparison.h
@@ -201,18 +201,10 @@ bool is_truncated(span<const UC> s) noexcept {
   return is_truncated(s.ptr, s.ptr + s.len());
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void parse_eight_digits(const char16_t*& , limb& , size_t& , size_t& ) noexcept {
-  // currently unused
-}
-
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void parse_eight_digits(const char32_t*& , limb& , size_t& , size_t& ) noexcept {
-  // currently unused
-}
 
+template <typename UC>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void parse_eight_digits(const char*& p, limb& value, size_t& counter, size_t& count) noexcept {
+void parse_eight_digits(const UC*& p, limb& value, size_t& counter, size_t& count) noexcept {
   value = value * 100000000 + parse_eight_digits_unrolled(p);
   p += 8;
   counter += 8;
@@ -264,10 +256,8 @@ void parse_mantissa(bigint& result, parsed_number_string_t<UC>& num, size_t max_
   skip_zeros(p, pend);
   // process all digits, in increments of step per loop
   while (p != pend) {
-    if (std::is_same<UC,char>::value) {
-      while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
-        parse_eight_digits(p, value, counter, digits);
-      }
+    while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
+      parse_eight_digits(p, value, counter, digits);
     }
     while (counter < step && p != pend && digits < max_digits) {
       parse_one_digit(p, value, counter, digits);
@@ -299,10 +289,8 @@ void parse_mantissa(bigint& result, parsed_number_string_t<UC>& num, size_t max_
     }
     // process all digits, in increments of step per loop
     while (p != pend) {
-      if (std::is_same<UC,char>::value) {
-        while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
-          parse_eight_digits(p, value, counter, digits);
-        }
+      while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
+        parse_eight_digits(p, value, counter, digits);
       }
       while (counter < step && p != pend && digits < max_digits) {
         parse_one_digit(p, value, counter, digits);

diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h
@@ -115,6 +115,34 @@ using parse_options = parse_options_t<char>;
 #endif
 #endif
 
+#if defined(__SSE2__) || \
+  (defined(FASTFLOAT_VISUAL_STUDIO) && \
+    (defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)))
+#define FASTFLOAT_SSE2 1
+#endif
+
+#ifdef FASTFLOAT_SSE2
+#define FASTFLOAT_HAS_SIMD 1
+#endif
+
+#if defined(__GNUC__)
+// disable -Wcast-align=strict (GCC only)
+#define FASTFLOAT_SIMD_DISABLE_WARNINGS \
+  _Pragma("GCC diagnostic push") \
+  _Pragma("GCC diagnostic ignored \"-Wcast-align\"")
+#else
+#define FASTFLOAT_SIMD_DISABLE_WARNINGS
+#endif
+
+#if defined(__GNUC__)
+#define FASTFLOAT_SIMD_RESTORE_WARNINGS \
+  _Pragma("GCC diagnostic pop")
+#else
+#define FASTFLOAT_SIMD_RESTORE_WARNINGS
+#endif
+
+
+
 #ifdef FASTFLOAT_VISUAL_STUDIO
 #define fastfloat_really_inline __forceinline
 #else
@@ -132,6 +160,9 @@ using parse_options = parse_options_t<char>;
 // rust style `try!()` macro, or `?` operator
 #define FASTFLOAT_TRY(x) { if (!(x)) return false; }
 
+#define FASTFLOAT_ENABLE_IF(...) typename std::enable_if<(__VA_ARGS__), int>::type = 0
+
+
 namespace fast_float {
 
 fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() {

diff --git a/include/fast_float/parse_number.h b/include/fast_float/parse_number.h
@@ -166,6 +166,7 @@ from_chars_result_t<UC> from_chars_advanced(UC const * first, UC const * last,
   if (!pns.valid) {
     return detail::parse_infnan(first, last, value);
   }
+
   answer.ec = std::errc(); // be optimistic
   answer.ptr = pns.lastmatch;
   // The implementation of the Clinger's fast path is convoluted because