Skip to content

Commit

Permalink
noinline version of is_valid_utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
grisumbras committed Mar 6, 2024
1 parent a284d0d commit d79a586
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 88 deletions.
4 changes: 2 additions & 2 deletions include/boost/json/detail/sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ count_valid<false>(
uint8_t len = first & 0xFF;
if(BOOST_JSON_UNLIKELY(end - p < len))
break;
if(BOOST_JSON_UNLIKELY(! is_valid_utf8(p, first)))
if(BOOST_JSON_UNLIKELY(! is_valid_utf8_no_inline(p, first)))
break;
p += len;
}
Expand Down Expand Up @@ -185,7 +185,7 @@ count_valid<false>(
uint8_t len = first & 0xFF;
if(BOOST_JSON_UNLIKELY(end - p < len))
break;
if(BOOST_JSON_UNLIKELY(! is_valid_utf8(p, first)))
if(BOOST_JSON_UNLIKELY(! is_valid_utf8_no_inline(p, first)))
break;
p += len;
}
Expand Down
128 changes: 42 additions & 86 deletions include/boost/json/detail/utf8.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,58 +21,12 @@ namespace boost {
namespace json {
namespace detail {

template<endian::order = endian::order::little>
constexpr
std::uint32_t
make_u32_impl(std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1)
{
return (b4 << 24) | (b3 << 16) | (b2 << 8) | b1;
}

template<>
constexpr
std::uint32_t
make_u32_impl<endian::order::big>(
std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1)
{
return (b1 << 24) | (b2 << 16) | (b3 << 8) | b4;
}

constexpr
std::uint32_t
make_u32(std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1)
{
return make_u32_impl<endian::order::native>(b4, b3, b2, b1);
}

template<endian::order = endian::order::little>
constexpr
bool
utf8_case5(std::uint32_t v)
{
return ( ( ( v & make_u32(0xC0,0xC0,0xF0,0x00) )
+ make_u32(0x7F,0x7F,0x70,0x00) ) | make_u32(0x00,0x00,0x30,0x00) )
== make_u32(0x00,0x00,0x30,0x00);
}

template<>
constexpr
bool
utf8_case5<endian::order::big>(std::uint32_t v)
{
return ( ( ( v & make_u32(0xC0,0xC0,0xF0,0x00) )
+ make_u32(0x00,0x00,0x70,0xFF) ) | make_u32(0x00,0x00,0x30,0x00) )
== make_u32(0x80,0x80,0x30,0x00);
}

template<int N>
std::uint32_t
load_little_endian(void const* p)
{
std::uint32_t v = 0;
std::memcpy(&v, p, N);
endian::little_to_native_inplace(v);
return v;
auto const up = reinterpret_cast<unsigned char const*>(p);
return endian::endian_load<std::uint32_t, N, endian::order::little>(up);
}

inline
Expand Down Expand Up @@ -122,47 +76,49 @@ is_valid_utf8(const char* p, uint16_t first)

// 2 bytes, second byte [80, BF]
case 1:
std::memcpy(&v, p, 2);
return ( v & make_u32(0x00,0x00,0xC0,0x00) )
== make_u32(0x00,0x00,0x80,0x00);

// 3 bytes, second byte [A0, BF]
case 2:
std::memcpy(&v, p, 3);
return ( v & make_u32(0x00,0xC0,0xE0,0x00) )
== make_u32(0x00,0x80,0xA0,0x00);

// 3 bytes, second byte [80, BF]
case 3:
std::memcpy(&v, p, 3);
return ( v & make_u32(0x00,0xC0,0xC0,0x00) )
== make_u32(0x00,0x80,0x80,0x00);

// 3 bytes, second byte [80, 9F]
case 4:
std::memcpy(&v, p, 3);
return ( v & make_u32(0x00,0xC0,0xE0,0x00) )
== make_u32(0x00,0x80,0x80,0x00);

// 4 bytes, second byte [90, BF]
case 5:
std::memcpy(&v, p, 4);
return utf8_case5<endian::order::native>(v);

// 4 bytes, second byte [80, BF]
case 6:
std::memcpy(&v, p, 4);
return ( v & make_u32(0xC0,0xC0,0xC0,0x00) )
== make_u32(0x80,0x80,0x80,0x00);

// 4 bytes, second byte [80, 8F]
case 7:
std::memcpy(&v, p, 4);
return ( v & make_u32(0xC0,0xC0,0xF0,0x00) )
== make_u32(0x80,0x80,0x80,0x00);
v = load_little_endian<2>(p);
return (v & 0xC000) == 0x8000;

// 3 bytes, second byte [A0, BF]
case 2:
v = load_little_endian<3>(p);
return (v & 0xC0E000) == 0x80A000;

// 3 bytes, second byte [80, BF]
case 3:
v = load_little_endian<3>(p);
return (v & 0xC0C000) == 0x808000;

// 3 bytes, second byte [80, 9F]
case 4:
v = load_little_endian<3>(p);
return (v & 0xC0E000) == 0x808000;

// 4 bytes, second byte [90, BF]
case 5:
v = load_little_endian<4>(p);
return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;

// 4 bytes, second byte [80, BF]
case 6:
v = load_little_endian<4>(p);
return (v & 0xC0C0C000) == 0x80808000;

// 4 bytes, second byte [80, 8F]
case 7:
v = load_little_endian<4>(p);
return (v & 0xC0C0F000) == 0x80808000;
}
}

BOOST_NOINLINE
inline
bool
is_valid_utf8_no_inline(const char* p, uint16_t first)
{
return is_valid_utf8(p, first);
}

class utf8_sequence
{
char seq_[4];
Expand Down

0 comments on commit d79a586

Please sign in to comment.