From 662b1d8da12f47a268eeea2035138c40438542df Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Mon, 30 Sep 2024 21:18:32 -0400 Subject: [PATCH] improve stringwrapper bytestring unwrap --- src/workerd/jsg/BUILD.bazel | 2 +- src/workerd/jsg/value.h | 28 ++++++++++++++++------------ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/workerd/jsg/BUILD.bazel b/src/workerd/jsg/BUILD.bazel index 8cce150449b..b84b00b48b7 100644 --- a/src/workerd/jsg/BUILD.bazel +++ b/src/workerd/jsg/BUILD.bazel @@ -52,7 +52,6 @@ wd_cc_library( "web-idl.h", "wrappable.h", ], - implementation_deps = ["@simdutf"], visibility = ["//visibility:public"], deps = [ ":exception", @@ -65,6 +64,7 @@ wd_cc_library( "//src/workerd/util:thread-scopes", "//src/workerd/util:uuid", "@capnp-cpp//src/kj", + "@simdutf", "@workerd-v8//:v8", ], ) diff --git a/src/workerd/jsg/value.h b/src/workerd/jsg/value.h index a71a0947747..54c7fca2d63 100644 --- a/src/workerd/jsg/value.h +++ b/src/workerd/jsg/value.h @@ -8,6 +8,7 @@ // Handling of various basic value types: numbers, booleans, strings, optionals, maybes, variants, // arrays, buffers, dicts. +#include "simdutf.h" #include "util.h" #include "web-idl.h" #include "wrappable.h" @@ -468,20 +469,23 @@ class StringWrapper { kj::Maybe> parentObject) { // TODO(cleanup): Move to a HeaderStringWrapper in the api directory. v8::Local str = check(handle->ToString(context)); - auto result = - ByteString(KJ_ASSERT_NONNULL(tryUnwrap(context, str, (kj::String*)nullptr, parentObject))); - if (str->ContainsOnlyOneByte()) { - auto buf = kj::heapArray(str->Length() + 1); - str->WriteOneByte(context->GetIsolate(), buf.begin()); - for (auto b: buf) { - if (b >= 128) { - result.warning = ByteString::Warning::CONTAINS_EXTENDED_ASCII; - break; - } + auto result = ByteString(KJ_ASSERT_NONNULL( + tryUnwrap(context, str, static_cast(nullptr), parentObject))); + + if (!simdutf::validate_ascii(result.begin(), result.size())) { + // If storage is one-byte or the string contains only one-byte + // characters, we know that it contains extended ASCII characters. + // + // The order of execution matters, since ContainsOnlyOneByte() + // will scan the whole string for two-byte storage. + if (str->ContainsOnlyOneByte()) { + result.warning = ByteString::Warning::CONTAINS_EXTENDED_ASCII; + } else { + // Storage is two-bytes and it contains two-byte characters. + result.warning = ByteString::Warning::CONTAINS_UNICODE; } - } else { - result.warning = ByteString::Warning::CONTAINS_UNICODE; } + return kj::mv(result); } };