diff --git a/Data/ByteString/Builder/Internal.hs b/Data/ByteString/Builder/Internal.hs index 8bb6278bd..3ea7aeb24 100644 --- a/Data/ByteString/Builder/Internal.hs +++ b/Data/ByteString/Builder/Internal.hs @@ -1,6 +1,8 @@ {-# LANGUAGE Unsafe #-} {-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE TypeApplications #-} {-# LANGUAGE NoMonoLocalBinds #-} +{-# LANGUAGE ViewPatterns #-} {-# OPTIONS_HADDOCK not-home #-} @@ -87,6 +89,8 @@ module Data.ByteString.Builder.Internal ( -- , sizedChunksInsert , byteStringCopy + , asciiLiteralCopy + , modUtf8LiteralCopy , byteStringInsert , byteStringThreshold @@ -816,6 +820,7 @@ ensureFree :: Int -> Builder ensureFree minFree = builder step where + step :: forall r. BuildStep r -> BuildStep r step k br@(BufferRange op ope) | ope `minusPtr` op < minFree = return $ bufferFull minFree op k | otherwise = k br @@ -839,6 +844,25 @@ wrappedBytesCopyStep bs0 k = where outRemaining = ope `minusPtr` op +-- | Copy the bytes from a 'BufferRange' into the output stream. +wrappedBufferRangeCopyStep :: BufferRange -- ^ Input 'BufferRange'. + -> BuildStep a -> BuildStep a +wrappedBufferRangeCopyStep (BufferRange ip0 ipe) k = + go ip0 + where + go !ip (BufferRange op ope) + | inpRemaining <= outRemaining = do + copyBytes op ip inpRemaining + let !br' = BufferRange (op `plusPtr` inpRemaining) ope + k br' + | otherwise = do + copyBytes op ip outRemaining + let !ip' = ip `plusPtr` outRemaining + return $ bufferFull 1 ope (go ip') + where + outRemaining = ope `minusPtr` op + inpRemaining = ipe `minusPtr` ip + -- Strict ByteStrings ------------------------------------------------------------------------------ @@ -858,6 +882,7 @@ byteStringThreshold :: Int -> S.StrictByteString -> Builder byteStringThreshold maxCopySize = \bs -> builder $ step bs where + step :: forall r. S.ByteString -> BuildStep r -> BuildStep r step bs@(S.BS _ len) k br@(BufferRange !op _) | len <= maxCopySize = byteStringCopyStep bs k br | otherwise = return $ insertChunk op bs k @@ -949,6 +974,88 @@ byteStringInsert :: S.StrictByteString -> Builder byteStringInsert = \bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k + +------------------------------------------------------------------------------ +-- Raw CString encoding +------------------------------------------------------------------------------ + +-- | Builder for raw pointers to static data of known length that will never be +-- moved or freed. (This is used with the static buffers GHC uses to implement +-- ASCII string literals that do not contain null characters.) +-- +-- @since 0.13.0.0 +{-# INLINABLE asciiLiteralCopy #-} +asciiLiteralCopy :: Ptr Word8 -> Int -> Builder +asciiLiteralCopy = \ !ip !len -> builder $ \k br@(BufferRange op ope) -> + if len <= ope `minusPtr` op + then copyBytes op ip len >> k (BufferRange (op `plusPtr` len) ope) + else wrappedBufferRangeCopyStep (BufferRange ip (ip `plusPtr` len)) k br + +-- | Builder for pointers to /null-terminated/ primitive UTF-8 encoded strings +-- that may contain embedded overlong two-byte encodings of the NUL character +-- as @0xC0 0x80@. Other deviations from strict UTF-8 are tolerated, but the +-- result is not well defined. +-- +-- @since 0.13.0.0 +{-# INLINABLE modUtf8LiteralCopy #-} +modUtf8LiteralCopy :: Ptr Word8 -> Int -> Builder +modUtf8LiteralCopy !ip !len + | len > 0 = builder (modUtf8_step ip len) + | otherwise = builder id + +-- | Copy a /non-empty/ UTF-8 input possibly containing overlong 2-octet +-- sequences. While only the NUL byte should ever encoded that way (as @0xC0 +-- 80@), this handles other overlong @0xC0 0x??@ sequences by keeping the +-- bottom 6 bits of the second byte. If the input is non-UTF8 garbage, the +-- result may not be what the user expected. +-- +modUtf8_step :: Ptr Word8 -> Int -> BuildStep r -> BuildStep r +modUtf8_step !ip !len k (BufferRange op ope) + | op == ope = return $ bufferFull 1 op (modUtf8_step ip len k) + | otherwise = do + let !avail = ope `minusPtr` op + !usable = avail `min` len + -- null-termination makes it possible to read one more byte than the + -- nominal input length, with any unexpected 0xC000 ending interpreted + -- as a NUL. More typically, this simplifies hanlding of inputs where + -- 0xC0 0x80 might otherwise be split across the "usable" input window. + !ch <- peekElemOff ip (usable - 1) + let !use | ch /= 0xC0 = usable + | otherwise = usable + 1 + !n <- utf8_copyBytes (ip `plusPtr` use) ip op + let !op' = op `plusPtr` n + !len' = len - use + ip' = ip `plusPtr` use + if | len' <= 0 -> k (BufferRange op' ope) + | op' < ope -> modUtf8_step ip' len' k (BufferRange op' ope) + | otherwise -> return $ bufferFull 1 op' (modUtf8_step ip' len' k) + +-- | Consume the supplied input returning the number of bytes written +utf8_copyBytes :: Ptr Word8 -> Ptr Word8 -> Ptr Word8 -> IO Int +utf8_copyBytes !ipe = \ ip op -> go 0 ip op + where + go :: Int -> Ptr Word8 -> Ptr Word8 -> IO Int + go !n !ip@((< ipe) -> True) !op = do + !ch <- peek ip + let !ip' = ip `plusPtr` 1 + !op' = op `plusPtr` 1 + if | ch /= 0xC0 -> do + poke op ch + let !cnt = ipe `minusPtr` ip' + !runend <- S.memchr ip' 0xC0 (fromIntegral @Int cnt) + let !runlen | runend == nullPtr = cnt + | otherwise = runend `minusPtr` ip' + if (runlen == 0) + then go (n + 1) ip' op' + else do + copyBytes op' ip' runlen + go (n + 1 + runlen) (ip' `plusPtr` runlen) (op' `plusPtr` runlen) + | otherwise -> do + !ch' <- peek ip' + poke op (ch' .&. 0x3f) + go (n + 1) (ip' `plusPtr` 1) op' + go !n _ _ = pure n + -- Short bytestrings ------------------------------------------------------------------------------ diff --git a/Data/ByteString/Builder/Prim.hs b/Data/ByteString/Builder/Prim.hs index 82f5d18a9..088d06729 100644 --- a/Data/ByteString/Builder/Prim.hs +++ b/Data/ByteString/Builder/Prim.hs @@ -453,6 +453,7 @@ import Data.ByteString.Builder.Internal import qualified Data.ByteString as S import qualified Data.ByteString.Internal as S +import qualified Data.ByteString.Internal.Type as S import qualified Data.ByteString.Lazy.Internal as L import Data.Char (ord) @@ -464,9 +465,7 @@ import Data.ByteString.Builder.Prim.ASCII import Foreign import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr) -import GHC.Word (Word8 (..)) import GHC.Exts -import GHC.IO ------------------------------------------------------------------------------ -- Creating Builders from bounded primitives @@ -658,59 +657,22 @@ primMapLazyByteStringBounded w = L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty ------------------------------------------------------------------------------- --- Raw CString encoding ------------------------------------------------------------------------------- - --- | A null-terminated ASCII encoded 'Foreign.C.String.CString'. --- Null characters are not representable. +-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII +-- strings that are free of embedded null characters. -- -- @since 0.11.0.0 cstring :: Addr# -> Builder -cstring = - \addr0 -> builder $ step addr0 - where - step :: Addr# -> BuildStep r -> BuildStep r - step !addr !k br@(BufferRange op0@(Ptr op0#) ope) - | W8# ch == 0 = k br - | op0 == ope = - return $ bufferFull 1 op0 (step addr k) - | otherwise = do - IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of - s' -> (# s', () #) - let br' = BufferRange (op0 `plusPtr` 1) ope - step (addr `plusAddr#` 1#) k br' - where - !ch = indexWord8OffAddr# addr 0# +cstring s = asciiLiteralCopy (Ptr s) (S.byteCountLiteral s) +{-# INLINE cstring #-} --- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'. --- Null characters can be encoded as @0xc0 0x80@. +-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8 +-- encoded strings in which any emebded null characters are represented via +-- the two-byte overlong-encoding: @0xC0 0x80@. -- -- @since 0.11.0.0 cstringUtf8 :: Addr# -> Builder -cstringUtf8 = - \addr0 -> builder $ step addr0 - where - step :: Addr# -> BuildStep r -> BuildStep r - step !addr !k br@(BufferRange op0@(Ptr op0#) ope) - | W8# ch == 0 = k br - | op0 == ope = - return $ bufferFull 1 op0 (step addr k) - -- NULL is encoded as 0xc0 0x80 - | W8# ch == 0xc0 - , W8# (indexWord8OffAddr# addr 1#) == 0x80 = do - let !(W8# nullByte#) = 0 - IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of - s' -> (# s', () #) - let br' = BufferRange (op0 `plusPtr` 1) ope - step (addr `plusAddr#` 2#) k br' - | otherwise = do - IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of - s' -> (# s', () #) - let br' = BufferRange (op0 `plusPtr` 1) ope - step (addr `plusAddr#` 1#) k br' - where - !ch = indexWord8OffAddr# addr 0# +cstringUtf8 s = modUtf8LiteralCopy (Ptr s) (S.byteCountLiteral s) +{-# INLINE cstringUtf8 #-} ------------------------------------------------------------------------------ -- Char8 encoding diff --git a/Data/ByteString/Internal/Type.hs b/Data/ByteString/Internal/Type.hs index 31e449b7d..e99e7c51e 100644 --- a/Data/ByteString/Internal/Type.hs +++ b/Data/ByteString/Internal/Type.hs @@ -42,6 +42,7 @@ module Data.ByteString.Internal.Type ( unpackChars, unpackAppendCharsLazy, unpackAppendCharsStrict, unsafePackAddress, unsafePackLenAddress, unsafePackLiteral, unsafePackLenLiteral, + byteCountLiteral, -- * Low level imperative construction empty, @@ -475,6 +476,18 @@ unsafePackLenAddress len addr# = do #endif {-# INLINE unsafePackLenAddress #-} +-- | Byte count of null-terminated primitive literal string excluding the +-- terminating null byte. +byteCountLiteral :: Addr# -> Int +byteCountLiteral addr# = +#if HS_cstringLength_AND_FinalPtr_AVAILABLE + I# (cstringLength# addr#) +#else + fromIntegral @CSize @Int $ + accursedUnutterablePerformIO (c_strlen (Ptr addr#)) +#endif +{-# INLINE byteCountLiteral #-} + -- | See 'unsafePackAddress'. This function has similar behavior. Prefer -- this function when the address in known to be an @Addr#@ literal. In -- that context, there is no need for the sequencing guarantees that 'IO' diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs index 7f95a3e6f..e81f0e8d2 100644 --- a/bench/BenchAll.hs +++ b/bench/BenchAll.hs @@ -327,6 +327,10 @@ main = do , benchB'_ "ASCII String (12B)" $ asciiLit (Ptr "hello wurld!"#) , benchB' "ASCII String (64B, naive)" asciiStr fromString , benchB'_ "ASCII String (64B)" $ asciiLit asciiBuf + , benchB'_ "strLit" $ string8 asciiStr + , benchB'_ "stringUtf8" $ stringUtf8 utf8Str + , benchB'_ "strLitInline" $ string8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" + , benchB'_ "utf8LitInline" $ stringUtf8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" ] , bgroup "Encoding wrappers" diff --git a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs index 60e0cf4a8..b5c734e5c 100644 --- a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs +++ b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs @@ -14,6 +14,7 @@ import Data.Char (ord) import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy.Char8 as LC import Data.ByteString.Builder +import Data.ByteString.Builder.Extra as BE import qualified Data.ByteString.Builder.Prim as BP import Data.ByteString.Builder.Prim.TestUtils @@ -22,17 +23,28 @@ import Test.Tasty.QuickCheck tests :: [TestTree] tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8 - , testsCombinatorsB, [testCString, testCStringUtf8] ] + , testsCombinatorsB + , [ testCString + , testCStringUtf8 1 + , testCStringUtf8 6 + , testCStringUtf8 64 + ] + ] testCString :: TestTree testCString = testProperty "cstring" $ toLazyByteString (BP.cstring "hello world!"#) == LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!" -testCStringUtf8 :: TestTree -testCStringUtf8 = testProperty "cstringUtf8" $ - toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) == - LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!" +testCStringUtf8 :: Int -> TestTree +testCStringUtf8 sz = testProperty ("cstringUtf8 (chunk size " ++ shows sz ")") $ + BE.toLazyByteStringWith (BE.untrimmedStrategy sz sz) L.empty + (BP.cstringUtf8 "hello\xc0\x80\xc0\x80\xd0\xbc\xd0\xb8\xd1\x80\xc0\x80\xC0"#) == + LC.pack "hello" `L.append` L.singleton 0x00 + `L.append` L.singleton 0x00 + `L.append` LC.pack "\xd0\xbc\xd0\xb8\xd1\x80" + `L.append` L.singleton 0x00 + `L.append` L.singleton 0x00 ------------------------------------------------------------------------------ -- Binary