Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions Data/ByteString/Builder/Internal.hs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
{-# LANGUAGE Unsafe #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE NoMonoLocalBinds #-}
{-# LANGUAGE ViewPatterns #-}

{-# OPTIONS_HADDOCK not-home #-}

Expand Down Expand Up @@ -87,6 +89,8 @@ module Data.ByteString.Builder.Internal (
-- , sizedChunksInsert

, byteStringCopy
, asciiLiteralCopy
, modUtf8LiteralCopy
, byteStringInsert
, byteStringThreshold

Expand Down Expand Up @@ -816,6 +820,7 @@ ensureFree :: Int -> Builder
ensureFree minFree =
builder step
where
step :: forall r. BuildStep r -> BuildStep r
step k br@(BufferRange op ope)
| ope `minusPtr` op < minFree = return $ bufferFull minFree op k
| otherwise = k br
Expand All @@ -839,6 +844,25 @@ wrappedBytesCopyStep bs0 k =
where
outRemaining = ope `minusPtr` op

-- | Copy the bytes from a 'BufferRange' into the output stream.
wrappedBufferRangeCopyStep :: BufferRange -- ^ Input 'BufferRange'.
-> BuildStep a -> BuildStep a
wrappedBufferRangeCopyStep (BufferRange ip0 ipe) k =
go ip0
where
go !ip (BufferRange op ope)
| inpRemaining <= outRemaining = do
copyBytes op ip inpRemaining
let !br' = BufferRange (op `plusPtr` inpRemaining) ope
k br'
| otherwise = do
copyBytes op ip outRemaining
let !ip' = ip `plusPtr` outRemaining
return $ bufferFull 1 ope (go ip')
where
outRemaining = ope `minusPtr` op
inpRemaining = ipe `minusPtr` ip


-- Strict ByteStrings
------------------------------------------------------------------------------
Expand All @@ -858,6 +882,7 @@ byteStringThreshold :: Int -> S.StrictByteString -> Builder
byteStringThreshold maxCopySize =
\bs -> builder $ step bs
where
step :: forall r. S.ByteString -> BuildStep r -> BuildStep r
step bs@(S.BS _ len) k br@(BufferRange !op _)
| len <= maxCopySize = byteStringCopyStep bs k br
| otherwise = return $ insertChunk op bs k
Expand Down Expand Up @@ -949,6 +974,88 @@ byteStringInsert :: S.StrictByteString -> Builder
byteStringInsert =
\bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k


------------------------------------------------------------------------------
-- Raw CString encoding
------------------------------------------------------------------------------

-- | Builder for raw pointers to static data of known length that will never be
-- moved or freed. (This is used with the static buffers GHC uses to implement
-- ASCII string literals that do not contain null characters.)
--
-- @since 0.13.0.0
{-# INLINABLE asciiLiteralCopy #-}
asciiLiteralCopy :: Ptr Word8 -> Int -> Builder
asciiLiteralCopy = \ !ip !len -> builder $ \k br@(BufferRange op ope) ->
if len <= ope `minusPtr` op
then copyBytes op ip len >> k (BufferRange (op `plusPtr` len) ope)
else wrappedBufferRangeCopyStep (BufferRange ip (ip `plusPtr` len)) k br

-- | Builder for pointers to /null-terminated/ primitive UTF-8 encoded strings
-- that may contain embedded overlong two-byte encodings of the NUL character
-- as @0xC0 0x80@. Other deviations from strict UTF-8 are tolerated, but the
-- result is not well defined.
--
-- @since 0.13.0.0
{-# INLINABLE modUtf8LiteralCopy #-}
modUtf8LiteralCopy :: Ptr Word8 -> Int -> Builder
modUtf8LiteralCopy !ip !len
| len > 0 = builder (modUtf8_step ip len)
| otherwise = builder id

-- | Copy a /non-empty/ UTF-8 input possibly containing overlong 2-octet
-- sequences. While only the NUL byte should ever encoded that way (as @0xC0
-- 80@), this handles other overlong @0xC0 0x??@ sequences by keeping the
-- bottom 6 bits of the second byte. If the input is non-UTF8 garbage, the
-- result may not be what the user expected.
--
modUtf8_step :: Ptr Word8 -> Int -> BuildStep r -> BuildStep r
modUtf8_step !ip !len k (BufferRange op ope)
| op == ope = return $ bufferFull 1 op (modUtf8_step ip len k)
| otherwise = do
let !avail = ope `minusPtr` op
!usable = avail `min` len
-- null-termination makes it possible to read one more byte than the
-- nominal input length, with any unexpected 0xC000 ending interpreted
-- as a NUL. More typically, this simplifies hanlding of inputs where
-- 0xC0 0x80 might otherwise be split across the "usable" input window.
!ch <- peekElemOff ip (usable - 1)
let !use | ch /= 0xC0 = usable
| otherwise = usable + 1
!n <- utf8_copyBytes (ip `plusPtr` use) ip op
let !op' = op `plusPtr` n
!len' = len - use
ip' = ip `plusPtr` use
if | len' <= 0 -> k (BufferRange op' ope)
| op' < ope -> modUtf8_step ip' len' k (BufferRange op' ope)
| otherwise -> return $ bufferFull 1 op' (modUtf8_step ip' len' k)

-- | Consume the supplied input returning the number of bytes written
utf8_copyBytes :: Ptr Word8 -> Ptr Word8 -> Ptr Word8 -> IO Int
utf8_copyBytes !ipe = \ ip op -> go 0 ip op
where
go :: Int -> Ptr Word8 -> Ptr Word8 -> IO Int
go !n !ip@((< ipe) -> True) !op = do
!ch <- peek ip
let !ip' = ip `plusPtr` 1
!op' = op `plusPtr` 1
if | ch /= 0xC0 -> do
poke op ch
let !cnt = ipe `minusPtr` ip'
!runend <- S.memchr ip' 0xC0 (fromIntegral @Int cnt)
let !runlen | runend == nullPtr = cnt
| otherwise = runend `minusPtr` ip'
if (runlen == 0)
then go (n + 1) ip' op'
else do
copyBytes op' ip' runlen
go (n + 1 + runlen) (ip' `plusPtr` runlen) (op' `plusPtr` runlen)
| otherwise -> do
!ch' <- peek ip'
poke op (ch' .&. 0x3f)
go (n + 1) (ip' `plusPtr` 1) op'
go !n _ _ = pure n

-- Short bytestrings
------------------------------------------------------------------------------

Expand Down
58 changes: 10 additions & 48 deletions Data/ByteString/Builder/Prim.hs
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ import Data.ByteString.Builder.Internal

import qualified Data.ByteString as S
import qualified Data.ByteString.Internal as S
import qualified Data.ByteString.Internal.Type as S
import qualified Data.ByteString.Lazy.Internal as L

import Data.Char (ord)
Expand All @@ -464,9 +465,7 @@ import Data.ByteString.Builder.Prim.ASCII

import Foreign
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
import GHC.Word (Word8 (..))
import GHC.Exts
import GHC.IO

------------------------------------------------------------------------------
-- Creating Builders from bounded primitives
Expand Down Expand Up @@ -658,59 +657,22 @@ primMapLazyByteStringBounded w =
L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty


------------------------------------------------------------------------------
-- Raw CString encoding
------------------------------------------------------------------------------

-- | A null-terminated ASCII encoded 'Foreign.C.String.CString'.
-- Null characters are not representable.
-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
-- strings that are free of embedded null characters.
--
-- @since 0.11.0.0
cstring :: Addr# -> Builder
cstring =
\addr0 -> builder $ step addr0
where
step :: Addr# -> BuildStep r -> BuildStep r
step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
| W8# ch == 0 = k br
| op0 == ope =
return $ bufferFull 1 op0 (step addr k)
| otherwise = do
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 1#) k br'
where
!ch = indexWord8OffAddr# addr 0#
cstring s = asciiLiteralCopy (Ptr s) (S.byteCountLiteral s)
{-# INLINE cstring #-}

-- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
-- Null characters can be encoded as @0xc0 0x80@.
-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
-- encoded strings in which any emebded null characters are represented via
-- the two-byte overlong-encoding: @0xC0 0x80@.
--
-- @since 0.11.0.0
cstringUtf8 :: Addr# -> Builder
cstringUtf8 =
\addr0 -> builder $ step addr0
where
step :: Addr# -> BuildStep r -> BuildStep r
step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
| W8# ch == 0 = k br
| op0 == ope =
return $ bufferFull 1 op0 (step addr k)
-- NULL is encoded as 0xc0 0x80
| W8# ch == 0xc0
, W8# (indexWord8OffAddr# addr 1#) == 0x80 = do
let !(W8# nullByte#) = 0
IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 2#) k br'
| otherwise = do
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 1#) k br'
where
!ch = indexWord8OffAddr# addr 0#
cstringUtf8 s = modUtf8LiteralCopy (Ptr s) (S.byteCountLiteral s)
{-# INLINE cstringUtf8 #-}

------------------------------------------------------------------------------
-- Char8 encoding
Expand Down
13 changes: 13 additions & 0 deletions Data/ByteString/Internal/Type.hs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ module Data.ByteString.Internal.Type (
unpackChars, unpackAppendCharsLazy, unpackAppendCharsStrict,
unsafePackAddress, unsafePackLenAddress,
unsafePackLiteral, unsafePackLenLiteral,
byteCountLiteral,

-- * Low level imperative construction
empty,
Expand Down Expand Up @@ -475,6 +476,18 @@ unsafePackLenAddress len addr# = do
#endif
{-# INLINE unsafePackLenAddress #-}

-- | Byte count of null-terminated primitive literal string excluding the
-- terminating null byte.
byteCountLiteral :: Addr# -> Int
byteCountLiteral addr# =
#if HS_cstringLength_AND_FinalPtr_AVAILABLE
I# (cstringLength# addr#)
#else
fromIntegral @CSize @Int $
accursedUnutterablePerformIO (c_strlen (Ptr addr#))
#endif
{-# INLINE byteCountLiteral #-}

-- | See 'unsafePackAddress'. This function has similar behavior. Prefer
-- this function when the address in known to be an @Addr#@ literal. In
-- that context, there is no need for the sequencing guarantees that 'IO'
Expand Down
4 changes: 4 additions & 0 deletions bench/BenchAll.hs
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,10 @@ main = do
, benchB'_ "ASCII String (12B)" $ asciiLit (Ptr "hello wurld!"#)
, benchB' "ASCII String (64B, naive)" asciiStr fromString
, benchB'_ "ASCII String (64B)" $ asciiLit asciiBuf
, benchB'_ "strLit" $ string8 asciiStr
, benchB'_ "stringUtf8" $ stringUtf8 utf8Str
, benchB'_ "strLitInline" $ string8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
, benchB'_ "utf8LitInline" $ stringUtf8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
]

, bgroup "Encoding wrappers"
Expand Down
22 changes: 17 additions & 5 deletions tests/builder/Data/ByteString/Builder/Prim/Tests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import Data.Char (ord)
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as LC
import Data.ByteString.Builder
import Data.ByteString.Builder.Extra as BE
import qualified Data.ByteString.Builder.Prim as BP
import Data.ByteString.Builder.Prim.TestUtils

Expand All @@ -22,17 +23,28 @@ import Test.Tasty.QuickCheck

tests :: [TestTree]
tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8
, testsCombinatorsB, [testCString, testCStringUtf8] ]
, testsCombinatorsB
, [ testCString
, testCStringUtf8 1
, testCStringUtf8 6
, testCStringUtf8 64
]
]

testCString :: TestTree
testCString = testProperty "cstring" $
toLazyByteString (BP.cstring "hello world!"#) ==
LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!"

testCStringUtf8 :: TestTree
testCStringUtf8 = testProperty "cstringUtf8" $
toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) ==
LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!"
testCStringUtf8 :: Int -> TestTree
testCStringUtf8 sz = testProperty ("cstringUtf8 (chunk size " ++ shows sz ")") $
BE.toLazyByteStringWith (BE.untrimmedStrategy sz sz) L.empty
(BP.cstringUtf8 "hello\xc0\x80\xc0\x80\xd0\xbc\xd0\xb8\xd1\x80\xc0\x80\xC0"#) ==
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
(BP.cstringUtf8 "hello\xc0\x80\xc0\x80\xd0\xbc\xd0\xb8\xd1\x80\xc0\x80\xC0"#) ==
(BP.cstringUtf8 "hello\xc0\x80\xc0\x80\xd0\xbc\xd0\xb8\xd1\x80\xc0\x80\xC0\x80"#) ==

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do think it is best to test that the code does not blow up with input that unexpectedly ends with just \xC0 before the raw (implicit) NUL terminator. So this is more of a robustness test, that then also encodes the implemented handling, than a promise to users that this is how that's handled.

LC.pack "hello" `L.append` L.singleton 0x00
`L.append` L.singleton 0x00
`L.append` LC.pack "\xd0\xbc\xd0\xb8\xd1\x80"
`L.append` L.singleton 0x00
`L.append` L.singleton 0x00

------------------------------------------------------------------------------
-- Binary
Expand Down