Skip to content

Commit

Permalink
Data.ByteString.Lazy.dropEnd: Use two-pointers technique (#629)
Browse files Browse the repository at this point in the history
* Data.ByteString.Lazy.dropEnd: Use two-pointers technique

This can be seen as using the input itself as an implicit
queue; we formerly copied its chunks into an explicit queue.

By writing the key logic as a polymorphic `splitAtEndFold`,
it was easy to re-use it for `takeEnd`; the latter function
should now operate in constant stack space and can release
initial chunks of a very long input string sooner.

* Fix a very silly bug, and strengthen tests

...so that a plain 'cabal test' finds the bug almost every try
instead of finding it only every few dozen tries

* Actually work around the poison instance

(Some re-compilation check somewhere set a trap for me.)

This also replaces fromIntegral with intToIndexTy in a few places.

* Rewrite the poison instance using TypeError

* Rename "bsL" -> "toSplit" and "bsR" -> "toScan"

* Add basic benchmarks for lazy takeEnd/splitEnd

According to these benchmarks, the new implementation for takeEnd
is somewhat faster and the new implementation for dropEnd is roughly
3.5x to 4x as quick as its predecessor.
  • Loading branch information
clyring authored Feb 4, 2024
1 parent cff4421 commit 2bbc97e
Show file tree
Hide file tree
Showing 7 changed files with 156 additions and 133 deletions.
129 changes: 80 additions & 49 deletions Data/ByteString/Lazy.hs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
{-# OPTIONS_HADDOCK prune #-}
{-# LANGUAGE Trustworthy #-}

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ScopedTypeVariables #-}

-- |
-- Module : Data.ByteString.Lazy
-- Copyright : (c) Don Stewart 2006
Expand Down Expand Up @@ -237,9 +240,9 @@ import qualified Data.ByteString as P (ByteString) -- type name only
import qualified Data.ByteString as S -- S for strict (hmm...)
import qualified Data.ByteString.Internal.Type as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.ByteString.Lazy.Internal.Deque as D
import Data.ByteString.Lazy.Internal

import Control.Exception (assert)
import Control.Monad (mplus)
import Data.Word (Word8)
import Data.Int (Int64)
Expand Down Expand Up @@ -790,15 +793,75 @@ take i cs0 = take' i cs0
--
-- @since 0.11.2.0
takeEnd :: Int64 -> ByteString -> ByteString
takeEnd i _ | i <= 0 = Empty
takeEnd i cs0 = takeEnd' i cs0
where takeEnd' 0 _ = Empty
takeEnd' n cs =
snd $ foldrChunks takeTuple (n,Empty) cs
takeTuple _ (0, cs) = (0, cs)
takeTuple c (n, cs)
| n > fromIntegral (S.length c) = (n - fromIntegral (S.length c), Chunk c cs)
| otherwise = (0, Chunk (S.takeEnd (fromIntegral n) c) cs)
takeEnd i bs
| i <= 0 = Empty
| otherwise = splitAtEndFold (\_ res -> res) id i bs

-- | Helper function for implementing 'takeEnd' and 'dropEnd'
splitAtEndFold
:: forall result
. (S.StrictByteString -> result -> result)
-- ^ What to do when one chunk of output is ready
-- (The StrictByteString will not be empty.)
-> (ByteString -> result)
-- ^ What to do when the split-point is reached
-> Int64
-- ^ Number of bytes to leave at the end (must be strictly positive)
-> ByteString -- ^ Input ByteString
-> result
{-# INLINE splitAtEndFold #-}
splitAtEndFold step end len bs0 = assert (len > 0) $ case bs0 of
Empty -> end Empty
Chunk c t -> goR len c t t
where
-- Idea: Keep two references into the input ByteString:
-- "toSplit" tracks the current split point,
-- "toScan" tracks the yet-unprocessed tail.
-- When they are closer than "len" bytes apart, process more input. ("goR")
-- When they are at least "len" bytes apart, produce more output. ("goL")
-- We always have that "toScan" is a suffix of "toSplit",
-- and "toSplit" is a suffix of the original input (bs0).
goR :: Int64 -> S.StrictByteString -> ByteString -> ByteString -> result
goR !undershoot nextOutput@(S.BS noFp noLen) toSplit toScan =
assert (undershoot > 0) $
-- INVARIANT: length toSplit == length toScan + len - undershoot
-- (not 'assert'ed because that would break our laziness properties)
case toScan of
Empty
| undershoot >= intToInt64 noLen
-> end (Chunk nextOutput toSplit)
| undershootW <- fromIntegral @Int64 @Int undershoot
-- conversion Int64->Int is OK because 0 < undershoot < noLen
, splitIndex <- noLen - undershootW
, beforeSplit <- S.BS noFp splitIndex
, afterSplit <- S.BS (noFp `S.plusForeignPtr` splitIndex) undershootW
-> step beforeSplit $ end (Chunk afterSplit toSplit)

Chunk (S.BS _ cLen) newBsR
| cLen64 <- intToInt64 cLen
, undershoot > cLen64
-> goR (undershoot - cLen64) nextOutput toSplit newBsR
| undershootW <- fromIntegral @Int64 @Int undershoot
-> step nextOutput $ goL (cLen - undershootW) toSplit newBsR

goL :: Int -> ByteString -> ByteString -> result
goL !overshoot toSplit toScan =
assert (overshoot >= 0) $
-- INVARIANT: length toSplit == length toScan + len + intToInt64 overshoot
-- (not 'assert'ed because that would break our laziness properties)
case toSplit of
Empty -> splitAtEndFoldInvariantFailed
Chunk c@(S.BS _ cLen) newBsL
| overshoot >= cLen
-> step c $ goL (overshoot - cLen) newBsL toScan
| otherwise
-> goR (intToInt64 $ cLen - overshoot) c newBsL toScan

splitAtEndFoldInvariantFailed :: a
-- See Note [Float error calls out of INLINABLE things] in D.B.Internal.Type
splitAtEndFoldInvariantFailed =
moduleError "splitAtEndFold"
"internal error: toSplit not longer than toScan"

-- | /O(n\/c)/ 'drop' @n xs@ returns the suffix of @xs@ after the first @n@
-- elements, or 'empty' if @n > 'length' xs@.
Expand All @@ -824,44 +887,9 @@ drop i cs0 = drop' i cs0
--
-- @since 0.11.2.0
dropEnd :: Int64 -> ByteString -> ByteString
dropEnd i p | i <= 0 = p
dropEnd i p = go D.empty p
where go :: D.Deque -> ByteString -> ByteString
go deque (Chunk c cs)
| D.byteLength deque < i = go (D.snoc c deque) cs
| otherwise =
let (output, deque') = getOutput empty (D.snoc c deque)
in foldrChunks Chunk (go deque' cs) output
go deque Empty = fromDeque $ dropEndBytes deque i

len c = fromIntegral (S.length c)

-- get a `ByteString` from all the front chunks of the accumulating deque
-- for which we know they won't be dropped
getOutput :: ByteString -> D.Deque -> (ByteString, D.Deque)
getOutput out deque = case D.popFront deque of
Nothing -> (reverseChunks out, deque)
Just (x, deque') | D.byteLength deque' >= i ->
getOutput (Chunk x out) deque'
_ -> (reverseChunks out, deque)

-- reverse a `ByteString`s chunks, keeping all internal `S.StrictByteString`s
-- unchanged
reverseChunks = foldlChunks (flip Chunk) empty

-- drop n elements from the rear of the accumulating `deque`
dropEndBytes :: D.Deque -> Int64 -> D.Deque
dropEndBytes deque n = case D.popRear deque of
Nothing -> deque
Just (deque', x) | len x <= n -> dropEndBytes deque' (n - len x)
| otherwise ->
D.snoc (S.dropEnd (fromIntegral n) x) deque'

-- build a lazy ByteString from an accumulating `deque`
fromDeque :: D.Deque -> ByteString
fromDeque deque =
List.foldr chunk Empty (D.front deque) `append`
List.foldl' (flip chunk) Empty (D.rear deque)
dropEnd i p
| i <= 0 = p
| otherwise = splitAtEndFold Chunk (const Empty) i p

-- | /O(n\/c)/ 'splitAt' @n xs@ is equivalent to @('take' n xs, 'drop' n xs)@.
splitAt :: Int64 -> ByteString -> (ByteString, ByteString)
Expand Down Expand Up @@ -1688,6 +1716,9 @@ revNonEmptyChunks = List.foldl' (flip Chunk) Empty
revChunks :: [P.ByteString] -> ByteString
revChunks = List.foldl' (flip chunk) Empty

intToInt64 :: Int -> Int64
intToInt64 = fromIntegral @Int @Int64

-- $IOChunk
--
-- ⚠ Using lazy I\/O functions like 'readFile' or 'hGetContents'
Expand Down
65 changes: 0 additions & 65 deletions Data/ByteString/Lazy/Internal/Deque.hs

This file was deleted.

24 changes: 21 additions & 3 deletions bench/BenchAll.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import Data.Semigroup
import Data.String
import Test.Tasty.Bench
import Prelude hiding (words)
import qualified Data.List as List
import Control.DeepSeq

import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
Expand Down Expand Up @@ -99,9 +101,12 @@ lazyByteStringData = case S.splitAt (nRepl `div` 2) byteStringData of

{-# NOINLINE smallChunksData #-}
smallChunksData :: L.ByteString
smallChunksData
= L.fromChunks [S.take sz (S.drop n byteStringData)
| let sz = 48, n <- [0, sz .. S.length byteStringData]]
smallChunksData = L.fromChunks $ List.unfoldr step (byteStringData, 1)
where
step (!s, !i)
| S.null s = Nothing
| otherwise = case S.splitAt i s of
(!s1, !s2) -> Just (s1, (s2, i * 71 `mod` 97))

{-# NOINLINE byteStringChunksData #-}
byteStringChunksData :: [S.ByteString]
Expand Down Expand Up @@ -419,6 +424,19 @@ main = do
[ bench "strict" $ nf S.tails byteStringData
, bench "lazy" $ nf L.tails lazyByteStringData
]
, bgroup "splitAtEnd (lazy)" $ let
testSAE op = \bs -> [op i bs | i <- [0,5..L.length bs]] `deepseq` ()
{-# INLINE testSAE #-}
in
[ bench "takeEnd" $
nf (testSAE L.takeEnd) lazyByteStringData
, bench "takeEnd (small chunks)" $
nf (testSAE L.takeEnd) smallChunksData
, bench "dropEnd" $
nf (testSAE L.dropEnd) lazyByteStringData
, bench "dropEnd (small chunks)" $
nf (testSAE L.dropEnd) smallChunksData
]
, bgroup "sort" $ map (\s -> bench (S8.unpack s) $ nf S.sort s) sortInputs
, bgroup "stimes" $ let st = stimes :: Int -> S.ByteString -> S.ByteString
in
Expand Down
1 change: 0 additions & 1 deletion bytestring.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ library
Data.ByteString.Builder.RealFloat.Internal
Data.ByteString.Builder.RealFloat.TableGenerator
Data.ByteString.Internal.Type
Data.ByteString.Lazy.Internal.Deque
Data.ByteString.Lazy.ReadInt
Data.ByteString.Lazy.ReadNat
Data.ByteString.ReadInt
Expand Down
38 changes: 26 additions & 12 deletions tests/Properties/ByteString.hs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ import Test.Tasty
import Test.Tasty.QuickCheck
import QuickCheckUtils

#ifdef BYTESTRING_LAZY
import Data.Int
#endif

#ifndef BYTESTRING_CHAR8
toElem :: Word8 -> Word8
toElem = id
Expand Down Expand Up @@ -114,16 +118,25 @@ instance Arbitrary Natural where

testRdInt :: forall a. (Arbitrary a, RdInt a) => String -> TestTree
testRdInt s = testGroup s $
[ testProperty "from string" $ \ prefix value suffix ->
[ testProperty "from string" $ int64OK $ \value prefix suffix ->
let si = show @a value
b = prefix <> B.pack si <> suffix
in fmap (second B.unpack) (bread @a b)
=== sread @a (B.unpack prefix ++ si ++ B.unpack suffix)
, testProperty "from number" $ \n ->
, testProperty "from number" $ int64OK $ \n ->
bread @a (B.pack (show n)) === Just (n, B.empty)
]
#endif

intToIndexTy :: Int -> IndexTy
#ifdef BYTESTRING_LAZY
type IndexTy = Int64
intToIndexTy = fromIntegral @Int @Int64
#else
type IndexTy = Int
intToIndexTy = id
#endif

tests :: [TestTree]
tests =
[ testProperty "pack . unpack" $
Expand Down Expand Up @@ -308,7 +321,7 @@ tests =
#endif

, testProperty "drop" $
\n x -> B.unpack (B.drop n x) === List.genericDrop n (B.unpack x)
\(intToIndexTy -> n) x -> B.unpack (B.drop n x) === List.genericDrop n (B.unpack x)
, testProperty "drop 10" $
\x -> let n = 10 in B.unpack (B.drop n x) === List.genericDrop n (B.unpack x)
, testProperty "drop 2^31" $
Expand All @@ -325,7 +338,7 @@ tests =
#endif

, testProperty "take" $
\n x -> B.unpack (B.take n x) === List.genericTake n (B.unpack x)
\(intToIndexTy -> n) x -> B.unpack (B.take n x) === List.genericTake n (B.unpack x)
, testProperty "take 10" $
\x -> let n = 10 in B.unpack (B.take n x) === List.genericTake n (B.unpack x)
, testProperty "take 2^31" $
Expand All @@ -342,11 +355,11 @@ tests =
#endif

, testProperty "dropEnd" $
\n x -> B.dropEnd n x === B.take (B.length x - n) x
\(intToIndexTy -> n) x -> B.dropEnd n x === B.take (B.length x - n) x
, testProperty "dropWhileEnd" $
\f x -> B.dropWhileEnd f x === B.reverse (B.dropWhile f (B.reverse x))
, testProperty "takeEnd" $
\n x -> B.takeEnd n x === B.drop (B.length x - n) x
\(intToIndexTy -> n) x -> B.takeEnd n x === B.drop (B.length x - n) x
, testProperty "takeWhileEnd" $
\f x -> B.takeWhileEnd f x === B.reverse (B.takeWhile f (B.reverse x))

Expand All @@ -366,7 +379,7 @@ tests =
, testProperty "compareLength 4" $
\x (toElem -> c) -> B.compareLength (B.snoc x c <> undefined) (B.length x) === GT
, testProperty "compareLength 5" $
\x n -> B.compareLength x n === compare (B.length x) n
\x (intToIndexTy -> n) -> B.compareLength x n === compare (B.length x) n
, testProperty "dropEnd lazy" $
\(toElem -> c) -> B.take 1 (B.dropEnd 1 (B.singleton c <> B.singleton c <> B.singleton c <> undefined)) === B.singleton c
, testProperty "dropWhileEnd lazy" $
Expand Down Expand Up @@ -470,7 +483,8 @@ tests =
(l1 == l2 || l1 == l2 + 1) && sum (map B.length splits) + l2 == B.length x

, testProperty "splitAt" $
\n x -> (B.unpack *** B.unpack) (B.splitAt n x) === List.genericSplitAt n (B.unpack x)
\(intToIndexTy -> n) x -> (B.unpack *** B.unpack) (B.splitAt n x)
=== List.genericSplitAt n (B.unpack x)
, testProperty "splitAt 10" $
\x -> let n = 10 in (B.unpack *** B.unpack) (B.splitAt n x) === List.genericSplitAt n (B.unpack x)
, testProperty "splitAt (2^31)" $
Expand Down Expand Up @@ -594,13 +608,13 @@ tests =
#endif

, testProperty "index" $
\(NonNegative n) x -> fromIntegral n < B.length x ==> B.index x (fromIntegral n) === B.unpack x !! n
\(NonNegative n) x -> intToIndexTy n < B.length x ==> B.index x (intToIndexTy n) === B.unpack x !! n
, testProperty "indexMaybe" $
\(NonNegative n) x -> fromIntegral n < B.length x ==> B.indexMaybe x (fromIntegral n) === Just (B.unpack x !! n)
\(NonNegative n) x -> intToIndexTy n < B.length x ==> B.indexMaybe x (intToIndexTy n) === Just (B.unpack x !! n)
, testProperty "indexMaybe Nothing" $
\n x -> (n :: Int) < 0 || fromIntegral n >= B.length x ==> B.indexMaybe x (fromIntegral n) === Nothing
\n x -> n < 0 || intToIndexTy n >= B.length x ==> B.indexMaybe x (intToIndexTy n) === Nothing
, testProperty "!?" $
\n x -> B.indexMaybe x (fromIntegral (n :: Int)) === x B.!? (fromIntegral n)
\(intToIndexTy -> n) x -> B.indexMaybe x n === x B.!? n

#ifdef BYTESTRING_CHAR8
, testProperty "isString" $
Expand Down
Loading

0 comments on commit 2bbc97e

Please sign in to comment.