From 1ae86be323ff0561a313810303f779add0f7da76 Mon Sep 17 00:00:00 2001 From: Bryan Richter Date: Wed, 12 Jul 2023 19:33:34 +0300 Subject: [PATCH] Implement pure-Haskell reverse --- cbits/measure_off.c | 3 + src/Data/Text.hs | 27 +------- src/Data/Text/Internal/Reverse.hs | 102 ++++++++++++++++++++++++++++++ text.cabal | 26 +++++++- 4 files changed, 131 insertions(+), 27 deletions(-) create mode 100644 src/Data/Text/Internal/Reverse.hs diff --git a/cbits/measure_off.c b/cbits/measure_off.c index e44088fd..5f098f85 100644 --- a/cbits/measure_off.c +++ b/cbits/measure_off.c @@ -70,6 +70,9 @@ static inline const ssize_t measure_off_naive(const uint8_t *src, const uint8_t while (src < srcend - 7){ uint64_t w64; memcpy(&w64, src, sizeof(uint64_t)); + // find leading bytes by finding every byte that is not a continuation + // byte. The bit twiddle only results in a 0 if the original byte starts + // with 0b11... w64 = ((w64 << 1) | ~w64) & 0x8080808080808080ULL; // compute the popcount of w64 with two bit shifts and a multiplication size_t leads = ( (w64 >> 7) // w64 >> 7 = Sum{0<= i <= 7} x_i * 256^i (x_i \in {0,1}) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 0fca9bd9..60b1aeda 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -213,7 +213,7 @@ import Prelude (Char, Bool(..), Int, Maybe(..), String, Eq, (==), (/=), Ord(..), Ordering(..), (++), Monad(..), pure, Read(..), (&&), (||), (+), (-), (.), ($), ($!), (>>), - not, return, otherwise, quot, IO) + not, return, otherwise, quot) import Control.DeepSeq (NFData(rnf)) #if defined(ASSERTS) import Control.Exception (assert) @@ -224,13 +224,13 @@ import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf), constrIndex, Constr, mkConstr, DataType, mkDataType, Fixity(Prefix)) import Control.Monad (foldM) import Control.Monad.ST (ST, runST) -import Control.Monad.ST.Unsafe (unsafeIOToST) import qualified Data.Text.Array as A import qualified Data.List as L hiding (head, tail) import Data.Binary (Binary(get, put)) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.String (IsString(..)) +import Data.Text.Internal.Reverse (reverse) import Data.Text.Internal.Encoding.Utf8 (utf8Length, utf8LengthByLeader, chr2, chr3, chr4, ord2, ord3, ord4) import qualified Data.Text.Internal.Fusion as S import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping) @@ -746,29 +746,6 @@ intersperse c t@(Text src o l) = if l == 0 then mempty else runST $ do return (Text arr 0 (dstLen - cLen)) {-# INLINE [1] intersperse #-} --- | /O(n)/ Reverse the characters of a string. --- --- Example: --- --- >>> T.reverse "desrever" --- "reversed" -reverse :: -#if defined(ASSERTS) - HasCallStack => -#endif - Text -> Text -reverse (Text (A.ByteArray ba) off len) = runST $ do - marr@(A.MutableByteArray mba) <- A.new len - unsafeIOToST $ c_reverse mba ba (intToCSize off) (intToCSize len) - brr <- A.unsafeFreeze marr - return $ Text brr 0 len -{-# INLINE reverse #-} - --- | The input buffer (src :: ByteArray#, off :: CSize, len :: CSize) --- must specify a valid UTF-8 sequence, this condition is not checked. -foreign import ccall unsafe "_hs_text_reverse" c_reverse - :: Exts.MutableByteArray# s -> ByteArray# -> CSize -> CSize -> IO () - -- | /O(m+n)/ Replace every non-overlapping occurrence of @needle@ in -- @haystack@ with @replacement@. -- diff --git a/src/Data/Text/Internal/Reverse.hs b/src/Data/Text/Internal/Reverse.hs new file mode 100644 index 00000000..f3b91b46 --- /dev/null +++ b/src/Data/Text/Internal/Reverse.hs @@ -0,0 +1,102 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE UnliftedFFITypes #-} +{-# LANGUAGE NoImplicitPrelude #-} +#if defined(PURE_HASKELL) +{-# LANGUAGE BangPatterns #-} +#endif + +{-# OPTIONS_HADDOCK not-home #-} + +{-# OPTIONS_GHC -ddump-to-file -ddump-simpl -dsuppress-all -dno-suppress-type-signatures #-} + +-- | Implements 'reverse', using efficient C routines by default. +module Data.Text.Internal.Reverse (reverse) where + +#if !defined(PURE_HASKELL) +import GHC.Exts as Exts +import Control.Monad.ST.Unsafe (unsafeIOToST) +import Foreign.C.Types (CSize(..)) +#else +import Control.Monad.ST (ST) +import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader) +#endif +#if defined(ASSERTS) +import GHC.Stack (HasCallStack) +#endif +import Prelude hiding (reverse) +import Data.Text.Internal (Text(..)) +import Control.Monad.ST (runST) +import qualified Data.Text.Array as A + +-- | /O(n)/ Reverse the characters of a string. +-- +-- Example: +-- +-- $setup +-- >>> T.reverse "desrever" +-- "reversed" +reverse :: +#if defined(ASSERTS) + HasCallStack => +#endif + Text -> Text +#if defined(PURE_HASKELL) +reverse (Text src off len) = runST $ do + dest <- A.new len + _ <- reversePoints src off dest len + result <- A.unsafeFreeze dest + pure $ Text result 0 len + +-- Step 0: +-- +-- Input: R E D R U M +-- ^ +-- x +-- Output: _ _ _ _ _ _ +-- ^ +-- y +-- +-- Step 1: +-- +-- Input: R E D R U M +-- ^ +-- x +-- +-- Output: _ _ _ _ _ R +-- ^ +-- y +reversePoints + :: A.Array -- ^ Input array + -> Int -- ^ Input index + -> A.MArray s -- ^ Output array + -> Int -- ^ Output index + -> ST s () +reversePoints src xx dest yy = go xx yy where + go !_ y | y <= 0 = pure () + go x y = + let pLen = utf8LengthByLeader (A.unsafeIndex src x) + -- The next y is also the start of the current point in the output + yNext = y - pLen + in do + A.copyI pLen dest yNext src x + go (x + pLen) yNext +#else +reverse (Text (A.ByteArray ba) off len) = runST $ do + marr@(A.MutableByteArray mba) <- A.new len + unsafeIOToST $ c_reverse mba ba (fromIntegral off) (fromIntegral len) + brr <- A.unsafeFreeze marr + return $ Text brr 0 len +#endif +{-# INLINE reverse #-} + +#if !defined(PURE_HASKELL) +-- | The input buffer (src :: ByteArray#, off :: CSize, len :: CSize) +-- must specify a valid UTF-8 sequence, this condition is not checked. +foreign import ccall unsafe "_hs_text_reverse" c_reverse + :: Exts.MutableByteArray# s -> ByteArray# -> CSize -> CSize -> IO () +#endif + +-- $setup +-- >>> :set -XOverloadedStrings +-- >>> import qualified Data.Text.Internal.Reverse as T diff --git a/text.cabal b/text.cabal index 15aa1d4d..bad26bb4 100644 --- a/text.cabal +++ b/text.cabal @@ -79,14 +79,35 @@ flag simdutf default: True manual: True +flag pure-haskell + description: Don't use text's standard C routines + NB: This feature is not fully implemented. Several C routines are still in + use. + + When this flag is true, text will use pure Haskell variants of the + routines. This is not recommended except for use with GHC's JavaScript + backend. + + This flag also disables simdutf. + + default: False + manual: True + library - c-sources: cbits/is_ascii.c + if arch(javascript) || flag(pure-haskell) + cpp-options: -DPURE_HASKELL + c-sources: cbits/is_ascii.c + cbits/measure_off.c + cbits/utils.c + else + c-sources: cbits/is_ascii.c cbits/measure_off.c cbits/reverse.c cbits/utils.c + hs-source-dirs: src - if flag(simdutf) + if flag(simdutf) && !(arch(javascript) || flag(pure-haskell)) exposed-modules: Data.Text.Internal.Validate.Simd include-dirs: simdutf cxx-sources: simdutf/simdutf.cpp @@ -185,6 +206,7 @@ library other-modules: Data.Text.Show + Data.Text.Internal.Reverse build-depends: array >= 0.3 && < 0.6,