Skip to content

Commit

Permalink
[Evaluation] [Names] Define all lookups in terms of 'contIndexZero' (#…
Browse files Browse the repository at this point in the history
…6702)

Instead of 4 different implementation this will make it 1 main and 3 derivative ones.

This appears to make the validation benchmarks faster by a percent and the nofib benchmarks faster by a couple of percent while simplifying the code, so a clear win.
  • Loading branch information
effectfully authored Dec 4, 2024
1 parent b570f32 commit 34e9bf2
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 88 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### Changed

- In #6702 made variable lookup faster increasing overall performance of the evaluator by 1%.
148 changes: 61 additions & 87 deletions plutus-core/index-envs/src/Data/RandomAccessList/SkewBinary.hs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
{-# LANGUAGE ViewPatterns #-}
module Data.RandomAccessList.SkewBinary
( RAList(Cons,Nil)
, contIndexZero
, contIndexOne
, safeIndexZero
, unsafeIndexZero
, safeIndexOne
, safeIndexOneCont
, unsafeIndexOne
, Data.RandomAccessList.SkewBinary.null
, uncons
Expand Down Expand Up @@ -48,7 +49,7 @@ data RAList a = BHead
null :: RAList a -> Bool
null Nil = True
null _ = False
{-# INLINABLE null #-}
{-# INLINE null #-}

{-# complete Cons, Nil #-}
{-# complete BHead, Nil #-}
Expand All @@ -63,6 +64,7 @@ cons :: a -> RAList a -> RAList a
cons x = \case
(BHead w1 t1 (BHead w2 t2 ts')) | w1 == w2 -> BHead (2*w1+1) (Node x t1 t2) ts'
ts -> BHead 1 (Leaf x) ts
{-# INLINE cons #-}

-- /O(1)/
uncons :: RAList a -> Maybe (a, RAList a)
Expand All @@ -74,122 +76,94 @@ uncons = \case
-- split the node in two)
in Just (x, BHead halfSize t1 $ BHead halfSize t2 ts)
Nil -> Nothing
{-# INLINE uncons #-}

-- 0-based
unsafeIndexZero :: RAList a -> Word64 -> a
unsafeIndexZero Nil _ = error "out of bounds"
unsafeIndexZero (BHead w t ts) !i =
if i < w
then indexTree w i t
else unsafeIndexZero ts (i-w)
where
indexTree :: Word64 -> Word64 -> Tree a -> a
indexTree 1 0 (Leaf x) = x
indexTree _ _ (Leaf _) = error "out of bounds"
indexTree _ 0 (Node x _ _) = x
indexTree treeSize offset (Node _ t1 t2) =
let halfSize = unsafeShiftR treeSize 1 -- probably faster than `div w 2`
in if offset <= halfSize
then indexTree halfSize (offset - 1) t1
else indexTree halfSize (offset - 1 - halfSize) t2

-- 0-based
safeIndexZero :: RAList a -> Word64 -> Maybe a
safeIndexZero Nil _ = Nothing
safeIndexZero (BHead w t ts) !i =
if i < w
then indexTree w i t
else safeIndexZero ts (i-w)
where
indexTree :: Word64 -> Word64 -> Tree a -> Maybe a
indexTree 1 0 (Leaf x) = Just x
indexTree _ _ (Leaf _) = Nothing
indexTree _ 0 (Node x _ _) = Just x
indexTree treeSize offset (Node _ t1 t2) =
let halfSize = unsafeShiftR treeSize 1 -- probably faster than `div w 2`
in if offset <= halfSize
then indexTree halfSize (offset - 1) t1
else indexTree halfSize (offset - 1 - halfSize) t2

-- 1-based
unsafeIndexOne :: RAList a -> Word64 -> a
unsafeIndexOne Nil _ = error "out of bounds"
unsafeIndexOne (BHead w t ts) !i =
if i <= w
then indexTree w i t
else unsafeIndexOne ts (i-w)
where
indexTree :: Word64 -> Word64 -> Tree a -> a
indexTree _ 0 _ = error "index zero"
indexTree 1 1 (Leaf x) = x
indexTree _ _ (Leaf _) = error "out of bounds"
indexTree _ 1 (Node x _ _) = x
indexTree treeSize offset (Node _ t1 t2) =
let halfSize = unsafeShiftR treeSize 1 -- probably faster than `div w 2`
offset' = offset - 1
in if offset' <= halfSize
then indexTree halfSize offset' t1
else indexTree halfSize (offset' - halfSize) t2

{- Note [Optimizations of safeIndexOneCont]
Bangs in the local definitions of 'safeIndexOneCont' are needed to tell GHC that the functions are
{- Note [Optimizations of contIndexZero]
Bangs in the local definitions of 'contIndexZero' are needed to tell GHC that the functions are
strict in the 'Word64' argument, so that GHC produces workers operating on @Word64#@.
The function itself is CPS-ed, so that the arguments force the local definitions to be retained
within 'safeIndexOneCont' instead of being pulled out via full-laziness or some other optimization
pass. This ensures that when 'safeIndexOneCont' gets inlined, the local definitions appear directly
in the GHC Core, allowing GHC to inline the arguments of 'safeIndexOneCont' and transform the whole
within 'contIndexZero' instead of being pulled out via full-laziness or some other optimization
pass. This ensures that when 'contIndexZero' gets inlined, the local definitions appear directly
in the GHC Core, allowing GHC to inline the arguments of 'contIndexZero' and transform the whole
thing into a beautiful recursive join point full of @Word64#@s, i.e. allocating very little if
anything at all.
-}

-- See Note [Optimizations of safeIndexOneCont].
safeIndexOneCont :: forall a b. b -> (a -> b) -> RAList a -> Word64 -> b
safeIndexOneCont z f = findTree where
-- See Note [Optimizations of contIndexZero].
contIndexZero :: forall a b. b -> (a -> b) -> RAList a -> Word64 -> b
contIndexZero z f = findTree where
findTree :: RAList a -> Word64 -> b
-- See Note [Optimizations of safeIndexOneCont].
-- See Note [Optimizations of contIndexZero].
findTree Nil !_ = z
findTree (BHead w t ts) i =
if i <= w
if i < w
then indexTree w i t
else findTree ts (i-w)

indexTree :: Word64 -> Word64 -> Tree a -> b
-- See Note [Optimizations of safeIndexOneCont].
indexTree !w 1 t = case t of
-- See Note [Optimizations of contIndexZero].
indexTree !w 0 t = case t of
Node x _ _ -> f x
Leaf x -> if w == 1 then f x else z
indexTree _ 0 _ = z -- "index zero"
indexTree _ _ (Leaf _) = z
indexTree treeSize offset (Node _ t1 t2) =
let halfSize = unsafeShiftR treeSize 1 -- probably faster than `div w 2`
offset' = offset - 1
in if offset' <= halfSize
then indexTree halfSize offset' t1
else indexTree halfSize (offset' - halfSize) t2
{-# INLINE safeIndexOneCont #-}
in if offset <= halfSize
then indexTree halfSize (offset - 1) t1
else indexTree halfSize (offset - 1 - halfSize) t2
{-# INLINE contIndexZero #-}

contIndexOne :: forall a b. b -> (a -> b) -> RAList a -> Word64 -> b
contIndexOne z _ _ 0 = z
contIndexOne z f t n = contIndexZero z f t (n - 1)
{-# INLINE contIndexOne #-}

-- 0-based
unsafeIndexZero :: RAList a -> Word64 -> a
unsafeIndexZero = contIndexZero (error "out of bounds") id
{-# INLINE unsafeIndexZero #-}

-- 0-based
safeIndexZero :: RAList a -> Word64 -> Maybe a
safeIndexZero = contIndexZero Nothing Just
{-# INLINE safeIndexZero #-}

-- 1-based
unsafeIndexOne :: RAList a -> Word64 -> a
unsafeIndexOne = contIndexOne (error "out of bounds") id
{-# INLINE unsafeIndexOne #-}

-- 1-based
safeIndexOne :: RAList a -> Word64 -> Maybe a
safeIndexOne = safeIndexOneCont Nothing Just
safeIndexOne = contIndexOne Nothing Just
{-# INLINE safeIndexOne #-}

instance RAL.RandomAccessList (RAList a) where
type Element (RAList a) = a

{-# INLINABLE empty #-}
empty = Nil
{-# INLINABLE cons #-}
{-# INLINE empty #-}

cons = Cons
{-# INLINABLE uncons #-}
{-# INLINE cons #-}

uncons = uncons
{-# INLINABLE length #-}
length Nil = 0
length (BHead sz _ tl) = sz + RAL.length tl
{-# INLINABLE indexZero #-}
{-# INLINE uncons #-}

length = go 0 where
go !acc Nil = acc
go !acc (BHead sz _ tl) = go (acc + sz) tl
{-# INLINE length #-}

indexZero = safeIndexZero
{-# INLINABLE indexOne #-}
{-# INLINE indexZero #-}

indexOne = safeIndexOne
{-# INLINABLE unsafeIndexZero #-}
{-# INLINE indexOne #-}

unsafeIndexZero = unsafeIndexZero
{-# INLINABLE unsafeIndexOne #-}
{-# INLINE unsafeIndexZero #-}

unsafeIndexOne = unsafeIndexOne
{-# INLINE unsafeIndexOne #-}
Original file line number Diff line number Diff line change
Expand Up @@ -878,7 +878,7 @@ enterComputeCek = computeCek
-- | Look up a variable name in the environment.
lookupVarName :: NamedDeBruijn -> CekValEnv uni fun ann -> CekM uni fun s (CekValue uni fun ann)
lookupVarName varName@(NamedDeBruijn _ varIx) varEnv =
Env.safeIndexOneCont
Env.contIndexOne
(throwingWithCause _MachineError OpenTermEvaluatedMachineError . Just $ Var () varName)
pure
varEnv
Expand Down

1 comment on commit 34e9bf2

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark 'Plutus Benchmarks'.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 1.05.

Benchmark suite Current: 34e9bf2 Previous: b570f32 Ratio
validation-auction_1-1 180.3 μs 164.3 μs 1.10
validation-auction_1-2 585.8 μs 533.6 μs 1.10
validation-auction_1-3 587.8 μs 534.4 μs 1.10
validation-auction_1-4 234.4 μs 219.4 μs 1.07
validation-auction_2-1 180.4 μs 164.3 μs 1.10
validation-auction_2-2 582.6 μs 532.1 μs 1.09
validation-auction_2-3 758.1 μs 692.4 μs 1.09
validation-auction_2-4 584.7 μs 531.1 μs 1.10
validation-auction_2-5 232.9 μs 211.7 μs 1.10
validation-crowdfunding-success-1 211 μs 190.2 μs 1.11
validation-crowdfunding-success-2 211.9 μs 191.4 μs 1.11
validation-crowdfunding-success-3 212.1 μs 189.8 μs 1.12
validation-currency-1 232.7 μs 212.6 μs 1.09
validation-escrow-redeem_1-1 331.9 μs 300.7 μs 1.10
validation-escrow-redeem_2-1 386.7 μs 348.7 μs 1.11
validation-future-increase-margin-2 484.6 μs 453.4 μs 1.07
validation-future-increase-margin-3 486.6 μs 442.7 μs 1.10
validation-future-increase-margin-4 449.3 μs 414.9 μs 1.08
validation-future-increase-margin-5 741.2 μs 684.1 μs 1.08
validation-future-pay-out-1 233.1 μs 212.1 μs 1.10
validation-future-settle-early-3 485.5 μs 442.8 μs 1.10
validation-game-sm-success_1-2 192.6 μs 181.5 μs 1.06
validation-stablecoin_1-2 196.4 μs 178.8 μs 1.10
validation-decode-auction_1-1 211 μs 192.5 μs 1.10
validation-decode-multisig-sm-4 621.2 μs 574.9 μs 1.08
validation-decode-multisig-sm-5 622.5 μs 576.6 μs 1.08
validation-decode-multisig-sm-6 622.6 μs 567.8 μs 1.10
validation-decode-multisig-sm-7 620.5 μs 575.8 μs 1.08
validation-decode-multisig-sm-8 622.3 μs 574.2 μs 1.08
validation-decode-multisig-sm-9 621.9 μs 574.2 μs 1.08
validation-decode-multisig-sm-10 620.3 μs 574.2 μs 1.08
validation-decode-ping-pong-1 521.7 μs 482.8 μs 1.08
validation-decode-ping-pong-2 522.3 μs 481.7 μs 1.08
validation-decode-ping-pong_2-1 522.1 μs 471.5 μs 1.11
validation-decode-prism-1 174.7 μs 158.2 μs 1.10
validation-decode-prism-2 557.1 μs 515.3 μs 1.08
validation-decode-prism-3 255.1 μs 234.5 μs 1.09
validation-decode-stablecoin_1-1 921.1 μs 834.2 μs 1.10
validation-decode-stablecoin_1-2 178.7 μs 160.5 μs 1.11
validation-decode-stablecoin_1-3 920.4 μs 832.6 μs 1.11
validation-decode-stablecoin_1-4 178.7 μs 167.2 μs 1.07
marlowe-semantics/5e274e0f593511543d41570a4b03646c1d7539062b5728182e073e5760561a66 988.4 μs 905.9 μs 1.09
marlowe-semantics/5e2c68ac9f62580d626636679679b97109109df7ac1a8ce86d3e43dfb5e4f6bc 512 μs 472.1 μs 1.08
marlowe-semantics/5f130d19918807b60eab4c03119d67878fb6c6712c28c54f5a25792049294acc 305.5 μs 280.4 μs 1.09
marlowe-semantics/5f306b4b24ff2b39dab6cdc9ac6ca9bb442c1dc6f4e7e412eeb5a3ced42fb642 744.3 μs 700.7 μs 1.06
marlowe-semantics/66af9e473d75e3f464971f6879cc0f2ef84bafcb38fbfa1dbc31ac2053628a38 1268 μs 1162 μs 1.09
marlowe-semantics/675d63836cad11b547d1b4cddd498f04c919d4342612accf40913f9ae9419fac 1010 μs 929.9 μs 1.09
marlowe-semantics/67ba5a9a0245ee3aff4f34852b9889b8c810fccd3dce2a23910bddd35c503b71 6053 μs 5602 μs 1.08
marlowe-semantics/6d88f7294dd2b5ce02c3dc609bc7715bd508009738401d264bf9b3eb7c6f49c1 481.4 μs 440 μs 1.09
marlowe-semantics/70f65b21b77ddb451f3df9d9fb403ced3d10e1e953867cc4900cc25e5b9dec47 784.1 μs 719.9 μs 1.09
marlowe-semantics/71965c9ccae31f1ffc1d85aa20a356d4ed97a420954018d8301ec4f9783be0d7 468.4 μs 429.3 μs 1.09
marlowe-semantics/74c67f2f182b9a0a66c62b95d6fac5ace3f7e71ea3abfc52ffbe3ecb93436ea2 774.6 μs 711.8 μs 1.09
marlowe-semantics/7529b206a78becb793da74b78c04d9d33a2540a1abd79718e681228f4057403a 781.3 μs 712.2 μs 1.10
marlowe-semantics/75a8bb183688bce447e00f435a144c835435e40a5defc6f3b9be68b70b4a3db6 685.9 μs 630.6 μs 1.09
marlowe-semantics/7a758e17486d1a30462c32a5d5309bd1e98322a9dcbe277c143ed3aede9d265f 525.1 μs 479 μs 1.10
marlowe-semantics/7cbc5644b745f4ea635aca42cce5e4a4b9d2e61afdb3ac18128e1688c07071ba 462.2 μs 423.7 μs 1.09
marlowe-semantics/82213dfdb6a812b40446438767c61a388d2c0cfd0cbf7fd4a372b0dc59fa17e1 1317 μs 1205 μs 1.09
marlowe-semantics/8c7fdc3da6822b5112074380003524f50fb3a1ce6db4e501df1086773c6c0201 1128 μs 1026 μs 1.10
marlowe-semantics/8d9ae67656a2911ab15a8e5301c960c69aa2517055197aff6b60a87ff718d66c 350.2 μs 320.8 μs 1.09
marlowe-semantics/96e1a2fa3ceb9a402f2a5841a0b645f87b4e8e75beb636692478ec39f74ee221 304.9 μs 279 μs 1.09
marlowe-role-payout/c99ecc2146ce2066ba6dffc734923264f8794815acbc2ec74c2c2c42ba272e4d 213.5 μs 195.7 μs 1.09
marlowe-role-payout/caa409c40e39aed9b0f59214b4baa178c375526dea6026b4552b88d2cc729716 170.1 μs 154.9 μs 1.10
marlowe-role-payout/cb2ab8e22d1f64e8d204dece092e90e9bf1fa8b2a6e9cba5012dbe4978065832 180.8 μs 164.6 μs 1.10
marlowe-role-payout/cc1e82927f6c65b3e912200ae30588793d2066e1d4a6627c21955944ac9bd528 199.3 μs 181.9 μs 1.10
marlowe-role-payout/d5cda74eb0947e025e02fb8ed365df39d0a43e4b42cd3573ac2d8fcb29115997 195.1 μs 178.9 μs 1.09
marlowe-role-payout/d6bc8ac4155e22300085784148bbc9d9bbfea896e1009dd396610a90e3943032 198.3 μs 181.4 μs 1.09
marlowe-role-payout/da353bf9219801fa1bf703fc161497570954e9af7e10ffe95c911a9ef97e77bd 182.8 μs 166.4 μs 1.10
marlowe-role-payout/dc45c5f1b700b1334db99f50823321daaef0e6925b9b2fabbc9df7cde65af62e 185.9 μs 169.7 μs 1.10
marlowe-role-payout/df487b2fd5c1583fa33644423849bc1ab5f02f37edc0c235f34ef01cb12604f6 185.1 μs 168.3 μs 1.10
marlowe-role-payout/eabeeae18131af89fa57936c0e9eb8d2c7adba534f7e1a517d75410028fa0d6c 179.1 μs 161.9 μs 1.11
marlowe-role-payout/ec4712ee820eb959a43ebedfab6735f2325fa52994747526ffd2a4f4f84dd58e 201.7 μs 183.8 μs 1.10
marlowe-role-payout/ee3962fbd7373360f46decef3c9bda536a0b1daf6cda3b8a4bcfd6deeb5b4c53 200.8 μs 184.3 μs 1.09
marlowe-role-payout/f1a1e6a487f91feca5606f72bbb1e948c71abf043c6a0ea83bfea9ec6a0f08d8 178.4 μs 162.2 μs 1.10
marlowe-role-payout/f2932e4ca4bbb94b0a9ffbe95fcb7bd5639d9751d75d56d5e14efa5bbed981df 175.9 μs 160.3 μs 1.10
marlowe-role-payout/f53e8cafe26647ccce51e4c31db13608aea1f39034c0f52dee2e5634ef66e747 193.1 μs 176.8 μs 1.09
marlowe-role-payout/f7275afb60e33a550df13a132102e7e925dd28965a4efbe510a89b077ff9417f 178.3 μs 161.7 μs 1.10
marlowe-role-payout/fc8c5f45ffcdb024c21e0f34b22c23de8045a94d5e1a5bda1555c45ddb059f82 184.8 μs 168.1 μs 1.10
marlowe-role-payout/ff38b1ec89952d0247630f107a90cbbeb92ecbfcd19b284f60255718e4ec7548 210.6 μs 191.8 μs 1.10

This comment was automatically generated by workflow using github-action-benchmark.

CC: @IntersectMBO/plutus-core

Please sign in to comment.