Skip to content

Commit

Permalink
Add a concrete syntax for macaw
Browse files Browse the repository at this point in the history
This will enable caching analysis results in a text format that can be parsed
much more efficiently than re-running the entire fixed point analysis.
  • Loading branch information
travitch committed Nov 29, 2021
1 parent 952fe55 commit a56553b
Show file tree
Hide file tree
Showing 4 changed files with 612 additions and 0 deletions.
4 changes: 4 additions & 0 deletions base/macaw-base.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ library
galois-dwarf >= 0.2.2,
IntervalMap >= 0.5,
lens >= 4.7,
megaparsec >= 7 && < 10,
mtl,
parameterized-utils >= 2.1.0 && < 2.2,
prettyprinter >= 1.7.0,
Expand Down Expand Up @@ -78,6 +79,9 @@ library
Data.Macaw.Memory.LoadCommon
Data.Macaw.Memory.Permissions
Data.Macaw.Memory.Symbols
Data.Macaw.Syntax.Atom
Data.Macaw.Syntax.Parser
Data.Macaw.Syntax.SExpr
Data.Macaw.Types
Data.Macaw.Utils.Changed
Data.Macaw.Utils.IncComp
Expand Down
108 changes: 108 additions & 0 deletions base/src/Data/Macaw/Syntax/Atom.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
{-# LANGUAGE OverloadedStrings #-}
-- | The atoms of the concrete syntax for macaw
module Data.Macaw.Syntax.Atom (
Keyword(..)
, keywords
, AtomName(..)
, Atom(..)
)
where

import qualified Data.Map.Strict as Map
import qualified Data.Text as T
import Numeric.Natural ( Natural )

-- | Macaw syntax keywords
--
-- These are the keywords for the *base* macaw IR (i.e., without
-- architecture-specific extensions). The architecture-specific operations are
-- parsed as 'AtomName's initially and resolved by architecture-specific parsers
-- at the atom level.
data Keyword = BVAdd
| BVSub
| BVMul
| BVSDiv
| BVUDiv
| Lt
| Le
| Sle
| Slt
-- Syntax
| Assign
-- Statements
| Comment
| InstructionStart
| WriteMemory
| CondWriteMemory
-- Expressions
| ReadMemory
-- Boolean operations
| Not_
| And_
| Or_
| Xor_
-- Endianness
| BigEndian
| LittleEndian
-- MemRepr
| BVMemRepr
-- Types
| Bool_
| BV_
| Float_
| Tuple_
| Vec_
-- Values
| True_
| False_
| BV
| Undefined
deriving (Eq, Ord, Show)

-- | Uninterpreted atoms
newtype AtomName = AtomText T.Text
deriving (Eq, Ord, Show)

data Atom = Keyword !Keyword -- ^ Keywords include all of the built-in expressions and operators
| AtomName !AtomName -- ^ Non-keyword syntax atoms (to be interpreted at parse time)
| Register !Natural -- ^ A numbered local register (e.g., @r12@)
| AddressWord !Natural -- ^ An arbitrary address rendered in hex ('ArchAddrWord')
| SegmentOffset !Natural -- ^ A segment offset address rendered in hex (validation against the Memory object is required)
| Integer_ !Integer -- ^ Literal integers
| Natural_ !Natural -- ^ Literal naturals
| String_ !T.Text -- ^ Literal strings
deriving (Eq, Ord, Show)

keywords :: Map.Map T.Text Keyword
keywords = Map.fromList [ ("bv-add", BVAdd)
, ("bv-sub", BVSub)
, ("bv-mul", BVMul)
, ("bv-sdiv", BVSDiv)
, ("bv-udiv", BVUDiv)
, ("<", Lt)
, ("<=", Le)
, ("<$", Slt)
, ("<=$", Sle)
, ("not", Not_)
, ("and", And_)
, ("or", Or_)
, ("xor", Xor_)
, ("Bool", Bool_)
, ("BV", BV_)
, ("Float", Float_)
, ("Tuple", Tuple_)
, ("Vec", Vec_)
, ("true", True_)
, ("false", False_)
, ("bv", BV)
, ("undefined", Undefined)
, ("read-memory", ReadMemory)
, (":=", Assign)
, ("comment", Comment)
, ("instruction-start", InstructionStart)
, ("write-memory", WriteMemory)
, ("cond-write-memory", CondWriteMemory)
, ("big-endian", BigEndian)
, ("little-endian", LittleEndian)
, ("bv-mem", BVMemRepr)
]
269 changes: 269 additions & 0 deletions base/src/Data/Macaw/Syntax/Parser.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ViewPatterns #-}
-- | This module defines a concrete syntax parser for macaw, which is meant to
-- make it easy to persist analysis results and reuse them without recomputing
-- everything
--
-- The persisted artifact is a zip file with one file per function, plus a
-- manifest file that records metadata about the binary to enable the loader to
-- check to ensure that the loaded macaw IR actually matches the given binary.
module Data.Macaw.Syntax.Parser (
parseDiscoveryFunInfo
) where

import Control.Applicative ( (<|>) )
import qualified Data.ByteString as BS
import qualified Data.Map as Map
import qualified Data.Parameterized.Classes as PC
import qualified Data.Parameterized.NatRepr as PN
import Data.Parameterized.Some ( Some(..) )
import qualified Data.Text as T
import GHC.TypeLits ( type (<=) )
import Numeric.Natural ( Natural )
import qualified Text.Megaparsec as TM
import qualified Text.Megaparsec.Char as TMC
import qualified Text.Megaparsec.Char.Lexer as TMCL

import qualified Data.Macaw.CFG as MC
import qualified Data.Macaw.Discovery as MD
import qualified Data.Macaw.Discovery.State as MDS
import qualified Data.Macaw.Discovery.ParsedContents as Parsed
import qualified Data.Macaw.Memory as MM
import Data.Macaw.Syntax.Atom
import qualified Data.Macaw.Syntax.SExpr as SExpr
import qualified Data.Macaw.Types as MT


parse
:: (SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError a)
-> SExpr.Parser arch ids a
parse asSomething = do
at <- SExpr.sexp parseAtom
case asSomething at of
Left err -> TM.customFailure err
Right r -> return r

parseSExpr
:: forall arch ids a
. SExpr.Syntax Atom
-> (SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError a)
-> SExpr.Parser arch ids a
parseSExpr sexp asSomething =
case asSomething sexp of
Left err -> TM.customFailure err
Right r -> return r

data WidthRepr where
WidthRepr :: (1 <= n) => PN.NatRepr n -> WidthRepr

-- | Attempt to convert a 'Natural' into a non-zero 'PN.NatRepr'
asWidthRepr :: Natural -> Maybe WidthRepr
asWidthRepr nat =
case PN.mkNatRepr nat of
Some nr
| Right PN.LeqProof <- PN.isZeroOrGT1 nr -> Just (WidthRepr nr)
| otherwise -> Nothing

asEndianness :: SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError MM.Endianness
asEndianness at =
case at of
SExpr.A (Keyword BigEndian) -> Right MM.BigEndian
SExpr.A (Keyword LittleEndian) -> Right MM.LittleEndian
_ -> Left (SExpr.InvalidEndianness at)

asMemRepr :: SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError (SomeTyped MC.MemRepr)
asMemRepr at =
case at of
SExpr.L [ SExpr.A (Keyword BVMemRepr), SExpr.A (Natural_ w), send ]
| Just (WidthRepr nr) <- asWidthRepr w -> do
end <- asEndianness send
let nr8 = PN.natMultiply (PN.knownNat @8) nr
-- This cannot fail because we already checked that @w@ is >= 1 above
case PN.isZeroOrGT1 nr8 of
Left _ -> error ("Impossible; w was >= 1, so w * 8 must be >= 1: " ++ show nr8)
Right PN.LeqProof -> do
let tyRep = MT.BVTypeRepr nr8
return (SomeTyped tyRep (MC.BVMemRepr nr end))

data SomeTyped f where
SomeTyped :: MT.TypeRepr tp -> f tp -> SomeTyped f

asValue :: SExpr.Syntax Atom -> SExpr.Parser arch ids (SomeTyped (MC.Value arch ids))
asValue at =
case at of
SExpr.A (Keyword True_) -> return (SomeTyped MT.BoolTypeRepr (MC.BoolValue True))
SExpr.A (Keyword False_) -> return (SomeTyped MT.BoolTypeRepr (MC.BoolValue False))
SExpr.L [SExpr.A (Keyword BV), SExpr.A (Natural_ w), SExpr.A (Integer_ i)]
| Just (WidthRepr nr) <- asWidthRepr w -> return (SomeTyped (MT.BVTypeRepr nr) (MC.BVValue nr i))
| otherwise -> TM.customFailure SExpr.InvalidZeroWidthBV
SExpr.A (Register rnum) -> do
Some val <- SExpr.valueForRegisterNumber rnum
return (SomeTyped (MT.typeRepr val) val)

parseIdentifier :: SExpr.Parser arch ids T.Text
parseIdentifier = undefined

parseKeywordOrAtom :: SExpr.Parser arch ids Atom
parseKeywordOrAtom = do
x <- parseIdentifier
return $! maybe (AtomName (AtomText x)) Keyword (Map.lookup x keywords)

parseNatural :: SExpr.Parser arch ids Atom
parseNatural = Natural_ <$> TMCL.decimal

parseInteger :: SExpr.Parser arch ids Atom
parseInteger = TM.try (Integer_ <$> (TMC.char '+' >> TMCL.decimal))
<|> (Integer_ <$> (TMC.char '-' >> TMCL.decimal))

parseAtom :: SExpr.Parser arch ids Atom
parseAtom = TM.try parseKeywordOrAtom
<|> TM.try parseNatural
<|> TM.try parseInteger

asApp :: SExpr.Syntax Atom -> SExpr.Parser arch ids (SomeTyped (MC.App (MC.Value arch ids)))
asApp a =
case a of
SExpr.L [ SExpr.A (Keyword BVAdd), slhs, srhs ] -> do
SomeTyped lrep lhs <- asValue slhs
SomeTyped rrep rhs <- asValue srhs
case (PC.testEquality lrep rrep, lrep) of
(Just PC.Refl, MT.BVTypeRepr nr) -> return (SomeTyped lrep (MC.BVAdd nr lhs rhs))

-- | Parse a single type as a 'MT.TypeRepr'
--
-- The type forms are:
--
-- * @Bool@
-- * @(BV n)@
-- * @(Vec n ty)@
asTypeRepr :: SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError (Some MT.TypeRepr)
asTypeRepr at =
case at of
SExpr.A (Keyword Bool_) -> Right (Some MT.BoolTypeRepr)
SExpr.L [SExpr.A (Keyword BV_), SExpr.A (Natural_ w)]
| Just (WidthRepr nr) <- asWidthRepr w -> Right (Some (MT.BVTypeRepr nr))
| otherwise -> Left SExpr.InvalidZeroWidthBV
SExpr.L [SExpr.A (Keyword Vec_), SExpr.A (Natural_ w), mty] ->
case PN.mkNatRepr w of
Some nr ->
-- Note that zero-width vectors are technically allowed
case asTypeRepr mty of
Right (Some ty) -> Right (Some (MT.VecTypeRepr nr ty))
Left _errs -> Left (SExpr.InvalidVectorPayload mty)
_ -> Left (SExpr.InvalidType at)

-- | Parse the right-hand side of an assignment
--
-- Note that it is important that the EvalApp case is last, as there are many
-- syntactic forms that we might need to dispatch to.
asRHS
:: forall arch ids
. SExpr.Syntax Atom
-> SExpr.Parser arch ids (Some (MC.AssignRhs arch (MC.Value arch ids)))
asRHS a =
case a of
SExpr.L [ SExpr.A (Keyword Undefined), srep ] -> do
Some rep <- parseSExpr srep asTypeRepr
return (Some (MC.SetUndefined rep))
SExpr.L [ SExpr.A (Keyword ReadMemory), saddr, smemRepr ] -> do
SomeTyped addrTp addr <- asValue saddr
SomeTyped _rep memRepr <- parseSExpr smemRepr asMemRepr

memWidth <- SExpr.archMemWidth
let addrRepr = MT.BVTypeRepr memWidth

case PC.testEquality addrTp addrRepr of
Just PC.Refl -> return (Some (MC.ReadMem addr memRepr))
Nothing -> TM.customFailure (SExpr.InvalidAddressWidth a)
_ -> do
SomeTyped _tp app <- asApp a
return (Some (MC.EvalApp app))

-- | Forms:
--
-- * @(comment "msg")@
-- * @(instruction-start addr decoded-asm-text)@
-- * @(write-memory addr mem-rep value)@
-- * @(cond-write-memory cond addr mem-rep value)@
-- * @(reg := rhs)@
parseStmt :: forall arch ids . SExpr.Parser arch ids (MC.Stmt arch ids)
parseStmt = do
at <- SExpr.sexp parseAtom
case at of
SExpr.L [ SExpr.A (Keyword Comment), SExpr.A (String_ txt) ] ->
return (MC.Comment txt)
SExpr.L [ SExpr.A (Keyword InstructionStart), SExpr.A (AddressWord addr), SExpr.A (String_ txt) ] ->
return (MC.InstructionStart (MM.memWord (fromIntegral addr)) txt)
SExpr.L [ SExpr.A (Keyword WriteMemory), stargetAddr, smemRepr, svalue ] -> do
SomeTyped addrTy addr <- asValue stargetAddr
SomeTyped vtp value <- asValue svalue
SomeTyped mtp memRepr <- parseSExpr smemRepr asMemRepr
memWidth <- SExpr.archMemWidth
let addrRepr = MT.BVTypeRepr memWidth
case (PC.testEquality vtp mtp, PC.testEquality addrTy addrRepr) of
(Just PC.Refl, Just PC.Refl) -> do
return (MC.WriteMem addr memRepr value)
-- FIXME: Make a more-specific error
_ -> TM.customFailure (SExpr.InvalidStatement at)
SExpr.L [ SExpr.A (Keyword CondWriteMemory), scond, stargetAddr, smemRepr, svalue ] -> do
SomeTyped condTy cond <- asValue scond
SomeTyped addrTy addr <- asValue stargetAddr
SomeTyped vtp value <- asValue svalue
SomeTyped mtp memRepr <- parseSExpr smemRepr asMemRepr
memWidth <- SExpr.archMemWidth
let addrRepr = MT.BVTypeRepr memWidth
case (PC.testEquality vtp mtp, PC.testEquality addrTy addrRepr, PC.testEquality condTy MT.BoolTypeRepr) of
(Just PC.Refl, Just PC.Refl, Just PC.Refl) -> do
return (MC.CondWriteMem cond addr memRepr value)
-- FIXME: Make a more-specific error
_ -> TM.customFailure (SExpr.InvalidStatement at)
SExpr.L [ SExpr.A (Register r), SExpr.A (Keyword Assign), srhs ] -> do
Some rhs <- asRHS srhs
Some asgn <- SExpr.defineRegister r rhs
return (MC.AssignStmt asgn)
_ -> TM.customFailure (SExpr.InvalidStatement at)

parseBlock :: SExpr.Parser arch ids (Parsed.ParsedBlock arch ids)
parseBlock = do

stmts <- TM.many parseStmt

return Parsed.ParsedBlock { Parsed.pblockAddr = undefined
, Parsed.pblockPrecond = undefined
, Parsed.blockSize = undefined
, Parsed.blockReason = undefined
, Parsed.blockAbstractState = undefined
, Parsed.blockJumpBounds = undefined
, Parsed.pblockStmts = stmts
, Parsed.pblockTermStmt = undefined
}

parseFunction :: SExpr.Parser arch ids (Some (MD.DiscoveryFunInfo arch))
parseFunction = do

blocks <- TM.many parseBlock

let dfi = MDS.DiscoveryFunInfo { MDS.discoveredFunReason = undefined
, MDS.discoveredFunAddr = undefined
, MDS.discoveredFunSymbol = undefined
, MDS._parsedBlocks = Map.fromList [ (Parsed.pblockAddr pb, pb)
| pb <- blocks
]
, MDS.discoveredClassifyFailureResolutions = undefined
}
return (Some dfi)

parseDiscoveryFunInfo
:: (MC.ArchConstraints arch)
=> MM.Memory (MC.ArchAddrWidth arch)
-- ^ The memory of the binary we are loading results for
-> BS.ByteString
-- ^ The bytes of the persisted file
-> Either SExpr.MacawParseError (Some (MD.DiscoveryFunInfo arch))
parseDiscoveryFunInfo mem bytes = SExpr.runParser mem bytes parseFunction
Loading

0 comments on commit a56553b

Please sign in to comment.