Skip to content

Commit

Permalink
Merge pull request #17577 from yoff/python/add-comprehension-capture-…
Browse files Browse the repository at this point in the history
…flow

python: capture flow through comprehensions
  • Loading branch information
yoff authored Oct 4, 2024
2 parents 04a4fb2 + 6f5b949 commit 6bb98b0
Show file tree
Hide file tree
Showing 17 changed files with 170 additions and 60 deletions.
5 changes: 5 additions & 0 deletions python/ql/lib/change-notes/2024-10-01-comprehension-flow.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
category: minorAnalysis
---
* More precise modelling of the dataflow through comprehensions. In particular, captured variables are now handled correctly.
* Dataflow out of yield is added, allowing proper tracing through generators.
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ newtype TArgumentPosition =
// position, we need to ensure we make these available (these are specified as
// parameters in the flow-summary spec)
FlowSummaryImpl::ParsePositions::isParsedPositionalParameterPosition(_, index)
or
// the generated function inside a comprehension has a positional argument at index 0
exists(Comp c) and
index = 0
} or
TKeywordArgumentPosition(string name) {
exists(any(CallNode c).getArgByName(name))
Expand Down Expand Up @@ -314,12 +318,11 @@ newtype TDataFlowCallable =
* class instantiations, and (in the future) special methods.
*/
TFunction(Function func) {
// For generators/list-comprehensions we create a synthetic function. In the
// points-to call-graph these were not considered callable, and instead we added
// data-flow steps (read/write) for these. As an easy solution for now, we do the
// same to keep things easy to reason about (and therefore exclude things that do
// not have a definition)
// Functions with an explicit definition
exists(func.getDefinition())
or
// For generators/list-comprehensions we create a synthetic function.
exists(Comp c | c.getFunction() = func)
} or
/** see QLDoc for `DataFlowModuleScope` for why we need this. */
TModule(Module m) or
Expand Down Expand Up @@ -1381,6 +1384,8 @@ private predicate sameEnclosingCallable(Node node1, Node node2) {
// =============================================================================
newtype TDataFlowCall =
TNormalCall(CallNode call, Function target, CallType type) { resolveCall(call, target, type) } or
/** A call to the generated function inside a comprehension */
TComprehensionCall(Comp c) or
TPotentialLibraryCall(CallNode call) or
/** A synthesized call inside a summarized callable */
TSummaryCall(
Expand Down Expand Up @@ -1467,6 +1472,34 @@ class NormalCall extends ExtractedDataFlowCall, TNormalCall {
CallType getCallType() { result = type }
}

/** A call to the generated function inside a comprhension */
class ComprehensionCall extends ExtractedDataFlowCall, TComprehensionCall {
Comp c;
Function target;

ComprehensionCall() {
this = TComprehensionCall(c) and
target = c.getFunction()
}

Comp getComprehension() { result = c }

override string toString() { result = "comprehension call" }

override ControlFlowNode getNode() { result.getNode() = c }

override Scope getScope() { result = c.getScope() }

override DataFlowCallable getCallable() { result.(DataFlowFunction).getScope() = target }

override ArgumentNode getArgument(ArgumentPosition apos) {
result.asExpr() = c.getIterable() and
apos.isPositional(0)
}

override Location getLocation() { result = c.getLocation() }
}

/**
* A potential call to a summarized callable, a `LibraryCallable`.
*
Expand Down Expand Up @@ -1697,6 +1730,47 @@ class CapturedVariablesArgumentNode extends CfgNode, ArgumentNode {
}
}

/** A synthetic node representing the values of variables captured by a comprehension. */
class SynthCompCapturedVariablesArgumentNode extends Node, TSynthCompCapturedVariablesArgumentNode,
ArgumentNode
{
Comp comp;

SynthCompCapturedVariablesArgumentNode() { this = TSynthCompCapturedVariablesArgumentNode(comp) }

override string toString() { result = "Capturing closure argument (comp)" }

override Scope getScope() { result = comp.getScope() }

override Location getLocation() { result = comp.getLocation() }

Comp getComprehension() { result = comp }

override predicate argumentOf(DataFlowCall call, ArgumentPosition pos) {
call.(ComprehensionCall).getComprehension() = comp and
pos.isLambdaSelf()
}
}

/** A synthetic node representing the values of variables captured by a comprehension after the output has been computed. */
class SynthCompCapturedVariablesArgumentPostUpdateNode extends PostUpdateNodeImpl,
TSynthCompCapturedVariablesArgumentPostUpdateNode
{
Comp comp;

SynthCompCapturedVariablesArgumentPostUpdateNode() {
this = TSynthCompCapturedVariablesArgumentPostUpdateNode(comp)
}

override string toString() { result = "[post] Capturing closure argument (comp)" }

override Scope getScope() { result = comp.getScope() }

override Location getLocation() { result = comp.getLocation() }

override Node getPreUpdateNode() { result = TSynthCompCapturedVariablesArgumentNode(comp) }
}

/** Gets a viable run-time target for the call `call`. */
DataFlowCallable viableCallable(DataFlowCall call) {
call instanceof ExtractedDataFlowCall and
Expand Down Expand Up @@ -1736,7 +1810,10 @@ abstract class ReturnNode extends Node {
/** A data flow node that represents a value returned by a callable. */
class ExtractedReturnNode extends ReturnNode, CfgNode {
// See `TaintTrackingImplementation::returnFlowStep`
ExtractedReturnNode() { node = any(Return ret).getValue().getAFlowNode() }
ExtractedReturnNode() {
node = any(Return ret).getValue().getAFlowNode() or
node = any(Yield yield).getAFlowNode()
}

override ReturnKind getKind() { any() }
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,26 @@ private predicate synthDictSplatArgumentNodeStoreStep(
)
}

/**
* Holds if `nodeFrom` is the value yielded by the `yield` found at `nodeTo`.
*
* For example, in
* ```python
* for x in l:
* yield x.name
* ```
* data from `x.name` is stored into the `yield` (and can subsequently be read out of the iterable).
*/
predicate yieldStoreStep(Node nodeFrom, Content c, Node nodeTo) {
exists(Yield yield |
nodeTo.asCfgNode() = yield.getAFlowNode() and
nodeFrom.asCfgNode() = yield.getValue().getAFlowNode() and
// TODO: Consider if this will also need to transfer dictionary content
// once dictionary comprehensions are supported.
c instanceof ListElementContent
)
}

/**
* Ensures that the a `**kwargs` parameter will not contain elements with names of
* keyword parameters.
Expand Down Expand Up @@ -256,6 +276,12 @@ abstract class PostUpdateNodeImpl extends Node {
abstract Node getPreUpdateNode();
}

/**
* A post-update node synthesised for an existing control flow node.
* Add to `TSyntheticPostUpdateNode` to get the synthetic post-update node synthesised.
*
* Synthetic post-update nodes for synthetic nodes need to be listed one by one.
*/
class SyntheticPostUpdateNode extends PostUpdateNodeImpl, TSyntheticPostUpdateNode {
ControlFlowNode node;

Expand All @@ -270,6 +296,11 @@ class SyntheticPostUpdateNode extends PostUpdateNodeImpl, TSyntheticPostUpdateNo
override Location getLocation() { result = node.getLocation() }
}

/**
* An existsing control flow node being the post-update node of a synthetic pre-update node.
*
* Synthetic post-update nodes for synthetic nodes need to be listed one by one.
*/
class NonSyntheticPostUpdateNode extends PostUpdateNodeImpl, CfgNode {
SyntheticPreUpdateNode pre;

Expand Down Expand Up @@ -668,8 +699,6 @@ predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
or
setStoreStep(nodeFrom, c, nodeTo)
or
comprehensionStoreStep(nodeFrom, c, nodeTo)
or
attributeStoreStep(nodeFrom, c, nodeTo)
or
matchStoreStep(nodeFrom, c, nodeTo)
Expand All @@ -683,6 +712,8 @@ predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
or
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
or
yieldStoreStep(nodeFrom, c, nodeTo)
or
VariableCapture::storeStep(nodeFrom, c, nodeTo)
}

Expand Down Expand Up @@ -843,31 +874,6 @@ predicate dictClearStep(Node node, DictionaryElementContent c) {
)
}

/** Data flows from an element expression in a comprehension to the comprehension. */
predicate comprehensionStoreStep(CfgNode nodeFrom, Content c, CfgNode nodeTo) {
// Comprehension
// `[x+1 for x in l]`
// nodeFrom is `x+1`, cfg node
// nodeTo is `[x+1 for x in l]`, cfg node
// c denotes list or set or dictionary without index
//
// List
nodeTo.getNode().getNode().(ListComp).getElt() = nodeFrom.getNode().getNode() and
c instanceof ListElementContent
or
// Set
nodeTo.getNode().getNode().(SetComp).getElt() = nodeFrom.getNode().getNode() and
c instanceof SetElementContent
or
// Dictionary
nodeTo.getNode().getNode().(DictComp).getElt() = nodeFrom.getNode().getNode() and
c instanceof DictionaryElementAnyContent
or
// Generator
nodeTo.getNode().getNode().(GeneratorExp).getElt() = nodeFrom.getNode().getNode() and
c instanceof ListElementContent
}

/**
* Holds if `nodeFrom` flows into the attribute `c` of `nodeTo` via an attribute assignment.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ newtype TNode =
def.getDefiningNode() = node and
def.getParameter() = func.getArg(0)
)
or
// the iterable argument to the implicit comprehension function
node.getNode() = any(Comp c).getIterable()
} or
/** A node representing a global (module-level) variable in a specific module. */
TModuleVariableNode(Module m, GlobalVariable v) {
Expand Down Expand Up @@ -124,9 +127,16 @@ newtype TNode =
/** A synthetic node representing the heap of a function. Used for variable capture. */
TSynthCapturedVariablesParameterNode(Function f) {
f = any(VariableCapture::CapturedVariable v).getACapturingScope() and
// TODO: Remove this restriction when adding proper support for captured variables in the body of the function we generate for comprehensions
exists(TFunction(f))
} or
/** A synthetic node representing the values of variables captured by a comprehension. */
TSynthCompCapturedVariablesArgumentNode(Comp comp) {
comp.getFunction() = any(VariableCapture::CapturedVariable v).getACapturingScope()
} or
/** A synthetic node representing the values of variables captured by a comprehension after the output has been computed. */
TSynthCompCapturedVariablesArgumentPostUpdateNode(Comp comp) {
comp.getFunction() = any(VariableCapture::CapturedVariable v).getACapturingScope()
} or
/** An empty, unused node type that exists to prevent unwanted dependencies on data flow nodes. */
TForbiddenRecursionGuard() {
none() and
Expand Down Expand Up @@ -350,6 +360,9 @@ class ExtractedArgumentNode extends ArgumentNode {
or
// and self arguments
this.asCfgNode() = any(CallNode c).getFunction().(AttrNode).getObject()
or
// for comprehensions, we allow the synthetic `iterable` argument
this.asExpr() = any(Comp c).getIterable()
}

final override predicate argumentOf(DataFlowCall call, ArgumentPosition pos) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ class ForTarget extends ControlFlowNode {
)
or
exists(Comp comp |
source = comp.getIterable() and
source = comp.getFunction().getArg(0) and
this.getNode() = comp.getNthInnerLoop(0).getTarget()
)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ predicate containerStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
// TODO: once we have proper flow-summary modeling, we might not need this step any
// longer -- but there needs to be a matching read-step for the store-step, and we
// don't provide that right now.
DataFlowPrivate::comprehensionStoreStep(nodeFrom, _, nodeTo)
DataFlowPrivate::yieldStoreStep(nodeFrom, _, nodeTo)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@ module Flow = Shared::Flow<Location, CaptureInput>;
private Flow::ClosureNode asClosureNode(Node n) {
result = n.(SynthCaptureNode).getSynthesizedCaptureNode()
or
exists(Comp comp | n = TSynthCompCapturedVariablesArgumentNode(comp) |
result.(Flow::ExprNode).getExpr().getNode() = comp
)
or
// TODO: Should the `Comp`s above be excluded here?
result.(Flow::ExprNode).getExpr() = n.(CfgNode).getNode()
or
result.(Flow::VariableWriteSourceNode).getVariableWrite() = n.(CfgNode).getNode()
Expand Down
9 changes: 7 additions & 2 deletions python/ql/lib/semmle/python/frameworks/Stdlib.qll
Original file line number Diff line number Diff line change
Expand Up @@ -4251,22 +4251,27 @@ module StdlibPrivate {

override predicate propagatesFlow(string input, string output, boolean preservesValue) {
// The positional argument contains a mapping.
// TODO: Add the list-of-pairs version
// TODO: these values can be overwritten by keyword arguments
// - dict mapping
exists(DataFlow::DictionaryElementContent dc, string key | key = dc.getKey() |
input = "Argument[0].DictionaryElement[" + key + "]" and
output = "ReturnValue.DictionaryElement[" + key + "]" and
preservesValue = true
)
or
// - list-of-pairs mapping
input = "Argument[0].ListElement.TupleElement[1]" and
output = "ReturnValue.DictionaryElementAny" and
preservesValue = true
or
// The keyword arguments are added to the dictionary.
exists(DataFlow::DictionaryElementContent dc, string key | key = dc.getKey() |
input = "Argument[" + key + ":]" and
output = "ReturnValue.DictionaryElement[" + key + "]" and
preservesValue = true
)
or
// Imprecise content in any argument ends up on the container itself.
// Imprecise content in the first argument ends up on the container itself.
input = "Argument[0]" and
output = "ReturnValue" and
preservesValue = false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ edges
| TarSlipImprov.py:142:9:142:13 | ControlFlowNode for entry | TarSlipImprov.py:143:36:143:40 | ControlFlowNode for entry | provenance | |
| TarSlipImprov.py:151:14:151:50 | ControlFlowNode for closing() | TarSlipImprov.py:151:55:151:56 | ControlFlowNode for tf | provenance | |
| TarSlipImprov.py:151:22:151:49 | ControlFlowNode for Attribute() | TarSlipImprov.py:151:14:151:50 | ControlFlowNode for closing() | provenance | Config |
| TarSlipImprov.py:151:55:151:56 | ControlFlowNode for tf | TarSlipImprov.py:152:13:152:20 | ControlFlowNode for Yield | provenance | |
| TarSlipImprov.py:151:55:151:56 | ControlFlowNode for tf | TarSlipImprov.py:152:19:152:20 | ControlFlowNode for tf | provenance | |
| TarSlipImprov.py:152:13:152:20 | ControlFlowNode for Yield | TarSlipImprov.py:157:18:157:40 | ControlFlowNode for py2_tarxz() | provenance | |
| TarSlipImprov.py:152:19:152:20 | ControlFlowNode for tf | TarSlipImprov.py:157:18:157:40 | ControlFlowNode for py2_tarxz() | provenance | |
| TarSlipImprov.py:157:9:157:14 | ControlFlowNode for tar_cm | TarSlipImprov.py:162:20:162:23 | ControlFlowNode for tarc | provenance | |
| TarSlipImprov.py:157:18:157:40 | ControlFlowNode for py2_tarxz() | TarSlipImprov.py:157:9:157:14 | ControlFlowNode for tar_cm | provenance | |
Expand Down Expand Up @@ -131,6 +133,7 @@ nodes
| TarSlipImprov.py:151:14:151:50 | ControlFlowNode for closing() | semmle.label | ControlFlowNode for closing() |
| TarSlipImprov.py:151:22:151:49 | ControlFlowNode for Attribute() | semmle.label | ControlFlowNode for Attribute() |
| TarSlipImprov.py:151:55:151:56 | ControlFlowNode for tf | semmle.label | ControlFlowNode for tf |
| TarSlipImprov.py:152:13:152:20 | ControlFlowNode for Yield | semmle.label | ControlFlowNode for Yield |
| TarSlipImprov.py:152:19:152:20 | ControlFlowNode for tf | semmle.label | ControlFlowNode for tf |
| TarSlipImprov.py:157:9:157:14 | ControlFlowNode for tar_cm | semmle.label | ControlFlowNode for tar_cm |
| TarSlipImprov.py:157:18:157:40 | ControlFlowNode for py2_tarxz() | semmle.label | ControlFlowNode for py2_tarxz() |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,4 @@
| test.py:208:1:208:53 | Entry definition for SsaSourceVariable SINK | test.py:210:5:210:8 | ControlFlowNode for SINK |
| test.py:208:1:208:53 | Entry definition for SsaSourceVariable SOURCE | test.py:209:25:209:30 | ControlFlowNode for SOURCE |
| test.py:209:5:209:5 | ControlFlowNode for x | test.py:210:10:210:10 | ControlFlowNode for x |
| test.py:209:9:209:68 | ControlFlowNode for .0 | test.py:209:9:209:68 | ControlFlowNode for .0 |
| test.py:209:9:209:68 | ControlFlowNode for ListComp | test.py:209:5:209:5 | ControlFlowNode for x |
| test.py:209:16:209:16 | ControlFlowNode for v | test.py:209:45:209:45 | ControlFlowNode for v |
| test.py:209:40:209:40 | ControlFlowNode for u | test.py:209:56:209:56 | ControlFlowNode for u |
| test.py:209:51:209:51 | ControlFlowNode for z | test.py:209:67:209:67 | ControlFlowNode for z |
| test.py:209:62:209:62 | ControlFlowNode for y | test.py:209:10:209:10 | ControlFlowNode for y |
6 changes: 3 additions & 3 deletions python/ql/test/library-tests/dataflow/coverage/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def test_list_comprehension_with_tuple_result():
s = SOURCE
ns = NONSOURCE
l3 = [(s, ns) for _ in [1]]
SINK(l3[0][0]) # $ MISSING: flow="SOURCE, l:-3 -> l3[0][0]"
SINK(l3[0][0]) # $ flow="SOURCE, l:-3 -> l3[0][0]"
SINK_F(l3[0][1])


Expand Down Expand Up @@ -245,7 +245,7 @@ def gen(x):

def test_yield():
g = gen(SOURCE)
SINK(next(g)) #$ MISSING:flow="SOURCE, l:-1 -> next()"
SINK(next(g)) #$ flow="SOURCE, l:-1 -> next(..)"


def gen_from(x):
Expand All @@ -260,7 +260,7 @@ def test_yield_from():
# a statement rather than an expression, but related to generators
def test_for():
for x in gen(SOURCE):
SINK(x) #$ MISSING:flow="SOURCE, l:-1 -> x"
SINK(x) #$ flow="SOURCE, l:-1 -> x"


# 6.2.9.1. Generator-iterator methods
Expand Down
Loading

0 comments on commit 6bb98b0

Please sign in to comment.