Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

python: capture flow through comprehensions #17577

Merged
merged 21 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions python/ql/lib/change-notes/2024-10-01-comprehension-flow.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
```yaml
---
category: minorAnalysis
---
* More precise modelling of the dataflow through comprehensions. In particular, captured variables are now handled correctly.
* Dataflow out of yield is added, allowing proper tracing through generators.
```
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ newtype TArgumentPosition =
// position, we need to ensure we make these available (these are specified as
// parameters in the flow-summary spec)
FlowSummaryImpl::ParsePositions::isParsedPositionalParameterPosition(_, index)
or
// the generated function inside a comprehension has a positional argument at index 0
exists(Comp c) and
index = 0
} or
TKeywordArgumentPosition(string name) {
exists(any(CallNode c).getArgByName(name))
Expand Down Expand Up @@ -314,12 +318,11 @@ newtype TDataFlowCallable =
* class instantiations, and (in the future) special methods.
*/
TFunction(Function func) {
// For generators/list-comprehensions we create a synthetic function. In the
// points-to call-graph these were not considered callable, and instead we added
// data-flow steps (read/write) for these. As an easy solution for now, we do the
// same to keep things easy to reason about (and therefore exclude things that do
// not have a definition)
// Functions with an explicit definition
exists(func.getDefinition())
or
// For generators/list-comprehensions we create a synthetic function.
exists(Comp c | c.getFunction() = func)
RasmusWL marked this conversation as resolved.
Show resolved Hide resolved
} or
/** see QLDoc for `DataFlowModuleScope` for why we need this. */
TModule(Module m) or
Expand Down Expand Up @@ -1382,6 +1385,8 @@ private predicate sameEnclosingCallable(Node node1, Node node2) {
// =============================================================================
newtype TDataFlowCall =
TNormalCall(CallNode call, Function target, CallType type) { resolveCall(call, target, type) } or
/** A call to the generated function inside a comprhension */
yoff marked this conversation as resolved.
Show resolved Hide resolved
TComprehensionCall(Comp c) or
TPotentialLibraryCall(CallNode call) or
/** A synthesized call inside a summarized callable */
TSummaryCall(
Expand Down Expand Up @@ -1468,6 +1473,34 @@ class NormalCall extends ExtractedDataFlowCall, TNormalCall {
CallType getCallType() { result = type }
}

/** A call to the generated function inside a comprhension */
class ComprehensionCall extends ExtractedDataFlowCall, TComprehensionCall {
Comp c;
Function target;

ComprehensionCall() {
this = TComprehensionCall(c) and
target = c.getFunction()
}

Comp getComprehension() { result = c }

override string toString() { result = "comprehension call" }

override ControlFlowNode getNode() { result.getNode() = c }

override Scope getScope() { result = c.getScope() }

override DataFlowCallable getCallable() { result.(DataFlowFunction).getScope() = target }

override ArgumentNode getArgument(ArgumentPosition apos) {
result.asExpr() = c.getIterable() and
apos.isPositional(0)
}

override Location getLocation() { result = c.getLocation() }
}

/**
* A potential call to a summarized callable, a `LibraryCallable`.
*
Expand Down Expand Up @@ -1698,6 +1731,47 @@ class CapturedVariablesArgumentNode extends CfgNode, ArgumentNode {
}
}

/** A synthetic node representing the values of variables captured by a comprehension. */
class SynthCompCapturedVariablesArgumentNode extends Node, TSynthCompCapturedVariablesArgumentNode,
ArgumentNode
{
Comp comp;

SynthCompCapturedVariablesArgumentNode() { this = TSynthCompCapturedVariablesArgumentNode(comp) }

override string toString() { result = "Capturing closure argument (comp)" }

override Scope getScope() { result = comp.getScope() }

override Location getLocation() { result = comp.getLocation() }

Comp getComprehension() { result = comp }

override predicate argumentOf(DataFlowCall call, ArgumentPosition pos) {
call.(ComprehensionCall).getComprehension() = comp and
pos.isLambdaSelf()
}
}

/** A synthetic node representing the values of variables captured by a comprehension after the output has been computed. */
class SynthCompCapturedVariablesArgumentPostUpdateNode extends PostUpdateNodeImpl,
TSynthCompCapturedVariablesArgumentPostUpdateNode
{
Comp comp;

SynthCompCapturedVariablesArgumentPostUpdateNode() {
this = TSynthCompCapturedVariablesArgumentPostUpdateNode(comp)
}

override string toString() { result = "[post] Capturing closure argument (comp)" }

override Scope getScope() { result = comp.getScope() }

override Location getLocation() { result = comp.getLocation() }

override Node getPreUpdateNode() { result = TSynthCompCapturedVariablesArgumentNode(comp) }
}

/** Gets a viable run-time target for the call `call`. */
DataFlowCallable viableCallable(DataFlowCall call) {
call instanceof ExtractedDataFlowCall and
Expand Down Expand Up @@ -1737,7 +1811,10 @@ abstract class ReturnNode extends Node {
/** A data flow node that represents a value returned by a callable. */
class ExtractedReturnNode extends ReturnNode, CfgNode {
// See `TaintTrackingImplementation::returnFlowStep`
ExtractedReturnNode() { node = any(Return ret).getValue().getAFlowNode() }
ExtractedReturnNode() {
node = any(Return ret).getValue().getAFlowNode() or
node = any(Yield yield).getAFlowNode()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised that we could just add yields as a ExtractedReturnNode... Did you consider using different ReturnKind to model return/yield statements? (my impression was we would need to do that)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the new yield store step, where the yielded element is stored to the yield expression, it seems fine to simply use the yield as a normal return. We are returning a new thing and we can control its content. I have seen C# use a different return kind for variables that already exist but are written to.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understood that as: since I got yield working without messing with the ReturnKinds, I didn't really investigate that path much.

I guess that's fine 👍 I'm still a little curious how the approach would have worked out 🤔 (but it's not super important so 🤷)

}

override ReturnKind getKind() { any() }
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,47 @@ private predicate synthDictSplatArgumentNodeStoreStep(
)
}

/**
* Holds if `nodeFrom` is the value yielded by the `yield` found at `nodeTo`.
*
* For example, in
* ```python
* for x in l:
* yield x.name
* ```
* data from `x.name` is stored into the `yield` (and can subsequently be read out of the iterable).
*/
predicate yieldStoreStep(Node nodeFrom, Content c, Node nodeTo) {
exists(Yield yield, Function func |
nodeTo.asCfgNode() = yield.getAFlowNode() and
nodeFrom.asCfgNode() = yield.getValue().getAFlowNode() and
func.containsInScope(yield)
|
exists(Comp comp | func = comp.getFunction() |
(
comp instanceof ListComp or
comp instanceof GeneratorExp
) and
c instanceof ListElementContent
or
comp instanceof SetComp and
c instanceof SetElementContent
or
comp instanceof DictComp and
c instanceof DictionaryElementContent
)
or
not exists(Comp comp | func = comp.getFunction()) and
(
c instanceof ListElementContent
or
c instanceof SetElementContent
or
c instanceof DictionaryElementContent
)
)
}

/**
* Ensures that the a `**kwargs` parameter will not contain elements with names of
* keyword parameters.
Expand Down Expand Up @@ -256,6 +297,12 @@ abstract class PostUpdateNodeImpl extends Node {
abstract Node getPreUpdateNode();
}

/**
* A post-update node synthesised for an existing control flow node.
* Add to `TSyntheticPostUpdateNode` to get the synthetic post-update node synthesised.
*
* Synthetic post-update nodes for synthetic nodes need to be listed one by one.
*/
class SyntheticPostUpdateNode extends PostUpdateNodeImpl, TSyntheticPostUpdateNode {
ControlFlowNode node;

Expand All @@ -270,6 +317,11 @@ class SyntheticPostUpdateNode extends PostUpdateNodeImpl, TSyntheticPostUpdateNo
override Location getLocation() { result = node.getLocation() }
}

/**
* An existsing control flow node being the post-update node of a synthetic pre-update node.
*
* Synthetic post-update nodes for synthetic nodes need to be listed one by one.
*/
class NonSyntheticPostUpdateNode extends PostUpdateNodeImpl, CfgNode {
SyntheticPreUpdateNode pre;

Expand Down Expand Up @@ -668,8 +720,6 @@ predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
or
setStoreStep(nodeFrom, c, nodeTo)
or
comprehensionStoreStep(nodeFrom, c, nodeTo)
or
attributeStoreStep(nodeFrom, c, nodeTo)
or
matchStoreStep(nodeFrom, c, nodeTo)
Expand All @@ -683,6 +733,8 @@ predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
or
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
or
yieldStoreStep(nodeFrom, c, nodeTo)
or
VariableCapture::storeStep(nodeFrom, c, nodeTo)
}

Expand Down Expand Up @@ -843,31 +895,6 @@ predicate dictClearStep(Node node, DictionaryElementContent c) {
)
}

/** Data flows from an element expression in a comprehension to the comprehension. */
predicate comprehensionStoreStep(CfgNode nodeFrom, Content c, CfgNode nodeTo) {
// Comprehension
// `[x+1 for x in l]`
// nodeFrom is `x+1`, cfg node
// nodeTo is `[x+1 for x in l]`, cfg node
// c denotes list or set or dictionary without index
//
// List
nodeTo.getNode().getNode().(ListComp).getElt() = nodeFrom.getNode().getNode() and
c instanceof ListElementContent
or
// Set
nodeTo.getNode().getNode().(SetComp).getElt() = nodeFrom.getNode().getNode() and
c instanceof SetElementContent
or
// Dictionary
nodeTo.getNode().getNode().(DictComp).getElt() = nodeFrom.getNode().getNode() and
c instanceof DictionaryElementAnyContent
or
// Generator
nodeTo.getNode().getNode().(GeneratorExp).getElt() = nodeFrom.getNode().getNode() and
c instanceof ListElementContent
}

/**
* Holds if `nodeFrom` flows into the attribute `c` of `nodeTo` via an attribute assignment.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ newtype TNode =
def.getDefiningNode() = node and
def.getParameter() = func.getArg(0)
)
or
// the iterable argument to the implicit comprehension function
node.getNode() = any(Comp c).getIterable()
} or
/** A node representing a global (module-level) variable in a specific module. */
TModuleVariableNode(Module m, GlobalVariable v) {
Expand Down Expand Up @@ -124,9 +127,16 @@ newtype TNode =
/** A synthetic node representing the heap of a function. Used for variable capture. */
TSynthCapturedVariablesParameterNode(Function f) {
f = any(VariableCapture::CapturedVariable v).getACapturingScope() and
// TODO: Remove this restriction when adding proper support for captured variables in the body of the function we generate for comprehensions
exists(TFunction(f))
} or
/** A synthetic node representing the values of variables captured by a comprehension. */
TSynthCompCapturedVariablesArgumentNode(Comp comp) {
comp.getFunction() = any(VariableCapture::CapturedVariable v).getACapturingScope()
} or
/** A synthetic node representing the values of variables captured by a comprehension after the output has been computed. */
TSynthCompCapturedVariablesArgumentPostUpdateNode(Comp comp) {
comp.getFunction() = any(VariableCapture::CapturedVariable v).getACapturingScope()
} or
/** An empty, unused node type that exists to prevent unwanted dependencies on data flow nodes. */
TForbiddenRecursionGuard() {
none() and
Expand Down Expand Up @@ -350,6 +360,9 @@ class ExtractedArgumentNode extends ArgumentNode {
or
// and self arguments
this.asCfgNode() = any(CallNode c).getFunction().(AttrNode).getObject()
or
// for comprehensions, we allow the synthetic `iterable` argument
this.asExpr() = any(Comp c).getIterable()
}

final override predicate argumentOf(DataFlowCall call, ArgumentPosition pos) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ class ForTarget extends ControlFlowNode {
)
or
exists(Comp comp |
source = comp.getIterable() and
source = comp.getFunction().getArg(0) and
this.getNode() = comp.getNthInnerLoop(0).getTarget()
)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ predicate containerStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
// TODO: once we have proper flow-summary modeling, we might not need this step any
// longer -- but there needs to be a matching read-step for the store-step, and we
// don't provide that right now.
DataFlowPrivate::comprehensionStoreStep(nodeFrom, _, nodeTo)
DataFlowPrivate::yieldStoreStep(nodeFrom, _, nodeTo)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@ module Flow = Shared::Flow<Location, CaptureInput>;
private Flow::ClosureNode asClosureNode(Node n) {
result = n.(SynthCaptureNode).getSynthesizedCaptureNode()
or
exists(Comp comp | n = TSynthCompCapturedVariablesArgumentNode(comp) |
result.(Flow::ExprNode).getExpr().getNode() = comp
)
or
// TODO: Should the `Comp`s above be excluded here?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you resolve this TODO?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not. My intention was to leave it for later if we did not observe weird flow in the tests or bad performance.

result.(Flow::ExprNode).getExpr() = n.(CfgNode).getNode()
or
result.(Flow::VariableWriteSourceNode).getVariableWrite() = n.(CfgNode).getNode()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ edges
| TarSlipImprov.py:142:9:142:13 | ControlFlowNode for entry | TarSlipImprov.py:143:36:143:40 | ControlFlowNode for entry | provenance | |
| TarSlipImprov.py:151:14:151:50 | ControlFlowNode for closing() | TarSlipImprov.py:151:55:151:56 | ControlFlowNode for tf | provenance | |
| TarSlipImprov.py:151:22:151:49 | ControlFlowNode for Attribute() | TarSlipImprov.py:151:14:151:50 | ControlFlowNode for closing() | provenance | Config |
| TarSlipImprov.py:151:55:151:56 | ControlFlowNode for tf | TarSlipImprov.py:152:13:152:20 | ControlFlowNode for Yield | provenance | |
| TarSlipImprov.py:151:55:151:56 | ControlFlowNode for tf | TarSlipImprov.py:152:19:152:20 | ControlFlowNode for tf | provenance | |
| TarSlipImprov.py:152:13:152:20 | ControlFlowNode for Yield | TarSlipImprov.py:157:18:157:40 | ControlFlowNode for py2_tarxz() | provenance | |
| TarSlipImprov.py:152:19:152:20 | ControlFlowNode for tf | TarSlipImprov.py:157:18:157:40 | ControlFlowNode for py2_tarxz() | provenance | |
| TarSlipImprov.py:157:9:157:14 | ControlFlowNode for tar_cm | TarSlipImprov.py:162:20:162:23 | ControlFlowNode for tarc | provenance | |
| TarSlipImprov.py:157:18:157:40 | ControlFlowNode for py2_tarxz() | TarSlipImprov.py:157:9:157:14 | ControlFlowNode for tar_cm | provenance | |
Expand Down Expand Up @@ -131,6 +133,7 @@ nodes
| TarSlipImprov.py:151:14:151:50 | ControlFlowNode for closing() | semmle.label | ControlFlowNode for closing() |
| TarSlipImprov.py:151:22:151:49 | ControlFlowNode for Attribute() | semmle.label | ControlFlowNode for Attribute() |
| TarSlipImprov.py:151:55:151:56 | ControlFlowNode for tf | semmle.label | ControlFlowNode for tf |
| TarSlipImprov.py:152:13:152:20 | ControlFlowNode for Yield | semmle.label | ControlFlowNode for Yield |
| TarSlipImprov.py:152:19:152:20 | ControlFlowNode for tf | semmle.label | ControlFlowNode for tf |
| TarSlipImprov.py:157:9:157:14 | ControlFlowNode for tar_cm | semmle.label | ControlFlowNode for tar_cm |
| TarSlipImprov.py:157:18:157:40 | ControlFlowNode for py2_tarxz() | semmle.label | ControlFlowNode for py2_tarxz() |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,4 @@
| test.py:208:1:208:53 | Entry definition for SsaSourceVariable SINK | test.py:210:5:210:8 | ControlFlowNode for SINK |
| test.py:208:1:208:53 | Entry definition for SsaSourceVariable SOURCE | test.py:209:25:209:30 | ControlFlowNode for SOURCE |
| test.py:209:5:209:5 | ControlFlowNode for x | test.py:210:10:210:10 | ControlFlowNode for x |
| test.py:209:9:209:68 | ControlFlowNode for .0 | test.py:209:9:209:68 | ControlFlowNode for .0 |
| test.py:209:9:209:68 | ControlFlowNode for ListComp | test.py:209:5:209:5 | ControlFlowNode for x |
| test.py:209:16:209:16 | ControlFlowNode for v | test.py:209:45:209:45 | ControlFlowNode for v |
| test.py:209:40:209:40 | ControlFlowNode for u | test.py:209:56:209:56 | ControlFlowNode for u |
| test.py:209:51:209:51 | ControlFlowNode for z | test.py:209:67:209:67 | ControlFlowNode for z |
| test.py:209:62:209:62 | ControlFlowNode for y | test.py:209:10:209:10 | ControlFlowNode for y |
6 changes: 3 additions & 3 deletions python/ql/test/library-tests/dataflow/coverage/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def test_list_comprehension_with_tuple_result():
s = SOURCE
ns = NONSOURCE
l3 = [(s, ns) for _ in [1]]
SINK(l3[0][0]) # $ MISSING: flow="SOURCE, l:-3 -> l3[0][0]"
SINK(l3[0][0]) # $ flow="SOURCE, l:-3 -> l3[0][0]"
SINK_F(l3[0][1])


Expand Down Expand Up @@ -245,7 +245,7 @@ def gen(x):

def test_yield():
g = gen(SOURCE)
SINK(next(g)) #$ MISSING:flow="SOURCE, l:-1 -> next()"
SINK(next(g)) #$ flow="SOURCE, l:-1 -> next(..)"


def gen_from(x):
Expand All @@ -260,7 +260,7 @@ def test_yield_from():
# a statement rather than an expression, but related to generators
def test_for():
for x in gen(SOURCE):
SINK(x) #$ MISSING:flow="SOURCE, l:-1 -> x"
SINK(x) #$ flow="SOURCE, l:-1 -> x"


# 6.2.9.1. Generator-iterator methods
Expand Down
Loading
Loading