From b53e0fd5d97df328c53495f6555eb52bf66f9369 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 09:32:46 +0100 Subject: [PATCH 01/94] Let append tree return None --- src/databricks/labs/ucx/source_code/python/python_ast.py | 6 +----- tests/unit/source_code/python/test_python_ast.py | 6 +++--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 1bfcd85b48..57d90beaea 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -211,16 +211,12 @@ def __repr__(self): code = code[0:truncate_after] + "..." return f"" - def append_tree(self, tree: Tree) -> Tree: - """returns the appended tree, not the consolidated one!""" + def append_tree(self, tree: Tree) -> None: if not isinstance(tree.node, Module): raise NotImplementedError(f"Can't append tree from {type(tree.node).__name__}") tree_module: Module = cast(Module, tree.node) self.append_nodes(tree_module.body) self.append_globals(tree_module.globals) - # the following may seem strange but it's actually ok to use the original module as tree root - # because each node points to the correct parent (practically, the tree is now only a list of statements) - return tree def append_globals(self, globs: dict[str, list[Expr]]) -> None: if not isinstance(self.node, Module): diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index e65abe4b8e..53a7fb77df 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -148,9 +148,9 @@ def test_appends_statements() -> None: maybe_tree_2 = Tree.maybe_normalized_parse(source_2) assert maybe_tree_2.tree is not None, maybe_tree_2.failure tree_2 = maybe_tree_2.tree - tree_3 = tree_1.append_tree(tree_2) - nodes = tree_3.locate(Assign, []) - tree = Tree(nodes[0].value) # tree_3 only contains tree_2 statements + tree_1.append_tree(tree_2) + nodes = tree_2.locate(Assign, []) + tree = Tree(nodes[0].value) values = list(InferredValue.infer_from_node(tree.node)) strings = list(value.as_string() for value in values) assert strings == ["Hello John!"] From 108dd7b233c287533650c9f2417828eda191671f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 09:43:02 +0100 Subject: [PATCH 02/94] Test bidirectionality of appended trees --- tests/unit/source_code/python/test_python_ast.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 53a7fb77df..06217176ce 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -149,8 +149,15 @@ def test_appends_statements() -> None: assert maybe_tree_2.tree is not None, maybe_tree_2.failure tree_2 = maybe_tree_2.tree tree_1.append_tree(tree_2) + + nodes = tree_1.locate(Assign, []) + tree = Tree(nodes[1].value) # Starting from tree_1, we want the last assign + values = list(InferredValue.infer_from_node(tree.node)) + strings = list(value.as_string() for value in values) + assert strings == ["Hello John!"] + nodes = tree_2.locate(Assign, []) - tree = Tree(nodes[0].value) + tree = Tree(nodes[0].value) # Starting from tree_2, we want the first assign values = list(InferredValue.infer_from_node(tree.node)) strings = list(value.as_string() for value in values) assert strings == ["Hello John!"] From 8a22d904e39cdabe5a3b802c473bf82b329e974b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 09:48:44 +0100 Subject: [PATCH 03/94] Rename append_tree to attach tree --- src/databricks/labs/ucx/source_code/graph.py | 2 +- src/databricks/labs/ucx/source_code/jobs.py | 2 +- .../labs/ucx/source_code/python/python_ast.py | 15 +++++++++++---- tests/unit/source_code/python/test_python_ast.py | 8 ++++---- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index 070b09c828..1df993cfa7 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -559,7 +559,7 @@ def append(self, context: InheritedContext, copy_found: bool) -> InheritedContex if tree is None: return InheritedContext(self.tree, found, context.problems) new_tree = self.tree or Tree.new_module() - new_tree.append_tree(tree) + new_tree.attach_child_tree(tree) new_problems = itertools.chain(self.problems, context.problems) return InheritedContext(new_tree, found, new_problems) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 9db98e5671..bd3857fb20 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -662,7 +662,7 @@ def _collect_from_notebook( logger.warning(maybe_tree.failure.message) continue assert maybe_tree.tree is not None - inherited_tree.append_tree(maybe_tree.tree) + inherited_tree.attach_child_tree(maybe_tree.tree) def _collect_from_source( self, diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 57d90beaea..06a46887e0 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -211,9 +211,16 @@ def __repr__(self): code = code[0:truncate_after] + "..." return f"" - def append_tree(self, tree: Tree) -> None: + def attach_child_tree(self, tree: Tree) -> None: + """Attach a child tree. + + Attaching a child tree is a **stateful** operation for both the parent and child tree. After attaching a child + tree, a tree can be traversed starting from the parent or child tree. From both starting points all nodes in + both trees can be reached, though, the order of nodes will be different as that is relative to the starting + point. + """ if not isinstance(tree.node, Module): - raise NotImplementedError(f"Can't append tree from {type(tree.node).__name__}") + raise NotImplementedError(f"Cannot attach child tree: {type(tree.node).__name__}") tree_module: Module = cast(Module, tree.node) self.append_nodes(tree_module.body) self.append_globals(tree_module.globals) @@ -689,7 +696,7 @@ def _parse_and_append(self, code: str) -> MaybeTree: return maybe_tree def append_tree(self, tree: Tree) -> None: - self._make_tree().append_tree(tree) + self._make_tree().attach_child_tree(tree) def append_nodes(self, nodes: list[NodeNG]) -> None: self._make_tree().append_nodes(nodes) @@ -705,7 +712,7 @@ def process_child_cell(self, code: str) -> None: logger.warning(maybe_tree.failure.message) return assert maybe_tree.tree is not None - this_tree.append_tree(maybe_tree.tree) + this_tree.attach_child_tree(maybe_tree.tree) def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: maybe_tree = self._parse_and_append(source_code) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 06217176ce..ad18da7486 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -148,7 +148,7 @@ def test_appends_statements() -> None: maybe_tree_2 = Tree.maybe_normalized_parse(source_2) assert maybe_tree_2.tree is not None, maybe_tree_2.failure tree_2 = maybe_tree_2.tree - tree_1.append_tree(tree_2) + tree_1.attach_child_tree(tree_2) nodes = tree_1.locate(Assign, []) tree = Tree(nodes[1].value) # Starting from tree_1, we want the last assign @@ -217,11 +217,11 @@ def test_supports_recursive_refs_when_checking_module() -> None: maybe_tree_2 = Tree.maybe_normalized_parse(source_2) assert maybe_tree_2.tree is not None, maybe_tree_2.failure tree_2 = maybe_tree_2.tree - main_tree.append_tree(tree_2) + main_tree.attach_child_tree(tree_2) maybe_tree_3 = Tree.maybe_normalized_parse(source_3) assert maybe_tree_3.tree is not None, maybe_tree_3.failure tree_3 = maybe_tree_3.tree - main_tree.append_tree(tree_3) + main_tree.attach_child_tree(tree_3) assign = tree_3.locate(Assign, [])[0] assert Tree(assign.value).is_from_module("spark") @@ -320,7 +320,7 @@ def test_repr_is_truncated() -> None: def test_append_tree_fails() -> None: with pytest.raises(NotImplementedError): - Tree(Const("xyz")).append_tree(Tree(Const("xyz"))) + Tree(Const("xyz")).attach_child_tree(Tree(Const("xyz"))) def test_append_node_fails() -> None: From 6c0ebcf2e69e79c732d362548763f9f35eeb4009 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 09:53:27 +0100 Subject: [PATCH 04/94] Clean test for attaching child tree --- .../source_code/python/test_python_ast.py | 43 +++++++++---------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index ad18da7486..4b1bcb2fc9 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -139,28 +139,27 @@ def test_ignores_magic_marker_in_multiline_comment() -> None: assert True -def test_appends_statements() -> None: - source_1 = "a = 'John'" - maybe_tree_1 = Tree.maybe_normalized_parse(source_1) - assert maybe_tree_1.tree is not None, maybe_tree_1.failure - tree_1 = maybe_tree_1.tree - source_2 = 'b = f"Hello {a}!"' - maybe_tree_2 = Tree.maybe_normalized_parse(source_2) - assert maybe_tree_2.tree is not None, maybe_tree_2.failure - tree_2 = maybe_tree_2.tree - tree_1.attach_child_tree(tree_2) - - nodes = tree_1.locate(Assign, []) - tree = Tree(nodes[1].value) # Starting from tree_1, we want the last assign - values = list(InferredValue.infer_from_node(tree.node)) - strings = list(value.as_string() for value in values) - assert strings == ["Hello John!"] - - nodes = tree_2.locate(Assign, []) - tree = Tree(nodes[0].value) # Starting from tree_2, we want the first assign - values = list(InferredValue.infer_from_node(tree.node)) - strings = list(value.as_string() for value in values) - assert strings == ["Hello John!"] +def test_tree_attach_child_tree_infers_value() -> None: + """Attaching trees allows traversing from both parent and child.""" + inferred_string = "Hello John!" + parent_source, child_source = "a = 'John'", 'b = f"Hello {a}!"' + parent_maybe_tree = Tree.maybe_normalized_parse(parent_source) + child_maybe_tree = Tree.maybe_normalized_parse(child_source) + + assert parent_maybe_tree.tree is not None, parent_maybe_tree.failure + assert child_maybe_tree.tree is not None, child_maybe_tree.failure + + parent_maybe_tree.tree.attach_child_tree(child_maybe_tree.tree) + + nodes = parent_maybe_tree.tree.locate(Assign, []) + tree = Tree(nodes[1].value) # Starting from the parent, we are looking for the last assign + strings = [value.as_string() for value in InferredValue.infer_from_node(tree.node)] + assert strings == [inferred_string] + + nodes = child_maybe_tree.tree.locate(Assign, []) + tree = Tree(nodes[0].value) # Starting from child, we are looking for the first assign + strings = [value.as_string() for value in InferredValue.infer_from_node(tree.node)] + assert strings == [inferred_string] def test_is_from_module() -> None: From fd2c76141c6ee27beef022e9d96f3e5d6b84bc45 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 10:23:23 +0100 Subject: [PATCH 05/94] Rewrite test for module propagation --- .../source_code/python/test_python_ast.py | 42 +++++++++---------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 4b1bcb2fc9..50a12006ca 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -200,28 +200,26 @@ def test_is_instance_of(source, name, class_name) -> None: assert Tree(var[0]).is_instance_of(class_name) -def test_supports_recursive_refs_when_checking_module() -> None: - source_1 = """ - df = spark.read.csv("hi") - """ - source_2 = """ - df = df.withColumn(stuff) - """ - source_3 = """ - df = df.withColumn(stuff2) - """ - maybe_tree = Tree.maybe_normalized_parse(source_1) - assert maybe_tree.tree is not None, maybe_tree.failure - main_tree = maybe_tree.tree - maybe_tree_2 = Tree.maybe_normalized_parse(source_2) - assert maybe_tree_2.tree is not None, maybe_tree_2.failure - tree_2 = maybe_tree_2.tree - main_tree.attach_child_tree(tree_2) - maybe_tree_3 = Tree.maybe_normalized_parse(source_3) - assert maybe_tree_3.tree is not None, maybe_tree_3.failure - tree_3 = maybe_tree_3.tree - main_tree.attach_child_tree(tree_3) - assign = tree_3.locate(Assign, [])[0] +def test_tree_attach_child_tree_propagates_module_reference() -> None: + """The spark module should propagate from the parent tree.""" + source = """ +df = spark.read.csv("hi") +df = df.withColumn(stuff) +df = df.withColumn(stuff2) +""".lstrip() # Remove first new line character + sources = source.split("\n") + first_line_maybe_tree = Tree.maybe_normalized_parse(sources[0]) + second_line_maybe_tree = Tree.maybe_normalized_parse(sources[1]) + third_line_maybe_tree = Tree.maybe_normalized_parse(sources[2]) + + assert first_line_maybe_tree.tree, first_line_maybe_tree.failure + assert second_line_maybe_tree.tree, second_line_maybe_tree.failure + assert third_line_maybe_tree.tree, third_line_maybe_tree.failure + + first_line_maybe_tree.tree.attach_child_tree(second_line_maybe_tree.tree) + first_line_maybe_tree.tree.attach_child_tree(third_line_maybe_tree.tree) + + assign = third_line_maybe_tree.tree.locate(Assign, [])[0] assert Tree(assign.value).is_from_module("spark") From 2c46a01420e8fcb216fecc61325125ce8b0c99bc Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 10:25:28 +0100 Subject: [PATCH 06/94] Rewrite test for not implemented error --- tests/unit/source_code/python/test_python_ast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 50a12006ca..794a2d04e9 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -315,8 +315,8 @@ def test_repr_is_truncated() -> None: assert len(repr(Tree(Const("xyz")))) <= (32 + len("...") + len("")) -def test_append_tree_fails() -> None: - with pytest.raises(NotImplementedError): +def test_tree_attach_child_tree_raises_not_implemented_error_for_constant_node() -> None: + with pytest.raises(NotImplementedError, match="Cannot attach child tree: .*"): Tree(Const("xyz")).attach_child_tree(Tree(Const("xyz"))) From ca30dcf071903c2e4b765c518d20b7977439bc54 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 10:51:21 +0100 Subject: [PATCH 07/94] Test append globals --- .../unit/source_code/python/test_python_ast.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 794a2d04e9..f9ee568beb 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -1,5 +1,6 @@ import pytest -from astroid import Assign, Attribute, Call, Const, Expr, Module, Name # type: ignore +import astroid # type: ignore +from astroid import Assign, AssignName, Attribute, Call, Const, Expr, Module, Name # type: ignore from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper from databricks.labs.ucx.source_code.python.python_infer import InferredValue @@ -306,6 +307,20 @@ def test_is_builtin(source, name, is_builtin) -> None: assert False # could not locate call +def test_tree_append_globals_adds_assign_name_to_tree() -> None: + maybe_tree = Tree.maybe_normalized_parse("a = 1") + assert maybe_tree.tree, maybe_tree.failure + + node = astroid.extract_node("b = a + 2") + assign_name = next(node.get_children()) + assert isinstance(assign_name, AssignName) + + maybe_tree.tree.append_globals({"b": [assign_name]}) + + assert isinstance(maybe_tree.tree.node, Module) + assert maybe_tree.tree.node.globals.get("b") == [assign_name] + + def test_first_statement_is_none() -> None: node = Const("xyz") assert not Tree(node).first_statement() From 1d220848ea2bf424158198c5ff73eaae2102bcfa Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 10:52:48 +0100 Subject: [PATCH 08/94] Narrow not implemented test --- src/databricks/labs/ucx/source_code/python/python_ast.py | 2 +- tests/unit/source_code/python/test_python_ast.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 06a46887e0..3525d737e0 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -227,7 +227,7 @@ def attach_child_tree(self, tree: Tree) -> None: def append_globals(self, globs: dict[str, list[Expr]]) -> None: if not isinstance(self.node, Module): - raise NotImplementedError(f"Can't append globals to {type(self.node).__name__}") + raise NotImplementedError(f"Cannot append globals: {type(self.node).__name__}") self_module: Module = cast(Module, self.node) for name, values in globs.items(): statements: list[Expr] = self_module.globals.get(name, None) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index f9ee568beb..362df7766a 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -349,8 +349,8 @@ def test_has_global_fails() -> None: assert not Tree.new_module().has_global("xyz") -def test_append_globals_fails() -> None: - with pytest.raises(NotImplementedError): +def test_append_globals_raises_not_implemented_error_for_constant_node() -> None: + with pytest.raises(NotImplementedError, match="Cannot append globals: .*"): Tree(Const("xyz")).append_globals({}) From 48ddf4eb78d2e2e0cf06e5b0d8f162f730478d4b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 10:57:45 +0100 Subject: [PATCH 09/94] Test appending globals during attach tree --- tests/unit/source_code/python/test_python_ast.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 362df7766a..03291e6cee 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -321,6 +321,19 @@ def test_tree_append_globals_adds_assign_name_to_tree() -> None: assert maybe_tree.tree.node.globals.get("b") == [assign_name] +def test_tree_attach_child_tree_appends_globals_to_parent_tree() -> None: + parent_tree = Tree.maybe_normalized_parse("a = 1") + child_tree = Tree.maybe_normalized_parse("b = a + 2") + + assert parent_tree.tree, parent_tree.failure + assert child_tree.tree, child_tree.failure + + parent_tree.tree.attach_child_tree(child_tree.tree) + + assert set(parent_tree.tree.node.globals.keys()) == {"a", "b"} + assert set(child_tree.tree.node.globals.keys()) == {"b"} + + def test_first_statement_is_none() -> None: node = Const("xyz") assert not Tree(node).first_statement() From 1211ddae96c98468a4408b2ee51bd55aede60fd7 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 11:43:40 +0100 Subject: [PATCH 10/94] Refactor `append_globals` to `extend_globals` --- .../ucx/source_code/python/python_analyzer.py | 4 ++-- .../labs/ucx/source_code/python/python_ast.py | 21 +++++++++---------- .../source_code/python/test_python_ast.py | 10 ++++----- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_analyzer.py b/src/databricks/labs/ucx/source_code/python/python_analyzer.py index 1b66f53faf..416d2d2cd7 100644 --- a/src/databricks/labs/ucx/source_code/python/python_analyzer.py +++ b/src/databricks/labs/ucx/source_code/python/python_analyzer.py @@ -75,7 +75,7 @@ def build_inherited_context(self, child_path: Path) -> InheritedContext: nodes = tree.nodes_between(last_line + 1, node_line - 1) context.tree.append_nodes(nodes) globs = tree.globals_between(last_line + 1, node_line - 1) - context.tree.append_globals(globs) + context.tree.extend_globals(globs) last_line = node_line # process node child_context = self._build_inherited_context_from_node(base_node, child_path) @@ -88,7 +88,7 @@ def build_inherited_context(self, child_path: Path) -> InheritedContext: nodes = tree.nodes_between(last_line + 1, line_count) context.tree.append_nodes(nodes) globs = tree.globals_between(last_line + 1, line_count) - context.tree.append_globals(globs) + context.tree.extend_globals(globs) return context def _parse_and_extract_nodes(self) -> tuple[Tree | None, list[NodeBase], Iterable[DependencyProblem]]: diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 3525d737e0..37c84aa663 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -16,7 +16,6 @@ Call, ClassDef, Const, - Expr, Import, ImportFrom, Instance, @@ -223,18 +222,18 @@ def attach_child_tree(self, tree: Tree) -> None: raise NotImplementedError(f"Cannot attach child tree: {type(tree.node).__name__}") tree_module: Module = cast(Module, tree.node) self.append_nodes(tree_module.body) - self.append_globals(tree_module.globals) + self.extend_globals(tree_module.globals) - def append_globals(self, globs: dict[str, list[Expr]]) -> None: + def extend_globals(self, globs: dict[str, list[NodeNG]]) -> None: + """Extend globals by extending the global values for each global key. + + Extending globals is a stateful operation for this `Tree` (`self`), similarly to extending a list. + """ if not isinstance(self.node, Module): - raise NotImplementedError(f"Cannot append globals: {type(self.node).__name__}") + raise NotImplementedError(f"Cannot extend globals: {type(self.node).__name__}") self_module: Module = cast(Module, self.node) - for name, values in globs.items(): - statements: list[Expr] = self_module.globals.get(name, None) - if statements is None: - self_module.globals[name] = list(values) # clone the source list to avoid side-effects - continue - statements.extend(values) + for global_key, global_values in globs.items(): + self_module.globals[global_key] = self_module.globals.get(global_key, []) + global_values def append_nodes(self, nodes: list[NodeNG]) -> None: if not isinstance(self.node, Module): @@ -702,7 +701,7 @@ def append_nodes(self, nodes: list[NodeNG]) -> None: self._make_tree().append_nodes(nodes) def append_globals(self, globs: dict) -> None: - self._make_tree().append_globals(globs) + self._make_tree().extend_globals(globs) def process_child_cell(self, code: str) -> None: this_tree = self._make_tree() diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 03291e6cee..9a67a1750a 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -307,7 +307,7 @@ def test_is_builtin(source, name, is_builtin) -> None: assert False # could not locate call -def test_tree_append_globals_adds_assign_name_to_tree() -> None: +def test_tree_extend_globals_adds_assign_name_to_tree() -> None: maybe_tree = Tree.maybe_normalized_parse("a = 1") assert maybe_tree.tree, maybe_tree.failure @@ -315,7 +315,7 @@ def test_tree_append_globals_adds_assign_name_to_tree() -> None: assign_name = next(node.get_children()) assert isinstance(assign_name, AssignName) - maybe_tree.tree.append_globals({"b": [assign_name]}) + maybe_tree.tree.extend_globals({"b": [assign_name]}) assert isinstance(maybe_tree.tree.node, Module) assert maybe_tree.tree.node.globals.get("b") == [assign_name] @@ -362,9 +362,9 @@ def test_has_global_fails() -> None: assert not Tree.new_module().has_global("xyz") -def test_append_globals_raises_not_implemented_error_for_constant_node() -> None: - with pytest.raises(NotImplementedError, match="Cannot append globals: .*"): - Tree(Const("xyz")).append_globals({}) +def test_extend_globals_raises_not_implemented_error_for_constant_node() -> None: + with pytest.raises(NotImplementedError, match="Cannot extend globals: .*"): + Tree(Const("xyz")).extend_globals({}) def test_globals_between_fails() -> None: From 043fc6b83305e51871be7c25881d141ad5c079f4 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 11:48:20 +0100 Subject: [PATCH 11/94] Test appending nodes sets parent on node --- tests/unit/source_code/python/test_python_ast.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 9a67a1750a..7c142d2682 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -307,6 +307,16 @@ def test_is_builtin(source, name, is_builtin) -> None: assert False # could not locate call +def test_tree_append_nodes_sets_parent() -> None: + node = astroid.extract_node("b = a + 2") + maybe_tree = Tree.maybe_normalized_parse("a = 1") + assert maybe_tree.tree, maybe_tree.failure + + maybe_tree.tree.append_nodes([node]) + + assert node.parent == maybe_tree.tree.node + + def test_tree_extend_globals_adds_assign_name_to_tree() -> None: maybe_tree = Tree.maybe_normalized_parse("a = 1") assert maybe_tree.tree, maybe_tree.failure From b0e39efd6857aa697ef738d99f853a0155b4e8e4 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 11:49:27 +0100 Subject: [PATCH 12/94] Test appending nodes adds nodes to end of body --- tests/unit/source_code/python/test_python_ast.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 7c142d2682..b3106eee3a 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -317,6 +317,16 @@ def test_tree_append_nodes_sets_parent() -> None: assert node.parent == maybe_tree.tree.node +def test_tree_append_nodes_adds_node_to_body() -> None: + node = astroid.extract_node("b = a + 2") + maybe_tree = Tree.maybe_normalized_parse("a = 1") + assert maybe_tree.tree, maybe_tree.failure + + maybe_tree.tree.append_nodes([node]) + + assert maybe_tree.tree.node.body[-1] == node + + def test_tree_extend_globals_adds_assign_name_to_tree() -> None: maybe_tree = Tree.maybe_normalized_parse("a = 1") assert maybe_tree.tree, maybe_tree.failure From 51358f21520b199bef0323f64bf2c43ee04bd3b1 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 11:49:46 +0100 Subject: [PATCH 13/94] Move append_nodes method up --- .../labs/ucx/source_code/python/python_ast.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 37c84aa663..70e8541924 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -224,6 +224,14 @@ def attach_child_tree(self, tree: Tree) -> None: self.append_nodes(tree_module.body) self.extend_globals(tree_module.globals) + def append_nodes(self, nodes: list[NodeNG]) -> None: + if not isinstance(self.node, Module): + raise NotImplementedError(f"Can't append statements to {type(self.node).__name__}") + self_module: Module = cast(Module, self.node) + for node in nodes: + node.parent = self_module + self_module.body.append(node) + def extend_globals(self, globs: dict[str, list[NodeNG]]) -> None: """Extend globals by extending the global values for each global key. @@ -235,14 +243,6 @@ def extend_globals(self, globs: dict[str, list[NodeNG]]) -> None: for global_key, global_values in globs.items(): self_module.globals[global_key] = self_module.globals.get(global_key, []) + global_values - def append_nodes(self, nodes: list[NodeNG]) -> None: - if not isinstance(self.node, Module): - raise NotImplementedError(f"Can't append statements to {type(self.node).__name__}") - self_module: Module = cast(Module, self.node) - for node in nodes: - node.parent = self_module - self_module.body.append(node) - def is_instance_of(self, class_name: str) -> bool: for inferred in self.node.inferred(): if inferred is Uninferable: From 460e88fd8e1548143ef89b5cc21490a63dd66f7b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 11:50:48 +0100 Subject: [PATCH 14/94] Rename append_nodes to attach_nodes --- .../labs/ucx/source_code/python/python_analyzer.py | 4 ++-- .../labs/ucx/source_code/python/python_ast.py | 6 +++--- tests/unit/source_code/python/test_python_ast.py | 12 ++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_analyzer.py b/src/databricks/labs/ucx/source_code/python/python_analyzer.py index 416d2d2cd7..9fa8fb5118 100644 --- a/src/databricks/labs/ucx/source_code/python/python_analyzer.py +++ b/src/databricks/labs/ucx/source_code/python/python_analyzer.py @@ -73,7 +73,7 @@ def build_inherited_context(self, child_path: Path) -> InheritedContext: # append nodes node_line = base_node.node.lineno nodes = tree.nodes_between(last_line + 1, node_line - 1) - context.tree.append_nodes(nodes) + context.tree.attach_nodes(nodes) globs = tree.globals_between(last_line + 1, node_line - 1) context.tree.extend_globals(globs) last_line = node_line @@ -86,7 +86,7 @@ def build_inherited_context(self, child_path: Path) -> InheritedContext: assert context.tree is not None, "Tree should be initialized" if last_line < line_count: nodes = tree.nodes_between(last_line + 1, line_count) - context.tree.append_nodes(nodes) + context.tree.attach_nodes(nodes) globs = tree.globals_between(last_line + 1, line_count) context.tree.extend_globals(globs) return context diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 70e8541924..414383203c 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -221,10 +221,10 @@ def attach_child_tree(self, tree: Tree) -> None: if not isinstance(tree.node, Module): raise NotImplementedError(f"Cannot attach child tree: {type(tree.node).__name__}") tree_module: Module = cast(Module, tree.node) - self.append_nodes(tree_module.body) + self.attach_nodes(tree_module.body) self.extend_globals(tree_module.globals) - def append_nodes(self, nodes: list[NodeNG]) -> None: + def attach_nodes(self, nodes: list[NodeNG]) -> None: if not isinstance(self.node, Module): raise NotImplementedError(f"Can't append statements to {type(self.node).__name__}") self_module: Module = cast(Module, self.node) @@ -698,7 +698,7 @@ def append_tree(self, tree: Tree) -> None: self._make_tree().attach_child_tree(tree) def append_nodes(self, nodes: list[NodeNG]) -> None: - self._make_tree().append_nodes(nodes) + self._make_tree().attach_nodes(nodes) def append_globals(self, globs: dict) -> None: self._make_tree().extend_globals(globs) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index b3106eee3a..f071adab29 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -307,22 +307,22 @@ def test_is_builtin(source, name, is_builtin) -> None: assert False # could not locate call -def test_tree_append_nodes_sets_parent() -> None: +def test_tree_attach_nodes_sets_parent() -> None: node = astroid.extract_node("b = a + 2") maybe_tree = Tree.maybe_normalized_parse("a = 1") assert maybe_tree.tree, maybe_tree.failure - maybe_tree.tree.append_nodes([node]) + maybe_tree.tree.attach_nodes([node]) assert node.parent == maybe_tree.tree.node -def test_tree_append_nodes_adds_node_to_body() -> None: +def test_tree_attach_nodes_adds_node_to_body() -> None: node = astroid.extract_node("b = a + 2") maybe_tree = Tree.maybe_normalized_parse("a = 1") assert maybe_tree.tree, maybe_tree.failure - maybe_tree.tree.append_nodes([node]) + maybe_tree.tree.attach_nodes([node]) assert maybe_tree.tree.node.body[-1] == node @@ -368,9 +368,9 @@ def test_tree_attach_child_tree_raises_not_implemented_error_for_constant_node() Tree(Const("xyz")).attach_child_tree(Tree(Const("xyz"))) -def test_append_node_fails() -> None: +def test_tree_attach_nodes_raises_not_implemented_error_for_constant_node() -> None: with pytest.raises(NotImplementedError): - Tree(Const("xyz")).append_nodes([]) + Tree(Const("xyz")).attach_nodes([]) def test_nodes_between_fails() -> None: From 357a2e58acb4b6443b83e41d432deda5b269e3c8 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 11:52:23 +0100 Subject: [PATCH 15/94] Narrow raising not implemented error test --- .../labs/ucx/source_code/python/python_ast.py | 4 ++-- tests/unit/source_code/python/test_python_ast.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 414383203c..dac4b7fa29 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -226,7 +226,7 @@ def attach_child_tree(self, tree: Tree) -> None: def attach_nodes(self, nodes: list[NodeNG]) -> None: if not isinstance(self.node, Module): - raise NotImplementedError(f"Can't append statements to {type(self.node).__name__}") + raise NotImplementedError(f"Cannot attach nodes to: {type(self.node).__name__}") self_module: Module = cast(Module, self.node) for node in nodes: node.parent = self_module @@ -238,7 +238,7 @@ def extend_globals(self, globs: dict[str, list[NodeNG]]) -> None: Extending globals is a stateful operation for this `Tree` (`self`), similarly to extending a list. """ if not isinstance(self.node, Module): - raise NotImplementedError(f"Cannot extend globals: {type(self.node).__name__}") + raise NotImplementedError(f"Cannot extend globals to: {type(self.node).__name__}") self_module: Module = cast(Module, self.node) for global_key, global_values in globs.items(): self_module.globals[global_key] = self_module.globals.get(global_key, []) + global_values diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index f071adab29..8700e00769 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -369,10 +369,15 @@ def test_tree_attach_child_tree_raises_not_implemented_error_for_constant_node() def test_tree_attach_nodes_raises_not_implemented_error_for_constant_node() -> None: - with pytest.raises(NotImplementedError): + with pytest.raises(NotImplementedError, match="Cannot attach nodes to: .*"): Tree(Const("xyz")).attach_nodes([]) +def test_extend_globals_raises_not_implemented_error_for_constant_node() -> None: + with pytest.raises(NotImplementedError, match="Cannot extend globals to: .*"): + Tree(Const("xyz")).extend_globals({}) + + def test_nodes_between_fails() -> None: with pytest.raises(NotImplementedError): Tree(Const("xyz")).nodes_between(0, 100) @@ -382,11 +387,6 @@ def test_has_global_fails() -> None: assert not Tree.new_module().has_global("xyz") -def test_extend_globals_raises_not_implemented_error_for_constant_node() -> None: - with pytest.raises(NotImplementedError, match="Cannot extend globals: .*"): - Tree(Const("xyz")).extend_globals({}) - - def test_globals_between_fails() -> None: with pytest.raises(NotImplementedError): Tree(Const("xyz")).line_count() From d9ffd91c23279e33d4bc7860cedf499d1e9bd356 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 15 Jan 2025 11:56:06 +0100 Subject: [PATCH 16/94] Add docstring for attach nodes --- src/databricks/labs/ucx/source_code/python/python_ast.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index dac4b7fa29..d1f80b0a4f 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -225,6 +225,12 @@ def attach_child_tree(self, tree: Tree) -> None: self.extend_globals(tree_module.globals) def attach_nodes(self, nodes: list[NodeNG]) -> None: + """Attach nodes. + + Attaching nodes is a **stateful** operation for both this tree's node, the parent node, and the child nodes. + After attaching the nodes, the parent node has the nodes in its body and the child nodes have this tree's node + as parent node. + """ if not isinstance(self.node, Module): raise NotImplementedError(f"Cannot attach nodes to: {type(self.node).__name__}") self_module: Module = cast(Module, self.node) From a3c023bfd43824b31ce4762a27442efa120ff6c0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 13:47:01 +0100 Subject: [PATCH 17/94] Change defining sources in test --- tests/unit/source_code/python/test_python_ast.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 8700e00769..cd7e795b7c 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -203,12 +203,11 @@ def test_is_instance_of(source, name, class_name) -> None: def test_tree_attach_child_tree_propagates_module_reference() -> None: """The spark module should propagate from the parent tree.""" - source = """ -df = spark.read.csv("hi") -df = df.withColumn(stuff) -df = df.withColumn(stuff2) -""".lstrip() # Remove first new line character - sources = source.split("\n") + sources = ( + "df = spark.read.csv('hi')", + "df = df.withColumn(stuff)", + "df = df.withColumn(stuff2)", + ) first_line_maybe_tree = Tree.maybe_normalized_parse(sources[0]) second_line_maybe_tree = Tree.maybe_normalized_parse(sources[1]) third_line_maybe_tree = Tree.maybe_normalized_parse(sources[2]) From 1e7a4b9179d482cbfcd1623f25a59bc63535a540 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 11:44:59 +0100 Subject: [PATCH 18/94] Update constructing sources --- tests/unit/source_code/python/test_python_ast.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index cd7e795b7c..38a61da3fe 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -203,14 +203,12 @@ def test_is_instance_of(source, name, class_name) -> None: def test_tree_attach_child_tree_propagates_module_reference() -> None: """The spark module should propagate from the parent tree.""" - sources = ( - "df = spark.read.csv('hi')", - "df = df.withColumn(stuff)", - "df = df.withColumn(stuff2)", - ) - first_line_maybe_tree = Tree.maybe_normalized_parse(sources[0]) - second_line_maybe_tree = Tree.maybe_normalized_parse(sources[1]) - third_line_maybe_tree = Tree.maybe_normalized_parse(sources[2]) + source_1 = "df = spark.read.csv('hi')" + source_2 = "df = df.withColumn(stuff)" + source_3 = "df = df.withColumn(stuff2)" + first_line_maybe_tree = Tree.maybe_normalized_parse(source_1) + second_line_maybe_tree = Tree.maybe_normalized_parse(source_2) + third_line_maybe_tree = Tree.maybe_normalized_parse(source_3) assert first_line_maybe_tree.tree, first_line_maybe_tree.failure assert second_line_maybe_tree.tree, second_line_maybe_tree.failure From 249857e99900c2dfdb70224928234691d9714cf1 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 13:09:26 +0100 Subject: [PATCH 19/94] Test PythonLinter with dummy advices --- .../source_code/python/test_python_ast.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 38a61da3fe..e826fbc94c 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -1,8 +1,11 @@ +from collections.abc import Iterable + import pytest import astroid # type: ignore from astroid import Assign, AssignName, Attribute, Call, Const, Expr, Module, Name # type: ignore -from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper +from databricks.labs.ucx.source_code.base import Advice, Failure +from databricks.labs.ucx.source_code.python.python_ast import PythonLinter, PythonSequentialLinter, Tree, TreeHelper from databricks.labs.ucx.source_code.python.python_infer import InferredValue @@ -401,3 +404,17 @@ def test_renumber_fails() -> None: def test_const_is_not_from_module() -> None: assert not Tree(Const("xyz")).is_from_module("spark") + + +class DummyPythonLinter(PythonLinter): + """Dummy python linter yielding dummy advices for testing purpose.""" + + def lint_tree(self, tree: Tree) -> Iterable[Advice]: + yield Advice("dummy", "dummy advice", 0, 0, 0, 0) + yield Advice("dummy", "dummy advice", 1, 1, 1, 1) + + +def test_dummy_python_linter_lint_lints_tree() -> None: + linter = DummyPythonLinter() + advices = list(linter.lint("print(1)")) + assert advices == [Advice("dummy", "dummy advice", 0, 0, 0, 0), Advice("dummy", "dummy advice", 1, 1, 1, 1)] From ba04281f696cd8a86596193f72421dc4d4c4975a Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 13:10:11 +0100 Subject: [PATCH 20/94] Test linting unparsable python code --- tests/unit/source_code/python/test_python_ast.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index e826fbc94c..54135f0f57 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -418,3 +418,18 @@ def test_dummy_python_linter_lint_lints_tree() -> None: linter = DummyPythonLinter() advices = list(linter.lint("print(1)")) assert advices == [Advice("dummy", "dummy advice", 0, 0, 0, 0), Advice("dummy", "dummy advice", 1, 1, 1, 1)] + + +def test_dummy_python_linter_lint_yields_failure_due_to_parse_error() -> None: + linter = DummyPythonLinter() + advices = list(linter.lint("print(1")) # Closing parenthesis is missing on purpose + assert advices == [ + Failure( + code="python-parse-error", + message="Failed to parse code due to invalid syntax: print(1", + start_line=0, + start_col=5, + end_line=0, + end_col=1, + ) + ] From 18e9c98d7b71c2db63382364801b2254e13ad38b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 13:10:33 +0100 Subject: [PATCH 21/94] Test sequential linter with dummy advices --- tests/unit/source_code/python/test_python_ast.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 54135f0f57..9e03af6758 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -433,3 +433,9 @@ def test_dummy_python_linter_lint_yields_failure_due_to_parse_error() -> None: end_col=1, ) ] + + +def test_python_sequential_linter_lint_lints_tree() -> None: + linter = PythonSequentialLinter([DummyPythonLinter()], [], []) + advices = list(linter.lint("print(1)")) + assert advices == [Advice("dummy", "dummy advice", 0, 0, 0, 0), Advice("dummy", "dummy advice", 1, 1, 1, 1)] From fb8d6ee7cfd4ad01ce093163b1e7ef8981dde8d8 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 13:11:08 +0100 Subject: [PATCH 22/94] Test linting print(1) sets no globals --- tests/unit/source_code/python/test_python_ast.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 9e03af6758..5f6dc63544 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -439,3 +439,19 @@ def test_python_sequential_linter_lint_lints_tree() -> None: linter = PythonSequentialLinter([DummyPythonLinter()], [], []) advices = list(linter.lint("print(1)")) assert advices == [Advice("dummy", "dummy advice", 0, 0, 0, 0), Advice("dummy", "dummy advice", 1, 1, 1, 1)] + + +class NodeGlobalsLinter(PythonLinter): + """Get the node globals""" + + def lint_tree(self, tree: Tree) -> Iterable[Advice]: + globs = "" + if isinstance(tree.node, Module): + globs = ",".join(tree.node.globals.keys()) + yield Advice("globals", globs, 0, 0, 0, 0) + + +def test_python_sequential_linter_lint_has_no_globals() -> None: + linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) + advices = list(linter.lint("print(1)")) + assert advices == [Advice("globals", "", 0, 0, 0, 0)] From 96ef1ecb3fc7a0cadc82323da145471175f47636 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 13:11:27 +0100 Subject: [PATCH 23/94] Test linting with one global --- tests/unit/source_code/python/test_python_ast.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 5f6dc63544..37c0d13b4f 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -455,3 +455,9 @@ def test_python_sequential_linter_lint_has_no_globals() -> None: linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) advices = list(linter.lint("print(1)")) assert advices == [Advice("globals", "", 0, 0, 0, 0)] + + +def test_python_sequential_linter_lint_has_one_global() -> None: + linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) + advices = list(linter.lint("a = 1")) + assert advices == [Advice("globals", "a", 0, 0, 0, 0)] From e5365efd9dc07a0ef8f6e0fa9a6d157447ac2ee6 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 13:11:40 +0100 Subject: [PATCH 24/94] Test linting with two globals --- tests/unit/source_code/python/test_python_ast.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 37c0d13b4f..22af2f4ab6 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -461,3 +461,9 @@ def test_python_sequential_linter_lint_has_one_global() -> None: linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) advices = list(linter.lint("a = 1")) assert advices == [Advice("globals", "a", 0, 0, 0, 0)] + + +def test_python_sequential_linter_lint_has_two_globals() -> None: + linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) + advices = list(linter.lint("a = 1;b = 2")) + assert advices == [Advice("globals", "a,b", 0, 0, 0, 0)] From aed3b12e4fa75a3bc66bedf00170d3eb4a271d29 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 13:12:14 +0100 Subject: [PATCH 25/94] Test linting separate code sources separates globals --- tests/unit/source_code/python/test_python_ast.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 22af2f4ab6..44ee722d41 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -467,3 +467,11 @@ def test_python_sequential_linter_lint_has_two_globals() -> None: linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) advices = list(linter.lint("a = 1;b = 2")) assert advices == [Advice("globals", "a,b", 0, 0, 0, 0)] + + +def test_python_sequential_linter_lint_is_stateless() -> None: + """Globals from previous lint calls should not be part of later calls""" + linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) + list(linter.lint("a = 1")) + advices = list(linter.lint("b = 2")) + assert advices == [Advice("globals", "b", 0, 0, 0, 0)] From ee148ebac443701e09e8078b92099d2692f66273 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 13:12:33 +0100 Subject: [PATCH 26/94] Test appending globals sets global --- tests/unit/source_code/python/test_python_ast.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 44ee722d41..14a41d4787 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -475,3 +475,15 @@ def test_python_sequential_linter_lint_is_stateless() -> None: list(linter.lint("a = 1")) advices = list(linter.lint("b = 2")) assert advices == [Advice("globals", "b", 0, 0, 0, 0)] + + +def test_python_sequential_linter_extend_globals() -> None: + linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) + + node = astroid.extract_node("b = a + 2") + assign_name = next(node.get_children()) + assert isinstance(assign_name, AssignName) + linter.append_globals({"b": [assign_name]}) + + advices = list(linter.lint("a = 1")) + assert advices == [Advice("globals", "a,b", 0, 0, 0, 0)] From 84b5f69012034218dbe27ec1e9d06cc06190dbd7 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 13:13:03 +0100 Subject: [PATCH 27/94] Remove SquentialLinter.make_tree --- .../labs/ucx/source_code/python/python_ast.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index d1f80b0a4f..286b3b31f1 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -678,7 +678,9 @@ def __init__( self._linters = linters self._dfsa_collectors = dfsa_collectors self._table_collectors = table_collectors - self._tree: Tree | None = None + + # This tree collects all the trees parsed from source code by attaching them as child trees + self._tree: Tree = Tree.new_module() def lint(self, code: str) -> Iterable[Advice]: maybe_tree = self._parse_and_append(code) @@ -701,23 +703,22 @@ def _parse_and_append(self, code: str) -> MaybeTree: return maybe_tree def append_tree(self, tree: Tree) -> None: - self._make_tree().attach_child_tree(tree) + self._tree.attach_child_tree(tree) def append_nodes(self, nodes: list[NodeNG]) -> None: - self._make_tree().attach_nodes(nodes) + self._tree.attach_nodes(nodes) def append_globals(self, globs: dict) -> None: - self._make_tree().extend_globals(globs) + self._tree.extend_globals(globs) def process_child_cell(self, code: str) -> None: - this_tree = self._make_tree() maybe_tree = Tree.maybe_normalized_parse(code) if maybe_tree.failure: # TODO: bubble up this error logger.warning(maybe_tree.failure.message) return assert maybe_tree.tree is not None - this_tree.attach_child_tree(maybe_tree.tree) + self._tree.attach_child_tree(maybe_tree.tree) def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: maybe_tree = self._parse_and_append(source_code) @@ -744,8 +745,3 @@ def collect_tables(self, source_code: str) -> Iterable[UsedTable]: def collect_tables_from_tree(self, tree: Tree) -> Iterable[UsedTableNode]: for collector in self._table_collectors: yield from collector.collect_tables_from_tree(tree) - - def _make_tree(self) -> Tree: - if self._tree is None: - self._tree = Tree.new_module() - return self._tree From d142f7c44d5cb5a34ac44b9db513df1a91977416 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 13:17:36 +0100 Subject: [PATCH 28/94] Refactor globals linter to fetch globals from body nodes --- .../source_code/python/test_python_ast.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 14a41d4787..f05bf7f482 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -441,44 +441,45 @@ def test_python_sequential_linter_lint_lints_tree() -> None: assert advices == [Advice("dummy", "dummy advice", 0, 0, 0, 0), Advice("dummy", "dummy advice", 1, 1, 1, 1)] -class NodeGlobalsLinter(PythonLinter): +class BodyNodesGlobalsLinter(PythonLinter): """Get the node globals""" def lint_tree(self, tree: Tree) -> Iterable[Advice]: - globs = "" + globs = set() if isinstance(tree.node, Module): - globs = ",".join(tree.node.globals.keys()) - yield Advice("globals", globs, 0, 0, 0, 0) + for node in tree.node.body: + globs |= set(node.parent.globals.keys()) + yield Advice("globals", ",".join(globs), 0, 0, 0, 0) def test_python_sequential_linter_lint_has_no_globals() -> None: - linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) + linter = PythonSequentialLinter([BodyNodesGlobalsLinter()], [], []) advices = list(linter.lint("print(1)")) assert advices == [Advice("globals", "", 0, 0, 0, 0)] def test_python_sequential_linter_lint_has_one_global() -> None: - linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) + linter = PythonSequentialLinter([BodyNodesGlobalsLinter()], [], []) advices = list(linter.lint("a = 1")) assert advices == [Advice("globals", "a", 0, 0, 0, 0)] def test_python_sequential_linter_lint_has_two_globals() -> None: - linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) + linter = PythonSequentialLinter([BodyNodesGlobalsLinter()], [], []) advices = list(linter.lint("a = 1;b = 2")) assert advices == [Advice("globals", "a,b", 0, 0, 0, 0)] def test_python_sequential_linter_lint_is_stateless() -> None: """Globals from previous lint calls should not be part of later calls""" - linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) + linter = PythonSequentialLinter([BodyNodesGlobalsLinter()], [], []) list(linter.lint("a = 1")) advices = list(linter.lint("b = 2")) assert advices == [Advice("globals", "b", 0, 0, 0, 0)] def test_python_sequential_linter_extend_globals() -> None: - linter = PythonSequentialLinter([NodeGlobalsLinter()], [], []) + linter = PythonSequentialLinter([BodyNodesGlobalsLinter()], [], []) node = astroid.extract_node("b = a + 2") assign_name = next(node.get_children()) From 00aad283b7a2470a5d938eeaff3bb8f627195cf3 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 13:33:16 +0100 Subject: [PATCH 29/94] Sort globals for consistent testing --- tests/unit/source_code/python/test_python_ast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index f05bf7f482..fe8d656696 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -449,7 +449,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]: if isinstance(tree.node, Module): for node in tree.node.body: globs |= set(node.parent.globals.keys()) - yield Advice("globals", ",".join(globs), 0, 0, 0, 0) + yield Advice("globals", ",".join(sorted(globs)), 0, 0, 0, 0) def test_python_sequential_linter_lint_has_no_globals() -> None: From 4c1e79e577df0fc8ab5b0319242834f91457d903 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 14:03:40 +0100 Subject: [PATCH 30/94] Test dummy DFSA Python collector --- .../source_code/python/test_python_ast.py | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index fe8d656696..949d72d15e 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -1,11 +1,12 @@ +import logging from collections.abc import Iterable import pytest import astroid # type: ignore -from astroid import Assign, AssignName, Attribute, Call, Const, Expr, Module, Name # type: ignore +from astroid import Assign, AssignName, Attribute, Call, Const, Expr, Module, Name, NodeNG # type: ignore -from databricks.labs.ucx.source_code.base import Advice, Failure -from databricks.labs.ucx.source_code.python.python_ast import PythonLinter, PythonSequentialLinter, Tree, TreeHelper +from databricks.labs.ucx.source_code.base import Advice, DirectFsAccess, DirectFsAccessNode, Failure +from databricks.labs.ucx.source_code.python.python_ast import DfsaPyCollector, PythonLinter, PythonSequentialLinter, Tree, TreeHelper from databricks.labs.ucx.source_code.python.python_infer import InferredValue @@ -488,3 +489,32 @@ def test_python_sequential_linter_extend_globals() -> None: advices = list(linter.lint("a = 1")) assert advices == [Advice("globals", "a,b", 0, 0, 0, 0)] + + +class DummyDfsaPyCollector(DfsaPyCollector): + """Dummy direct filesystem access collector yielding dummy advices for testing purpose.""" + + def collect_dfsas_from_tree(self, tree: Tree) -> Iterable[DirectFsAccessNode]: + dfsa = DirectFsAccess(path="test.py") + node = NodeNG(0, 0, None, end_lineno=0, end_col_offset=0) + yield DirectFsAccessNode(dfsa, node) + + +def test_dummy_dfsa_python_collector_collect_dfsas() -> None: + linter = DummyDfsaPyCollector() + dfsas = list(linter.collect_dfsas("print(1)")) + assert dfsas == [DirectFsAccess(path="test.py")] + + +def test_dummy_dfsa_python_collector_collect_dfsas_warns_failure_due_to_parse_error(caplog) -> None: + linter = DummyDfsaPyCollector() + with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.python.python_ast"): + dfsas = list(linter.collect_dfsas("print(1")) # Closing parenthesis is missing on purpose + assert not dfsas + assert "Failed to parse code due to invalid syntax: print(1" in caplog.messages + + +def test_python_sequential_linter_collect_dfsas() -> None: + linter = PythonSequentialLinter([], [DummyDfsaPyCollector()], []) + dfsas = list(linter.collect_dfsas("print(1)")) + assert dfsas == [DirectFsAccess(path="test.py")] From f860a6769c62193635951710ff690cc69309c3d0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 15:46:11 +0100 Subject: [PATCH 31/94] Test dummy used table Python collector --- .../source_code/python/test_python_ast.py | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 949d72d15e..c5d215317c 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -5,8 +5,8 @@ import astroid # type: ignore from astroid import Assign, AssignName, Attribute, Call, Const, Expr, Module, Name, NodeNG # type: ignore -from databricks.labs.ucx.source_code.base import Advice, DirectFsAccess, DirectFsAccessNode, Failure -from databricks.labs.ucx.source_code.python.python_ast import DfsaPyCollector, PythonLinter, PythonSequentialLinter, Tree, TreeHelper +from databricks.labs.ucx.source_code.base import Advice, DirectFsAccess, DirectFsAccessNode, Failure, UsedTable, UsedTableNode +from databricks.labs.ucx.source_code.python.python_ast import DfsaPyCollector, PythonLinter, PythonSequentialLinter, TablePyCollector, Tree, TreeHelper from databricks.labs.ucx.source_code.python.python_infer import InferredValue @@ -518,3 +518,32 @@ def test_python_sequential_linter_collect_dfsas() -> None: linter = PythonSequentialLinter([], [DummyDfsaPyCollector()], []) dfsas = list(linter.collect_dfsas("print(1)")) assert dfsas == [DirectFsAccess(path="test.py")] + + +class DummyTablePyCollector(TablePyCollector): + """Dummy table collector yielding dummy used tables for testing purpose.""" + + def collect_tables_from_tree(self, tree: Tree) -> Iterable[UsedTableNode]: + dfsa = UsedTable(schema_name="test", table_name="test") + node = NodeNG(0, 0, None, end_lineno=0, end_col_offset=0) + yield UsedTableNode(dfsa, node) + + +def test_dummy_table_python_collector_collect_tables() -> None: + linter = DummyTablePyCollector() + used_tables = list(linter.collect_tables("print(1)")) + assert used_tables == [UsedTable(schema_name="test", table_name="test")] + + +def test_dummy_table_python_collector_collect_tables_warns_failure_due_to_parse_error(caplog) -> None: + linter = DummyTablePyCollector() + with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.python.python_ast"): + used_tables = list(linter.collect_tables("print(1")) # Closing parenthesis is missing on purpose + assert not used_tables + assert "Failed to parse code due to invalid syntax: print(1" in caplog.messages + + +def test_python_sequential_linter_collect_tables() -> None: + linter = PythonSequentialLinter([], [], [DummyTablePyCollector()]) + used_tables = list(linter.collect_tables("print(1)")) + assert used_tables == [UsedTable(schema_name="test", table_name="test")] From 906ba879da07e2795aa549267d06ef24c55aacdd Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 15:47:48 +0100 Subject: [PATCH 32/94] Delete dead code `PythonSequentialLinter.process_child_cell` --- src/databricks/labs/ucx/source_code/python/python_ast.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 286b3b31f1..8120ba7b85 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -711,15 +711,6 @@ def append_nodes(self, nodes: list[NodeNG]) -> None: def append_globals(self, globs: dict) -> None: self._tree.extend_globals(globs) - def process_child_cell(self, code: str) -> None: - maybe_tree = Tree.maybe_normalized_parse(code) - if maybe_tree.failure: - # TODO: bubble up this error - logger.warning(maybe_tree.failure.message) - return - assert maybe_tree.tree is not None - self._tree.attach_child_tree(maybe_tree.tree) - def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: maybe_tree = self._parse_and_append(source_code) if maybe_tree.failure: From feb0a8cab0c41263cc0d74128fd5e49d8a1dbaa1 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 15:48:57 +0100 Subject: [PATCH 33/94] Format imports --- .../unit/source_code/python/test_python_ast.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index c5d215317c..7a3b32fbe5 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -5,8 +5,22 @@ import astroid # type: ignore from astroid import Assign, AssignName, Attribute, Call, Const, Expr, Module, Name, NodeNG # type: ignore -from databricks.labs.ucx.source_code.base import Advice, DirectFsAccess, DirectFsAccessNode, Failure, UsedTable, UsedTableNode -from databricks.labs.ucx.source_code.python.python_ast import DfsaPyCollector, PythonLinter, PythonSequentialLinter, TablePyCollector, Tree, TreeHelper +from databricks.labs.ucx.source_code.base import ( + Advice, + DirectFsAccess, + DirectFsAccessNode, + Failure, + UsedTable, + UsedTableNode, +) +from databricks.labs.ucx.source_code.python.python_ast import ( + DfsaPyCollector, + PythonLinter, + PythonSequentialLinter, + TablePyCollector, + Tree, + TreeHelper, +) from databricks.labs.ucx.source_code.python.python_infer import InferredValue From 73991aed9f3c0ea0942bd5cca4c9b8e86b753849 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 15:55:13 +0100 Subject: [PATCH 34/94] Remove Tree from python sequential linter --- .../labs/ucx/source_code/notebooks/sources.py | 20 +------------------ .../labs/ucx/source_code/python/python_ast.py | 13 ------------ .../source_code/python/test_python_ast.py | 12 ----------- 3 files changed, 1 insertion(+), 44 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 1b8a4e4528..0a69a75e38 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -149,7 +149,6 @@ def __init__( path_lookup: PathLookup, session_state: CurrentSessionState, notebook: Notebook, - inherited_tree: Tree | None = None, ): self._context: LinterContext = context self._path_lookup = path_lookup @@ -158,8 +157,6 @@ def __init__( # reuse Python linter across related files and notebook cells # this is required in order to accumulate statements for improved inference self._python_linter: PythonSequentialLinter = cast(PythonSequentialLinter, context.linter(Language.PYTHON)) - if inherited_tree is not None: - self._python_linter.append_tree(inherited_tree) self._python_trees: dict[PythonCell, Tree] = {} # the original trees to be linted def lint(self) -> Iterable[Advice]: @@ -214,17 +211,11 @@ def _load_children_from_tree(self, tree: Tree) -> Iterable[Advice]: base_nodes.extend(self._list_run_magic_lines(tree)) base_nodes.extend(SysPathChange.extract_from_tree(self._session_state, tree)) if len(base_nodes) == 0: - self._python_linter.append_tree(tree) return - # append globals - globs = cast(Module, tree.node).globals - self._python_linter.append_globals(globs) # need to execute things in intertwined sequence so concat and sort them nodes = list(cast(Module, tree.node).body) base_nodes = sorted(base_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)) yield from self._load_children_with_base_nodes(nodes, base_nodes) - # append remaining nodes - self._python_linter.append_nodes(nodes) @staticmethod def _list_run_magic_lines(tree: Tree) -> Iterable[MagicLine]: @@ -244,7 +235,6 @@ def _load_children_with_base_nodes(self, nodes: list[NodeNG], base_nodes: list[N def _load_children_with_base_node(self, nodes: list[NodeNG], base_node: NodeBase) -> Iterable[Advice]: while len(nodes) > 0: node = nodes.pop(0) - self._python_linter.append_nodes([node]) if node.lineno < base_node.node.lineno: continue yield from self._load_children_from_base_node(base_node) @@ -417,8 +407,6 @@ def _lint_file(self) -> Iterable[Advice]: else: try: linter = self._ctx.linter(language) - if self._inherited_tree is not None and isinstance(linter, PythonSequentialLinter): - linter.append_tree(self._inherited_tree) yield from linter.lint(self._content) except ValueError as err: failure_message = f"Error while parsing content of {self._path.as_posix()}: {err}" @@ -431,11 +419,5 @@ def _lint_notebook(self) -> Iterable[Advice]: yield Failure("unknown-language", f"Cannot detect language for {self._path}", 0, 0, 1, 1) return notebook = Notebook.parse(self._path, self._content, language) - notebook_linter = NotebookLinter( - self._ctx, - self._path_lookup, - self._session_state, - notebook, - self._inherited_tree, - ) + notebook_linter = NotebookLinter(self._ctx, self._path_lookup, self._session_state, notebook) yield from notebook_linter.lint() diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 8120ba7b85..1648e57dc6 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -679,9 +679,6 @@ def __init__( self._dfsa_collectors = dfsa_collectors self._table_collectors = table_collectors - # This tree collects all the trees parsed from source code by attaching them as child trees - self._tree: Tree = Tree.new_module() - def lint(self, code: str) -> Iterable[Advice]: maybe_tree = self._parse_and_append(code) if maybe_tree.failure: @@ -699,18 +696,8 @@ def _parse_and_append(self, code: str) -> MaybeTree: if maybe_tree.failure: return maybe_tree assert maybe_tree.tree is not None - self.append_tree(maybe_tree.tree) return maybe_tree - def append_tree(self, tree: Tree) -> None: - self._tree.attach_child_tree(tree) - - def append_nodes(self, nodes: list[NodeNG]) -> None: - self._tree.attach_nodes(nodes) - - def append_globals(self, globs: dict) -> None: - self._tree.extend_globals(globs) - def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: maybe_tree = self._parse_and_append(source_code) if maybe_tree.failure: diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 7a3b32fbe5..e714aec486 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -493,18 +493,6 @@ def test_python_sequential_linter_lint_is_stateless() -> None: assert advices == [Advice("globals", "b", 0, 0, 0, 0)] -def test_python_sequential_linter_extend_globals() -> None: - linter = PythonSequentialLinter([BodyNodesGlobalsLinter()], [], []) - - node = astroid.extract_node("b = a + 2") - assign_name = next(node.get_children()) - assert isinstance(assign_name, AssignName) - linter.append_globals({"b": [assign_name]}) - - advices = list(linter.lint("a = 1")) - assert advices == [Advice("globals", "a,b", 0, 0, 0, 0)] - - class DummyDfsaPyCollector(DfsaPyCollector): """Dummy direct filesystem access collector yielding dummy advices for testing purpose.""" From 4cf6d8aa957c1c81b5be532d66e5a473930cdcc4 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 16:53:16 +0100 Subject: [PATCH 35/94] Fix type hinting for classmethod with child classes --- src/databricks/labs/ucx/source_code/base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index d02dbcdb70..fe3f06c9f1 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -10,7 +10,7 @@ from dataclasses import dataclass, field from datetime import datetime from pathlib import Path -from typing import Any, BinaryIO, TextIO +from typing import Any, BinaryIO, TextIO, Type, TypeVar from astroid import NodeNG # type: ignore from sqlglot import Expression, parse as parse_sql @@ -40,6 +40,9 @@ logger = logging.getLogger(__name__) +T = TypeVar("T", bound="Advice") + + @dataclass class Advice: code: str @@ -66,7 +69,7 @@ def for_path(self, path: Path) -> LocatedAdvice: return LocatedAdvice(self, path) @classmethod - def from_node(cls, *, code: str, message: str, node: NodeNG) -> Advice: + def from_node(cls: Type[T], *, code: str, message: str, node: NodeNG) -> T: # Astroid lines are 1-based. return cls( code=code, From 04fda6fd69b427b65d434b20bfaec85eda7b8c5f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 16:53:48 +0100 Subject: [PATCH 36/94] Let tree loading return failure --- .../labs/ucx/source_code/notebooks/sources.py | 92 ++++++++++--------- 1 file changed, 49 insertions(+), 43 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 0a69a75e38..95fedaa6fb 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -18,7 +18,6 @@ safe_read_text, read_text, Advice, - Advisory, CurrentSessionState, Failure, Linter, @@ -36,7 +35,7 @@ UnresolvedPath, ) from databricks.labs.ucx.source_code.notebooks.magic import MagicLine -from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, PythonSequentialLinter +from databricks.labs.ucx.source_code.python.python_ast import NodeBase, PythonSequentialLinter, Tree from databricks.labs.ucx.source_code.notebooks.cells import ( CellLanguage, Cell, @@ -160,12 +159,9 @@ def __init__( self._python_trees: dict[PythonCell, Tree] = {} # the original trees to be linted def lint(self) -> Iterable[Advice]: - has_failure = False - for advice in self._load_tree_from_notebook(self._notebook, True): - if isinstance(advice, Failure): # happens when a cell is unparseable - has_failure = True - yield advice - if has_failure: + failure = self._load_tree_from_notebook(self._notebook, True) + if failure: + yield failure return for cell in self._notebook.cells: if not self._context.is_supported(cell.language.language): @@ -182,40 +178,42 @@ def lint(self) -> Iterable[Advice]: start_line=advice.start_line + cell.original_offset, end_line=advice.end_line + cell.original_offset, ) + return - def _load_tree_from_notebook(self, notebook: Notebook, register_trees: bool) -> Iterable[Advice]: + def _load_tree_from_notebook(self, notebook: Notebook, register_trees: bool) -> Failure | None: for cell in notebook.cells: + failure = None if isinstance(cell, RunCell): - yield from self._load_tree_from_run_cell(cell) - continue - if isinstance(cell, PythonCell): - yield from self._load_tree_from_python_cell(cell, register_trees) - continue - - def _load_tree_from_python_cell(self, python_cell: PythonCell, register_trees: bool) -> Iterable[Advice]: + failure = self._load_tree_from_run_cell(cell) + elif isinstance(cell, PythonCell): + failure = self._load_tree_from_python_cell(cell, register_trees) + if failure: + return failure + return None + + def _load_tree_from_python_cell(self, python_cell: PythonCell, register_trees: bool) -> Failure | None: maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code) if maybe_tree.failure: - yield maybe_tree.failure - tree = maybe_tree.tree - # a cell with only comments will not produce a tree + return maybe_tree.failure if register_trees: - self._python_trees[python_cell] = tree or Tree.new_module() - if not tree: - return - yield from self._load_children_from_tree(tree) + # A cell with only comments will not produce a tree + self._python_trees[python_cell] = maybe_tree.tree or Tree.new_module() + if maybe_tree.tree: + return self._load_children_from_tree(maybe_tree.tree) + return None - def _load_children_from_tree(self, tree: Tree) -> Iterable[Advice]: + def _load_children_from_tree(self, tree: Tree) -> Failure | None: assert isinstance(tree.node, Module) # look for child notebooks (and sys.path changes that might affect their loading) base_nodes: list[NodeBase] = [] base_nodes.extend(self._list_run_magic_lines(tree)) base_nodes.extend(SysPathChange.extract_from_tree(self._session_state, tree)) if len(base_nodes) == 0: - return + return None # need to execute things in intertwined sequence so concat and sort them nodes = list(cast(Module, tree.node).body) base_nodes = sorted(base_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)) - yield from self._load_children_with_base_nodes(nodes, base_nodes) + return self._load_children_with_base_nodes(nodes, base_nodes) @staticmethod def _list_run_magic_lines(tree: Tree) -> Iterable[MagicLine]: @@ -228,52 +226,60 @@ def _ignore_problem(_code: str, _message: str, _node: NodeNG) -> None: if isinstance(command.as_magic(), RunCommand): yield command - def _load_children_with_base_nodes(self, nodes: list[NodeNG], base_nodes: list[NodeBase]) -> Iterable[Advice]: + def _load_children_with_base_nodes(self, nodes: list[NodeNG], base_nodes: list[NodeBase]) -> Failure | None: for base_node in base_nodes: - yield from self._load_children_with_base_node(nodes, base_node) + failure = self._load_children_with_base_node(nodes, base_node) + if failure: + return failure + return None - def _load_children_with_base_node(self, nodes: list[NodeNG], base_node: NodeBase) -> Iterable[Advice]: + def _load_children_with_base_node(self, nodes: list[NodeNG], base_node: NodeBase) -> Failure | None: while len(nodes) > 0: node = nodes.pop(0) if node.lineno < base_node.node.lineno: continue - yield from self._load_children_from_base_node(base_node) + failure = self._load_children_from_base_node(base_node) + if failure: + return failure + return None - def _load_children_from_base_node(self, base_node: NodeBase) -> Iterable[Advice]: + def _load_children_from_base_node(self, base_node: NodeBase) -> Failure | None: if isinstance(base_node, SysPathChange): - yield from self._mutate_path_lookup(base_node) - return + failure = self._mutate_path_lookup(base_node) + if failure: + return failure if isinstance(base_node, MagicLine): magic = base_node.as_magic() assert isinstance(magic, RunCommand) notebook = self._load_source_from_path(magic.notebook_path) if notebook is None: - yield Advisory.from_node( + failure = Failure.from_node( code='dependency-not-found', message=f"Can't locate dependency: {magic.notebook_path}", node=base_node.node, ) - return - yield from self._load_tree_from_notebook(notebook, False) - return + return failure + return self._load_tree_from_notebook(notebook, False) + return None - def _mutate_path_lookup(self, change: SysPathChange) -> Iterable[Advice]: + def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: if isinstance(change, UnresolvedPath): - yield Advisory.from_node( + return Failure.from_node( code='sys-path-cannot-compute-value', message=f"Can't update sys.path from {change.node.as_string()} because the expression cannot be computed", node=change.node, ) - return change.apply_to(self._path_lookup) + return None - def _load_tree_from_run_cell(self, run_cell: RunCell) -> Iterable[Advice]: + def _load_tree_from_run_cell(self, run_cell: RunCell) -> Failure | None: path = run_cell.maybe_notebook_path() if path is None: - return # malformed run cell already reported + return None # malformed run cell already reported notebook = self._load_source_from_path(path) if notebook is not None: - yield from self._load_tree_from_notebook(notebook, False) + return self._load_tree_from_notebook(notebook, False) + return None def _load_source_from_path(self, path: Path | None) -> Notebook | None: if path is None: From 7e16589ff0fb12b57310f2a31e46c66b2816d18b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 16 Jan 2025 17:39:45 +0100 Subject: [PATCH 37/94] Connect cells using parents --- .../labs/ucx/source_code/notebooks/sources.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 95fedaa6fb..9411d4859a 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -180,21 +180,29 @@ def lint(self) -> Iterable[Advice]: ) return - def _load_tree_from_notebook(self, notebook: Notebook, register_trees: bool) -> Failure | None: + def _load_tree_from_notebook( + self, notebook: Notebook, register_trees: bool, *, parent_tree: Tree | None = None + ) -> Failure | None: for cell in notebook.cells: failure = None if isinstance(cell, RunCell): - failure = self._load_tree_from_run_cell(cell) + failure = self._load_tree_from_run_cell(cell, parent_tree=parent_tree) elif isinstance(cell, PythonCell): - failure = self._load_tree_from_python_cell(cell, register_trees) + failure = self._load_tree_from_python_cell(cell, register_trees, parent_tree=parent_tree) + parent_tree = self._python_trees.get(cell) if failure: return failure return None - def _load_tree_from_python_cell(self, python_cell: PythonCell, register_trees: bool) -> Failure | None: + def _load_tree_from_python_cell( + self, python_cell: PythonCell, register_trees: bool, parent_tree: Tree | None = None + ) -> Failure | None: maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code) if maybe_tree.failure: return maybe_tree.failure + assert maybe_tree.tree is not None + if parent_tree: + parent_tree.attach_child_tree(maybe_tree.tree) if register_trees: # A cell with only comments will not produce a tree self._python_trees[python_cell] = maybe_tree.tree or Tree.new_module() @@ -272,13 +280,13 @@ def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: change.apply_to(self._path_lookup) return None - def _load_tree_from_run_cell(self, run_cell: RunCell) -> Failure | None: + def _load_tree_from_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> Failure | None: path = run_cell.maybe_notebook_path() if path is None: return None # malformed run cell already reported notebook = self._load_source_from_path(path) if notebook is not None: - return self._load_tree_from_notebook(notebook, False) + return self._load_tree_from_notebook(notebook, False, parent_tree=parent_tree) return None def _load_source_from_path(self, path: Path | None) -> Notebook | None: From d2db4b6606942858273260e1e0e1df762c56cf5e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 09:30:19 +0100 Subject: [PATCH 38/94] Pass inherited tree to notebook linter --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 9411d4859a..3ddbc9d94a 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -148,18 +148,21 @@ def __init__( path_lookup: PathLookup, session_state: CurrentSessionState, notebook: Notebook, + inherited_tree: Tree | None = None, ): self._context: LinterContext = context self._path_lookup = path_lookup self._session_state = session_state self._notebook: Notebook = notebook + self._inherited_tree = inherited_tree + # reuse Python linter across related files and notebook cells # this is required in order to accumulate statements for improved inference self._python_linter: PythonSequentialLinter = cast(PythonSequentialLinter, context.linter(Language.PYTHON)) self._python_trees: dict[PythonCell, Tree] = {} # the original trees to be linted def lint(self) -> Iterable[Advice]: - failure = self._load_tree_from_notebook(self._notebook, True) + failure = self._load_tree_from_notebook(self._notebook, True, parent_tree=self._inherited_tree) if failure: yield failure return @@ -433,5 +436,5 @@ def _lint_notebook(self) -> Iterable[Advice]: yield Failure("unknown-language", f"Cannot detect language for {self._path}", 0, 0, 1, 1) return notebook = Notebook.parse(self._path, self._content, language) - notebook_linter = NotebookLinter(self._ctx, self._path_lookup, self._session_state, notebook) + notebook_linter = NotebookLinter(self._ctx, self._path_lookup, self._session_state, notebook, self._inherited_tree) yield from notebook_linter.lint() From 14b8c4506f1a4abf18921ef63de6e76a5d79b837 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 10:07:29 +0100 Subject: [PATCH 39/94] Format --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 3ddbc9d94a..eb6eea63e1 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -436,5 +436,7 @@ def _lint_notebook(self) -> Iterable[Advice]: yield Failure("unknown-language", f"Cannot detect language for {self._path}", 0, 0, 1, 1) return notebook = Notebook.parse(self._path, self._content, language) - notebook_linter = NotebookLinter(self._ctx, self._path_lookup, self._session_state, notebook, self._inherited_tree) + notebook_linter = NotebookLinter( + self._ctx, self._path_lookup, self._session_state, notebook, self._inherited_tree + ) yield from notebook_linter.lint() From 5c28fc9355b299d6570345b3beb5feb06bd8467c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 10:10:18 +0100 Subject: [PATCH 40/94] Disable test that does not reflect a realistic scenario --- tests/unit/source_code/test_functional.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit/source_code/test_functional.py b/tests/unit/source_code/test_functional.py index b9df497aca..e8c79bfe86 100644 --- a/tests/unit/source_code/test_functional.py +++ b/tests/unit/source_code/test_functional.py @@ -244,7 +244,10 @@ def test_functional(sample: Functional, mock_path_lookup, simple_dependency_reso ("_child_that_uses_missing_value.py", "parent_that_dbutils_runs_child_that_misses_value_from_parent.py"), ("_child_that_uses_value_from_parent.py", "grand_parent_that_dbutils_runs_parent_that_magic_runs_child.py"), ("_child_that_uses_missing_value.py", "parent_that_imports_child_that_misses_value_from_parent.py"), - ("_child_that_uses_value_from_parent.py", "grand_parent_that_imports_parent_that_magic_runs_child.py"), + # TODO: Confirm to delete this test as it is a file that imports a notebook that runs the child, which is not a + # real case and it results in the imported notebook to be treated as a file as the ImportResolver only returns + # containers for files not for notebooks (as it should) + # ("_child_that_uses_value_from_parent.py", "grand_parent_that_imports_parent_that_magic_runs_child.py"), ], ) def test_functional_with_parent( From 1b8f4b56b1daa8c3d6e8d31aa6b92a5b9aee113c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 10:22:54 +0100 Subject: [PATCH 41/94] Pass run cell's tree as parent to the notebook it is running --- .../labs/ucx/source_code/notebooks/sources.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index eb6eea63e1..915e7e729c 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -184,7 +184,7 @@ def lint(self) -> Iterable[Advice]: return def _load_tree_from_notebook( - self, notebook: Notebook, register_trees: bool, *, parent_tree: Tree | None = None + self, notebook: Notebook, register_trees: bool, *, parent_tree: Tree | None ) -> Failure | None: for cell in notebook.cells: failure = None @@ -198,7 +198,7 @@ def _load_tree_from_notebook( return None def _load_tree_from_python_cell( - self, python_cell: PythonCell, register_trees: bool, parent_tree: Tree | None = None + self, python_cell: PythonCell, register_trees: bool, *, parent_tree: Tree | None ) -> Failure | None: maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code) if maybe_tree.failure: @@ -224,7 +224,7 @@ def _load_children_from_tree(self, tree: Tree) -> Failure | None: # need to execute things in intertwined sequence so concat and sort them nodes = list(cast(Module, tree.node).body) base_nodes = sorted(base_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)) - return self._load_children_with_base_nodes(nodes, base_nodes) + return self._load_children_with_base_nodes(nodes, base_nodes, parent_tree=tree) @staticmethod def _list_run_magic_lines(tree: Tree) -> Iterable[MagicLine]: @@ -237,24 +237,28 @@ def _ignore_problem(_code: str, _message: str, _node: NodeNG) -> None: if isinstance(command.as_magic(), RunCommand): yield command - def _load_children_with_base_nodes(self, nodes: list[NodeNG], base_nodes: list[NodeBase]) -> Failure | None: + def _load_children_with_base_nodes( + self, nodes: list[NodeNG], base_nodes: list[NodeBase], *, parent_tree: Tree | None + ) -> Failure | None: for base_node in base_nodes: - failure = self._load_children_with_base_node(nodes, base_node) + failure = self._load_children_with_base_node(nodes, base_node, parent_tree=parent_tree) if failure: return failure return None - def _load_children_with_base_node(self, nodes: list[NodeNG], base_node: NodeBase) -> Failure | None: + def _load_children_with_base_node( + self, nodes: list[NodeNG], base_node: NodeBase, *, parent_tree: Tree | None + ) -> Failure | None: while len(nodes) > 0: node = nodes.pop(0) if node.lineno < base_node.node.lineno: continue - failure = self._load_children_from_base_node(base_node) + failure = self._load_children_from_base_node(base_node, parent_tree=parent_tree) if failure: return failure return None - def _load_children_from_base_node(self, base_node: NodeBase) -> Failure | None: + def _load_children_from_base_node(self, base_node: NodeBase, *, parent_tree: Tree | None) -> Failure | None: if isinstance(base_node, SysPathChange): failure = self._mutate_path_lookup(base_node) if failure: @@ -270,7 +274,7 @@ def _load_children_from_base_node(self, base_node: NodeBase) -> Failure | None: node=base_node.node, ) return failure - return self._load_tree_from_notebook(notebook, False) + return self._load_tree_from_notebook(notebook, False, parent_tree=parent_tree) return None def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: From b7998bc160de28d6f75825277e09eaffbb129b5e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 10:57:38 +0100 Subject: [PATCH 42/94] Do not append child nodes to parents body --- .../labs/ucx/source_code/python/python_ast.py | 23 ++++++++-------- .../source_code/python/test_python_ast.py | 26 ++++--------------- 2 files changed, 17 insertions(+), 32 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 1648e57dc6..df40993d29 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -213,30 +213,31 @@ def __repr__(self): def attach_child_tree(self, tree: Tree) -> None: """Attach a child tree. - Attaching a child tree is a **stateful** operation for both the parent and child tree. After attaching a child - tree, a tree can be traversed starting from the parent or child tree. From both starting points all nodes in - both trees can be reached, though, the order of nodes will be different as that is relative to the starting - point. + 1. Make parent tree of the nodes in the child tree + 2. Extend parents globals with child globals + + Attaching a child tree is a **stateful** operation for the child tree. After attaching a child + tree, the tree can be traversed starting from the child tree as a child knows its parent. However, the tree can + not be traversed from the parent tree as that node object does not contain a list with children trees. """ if not isinstance(tree.node, Module): raise NotImplementedError(f"Cannot attach child tree: {type(tree.node).__name__}") tree_module: Module = cast(Module, tree.node) - self.attach_nodes(tree_module.body) + self.attach_child_nodes(tree_module.body) self.extend_globals(tree_module.globals) - def attach_nodes(self, nodes: list[NodeNG]) -> None: - """Attach nodes. + def attach_child_nodes(self, nodes: list[NodeNG]) -> None: + """Attach child nodes. - Attaching nodes is a **stateful** operation for both this tree's node, the parent node, and the child nodes. - After attaching the nodes, the parent node has the nodes in its body and the child nodes have this tree's node - as parent node. + Attaching a child tree is a **stateful** operation for the child tree. After attaching a child + tree, the tree can be traversed starting from the child tree as a child knows its parent. However, the tree can + not be traversed from the parent tree as that node object does not contain a list with children trees. """ if not isinstance(self.node, Module): raise NotImplementedError(f"Cannot attach nodes to: {type(self.node).__name__}") self_module: Module = cast(Module, self.node) for node in nodes: node.parent = self_module - self_module.body.append(node) def extend_globals(self, globs: dict[str, list[NodeNG]]) -> None: """Extend globals by extending the global values for each global key. diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index e714aec486..17da5ee3d9 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -158,8 +158,7 @@ def test_ignores_magic_marker_in_multiline_comment() -> None: assert True -def test_tree_attach_child_tree_infers_value() -> None: - """Attaching trees allows traversing from both parent and child.""" +def test_tree_attach_child_tree_allows_to_infer_value() -> None: inferred_string = "Hello John!" parent_source, child_source = "a = 'John'", 'b = f"Hello {a}!"' parent_maybe_tree = Tree.maybe_normalized_parse(parent_source) @@ -170,11 +169,6 @@ def test_tree_attach_child_tree_infers_value() -> None: parent_maybe_tree.tree.attach_child_tree(child_maybe_tree.tree) - nodes = parent_maybe_tree.tree.locate(Assign, []) - tree = Tree(nodes[1].value) # Starting from the parent, we are looking for the last assign - strings = [value.as_string() for value in InferredValue.infer_from_node(tree.node)] - assert strings == [inferred_string] - nodes = child_maybe_tree.tree.locate(Assign, []) tree = Tree(nodes[0].value) # Starting from child, we are looking for the first assign strings = [value.as_string() for value in InferredValue.infer_from_node(tree.node)] @@ -322,26 +316,16 @@ def test_is_builtin(source, name, is_builtin) -> None: assert False # could not locate call -def test_tree_attach_nodes_sets_parent() -> None: +def test_tree_attach_child_nodes_sets_parent() -> None: node = astroid.extract_node("b = a + 2") maybe_tree = Tree.maybe_normalized_parse("a = 1") assert maybe_tree.tree, maybe_tree.failure - maybe_tree.tree.attach_nodes([node]) + maybe_tree.tree.attach_child_nodes([node]) assert node.parent == maybe_tree.tree.node -def test_tree_attach_nodes_adds_node_to_body() -> None: - node = astroid.extract_node("b = a + 2") - maybe_tree = Tree.maybe_normalized_parse("a = 1") - assert maybe_tree.tree, maybe_tree.failure - - maybe_tree.tree.attach_nodes([node]) - - assert maybe_tree.tree.node.body[-1] == node - - def test_tree_extend_globals_adds_assign_name_to_tree() -> None: maybe_tree = Tree.maybe_normalized_parse("a = 1") assert maybe_tree.tree, maybe_tree.failure @@ -383,9 +367,9 @@ def test_tree_attach_child_tree_raises_not_implemented_error_for_constant_node() Tree(Const("xyz")).attach_child_tree(Tree(Const("xyz"))) -def test_tree_attach_nodes_raises_not_implemented_error_for_constant_node() -> None: +def test_tree_attach_child_nodes_raises_not_implemented_error_for_constant_node() -> None: with pytest.raises(NotImplementedError, match="Cannot attach nodes to: .*"): - Tree(Const("xyz")).attach_nodes([]) + Tree(Const("xyz")).attach_child_nodes([]) def test_extend_globals_raises_not_implemented_error_for_constant_node() -> None: From 70aa5bff0f3a175eafab65d1b4e7a2cb1479f1cf Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 11:55:11 +0100 Subject: [PATCH 43/94] Use type over Type from type hinting --- src/databricks/labs/ucx/source_code/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index fe3f06c9f1..609f1b4397 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -10,7 +10,7 @@ from dataclasses import dataclass, field from datetime import datetime from pathlib import Path -from typing import Any, BinaryIO, TextIO, Type, TypeVar +from typing import Any, BinaryIO, TextIO, TypeVar from astroid import NodeNG # type: ignore from sqlglot import Expression, parse as parse_sql @@ -69,7 +69,7 @@ def for_path(self, path: Path) -> LocatedAdvice: return LocatedAdvice(self, path) @classmethod - def from_node(cls: Type[T], *, code: str, message: str, node: NodeNG) -> T: + def from_node(cls: type[T], *, code: str, message: str, node: NodeNG) -> T: # Astroid lines are 1-based. return cls( code=code, From c81e6b1b0d2d835f328b0995a96ac78da2b261fc Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 11:56:54 +0100 Subject: [PATCH 44/94] Rename attach_nodes to attach_child_nodes in Python analyzer --- src/databricks/labs/ucx/source_code/python/python_analyzer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_analyzer.py b/src/databricks/labs/ucx/source_code/python/python_analyzer.py index 9fa8fb5118..b837e529c1 100644 --- a/src/databricks/labs/ucx/source_code/python/python_analyzer.py +++ b/src/databricks/labs/ucx/source_code/python/python_analyzer.py @@ -73,7 +73,7 @@ def build_inherited_context(self, child_path: Path) -> InheritedContext: # append nodes node_line = base_node.node.lineno nodes = tree.nodes_between(last_line + 1, node_line - 1) - context.tree.attach_nodes(nodes) + context.tree.attach_child_nodes(nodes) globs = tree.globals_between(last_line + 1, node_line - 1) context.tree.extend_globals(globs) last_line = node_line @@ -86,7 +86,7 @@ def build_inherited_context(self, child_path: Path) -> InheritedContext: assert context.tree is not None, "Tree should be initialized" if last_line < line_count: nodes = tree.nodes_between(last_line + 1, line_count) - context.tree.attach_nodes(nodes) + context.tree.attach_child_nodes(nodes) globs = tree.globals_between(last_line + 1, line_count) context.tree.extend_globals(globs) return context From 3fb49a754318962bb22e14403a347871a552174f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 13:42:54 +0100 Subject: [PATCH 45/94] Delete test for unrealistic scenario --- tests/unit/source_code/test_functional.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/unit/source_code/test_functional.py b/tests/unit/source_code/test_functional.py index e8c79bfe86..a06b71dc40 100644 --- a/tests/unit/source_code/test_functional.py +++ b/tests/unit/source_code/test_functional.py @@ -244,10 +244,6 @@ def test_functional(sample: Functional, mock_path_lookup, simple_dependency_reso ("_child_that_uses_missing_value.py", "parent_that_dbutils_runs_child_that_misses_value_from_parent.py"), ("_child_that_uses_value_from_parent.py", "grand_parent_that_dbutils_runs_parent_that_magic_runs_child.py"), ("_child_that_uses_missing_value.py", "parent_that_imports_child_that_misses_value_from_parent.py"), - # TODO: Confirm to delete this test as it is a file that imports a notebook that runs the child, which is not a - # real case and it results in the imported notebook to be treated as a file as the ImportResolver only returns - # containers for files not for notebooks (as it should) - # ("_child_that_uses_value_from_parent.py", "grand_parent_that_imports_parent_that_magic_runs_child.py"), ], ) def test_functional_with_parent( From 1ff98911c2020bd53decf9dc536b8cb8b6cbc3af Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 13:45:44 +0100 Subject: [PATCH 46/94] Rename method to parse trees --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 915e7e729c..fac25b9996 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -162,7 +162,7 @@ def __init__( self._python_trees: dict[PythonCell, Tree] = {} # the original trees to be linted def lint(self) -> Iterable[Advice]: - failure = self._load_tree_from_notebook(self._notebook, True, parent_tree=self._inherited_tree) + failure = self._parse_trees(self._notebook, True, parent_tree=self._inherited_tree) if failure: yield failure return @@ -183,7 +183,7 @@ def lint(self) -> Iterable[Advice]: ) return - def _load_tree_from_notebook( + def _parse_trees( self, notebook: Notebook, register_trees: bool, *, parent_tree: Tree | None ) -> Failure | None: for cell in notebook.cells: @@ -274,7 +274,7 @@ def _load_children_from_base_node(self, base_node: NodeBase, *, parent_tree: Tre node=base_node.node, ) return failure - return self._load_tree_from_notebook(notebook, False, parent_tree=parent_tree) + return self._parse_trees(notebook, False, parent_tree=parent_tree) return None def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: @@ -293,7 +293,7 @@ def _load_tree_from_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | Non return None # malformed run cell already reported notebook = self._load_source_from_path(path) if notebook is not None: - return self._load_tree_from_notebook(notebook, False, parent_tree=parent_tree) + return self._parse_trees(notebook, False, parent_tree=parent_tree) return None def _load_source_from_path(self, path: Path | None) -> Notebook | None: From 273a40dd848e735665a82fcb6f8f7669fd01a557 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 13:53:12 +0100 Subject: [PATCH 47/94] Add tests for notebook linter --- .../source_code/notebooks/test_sources.py | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index 91fa16950b..bfd42b2e00 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -4,10 +4,11 @@ from unittest.mock import create_autospec import pytest +from databricks.sdk.service.workspace import Language from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.linters.context import LinterContext -from databricks.labs.ucx.source_code.notebooks.sources import FileLinter +from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook, NotebookLinter @pytest.mark.parametrize("path, content", [("xyz.py", "a = 3"), ("xyz.sql", "select * from dual")]) @@ -113,3 +114,33 @@ def test_file_linter_lints_file_with_missing_read_permission(migration_index, mo assert len(advices) == 1 assert advices[0].code == "file-permission" assert advices[0].message == f"Missing read permission for {path}" + + +def test_notebook_linter_lints_source_yielding_no_advices(migration_index, mock_path_lookup) -> None: + linter = NotebookLinter.from_source( + migration_index, + mock_path_lookup, + CurrentSessionState(), + "# Databricks notebook source\nprint(1)", + Language.PYTHON, + ) + + advices = list(linter.lint()) + + assert not advices, "Expected no advices" + + +def test_notebook_linter_lints_parent_child_context_from_grand_parent(migration_index, mock_path_lookup) -> None: + """Verify the NotebookLinter can resolve %run""" + path = Path(__file__).parent.parent / "samples" / "parent-child-context" / "grand_parent.py" + notebook = Notebook.parse(path, path.read_text(), Language.PYTHON) + linter = NotebookLinter( + LinterContext(migration_index), + mock_path_lookup.change_directory(path.parent), + CurrentSessionState(), + notebook, + ) + + advices = list(linter.lint()) + + assert not advices, "Expected no advices" From 7728719f33b82752a0e901bd34e8ed292de6a9a2 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 14:02:11 +0100 Subject: [PATCH 48/94] Test for a table migration deprecation advice to be given --- .../source_code/notebooks/test_sources.py | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index bfd42b2e00..0c05200a54 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -6,7 +6,7 @@ import pytest from databricks.sdk.service.workspace import Language -from databricks.labs.ucx.source_code.base import CurrentSessionState +from databricks.labs.ucx.source_code.base import CurrentSessionState, Deprecation from databricks.labs.ucx.source_code.linters.context import LinterContext from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook, NotebookLinter @@ -144,3 +144,35 @@ def test_notebook_linter_lints_parent_child_context_from_grand_parent(migration_ advices = list(linter.lint()) assert not advices, "Expected no advices" + + +def test_notebook_linter_lints_migrated_table(migration_index, mock_path_lookup) -> None: + """Regression test with the tests below.""" + source = """ +# Databricks notebook source + +table_name = "old.things" # Migrated table according to the migration index + +# COMMAND ---------- + +spark.table(table_name) +""".lstrip() + linter = NotebookLinter.from_source( + migration_index, + mock_path_lookup, + CurrentSessionState(), + source, + Language.PYTHON, + ) + + advices = list(linter.lint()) + + assert advices + assert advices[0] == Deprecation( + code='table-migrated-to-uc', + message='Table old.things is migrated to brand.new.stuff in Unity Catalog', + start_line=6, + start_col=0, + end_line=6, + end_col=23, + ) From 772684c8a4968078799f3e26b1a88b5aa91d38d6 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 14:07:20 +0100 Subject: [PATCH 49/94] Test for notebook cells to consider only code above --- .../source_code/notebooks/test_sources.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index 0c05200a54..3315ec5ea2 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -176,3 +176,42 @@ def test_notebook_linter_lints_migrated_table(migration_index, mock_path_lookup) end_line=6, end_col=23, ) + + +def test_notebook_linter_lints_migrated_table_with_rename(migration_index, mock_path_lookup) -> None: + """The spark.table should read the table defined above the call not below. + + This is a regression test with the test above and below. + """ + source = """ +# Databricks notebook source + +table_name = "old.things" # Migrated table according to the migration index + +# COMMAND ---------- + +spark.table(table_name) + +# COMMAND ---------- + +table_name = "not_migrated.table" # NOT a migrated table according to the migration index +""".lstrip() + linter = NotebookLinter.from_source( + migration_index, + mock_path_lookup, + CurrentSessionState(), + source, + Language.PYTHON, + ) + + advices = list(linter.lint()) + + assert advices + assert advices[0] == Deprecation( + code='table-migrated-to-uc', + message='Table old.things is migrated to brand.new.stuff in Unity Catalog', + start_line=6, + start_col=0, + end_line=6, + end_col=23, + ) From 2f4c7faf1fc5943c28f419593d56826a15b6aacf Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 14:08:27 +0100 Subject: [PATCH 50/94] Test inverse of previous commit --- .../source_code/notebooks/test_sources.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index 3315ec5ea2..b87c942e8e 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -215,3 +215,34 @@ def test_notebook_linter_lints_migrated_table_with_rename(migration_index, mock_ end_line=6, end_col=23, ) + + +def test_notebook_linter_lints_not_migrated_table_with_rename(migration_index, mock_path_lookup) -> None: + """The spark.table should read the table defined above the call not below. + + This is a regression test with the tests above. + """ + source = """ +# Databricks notebook source + +table_name = "not_migrated.table" # NOT a migrated table according to the migration index + +# COMMAND ---------- + +spark.table(table_name) + +# COMMAND ---------- + +table_name = "old.things" # Migrated table according to the migration index +""".lstrip() + linter = NotebookLinter.from_source( + migration_index, + mock_path_lookup, + CurrentSessionState(), + source, + Language.PYTHON, + ) + + advices = list(linter.lint()) + + assert not [advice for advice in advices if advice.code == "table-migrated-to-uc"] From 5441ec554686798b150e3589c9be772f8a5bf79f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 14:10:23 +0100 Subject: [PATCH 51/94] Test inverse of reading table from other cell --- .../source_code/notebooks/test_sources.py | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index b87c942e8e..cab3c9b842 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -178,10 +178,34 @@ def test_notebook_linter_lints_migrated_table(migration_index, mock_path_lookup) ) +def test_notebook_linter_lints_not_migrated_table(migration_index, mock_path_lookup) -> None: + """Regression test with the tests above and below.""" + source = """ +# Databricks notebook source + +table_name = "not_migrated.table" # NOT a migrated table according to the migration index + +# COMMAND ---------- + +spark.table(table_name) +""".lstrip() + linter = NotebookLinter.from_source( + migration_index, + mock_path_lookup, + CurrentSessionState(), + source, + Language.PYTHON, + ) + + advices = list(linter.lint()) + + assert not [advice for advice in advices if advice.code == "table-migrated-to-uc"] + + def test_notebook_linter_lints_migrated_table_with_rename(migration_index, mock_path_lookup) -> None: """The spark.table should read the table defined above the call not below. - This is a regression test with the test above and below. + This is a regression test with the tests above and below. """ source = """ # Databricks notebook source From d6c0afc3f6bbbb5f601556be4618d4f9c7edfc08 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 14:17:07 +0100 Subject: [PATCH 52/94] Format --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index fac25b9996..59326134e3 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -183,9 +183,7 @@ def lint(self) -> Iterable[Advice]: ) return - def _parse_trees( - self, notebook: Notebook, register_trees: bool, *, parent_tree: Tree | None - ) -> Failure | None: + def _parse_trees(self, notebook: Notebook, register_trees: bool, *, parent_tree: Tree | None) -> Failure | None: for cell in notebook.cells: failure = None if isinstance(cell, RunCell): From 435f6c8f55014162a8179737ce9929069c2781c8 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 14:37:03 +0100 Subject: [PATCH 53/94] Let PythonSequentialLinter inherit correctly --- .../labs/ucx/source_code/python/python_ast.py | 36 ++----------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index df40993d29..9fef90b3f4 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -668,7 +668,8 @@ def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: def collect_dfsas_from_tree(self, tree: Tree) -> Iterable[DirectFsAccessNode]: ... -class PythonSequentialLinter(Linter, DfsaCollector, TableCollector): +class PythonSequentialLinter(PythonLinter, DfsaPyCollector, TablePyCollector): + """A linter for sequencing python linters and collectors.""" def __init__( self, @@ -680,47 +681,14 @@ def __init__( self._dfsa_collectors = dfsa_collectors self._table_collectors = table_collectors - def lint(self, code: str) -> Iterable[Advice]: - maybe_tree = self._parse_and_append(code) - if maybe_tree.failure: - yield maybe_tree.failure - return - assert maybe_tree.tree is not None - yield from self.lint_tree(maybe_tree.tree) - def lint_tree(self, tree: Tree) -> Iterable[Advice]: for linter in self._linters: yield from linter.lint_tree(tree) - def _parse_and_append(self, code: str) -> MaybeTree: - maybe_tree = Tree.maybe_normalized_parse(code) - if maybe_tree.failure: - return maybe_tree - assert maybe_tree.tree is not None - return maybe_tree - - def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: - maybe_tree = self._parse_and_append(source_code) - if maybe_tree.failure: - logger.warning(maybe_tree.failure.message) - return - assert maybe_tree.tree is not None - for dfsa_node in self.collect_dfsas_from_tree(maybe_tree.tree): - yield dfsa_node.dfsa - def collect_dfsas_from_tree(self, tree: Tree) -> Iterable[DirectFsAccessNode]: for collector in self._dfsa_collectors: yield from collector.collect_dfsas_from_tree(tree) - def collect_tables(self, source_code: str) -> Iterable[UsedTable]: - maybe_tree = self._parse_and_append(source_code) - if maybe_tree.failure: - logger.warning(maybe_tree.failure.message) - return - assert maybe_tree.tree is not None - for table_node in self.collect_tables_from_tree(maybe_tree.tree): - yield table_node.table - def collect_tables_from_tree(self, tree: Tree) -> Iterable[UsedTableNode]: for collector in self._table_collectors: yield from collector.collect_tables_from_tree(tree) From fd8f04d6a9b9bd455f2a49ebfdc9e66915d6052e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 14:47:14 +0100 Subject: [PATCH 54/94] Remove PythonSequentialLinter initialization from NotebookLinter init TODO https://github.com/databrickslabs/ucx/issues/3544 --- .../labs/ucx/source_code/notebooks/sources.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 59326134e3..6b87b58228 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -20,7 +20,6 @@ Advice, CurrentSessionState, Failure, - Linter, ) from databricks.labs.ucx.source_code.graph import ( @@ -35,7 +34,7 @@ UnresolvedPath, ) from databricks.labs.ucx.source_code.notebooks.magic import MagicLine -from databricks.labs.ucx.source_code.python.python_ast import NodeBase, PythonSequentialLinter, Tree +from databricks.labs.ucx.source_code.python.python_ast import NodeBase, PythonLinter, Tree from databricks.labs.ucx.source_code.notebooks.cells import ( CellLanguage, Cell, @@ -156,9 +155,6 @@ def __init__( self._notebook: Notebook = notebook self._inherited_tree = inherited_tree - # reuse Python linter across related files and notebook cells - # this is required in order to accumulate statements for improved inference - self._python_linter: PythonSequentialLinter = cast(PythonSequentialLinter, context.linter(Language.PYTHON)) self._python_trees: dict[PythonCell, Tree] = {} # the original trees to be linted def lint(self) -> Iterable[Advice]: @@ -167,13 +163,15 @@ def lint(self) -> Iterable[Advice]: yield failure return for cell in self._notebook.cells: - if not self._context.is_supported(cell.language.language): + try: + linter = self._context.linter(cell.language.language) + except ValueError: # Language is not supported (yet) continue if isinstance(cell, PythonCell): + linter = cast(PythonLinter, linter) tree = self._python_trees[cell] - advices = self._python_linter.lint_tree(tree) + advices = linter.lint_tree(tree) else: - linter = self._linter(cell.language.language) advices = linter.lint(cell.original_code) for advice in advices: yield dataclasses.replace( @@ -311,11 +309,6 @@ def _load_source_from_path(self, path: Path | None) -> Notebook | None: return None return Notebook.parse(path, source, language) - def _linter(self, language: Language) -> Linter: - if language is Language.PYTHON: - return self._python_linter - return self._context.linter(language) - @staticmethod def name() -> str: return "notebook-linter" From 7209aacb5abc20573aa3ad9ec9e8d9d6cebe71fb Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 17 Jan 2025 14:59:08 +0100 Subject: [PATCH 55/94] Test NotebookLinter to lint parse failure --- .../source_code/notebooks/test_sources.py | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index cab3c9b842..164376daeb 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -6,7 +6,7 @@ import pytest from databricks.sdk.service.workspace import Language -from databricks.labs.ucx.source_code.base import CurrentSessionState, Deprecation +from databricks.labs.ucx.source_code.base import CurrentSessionState, Deprecation, Failure from databricks.labs.ucx.source_code.linters.context import LinterContext from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook, NotebookLinter @@ -130,6 +130,29 @@ def test_notebook_linter_lints_source_yielding_no_advices(migration_index, mock_ assert not advices, "Expected no advices" +def test_notebook_linter_lints_source_yielding_parse_failure(migration_index, mock_path_lookup) -> None: + linter = NotebookLinter.from_source( + migration_index, + mock_path_lookup, + CurrentSessionState(), + "# Databricks notebook source\nprint(1", # Missing parenthesis is on purpose + Language.PYTHON, + ) + + advices = list(linter.lint()) + + assert advices == [ + Failure( + code='python-parse-error', + message='Failed to parse code due to invalid syntax: print(1', + start_line=0, + start_col=5, + end_line=0, + end_col=1 + ) + ] + + def test_notebook_linter_lints_parent_child_context_from_grand_parent(migration_index, mock_path_lookup) -> None: """Verify the NotebookLinter can resolve %run""" path = Path(__file__).parent.parent / "samples" / "parent-child-context" / "grand_parent.py" From f2dbc56a0da1a8792ff8cdd111e1a54c493af6a3 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Sat, 18 Jan 2025 08:10:50 +0100 Subject: [PATCH 56/94] Remove redundant if statement --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 6b87b58228..cc9cf4f916 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -205,9 +205,7 @@ def _load_tree_from_python_cell( if register_trees: # A cell with only comments will not produce a tree self._python_trees[python_cell] = maybe_tree.tree or Tree.new_module() - if maybe_tree.tree: - return self._load_children_from_tree(maybe_tree.tree) - return None + return self._load_children_from_tree(maybe_tree.tree) def _load_children_from_tree(self, tree: Tree) -> Failure | None: assert isinstance(tree.node, Module) From 2603d1486121616831e81f56314ab5b44f3de0f2 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 14:31:14 +0100 Subject: [PATCH 57/94] Format --- tests/unit/source_code/notebooks/test_sources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index 164376daeb..fa74d5eb8c 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -148,7 +148,7 @@ def test_notebook_linter_lints_source_yielding_parse_failure(migration_index, mo start_line=0, start_col=5, end_line=0, - end_col=1 + end_col=1, ) ] From 23e5a497137e2e62dfea2960085f2a543d59e6b2 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 14:39:52 +0100 Subject: [PATCH 58/94] Rewrite load children from tree --- .../labs/ucx/source_code/notebooks/sources.py | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index cc9cf4f916..22d5bc991a 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -34,7 +34,7 @@ UnresolvedPath, ) from databricks.labs.ucx.source_code.notebooks.magic import MagicLine -from databricks.labs.ucx.source_code.python.python_ast import NodeBase, PythonLinter, Tree +from databricks.labs.ucx.source_code.python.python_ast import PythonLinter, Tree from databricks.labs.ucx.source_code.notebooks.cells import ( CellLanguage, Cell, @@ -208,31 +208,29 @@ def _load_tree_from_python_cell( return self._load_children_from_tree(maybe_tree.tree) def _load_children_from_tree(self, tree: Tree) -> Failure | None: - assert isinstance(tree.node, Module) - # look for child notebooks (and sys.path changes that might affect their loading) - base_nodes: list[NodeBase] = [] - base_nodes.extend(self._list_run_magic_lines(tree)) - base_nodes.extend(SysPathChange.extract_from_tree(self._session_state, tree)) - if len(base_nodes) == 0: + """Load children from tree by looking for child notebooks and path changes that might affect their loading.""" + code_path_nodes = self._list_magic_lines_with_run_command(tree) + SysPathChange.extract_from_tree( + self._session_state, tree + ) + if len(code_path_nodes) == 0: return None - # need to execute things in intertwined sequence so concat and sort them nodes = list(cast(Module, tree.node).body) - base_nodes = sorted(base_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)) + # Sys path changes require to load children in order of reading + base_nodes = sorted(code_path_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)) return self._load_children_with_base_nodes(nodes, base_nodes, parent_tree=tree) @staticmethod - def _list_run_magic_lines(tree: Tree) -> Iterable[MagicLine]: - - def _ignore_problem(_code: str, _message: str, _node: NodeNG) -> None: - return None - - commands, _ = MagicLine.extract_from_tree(tree, _ignore_problem) - for command in commands: - if isinstance(command.as_magic(), RunCommand): - yield command + def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: + """List the magic lines with a run command""" + run_commands = [] + magic_lines, _ = MagicLine.extract_from_tree(tree, lambda code, message, node: None) + for magic_line in magic_lines: + if isinstance(magic_line.as_magic(), RunCommand): + run_commands.append(magic_line) + return run_commands def _load_children_with_base_nodes( - self, nodes: list[NodeNG], base_nodes: list[NodeBase], *, parent_tree: Tree | None + self, nodes: list[NodeNG], base_nodes: list[SysPathChange | MagicLine], *, parent_tree: Tree | None ) -> Failure | None: for base_node in base_nodes: failure = self._load_children_with_base_node(nodes, base_node, parent_tree=parent_tree) @@ -241,7 +239,7 @@ def _load_children_with_base_nodes( return None def _load_children_with_base_node( - self, nodes: list[NodeNG], base_node: NodeBase, *, parent_tree: Tree | None + self, nodes: list[NodeNG], base_node: SysPathChange | MagicLine, *, parent_tree: Tree | None ) -> Failure | None: while len(nodes) > 0: node = nodes.pop(0) @@ -252,7 +250,9 @@ def _load_children_with_base_node( return failure return None - def _load_children_from_base_node(self, base_node: NodeBase, *, parent_tree: Tree | None) -> Failure | None: + def _load_children_from_base_node( + self, base_node: SysPathChange | MagicLine, *, parent_tree: Tree | None + ) -> Failure | None: if isinstance(base_node, SysPathChange): failure = self._mutate_path_lookup(base_node) if failure: From 8cef0ef68c79334aa86f8436a83470c9c896d819 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 14:45:09 +0100 Subject: [PATCH 59/94] Remove redundant for-loops --- .../labs/ucx/source_code/notebooks/sources.py | 32 +++---------------- 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 22d5bc991a..65186222c0 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -7,7 +7,6 @@ from pathlib import Path from typing import cast -from astroid import Module, NodeNG # type: ignore from databricks.sdk.service.workspace import Language @@ -212,12 +211,12 @@ def _load_children_from_tree(self, tree: Tree) -> Failure | None: code_path_nodes = self._list_magic_lines_with_run_command(tree) + SysPathChange.extract_from_tree( self._session_state, tree ) - if len(code_path_nodes) == 0: - return None - nodes = list(cast(Module, tree.node).body) # Sys path changes require to load children in order of reading - base_nodes = sorted(code_path_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)) - return self._load_children_with_base_nodes(nodes, base_nodes, parent_tree=tree) + for base_node in sorted(code_path_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)): + failure = self._load_children_from_base_node(base_node, parent_tree=tree) + if failure: + return failure + return None @staticmethod def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: @@ -229,27 +228,6 @@ def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: run_commands.append(magic_line) return run_commands - def _load_children_with_base_nodes( - self, nodes: list[NodeNG], base_nodes: list[SysPathChange | MagicLine], *, parent_tree: Tree | None - ) -> Failure | None: - for base_node in base_nodes: - failure = self._load_children_with_base_node(nodes, base_node, parent_tree=parent_tree) - if failure: - return failure - return None - - def _load_children_with_base_node( - self, nodes: list[NodeNG], base_node: SysPathChange | MagicLine, *, parent_tree: Tree | None - ) -> Failure | None: - while len(nodes) > 0: - node = nodes.pop(0) - if node.lineno < base_node.node.lineno: - continue - failure = self._load_children_from_base_node(base_node, parent_tree=parent_tree) - if failure: - return failure - return None - def _load_children_from_base_node( self, base_node: SysPathChange | MagicLine, *, parent_tree: Tree | None ) -> Failure | None: From 82f3ad1f7115379112f17a20e66b13bb5f16d5c1 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 14:46:27 +0100 Subject: [PATCH 60/94] Remove unused name method --- .../labs/ucx/source_code/notebooks/sources.py | 4 ---- tests/unit/source_code/test_notebook_linter.py | 13 ------------- 2 files changed, 17 deletions(-) delete mode 100644 tests/unit/source_code/test_notebook_linter.py diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 65186222c0..99e282aba1 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -285,10 +285,6 @@ def _load_source_from_path(self, path: Path | None) -> Notebook | None: return None return Notebook.parse(path, source, language) - @staticmethod - def name() -> str: - return "notebook-linter" - class FileLinter: _NOT_YET_SUPPORTED_SUFFIXES = { diff --git a/tests/unit/source_code/test_notebook_linter.py b/tests/unit/source_code/test_notebook_linter.py deleted file mode 100644 index 3c8e352a39..0000000000 --- a/tests/unit/source_code/test_notebook_linter.py +++ /dev/null @@ -1,13 +0,0 @@ -from databricks.sdk.service.workspace import Language - -from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex -from databricks.labs.ucx.source_code.base import CurrentSessionState -from databricks.labs.ucx.source_code.notebooks.sources import NotebookLinter - -index = TableMigrationIndex([]) - - -def test_notebook_linter_name(mock_path_lookup) -> None: - source = """-- Databricks notebook source""" - linter = NotebookLinter.from_source(index, mock_path_lookup, CurrentSessionState(), source, Language.SQL) - assert linter.name() == "notebook-linter" From ce23e4f56efa98bc8bd9faf28bfd080eb815ed12 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 14:51:29 +0100 Subject: [PATCH 61/94] Move load tree from run cell up --- .../labs/ucx/source_code/notebooks/sources.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 99e282aba1..8a21970105 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -192,6 +192,15 @@ def _parse_trees(self, notebook: Notebook, register_trees: bool, *, parent_tree: return failure return None + def _load_tree_from_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> Failure | None: + path = run_cell.maybe_notebook_path() + if path is None: + return None # malformed run cell already reported + notebook = self._load_source_from_path(path) + if notebook is not None: + return self._parse_trees(notebook, False, parent_tree=parent_tree) + return None + def _load_tree_from_python_cell( self, python_cell: PythonCell, register_trees: bool, *, parent_tree: Tree | None ) -> Failure | None: @@ -259,15 +268,6 @@ def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: change.apply_to(self._path_lookup) return None - def _load_tree_from_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> Failure | None: - path = run_cell.maybe_notebook_path() - if path is None: - return None # malformed run cell already reported - notebook = self._load_source_from_path(path) - if notebook is not None: - return self._parse_trees(notebook, False, parent_tree=parent_tree) - return None - def _load_source_from_path(self, path: Path | None) -> Notebook | None: if path is None: return None # already reported during dependency building From 71762769403036abc8e3f1a4510f8481e3967f3b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 14:56:27 +0100 Subject: [PATCH 62/94] Rename methods for consistency --- .../labs/ucx/source_code/notebooks/sources.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 8a21970105..1e4735b58d 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -157,7 +157,7 @@ def __init__( self._python_trees: dict[PythonCell, Tree] = {} # the original trees to be linted def lint(self) -> Iterable[Advice]: - failure = self._parse_trees(self._notebook, True, parent_tree=self._inherited_tree) + failure = self._parse_trees_from_notebook(self._notebook, True, parent_tree=self._inherited_tree) if failure: yield failure return @@ -180,28 +180,30 @@ def lint(self) -> Iterable[Advice]: ) return - def _parse_trees(self, notebook: Notebook, register_trees: bool, *, parent_tree: Tree | None) -> Failure | None: + def _parse_trees_from_notebook( + self, notebook: Notebook, register_trees: bool, *, parent_tree: Tree | None + ) -> Failure | None: for cell in notebook.cells: failure = None if isinstance(cell, RunCell): - failure = self._load_tree_from_run_cell(cell, parent_tree=parent_tree) + failure = self._parse_trees_from_run_cell(cell, parent_tree=parent_tree) elif isinstance(cell, PythonCell): - failure = self._load_tree_from_python_cell(cell, register_trees, parent_tree=parent_tree) + failure = self._parse_trees_from_python_cell(cell, register_trees, parent_tree=parent_tree) parent_tree = self._python_trees.get(cell) if failure: return failure return None - def _load_tree_from_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> Failure | None: + def _parse_trees_from_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> Failure | None: path = run_cell.maybe_notebook_path() if path is None: return None # malformed run cell already reported - notebook = self._load_source_from_path(path) + notebook = self._parse_notebook_from_path(path) if notebook is not None: - return self._parse_trees(notebook, False, parent_tree=parent_tree) + return self._parse_trees_from_notebook(notebook, False, parent_tree=parent_tree) return None - def _load_tree_from_python_cell( + def _parse_trees_from_python_cell( self, python_cell: PythonCell, register_trees: bool, *, parent_tree: Tree | None ) -> Failure | None: maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code) @@ -213,16 +215,16 @@ def _load_tree_from_python_cell( if register_trees: # A cell with only comments will not produce a tree self._python_trees[python_cell] = maybe_tree.tree or Tree.new_module() - return self._load_children_from_tree(maybe_tree.tree) + return self._parse_child_trees_from_tree(maybe_tree.tree) - def _load_children_from_tree(self, tree: Tree) -> Failure | None: - """Load children from tree by looking for child notebooks and path changes that might affect their loading.""" + def _parse_child_trees_from_tree(self, tree: Tree) -> Failure | None: + """Parse child trees by looking for referred notebooks and path changes that might affect loading notebooks.""" code_path_nodes = self._list_magic_lines_with_run_command(tree) + SysPathChange.extract_from_tree( self._session_state, tree ) # Sys path changes require to load children in order of reading for base_node in sorted(code_path_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)): - failure = self._load_children_from_base_node(base_node, parent_tree=tree) + failure = self._process_code_node(base_node, parent_tree=tree) if failure: return failure return None @@ -237,9 +239,7 @@ def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: run_commands.append(magic_line) return run_commands - def _load_children_from_base_node( - self, base_node: SysPathChange | MagicLine, *, parent_tree: Tree | None - ) -> Failure | None: + def _process_code_node(self, base_node: SysPathChange | MagicLine, *, parent_tree: Tree | None) -> Failure | None: if isinstance(base_node, SysPathChange): failure = self._mutate_path_lookup(base_node) if failure: @@ -247,7 +247,7 @@ def _load_children_from_base_node( if isinstance(base_node, MagicLine): magic = base_node.as_magic() assert isinstance(magic, RunCommand) - notebook = self._load_source_from_path(magic.notebook_path) + notebook = self._parse_notebook_from_path(magic.notebook_path) if notebook is None: failure = Failure.from_node( code='dependency-not-found', @@ -255,7 +255,7 @@ def _load_children_from_base_node( node=base_node.node, ) return failure - return self._parse_trees(notebook, False, parent_tree=parent_tree) + return self._parse_trees_from_notebook(notebook, False, parent_tree=parent_tree) return None def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: @@ -268,7 +268,7 @@ def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: change.apply_to(self._path_lookup) return None - def _load_source_from_path(self, path: Path | None) -> Notebook | None: + def _parse_notebook_from_path(self, path: Path | None) -> Notebook | None: if path is None: return None # already reported during dependency building resolved = self._path_lookup.resolve(path) From 768706ceb8bf7c092ff329390ce50d1d71d308d4 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 14:58:22 +0100 Subject: [PATCH 63/94] Rename Python tree cache for clarity --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 1e4735b58d..62d8b99005 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -154,7 +154,8 @@ def __init__( self._notebook: Notebook = notebook self._inherited_tree = inherited_tree - self._python_trees: dict[PythonCell, Tree] = {} # the original trees to be linted + # Python trees are constructed during notebook parsing and cached for later usage + self._python_tree_cache: dict[PythonCell, Tree] = {} def lint(self) -> Iterable[Advice]: failure = self._parse_trees_from_notebook(self._notebook, True, parent_tree=self._inherited_tree) @@ -168,7 +169,7 @@ def lint(self) -> Iterable[Advice]: continue if isinstance(cell, PythonCell): linter = cast(PythonLinter, linter) - tree = self._python_trees[cell] + tree = self._python_tree_cache[cell] advices = linter.lint_tree(tree) else: advices = linter.lint(cell.original_code) @@ -189,7 +190,7 @@ def _parse_trees_from_notebook( failure = self._parse_trees_from_run_cell(cell, parent_tree=parent_tree) elif isinstance(cell, PythonCell): failure = self._parse_trees_from_python_cell(cell, register_trees, parent_tree=parent_tree) - parent_tree = self._python_trees.get(cell) + parent_tree = self._python_tree_cache.get(cell) if failure: return failure return None @@ -214,7 +215,7 @@ def _parse_trees_from_python_cell( parent_tree.attach_child_tree(maybe_tree.tree) if register_trees: # A cell with only comments will not produce a tree - self._python_trees[python_cell] = maybe_tree.tree or Tree.new_module() + self._python_tree_cache[python_cell] = maybe_tree.tree or Tree.new_module() return self._parse_child_trees_from_tree(maybe_tree.tree) def _parse_child_trees_from_tree(self, tree: Tree) -> Failure | None: From 1f44031eed1a0cb543a31e58e5048323fdf6a29f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 15:04:30 +0100 Subject: [PATCH 64/94] Always cache Python trees https://github.com/databrickslabs/ucx/issues/3552 --- .../labs/ucx/source_code/notebooks/sources.py | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 62d8b99005..472b1bc8dd 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -155,10 +155,10 @@ def __init__( self._inherited_tree = inherited_tree # Python trees are constructed during notebook parsing and cached for later usage - self._python_tree_cache: dict[PythonCell, Tree] = {} + self._python_tree_cache: dict[tuple[Path, PythonCell], Tree] = {} # Path in key is the notebook's path def lint(self) -> Iterable[Advice]: - failure = self._parse_trees_from_notebook(self._notebook, True, parent_tree=self._inherited_tree) + failure = self._parse_trees_from_notebook(self._notebook, parent_tree=self._inherited_tree) if failure: yield failure return @@ -169,7 +169,7 @@ def lint(self) -> Iterable[Advice]: continue if isinstance(cell, PythonCell): linter = cast(PythonLinter, linter) - tree = self._python_tree_cache[cell] + tree = self._python_tree_cache[(self._notebook.path, cell)] advices = linter.lint_tree(tree) else: advices = linter.lint(cell.original_code) @@ -181,16 +181,14 @@ def lint(self) -> Iterable[Advice]: ) return - def _parse_trees_from_notebook( - self, notebook: Notebook, register_trees: bool, *, parent_tree: Tree | None - ) -> Failure | None: + def _parse_trees_from_notebook(self, notebook: Notebook, *, parent_tree: Tree | None) -> Failure | None: for cell in notebook.cells: failure = None if isinstance(cell, RunCell): failure = self._parse_trees_from_run_cell(cell, parent_tree=parent_tree) elif isinstance(cell, PythonCell): - failure = self._parse_trees_from_python_cell(cell, register_trees, parent_tree=parent_tree) - parent_tree = self._python_tree_cache.get(cell) + failure = self._parse_trees_from_python_cell(cell, notebook.path, parent_tree=parent_tree) + parent_tree = self._python_tree_cache.get((notebook.path, cell)) if failure: return failure return None @@ -201,11 +199,11 @@ def _parse_trees_from_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | N return None # malformed run cell already reported notebook = self._parse_notebook_from_path(path) if notebook is not None: - return self._parse_trees_from_notebook(notebook, False, parent_tree=parent_tree) + return self._parse_trees_from_notebook(notebook, parent_tree=parent_tree) return None def _parse_trees_from_python_cell( - self, python_cell: PythonCell, register_trees: bool, *, parent_tree: Tree | None + self, python_cell: PythonCell, notebook_path: Path, *, parent_tree: Tree | None ) -> Failure | None: maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code) if maybe_tree.failure: @@ -213,9 +211,7 @@ def _parse_trees_from_python_cell( assert maybe_tree.tree is not None if parent_tree: parent_tree.attach_child_tree(maybe_tree.tree) - if register_trees: - # A cell with only comments will not produce a tree - self._python_tree_cache[python_cell] = maybe_tree.tree or Tree.new_module() + self._python_tree_cache[(notebook_path, python_cell)] = maybe_tree.tree or Tree.new_module() return self._parse_child_trees_from_tree(maybe_tree.tree) def _parse_child_trees_from_tree(self, tree: Tree) -> Failure | None: @@ -256,7 +252,7 @@ def _process_code_node(self, base_node: SysPathChange | MagicLine, *, parent_tre node=base_node.node, ) return failure - return self._parse_trees_from_notebook(notebook, False, parent_tree=parent_tree) + return self._parse_trees_from_notebook(notebook, parent_tree=parent_tree) return None def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: From b80f2d385ee2e263f24570a7a0086c72c52cfdf1 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 15:12:10 +0100 Subject: [PATCH 65/94] Rename methods for clarity --- .../labs/ucx/source_code/notebooks/sources.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 472b1bc8dd..c9b34c44b2 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -158,7 +158,7 @@ def __init__( self._python_tree_cache: dict[tuple[Path, PythonCell], Tree] = {} # Path in key is the notebook's path def lint(self) -> Iterable[Advice]: - failure = self._parse_trees_from_notebook(self._notebook, parent_tree=self._inherited_tree) + failure = self._parse_notebook(self._notebook, parent_tree=self._inherited_tree) if failure: yield failure return @@ -181,28 +181,28 @@ def lint(self) -> Iterable[Advice]: ) return - def _parse_trees_from_notebook(self, notebook: Notebook, *, parent_tree: Tree | None) -> Failure | None: + def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree | None) -> Failure | None: for cell in notebook.cells: failure = None if isinstance(cell, RunCell): - failure = self._parse_trees_from_run_cell(cell, parent_tree=parent_tree) + failure = self._resolve_and_parse_run_cell(cell, parent_tree=parent_tree) elif isinstance(cell, PythonCell): - failure = self._parse_trees_from_python_cell(cell, notebook.path, parent_tree=parent_tree) + failure = self._parse_python_cell(cell, notebook.path, parent_tree=parent_tree) parent_tree = self._python_tree_cache.get((notebook.path, cell)) if failure: return failure return None - def _parse_trees_from_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> Failure | None: + def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> Failure | None: path = run_cell.maybe_notebook_path() if path is None: return None # malformed run cell already reported - notebook = self._parse_notebook_from_path(path) + notebook = self._resolve_and_parse_path(path) if notebook is not None: - return self._parse_trees_from_notebook(notebook, parent_tree=parent_tree) + return self._parse_notebook(notebook, parent_tree=parent_tree) return None - def _parse_trees_from_python_cell( + def _parse_python_cell( self, python_cell: PythonCell, notebook_path: Path, *, parent_tree: Tree | None ) -> Failure | None: maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code) @@ -212,10 +212,10 @@ def _parse_trees_from_python_cell( if parent_tree: parent_tree.attach_child_tree(maybe_tree.tree) self._python_tree_cache[(notebook_path, python_cell)] = maybe_tree.tree or Tree.new_module() - return self._parse_child_trees_from_tree(maybe_tree.tree) + return self._parse_tree(maybe_tree.tree) - def _parse_child_trees_from_tree(self, tree: Tree) -> Failure | None: - """Parse child trees by looking for referred notebooks and path changes that might affect loading notebooks.""" + def _parse_tree(self, tree: Tree) -> Failure | None: + """Parse tree by looking for referred notebooks and path changes that might affect loading notebooks.""" code_path_nodes = self._list_magic_lines_with_run_command(tree) + SysPathChange.extract_from_tree( self._session_state, tree ) @@ -244,7 +244,7 @@ def _process_code_node(self, base_node: SysPathChange | MagicLine, *, parent_tre if isinstance(base_node, MagicLine): magic = base_node.as_magic() assert isinstance(magic, RunCommand) - notebook = self._parse_notebook_from_path(magic.notebook_path) + notebook = self._resolve_and_parse_path(magic.notebook_path) if notebook is None: failure = Failure.from_node( code='dependency-not-found', @@ -252,7 +252,7 @@ def _process_code_node(self, base_node: SysPathChange | MagicLine, *, parent_tre node=base_node.node, ) return failure - return self._parse_trees_from_notebook(notebook, parent_tree=parent_tree) + return self._parse_notebook(notebook, parent_tree=parent_tree) return None def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: @@ -265,7 +265,7 @@ def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: change.apply_to(self._path_lookup) return None - def _parse_notebook_from_path(self, path: Path | None) -> Notebook | None: + def _resolve_and_parse_path(self, path: Path | None) -> Notebook | None: if path is None: return None # already reported during dependency building resolved = self._path_lookup.resolve(path) From 54895fbb8017dcf78162e95b7e0b8d39d67fe3b7 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 15:12:28 +0100 Subject: [PATCH 66/94] Rename variable --- .../labs/ucx/source_code/notebooks/sources.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index c9b34c44b2..ed779014fc 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -236,20 +236,20 @@ def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: run_commands.append(magic_line) return run_commands - def _process_code_node(self, base_node: SysPathChange | MagicLine, *, parent_tree: Tree | None) -> Failure | None: - if isinstance(base_node, SysPathChange): - failure = self._mutate_path_lookup(base_node) + def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tree | None) -> Failure | None: + if isinstance(node, SysPathChange): + failure = self._mutate_path_lookup(node) if failure: return failure - if isinstance(base_node, MagicLine): - magic = base_node.as_magic() + if isinstance(node, MagicLine): + magic = node.as_magic() assert isinstance(magic, RunCommand) notebook = self._resolve_and_parse_path(magic.notebook_path) if notebook is None: failure = Failure.from_node( code='dependency-not-found', message=f"Can't locate dependency: {magic.notebook_path}", - node=base_node.node, + node=node.node, ) return failure return self._parse_notebook(notebook, parent_tree=parent_tree) From 3a8bb374ae46c31ce9751e485b24ba4a0e6a8fee Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 15:17:14 +0100 Subject: [PATCH 67/94] Add docstrings --- .../labs/ucx/source_code/notebooks/sources.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index ed779014fc..4f7da2e8be 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -182,6 +182,7 @@ def lint(self) -> Iterable[Advice]: return def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree | None) -> Failure | None: + """Parse a notebook by parsing its cells.""" for cell in notebook.cells: failure = None if isinstance(cell, RunCell): @@ -194,10 +195,11 @@ def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree | None) -> Fa return None def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> Failure | None: + """Resolve the path in the run cell and parse the notebook it refers.""" path = run_cell.maybe_notebook_path() if path is None: return None # malformed run cell already reported - notebook = self._resolve_and_parse_path(path) + notebook = self._resolve_and_parse_notebook_path(path) if notebook is not None: return self._parse_notebook(notebook, parent_tree=parent_tree) return None @@ -205,6 +207,7 @@ def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | def _parse_python_cell( self, python_cell: PythonCell, notebook_path: Path, *, parent_tree: Tree | None ) -> Failure | None: + """Parse the Python cell.""" maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code) if maybe_tree.failure: return maybe_tree.failure @@ -236,7 +239,12 @@ def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: run_commands.append(magic_line) return run_commands - def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tree | None) -> Failure | None: + def _process_code_node(self, node: SysPathChange | RunCommand, *, parent_tree: Tree | None) -> Failure | None: + """Process a code node. + + 1. `SysPathChange` mutate the path lookup. + 2. `MagicLine` containing a `RunCommand` run other notebooks that should be parsed. + """ if isinstance(node, SysPathChange): failure = self._mutate_path_lookup(node) if failure: @@ -244,7 +252,7 @@ def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tr if isinstance(node, MagicLine): magic = node.as_magic() assert isinstance(magic, RunCommand) - notebook = self._resolve_and_parse_path(magic.notebook_path) + notebook = self._resolve_and_parse_notebook_path(magic.notebook_path) if notebook is None: failure = Failure.from_node( code='dependency-not-found', @@ -256,6 +264,7 @@ def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tr return None def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: + """Mutate the path lookup.""" if isinstance(change, UnresolvedPath): return Failure.from_node( code='sys-path-cannot-compute-value', @@ -265,7 +274,8 @@ def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: change.apply_to(self._path_lookup) return None - def _resolve_and_parse_path(self, path: Path | None) -> Notebook | None: + def _resolve_and_parse_notebook_path(self, path: Path | None) -> Notebook | None: + """Resolve and parse notebook path.""" if path is None: return None # already reported during dependency building resolved = self._path_lookup.resolve(path) From f46955eb3f52af369c534b1f73e3fdf51b2f17d9 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 15:17:33 +0100 Subject: [PATCH 68/94] Remove redundant tree initialization --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 4f7da2e8be..edde7223e0 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -214,7 +214,7 @@ def _parse_python_cell( assert maybe_tree.tree is not None if parent_tree: parent_tree.attach_child_tree(maybe_tree.tree) - self._python_tree_cache[(notebook_path, python_cell)] = maybe_tree.tree or Tree.new_module() + self._python_tree_cache[(notebook_path, python_cell)] = maybe_tree.tree return self._parse_tree(maybe_tree.tree) def _parse_tree(self, tree: Tree) -> Failure | None: From be03c3b7759f73238ce33e64e7706dffa4fb7d2c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 15:32:59 +0100 Subject: [PATCH 69/94] Return failures for each notebook cell --- .../labs/ucx/source_code/notebooks/sources.py | 56 ++++++++++--------- .../source_code/notebooks/test_sources.py | 40 +++++++++++++ 2 files changed, 69 insertions(+), 27 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index edde7223e0..232d2500da 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -158,9 +158,9 @@ def __init__( self._python_tree_cache: dict[tuple[Path, PythonCell], Tree] = {} # Path in key is the notebook's path def lint(self) -> Iterable[Advice]: - failure = self._parse_notebook(self._notebook, parent_tree=self._inherited_tree) - if failure: - yield failure + failures = self._parse_notebook(self._notebook, parent_tree=self._inherited_tree) + if failures: + yield from failures return for cell in self._notebook.cells: try: @@ -181,53 +181,55 @@ def lint(self) -> Iterable[Advice]: ) return - def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree | None) -> Failure | None: + def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree | None) -> list[Failure]: """Parse a notebook by parsing its cells.""" + failures = [] for cell in notebook.cells: - failure = None if isinstance(cell, RunCell): - failure = self._resolve_and_parse_run_cell(cell, parent_tree=parent_tree) + failures.extend(self._resolve_and_parse_run_cell(cell, parent_tree=parent_tree)) elif isinstance(cell, PythonCell): - failure = self._parse_python_cell(cell, notebook.path, parent_tree=parent_tree) + failures.extend(self._parse_python_cell(cell, notebook.path, parent_tree=parent_tree)) parent_tree = self._python_tree_cache.get((notebook.path, cell)) - if failure: - return failure - return None + return failures - def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> Failure | None: + def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> list[Failure]: """Resolve the path in the run cell and parse the notebook it refers.""" path = run_cell.maybe_notebook_path() if path is None: - return None # malformed run cell already reported + return [] # malformed run cell already reported notebook = self._resolve_and_parse_notebook_path(path) - if notebook is not None: - return self._parse_notebook(notebook, parent_tree=parent_tree) - return None + if not notebook: + return [] + return self._parse_notebook(notebook, parent_tree=parent_tree) def _parse_python_cell( self, python_cell: PythonCell, notebook_path: Path, *, parent_tree: Tree | None - ) -> Failure | None: + ) -> list[Failure]: """Parse the Python cell.""" maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code) if maybe_tree.failure: - return maybe_tree.failure + failure = dataclasses.replace( + maybe_tree.failure, + start_line=maybe_tree.failure.start_line + python_cell.original_offset, + end_line=maybe_tree.failure.end_line + python_cell.original_offset, + ) + return [failure] assert maybe_tree.tree is not None if parent_tree: parent_tree.attach_child_tree(maybe_tree.tree) self._python_tree_cache[(notebook_path, python_cell)] = maybe_tree.tree return self._parse_tree(maybe_tree.tree) - def _parse_tree(self, tree: Tree) -> Failure | None: + def _parse_tree(self, tree: Tree) -> list[Failure]: """Parse tree by looking for referred notebooks and path changes that might affect loading notebooks.""" code_path_nodes = self._list_magic_lines_with_run_command(tree) + SysPathChange.extract_from_tree( self._session_state, tree ) # Sys path changes require to load children in order of reading + failures = [] for base_node in sorted(code_path_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)): - failure = self._process_code_node(base_node, parent_tree=tree) - if failure: - return failure - return None + failures.extend(self._process_code_node(base_node, parent_tree=tree)) + return failures @staticmethod def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: @@ -239,7 +241,7 @@ def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: run_commands.append(magic_line) return run_commands - def _process_code_node(self, node: SysPathChange | RunCommand, *, parent_tree: Tree | None) -> Failure | None: + def _process_code_node(self, node: SysPathChange | RunCommand, *, parent_tree: Tree | None) -> list[Failure]: """Process a code node. 1. `SysPathChange` mutate the path lookup. @@ -248,8 +250,8 @@ def _process_code_node(self, node: SysPathChange | RunCommand, *, parent_tree: T if isinstance(node, SysPathChange): failure = self._mutate_path_lookup(node) if failure: - return failure - if isinstance(node, MagicLine): + return [failure] + elif isinstance(node, MagicLine): magic = node.as_magic() assert isinstance(magic, RunCommand) notebook = self._resolve_and_parse_notebook_path(magic.notebook_path) @@ -259,9 +261,9 @@ def _process_code_node(self, node: SysPathChange | RunCommand, *, parent_tree: T message=f"Can't locate dependency: {magic.notebook_path}", node=node.node, ) - return failure + return [failure] return self._parse_notebook(notebook, parent_tree=parent_tree) - return None + return [] def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: """Mutate the path lookup.""" diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index fa74d5eb8c..71eab15dd3 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -153,6 +153,46 @@ def test_notebook_linter_lints_source_yielding_parse_failure(migration_index, mo ] +def test_notebook_linter_lints_source_yielding_parse_failures(migration_index, mock_path_lookup) -> None: + source = """ +# Databricks notebook source + +print(1 + +# COMMAND ---------- + +print(2 +""".lstrip() # Missing parentheses is on purpose + linter = NotebookLinter.from_source( + migration_index, + mock_path_lookup, + CurrentSessionState(), + source, + Language.PYTHON, + ) + + advices = list(linter.lint()) + + assert advices == [ + Failure( + code='python-parse-error', + message='Failed to parse code due to invalid syntax: print(1', + start_line=2, + start_col=5, + end_line=2, + end_col=1, + ), + Failure( + code='python-parse-error', + message='Failed to parse code due to invalid syntax: print(2', + start_line=6, + start_col=5, + end_line=6, + end_col=1, + ), + ] + + def test_notebook_linter_lints_parent_child_context_from_grand_parent(migration_index, mock_path_lookup) -> None: """Verify the NotebookLinter can resolve %run""" path = Path(__file__).parent.parent / "samples" / "parent-child-context" / "grand_parent.py" From 3646140ca05f3421568629657da033cc258f96f0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 15:33:28 +0100 Subject: [PATCH 70/94] Fix expected start and end line --- tests/unit/source_code/notebooks/test_sources.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index 71eab15dd3..723eb2e733 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -145,9 +145,9 @@ def test_notebook_linter_lints_source_yielding_parse_failure(migration_index, mo Failure( code='python-parse-error', message='Failed to parse code due to invalid syntax: print(1', - start_line=0, + start_line=1, start_col=5, - end_line=0, + end_line=1, end_col=1, ) ] From b7990769947c9befb59aa00269f55604cff70927 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 15:34:23 +0100 Subject: [PATCH 71/94] Fix type hint --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 232d2500da..6d23cf4181 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -225,8 +225,8 @@ def _parse_tree(self, tree: Tree) -> list[Failure]: code_path_nodes = self._list_magic_lines_with_run_command(tree) + SysPathChange.extract_from_tree( self._session_state, tree ) - # Sys path changes require to load children in order of reading failures = [] + # Sys path changes require to load children in order of reading for base_node in sorted(code_path_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)): failures.extend(self._process_code_node(base_node, parent_tree=tree)) return failures @@ -241,7 +241,7 @@ def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: run_commands.append(magic_line) return run_commands - def _process_code_node(self, node: SysPathChange | RunCommand, *, parent_tree: Tree | None) -> list[Failure]: + def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tree | None) -> list[Failure]: """Process a code node. 1. `SysPathChange` mutate the path lookup. From da3eb20bfc5054852f2c29908c48e6e030a46118 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 15:41:10 +0100 Subject: [PATCH 72/94] Move from_source_code class method to tester class It is only used during testing and should not be used outside --- .../source_code/notebooks/test_sources.py | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index 723eb2e733..583b59a7a9 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -6,9 +6,11 @@ import pytest from databricks.sdk.service.workspace import Language +from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex from databricks.labs.ucx.source_code.base import CurrentSessionState, Deprecation, Failure from databricks.labs.ucx.source_code.linters.context import LinterContext from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook, NotebookLinter +from databricks.labs.ucx.source_code.path_lookup import PathLookup @pytest.mark.parametrize("path, content", [("xyz.py", "a = 3"), ("xyz.sql", "select * from dual")]) @@ -116,8 +118,26 @@ def test_file_linter_lints_file_with_missing_read_permission(migration_index, mo assert advices[0].message == f"Missing read permission for {path}" +class _NotebookLinter(NotebookLinter): + """A helper class to construct the notebook linter from source code for testing simplification.""" + + @classmethod + def from_source_code( + cls, + index: TableMigrationIndex, + path_lookup: PathLookup, + session_state: CurrentSessionState, + source: str, + default_language: Language, + ) -> NotebookLinter: + ctx = LinterContext(index) + notebook = Notebook.parse(Path(""), source, default_language) + assert notebook is not None + return cls(ctx, path_lookup, session_state, notebook) + + def test_notebook_linter_lints_source_yielding_no_advices(migration_index, mock_path_lookup) -> None: - linter = NotebookLinter.from_source( + linter = _NotebookLinter.from_source_code( migration_index, mock_path_lookup, CurrentSessionState(), @@ -131,7 +151,7 @@ def test_notebook_linter_lints_source_yielding_no_advices(migration_index, mock_ def test_notebook_linter_lints_source_yielding_parse_failure(migration_index, mock_path_lookup) -> None: - linter = NotebookLinter.from_source( + linter = _NotebookLinter.from_source_code( migration_index, mock_path_lookup, CurrentSessionState(), @@ -163,7 +183,7 @@ def test_notebook_linter_lints_source_yielding_parse_failures(migration_index, m print(2 """.lstrip() # Missing parentheses is on purpose - linter = NotebookLinter.from_source( + linter = _NotebookLinter.from_source_code( migration_index, mock_path_lookup, CurrentSessionState(), @@ -220,7 +240,7 @@ def test_notebook_linter_lints_migrated_table(migration_index, mock_path_lookup) spark.table(table_name) """.lstrip() - linter = NotebookLinter.from_source( + linter = _NotebookLinter.from_source_code( migration_index, mock_path_lookup, CurrentSessionState(), @@ -252,7 +272,7 @@ def test_notebook_linter_lints_not_migrated_table(migration_index, mock_path_loo spark.table(table_name) """.lstrip() - linter = NotebookLinter.from_source( + linter = _NotebookLinter.from_source_code( migration_index, mock_path_lookup, CurrentSessionState(), @@ -283,7 +303,7 @@ def test_notebook_linter_lints_migrated_table_with_rename(migration_index, mock_ table_name = "not_migrated.table" # NOT a migrated table according to the migration index """.lstrip() - linter = NotebookLinter.from_source( + linter = _NotebookLinter.from_source_code( migration_index, mock_path_lookup, CurrentSessionState(), @@ -322,7 +342,7 @@ def test_notebook_linter_lints_not_migrated_table_with_rename(migration_index, m table_name = "old.things" # Migrated table according to the migration index """.lstrip() - linter = NotebookLinter.from_source( + linter = _NotebookLinter.from_source_code( migration_index, mock_path_lookup, CurrentSessionState(), From cf5679dae327967626f854d959742820a471f4cc Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 20 Jan 2025 15:41:21 +0100 Subject: [PATCH 73/94] Change elif to if --- .../labs/ucx/source_code/notebooks/sources.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 6d23cf4181..5b56ffb548 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -10,7 +10,6 @@ from databricks.sdk.service.workspace import Language -from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex from databricks.labs.ucx.source_code.base import ( file_language, is_a_notebook, @@ -126,20 +125,6 @@ class NotebookLinter: to the code cells according to the language of the cell. """ - @classmethod - def from_source( - cls, - index: TableMigrationIndex, - path_lookup: PathLookup, - session_state: CurrentSessionState, - source: str, - default_language: Language, - ) -> NotebookLinter: - ctx = LinterContext(index) - notebook = Notebook.parse(Path(""), source, default_language) - assert notebook is not None - return cls(ctx, path_lookup, session_state, notebook) - def __init__( self, context: LinterContext, @@ -251,7 +236,7 @@ def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tr failure = self._mutate_path_lookup(node) if failure: return [failure] - elif isinstance(node, MagicLine): + if isinstance(node, MagicLine): magic = node.as_magic() assert isinstance(magic, RunCommand) notebook = self._resolve_and_parse_notebook_path(magic.notebook_path) From 8113828f4f0594455f8e66ba0b7c0dca9b194842 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 08:28:32 +0100 Subject: [PATCH 74/94] Rename inherited tree to parent tree --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 5b56ffb548..82cdca033a 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -131,19 +131,19 @@ def __init__( path_lookup: PathLookup, session_state: CurrentSessionState, notebook: Notebook, - inherited_tree: Tree | None = None, + parent_tree: Tree | None = None, ): self._context: LinterContext = context self._path_lookup = path_lookup self._session_state = session_state self._notebook: Notebook = notebook - self._inherited_tree = inherited_tree + self._parent_tree = parent_tree # Python trees are constructed during notebook parsing and cached for later usage self._python_tree_cache: dict[tuple[Path, PythonCell], Tree] = {} # Path in key is the notebook's path def lint(self) -> Iterable[Advice]: - failures = self._parse_notebook(self._notebook, parent_tree=self._inherited_tree) + failures = self._parse_notebook(self._notebook, parent_tree=self._parent_tree) if failures: yield from failures return From ae206b8f7a368b7f24ace72d86313ca8256b80b9 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 08:56:18 +0100 Subject: [PATCH 75/94] Test creating run cell from notebook --- tests/unit/source_code/samples/simple_notebook.py | 4 ++++ tests/unit/source_code/samples/simple_notebook.yml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/tests/unit/source_code/samples/simple_notebook.py b/tests/unit/source_code/samples/simple_notebook.py index 9dfdb02369..b23cc09926 100644 --- a/tests/unit/source_code/samples/simple_notebook.py +++ b/tests/unit/source_code/samples/simple_notebook.py @@ -59,3 +59,7 @@ # COMMAND ---------- # MAGIC %lsmagic + +# COMMAND ---------- + +# MAGIC %run ./test diff --git a/tests/unit/source_code/samples/simple_notebook.yml b/tests/unit/source_code/samples/simple_notebook.yml index c3fbb14ae1..78c78bd5b4 100644 --- a/tests/unit/source_code/samples/simple_notebook.yml +++ b/tests/unit/source_code/samples/simple_notebook.yml @@ -63,3 +63,7 @@ cells: starts_at_line: 60 content: | %lsmagic + - type: Run + starts_at_line: 64 + content: | + %run ./test From 4b1528fc898986c7c1158d3e2eca01a82191cd5b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 10:33:56 +0100 Subject: [PATCH 76/94] Test infer value from parent's child --- .../source_code/python/test_python_ast.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index da5acab70f..ee6d008c3f 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -175,6 +175,26 @@ def test_tree_attach_child_tree_infers_value() -> None: assert strings == [inferred_string] +def test_tree_attach_parent_with_child_tree_infers_value() -> None: + inferred_string = "Hello John!" + parent_source, child_a_source, child_b_source = "name = 'John'", "greeting = 'Hello'", 'say = f"{greeting} {name}!"' + parent_maybe_tree = Tree.maybe_normalized_parse(parent_source) + child_a_maybe_tree = Tree.maybe_normalized_parse(child_a_source) + child_b_maybe_tree = Tree.maybe_normalized_parse(child_b_source) + + assert parent_maybe_tree.tree is not None, parent_maybe_tree.failure + assert child_a_maybe_tree.tree is not None, child_a_maybe_tree.failure + assert child_b_maybe_tree.tree is not None, child_b_maybe_tree.failure + + parent_maybe_tree.tree.attach_child_tree(child_a_maybe_tree.tree) + parent_maybe_tree.tree.attach_child_tree(child_b_maybe_tree.tree) + + nodes = child_b_maybe_tree.tree.locate(Assign, []) + tree = Tree(nodes[0].value) # Starting from child, we are looking for the first assign + strings = [value.as_string() for value in InferredValue.infer_from_node(tree.node)] + assert strings == [inferred_string] + + def test_is_from_module() -> None: source = """ df = spark.read.csv("hi") From 5b98b4465d6f5bab9c10cf81ef7cca36a561dbac Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 10:56:31 +0100 Subject: [PATCH 77/94] Test Python trees simulating notebook running other notebook --- .../labs/ucx/source_code/python/python_ast.py | 6 +++++ .../source_code/python/test_python_ast.py | 25 ++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index f108961eb0..61339f13d1 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -20,6 +20,7 @@ Import, ImportFrom, Instance, + JoinedStr, Module, Name, NodeNG, @@ -554,6 +555,11 @@ def visit_importfrom(self, node: ImportFrom) -> None: return self._matched_nodes.append(node) + def visit_joinedstr(self, node: JoinedStr) -> None: + if self._node_type is not JoinedStr: + return + self._matched_nodes.append(node) + def _matches(self, node: NodeNG, depth: int) -> bool: if depth >= len(self._match_nodes): return False diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index ee6d008c3f..8f55e21365 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -3,7 +3,7 @@ import pytest import astroid # type: ignore -from astroid import Assign, AssignName, Attribute, Call, Const, Expr, Module, Name, NodeNG # type: ignore +from astroid import Assign, AssignName, Attribute, Call, Const, Expr, JoinedStr, Module, Name, NodeNG # type: ignore from databricks.labs.ucx.source_code.base import ( Advice, @@ -195,6 +195,29 @@ def test_tree_attach_parent_with_child_tree_infers_value() -> None: assert strings == [inferred_string] +def test_tree_attach_child_tree_with_notebook_using_variable_from_other_notebook() -> None: + """Simulating a notebook where it uses a variable from another notebook.""" + inferred_string = "catalog.schema.table" + child_source = "table_name = 'schema.table'" + parent_cell_1_source = "%run ./child" + parent_cell_2_source = "spark.table(f'catalog.{table_name}')" + child_maybe_tree = Tree.maybe_normalized_parse(child_source) + parent_cell_1_maybe_tree = Tree.maybe_normalized_parse(parent_cell_1_source) + parent_cell_2_maybe_tree = Tree.maybe_normalized_parse(parent_cell_2_source) + + assert child_maybe_tree.tree is not None, child_maybe_tree.failure + assert parent_cell_1_maybe_tree.tree is not None, parent_cell_1_maybe_tree.failure + assert parent_cell_2_maybe_tree.tree is not None, parent_cell_2_maybe_tree.failure + + parent_cell_1_maybe_tree.tree.attach_child_tree(child_maybe_tree.tree) + # Subsequent notebook cell is child of previous cell + parent_cell_1_maybe_tree.tree.attach_child_tree(parent_cell_2_maybe_tree.tree) + + nodes = parent_cell_2_maybe_tree.tree.locate(JoinedStr, []) + strings = [value.as_string() for value in InferredValue.infer_from_node(nodes[0])] + assert strings == [inferred_string] + + def test_is_from_module() -> None: source = """ df = spark.read.csv("hi") From 2e7663f99c1dc977afda7de5c9fb594a09f10e77 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 11:23:15 +0100 Subject: [PATCH 78/94] Test inferring value from grand parent --- .../source_code/python/test_python_ast.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 8f55e21365..e10a414e5a 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -175,6 +175,26 @@ def test_tree_attach_child_tree_infers_value() -> None: assert strings == [inferred_string] +def test_tree_attach_child_tree_infers_value_from_grand_parent() -> None: + inferred_string = "Hello John!" + grand_parent_source, parent_source, child_source = "name = 'John'", "greeting = 'Hello'", 'say = f"Hello {name}!"' + grand_parent_maybe_tree = Tree.maybe_normalized_parse(grand_parent_source) + parent_maybe_tree = Tree.maybe_normalized_parse(parent_source) + child_maybe_tree = Tree.maybe_normalized_parse(child_source) + + assert grand_parent_maybe_tree.tree is not None, grand_parent_maybe_tree.failure + assert parent_maybe_tree.tree is not None, parent_maybe_tree.failure + assert child_maybe_tree.tree is not None, child_maybe_tree.failure + + grand_parent_maybe_tree.tree.attach_child_tree(parent_maybe_tree.tree) + parent_maybe_tree.tree.attach_child_tree(child_maybe_tree.tree) + + nodes = child_maybe_tree.tree.locate(Assign, []) + tree = Tree(nodes[0].value) # Starting from child, we are looking for the first assign + strings = [value.as_string() for value in InferredValue.infer_from_node(tree.node)] + assert strings == [inferred_string] + + def test_tree_attach_parent_with_child_tree_infers_value() -> None: inferred_string = "Hello John!" parent_source, child_a_source, child_b_source = "name = 'John'", "greeting = 'Hello'", 'say = f"{greeting} {name}!"' From 9701f1794c3c665eecba1525cb2b37158e371816 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 11:26:54 +0100 Subject: [PATCH 79/94] Test using variable from ran child notebook --- .../source_code/python/test_python_ast.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index e10a414e5a..25eca587ed 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -238,6 +238,29 @@ def test_tree_attach_child_tree_with_notebook_using_variable_from_other_notebook assert strings == [inferred_string] +def test_tree_attach_child_tree_with_notebook_using_variable_from_parent_notebook() -> None: + """Simulating a notebook where it uses a variable from its parent notebook.""" + inferred_string = "catalog.schema.table" + child_source = "spark.table(f'catalog.{table_name}')" + parent_cell_1_source = "table_name = 'schema.table'" + parent_cell_2_source = "%run ./child" + child_maybe_tree = Tree.maybe_normalized_parse(child_source) + parent_cell_1_maybe_tree = Tree.maybe_normalized_parse(parent_cell_1_source) + parent_cell_2_maybe_tree = Tree.maybe_normalized_parse(parent_cell_2_source) + + assert child_maybe_tree.tree is not None, child_maybe_tree.failure + assert parent_cell_1_maybe_tree.tree is not None, parent_cell_1_maybe_tree.failure + assert parent_cell_2_maybe_tree.tree is not None, parent_cell_2_maybe_tree.failure + + # Subsequent notebook cell is child of previous cell + parent_cell_1_maybe_tree.tree.attach_child_tree(parent_cell_2_maybe_tree.tree) + parent_cell_2_maybe_tree.tree.attach_child_tree(child_maybe_tree.tree) + + nodes = child_maybe_tree.tree.locate(JoinedStr, []) + strings = [value.as_string() for value in InferredValue.infer_from_node(nodes[0])] + assert strings == [inferred_string] + + def test_is_from_module() -> None: source = """ df = spark.read.csv("hi") From 02e8cb3be18b5ae81d2f7822a1958851dcb57c4d Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 12:02:14 +0100 Subject: [PATCH 80/94] Test infer from parent using extend globals --- tests/unit/source_code/python/test_python_ast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 25eca587ed..ecfa0dfc8f 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -158,7 +158,7 @@ def test_ignores_magic_marker_in_multiline_comment() -> None: assert True -def test_tree_attach_child_tree_infers_value() -> None: +def test_tree_child_tree_infers_value() -> None: inferred_string = "Hello John!" parent_source, child_source = "a = 'John'", 'b = f"Hello {a}!"' parent_maybe_tree = Tree.maybe_normalized_parse(parent_source) @@ -167,7 +167,7 @@ def test_tree_attach_child_tree_infers_value() -> None: assert parent_maybe_tree.tree is not None, parent_maybe_tree.failure assert child_maybe_tree.tree is not None, child_maybe_tree.failure - parent_maybe_tree.tree.attach_child_tree(child_maybe_tree.tree) + child_maybe_tree.tree.extend_globals(parent_maybe_tree.tree.node.globals) nodes = child_maybe_tree.tree.locate(Assign, []) tree = Tree(nodes[0].value) # Starting from child, we are looking for the first assign From 6f8a8193f97152f3539805d2f3b7e58355c8efac Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 12:02:23 +0100 Subject: [PATCH 81/94] Test infer from grand parent using extend globals --- tests/unit/source_code/python/test_python_ast.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index ecfa0dfc8f..d63fb6258a 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -175,7 +175,7 @@ def test_tree_child_tree_infers_value() -> None: assert strings == [inferred_string] -def test_tree_attach_child_tree_infers_value_from_grand_parent() -> None: +def test_tree_infers_value_from_grand_parent() -> None: inferred_string = "Hello John!" grand_parent_source, parent_source, child_source = "name = 'John'", "greeting = 'Hello'", 'say = f"Hello {name}!"' grand_parent_maybe_tree = Tree.maybe_normalized_parse(grand_parent_source) @@ -186,8 +186,8 @@ def test_tree_attach_child_tree_infers_value_from_grand_parent() -> None: assert parent_maybe_tree.tree is not None, parent_maybe_tree.failure assert child_maybe_tree.tree is not None, child_maybe_tree.failure - grand_parent_maybe_tree.tree.attach_child_tree(parent_maybe_tree.tree) - parent_maybe_tree.tree.attach_child_tree(child_maybe_tree.tree) + parent_maybe_tree.tree.extend_globals(grand_parent_maybe_tree.tree.node.globals) + child_maybe_tree.tree.extend_globals(parent_maybe_tree.tree.node.globals) nodes = child_maybe_tree.tree.locate(Assign, []) tree = Tree(nodes[0].value) # Starting from child, we are looking for the first assign From 47e327376cd87522f82a425eb0e5d3bc245ad3fa Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 12:02:54 +0100 Subject: [PATCH 82/94] Fix test name --- tests/unit/source_code/python/test_python_ast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index d63fb6258a..8bfa44cfb0 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -158,7 +158,7 @@ def test_ignores_magic_marker_in_multiline_comment() -> None: assert True -def test_tree_child_tree_infers_value() -> None: +def test_tree_extend_globals_child_tree_infers_value() -> None: inferred_string = "Hello John!" parent_source, child_source = "a = 'John'", 'b = f"Hello {a}!"' parent_maybe_tree = Tree.maybe_normalized_parse(parent_source) @@ -175,7 +175,7 @@ def test_tree_child_tree_infers_value() -> None: assert strings == [inferred_string] -def test_tree_infers_value_from_grand_parent() -> None: +def test_tree_extend_globals_infers_value_from_grand_parent() -> None: inferred_string = "Hello John!" grand_parent_source, parent_source, child_source = "name = 'John'", "greeting = 'Hello'", 'say = f"Hello {name}!"' grand_parent_maybe_tree = Tree.maybe_normalized_parse(grand_parent_source) From e800b7ed39e7fc645a330bdbe9c6f82877533ccd Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 13:21:13 +0100 Subject: [PATCH 83/94] Test inferring from sibling tree --- tests/unit/source_code/python/test_python_ast.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 8bfa44cfb0..1a951f6b2b 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -195,8 +195,9 @@ def test_tree_extend_globals_infers_value_from_grand_parent() -> None: assert strings == [inferred_string] -def test_tree_attach_parent_with_child_tree_infers_value() -> None: - inferred_string = "Hello John!" +def test_tree_extend_globals_for_parent_with_children_cannot_infer_value() -> None: + """A tree cannot infer the value from its parent's child (aka sibling tree).""" + inferred_string = "" # Nothing inferred parent_source, child_a_source, child_b_source = "name = 'John'", "greeting = 'Hello'", 'say = f"{greeting} {name}!"' parent_maybe_tree = Tree.maybe_normalized_parse(parent_source) child_a_maybe_tree = Tree.maybe_normalized_parse(child_a_source) @@ -206,8 +207,8 @@ def test_tree_attach_parent_with_child_tree_infers_value() -> None: assert child_a_maybe_tree.tree is not None, child_a_maybe_tree.failure assert child_b_maybe_tree.tree is not None, child_b_maybe_tree.failure - parent_maybe_tree.tree.attach_child_tree(child_a_maybe_tree.tree) - parent_maybe_tree.tree.attach_child_tree(child_b_maybe_tree.tree) + child_a_maybe_tree.tree.extend_globals(parent_maybe_tree.tree.node.globals) + child_b_maybe_tree.tree.extend_globals(parent_maybe_tree.tree.node.globals) nodes = child_b_maybe_tree.tree.locate(Assign, []) tree = Tree(nodes[0].value) # Starting from child, we are looking for the first assign From b648b2f578b48bb35a690f919e4b40f90b7ac26c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 13:21:56 +0100 Subject: [PATCH 84/94] Test simulate using value from child notebook --- tests/unit/source_code/python/test_python_ast.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 1a951f6b2b..37097ad569 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -216,7 +216,7 @@ def test_tree_extend_globals_for_parent_with_children_cannot_infer_value() -> No assert strings == [inferred_string] -def test_tree_attach_child_tree_with_notebook_using_variable_from_other_notebook() -> None: +def test_tree_extend_globals_with_notebook_using_variable_from_other_notebook() -> None: """Simulating a notebook where it uses a variable from another notebook.""" inferred_string = "catalog.schema.table" child_source = "table_name = 'schema.table'" @@ -230,9 +230,9 @@ def test_tree_attach_child_tree_with_notebook_using_variable_from_other_notebook assert parent_cell_1_maybe_tree.tree is not None, parent_cell_1_maybe_tree.failure assert parent_cell_2_maybe_tree.tree is not None, parent_cell_2_maybe_tree.failure - parent_cell_1_maybe_tree.tree.attach_child_tree(child_maybe_tree.tree) - # Subsequent notebook cell is child of previous cell - parent_cell_1_maybe_tree.tree.attach_child_tree(parent_cell_2_maybe_tree.tree) + parent_cell_1_maybe_tree.tree.extend_globals(child_maybe_tree.tree.node.globals) + # Subsequent notebook cell gets globals from previous cell + parent_cell_2_maybe_tree.tree.extend_globals(parent_cell_1_maybe_tree.tree.node.globals) nodes = parent_cell_2_maybe_tree.tree.locate(JoinedStr, []) strings = [value.as_string() for value in InferredValue.infer_from_node(nodes[0])] From 25549ec5ac4227f075d2a71b0a35556cf53514b9 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 13:22:14 +0100 Subject: [PATCH 85/94] Test simulate using value from parent notebook --- tests/unit/source_code/python/test_python_ast.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 37097ad569..3e6d37dfa7 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -239,7 +239,7 @@ def test_tree_extend_globals_with_notebook_using_variable_from_other_notebook() assert strings == [inferred_string] -def test_tree_attach_child_tree_with_notebook_using_variable_from_parent_notebook() -> None: +def test_tree_extend_globals_with_notebook_using_variable_from_parent_notebook() -> None: """Simulating a notebook where it uses a variable from its parent notebook.""" inferred_string = "catalog.schema.table" child_source = "spark.table(f'catalog.{table_name}')" @@ -253,9 +253,8 @@ def test_tree_attach_child_tree_with_notebook_using_variable_from_parent_noteboo assert parent_cell_1_maybe_tree.tree is not None, parent_cell_1_maybe_tree.failure assert parent_cell_2_maybe_tree.tree is not None, parent_cell_2_maybe_tree.failure - # Subsequent notebook cell is child of previous cell - parent_cell_1_maybe_tree.tree.attach_child_tree(parent_cell_2_maybe_tree.tree) - parent_cell_2_maybe_tree.tree.attach_child_tree(child_maybe_tree.tree) + parent_cell_2_maybe_tree.tree.extend_globals(parent_cell_1_maybe_tree.tree.node.globals) + child_maybe_tree.tree.extend_globals(parent_cell_2_maybe_tree.tree.node.globals) nodes = child_maybe_tree.tree.locate(JoinedStr, []) strings = [value.as_string() for value in InferredValue.infer_from_node(nodes[0])] From 09470e613d1c8241d6659941578701e374c65cb1 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 13:25:00 +0100 Subject: [PATCH 86/94] Test propagating module with extend globals --- tests/unit/source_code/python/test_python_ast.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 3e6d37dfa7..7266ea12d8 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -299,7 +299,7 @@ def test_is_instance_of(source, name, class_name) -> None: assert Tree(var[0]).is_instance_of(class_name) -def test_tree_attach_child_tree_propagates_module_reference() -> None: +def test_tree_extend_globals_propagates_module_reference() -> None: """The spark module should propagate from the parent tree.""" source_1 = "df = spark.read.csv('hi')" source_2 = "df = df.withColumn(stuff)" @@ -312,8 +312,8 @@ def test_tree_attach_child_tree_propagates_module_reference() -> None: assert second_line_maybe_tree.tree, second_line_maybe_tree.failure assert third_line_maybe_tree.tree, third_line_maybe_tree.failure - first_line_maybe_tree.tree.attach_child_tree(second_line_maybe_tree.tree) - first_line_maybe_tree.tree.attach_child_tree(third_line_maybe_tree.tree) + second_line_maybe_tree.tree.extend_globals(first_line_maybe_tree.tree.node.globals) + third_line_maybe_tree.tree.extend_globals(second_line_maybe_tree.tree.node.globals) assign = third_line_maybe_tree.tree.locate(Assign, [])[0] assert Tree(assign.value).is_from_module("spark") From 2a16d902f659152a1ea816163c0efe8866b8f41a Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 14:51:03 +0100 Subject: [PATCH 87/94] Let NotebookLinter fail early while parsing --- .../labs/ucx/source_code/notebooks/sources.py | 45 ++++++++++--------- .../source_code/notebooks/test_sources.py | 1 + 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 82cdca033a..f56843d9b8 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -143,9 +143,9 @@ def __init__( self._python_tree_cache: dict[tuple[Path, PythonCell], Tree] = {} # Path in key is the notebook's path def lint(self) -> Iterable[Advice]: - failures = self._parse_notebook(self._notebook, parent_tree=self._parent_tree) - if failures: - yield from failures + failure = self._parse_notebook(self._notebook, parent_tree=self._parent_tree) + if failure: + yield failure return for cell in self._notebook.cells: try: @@ -166,30 +166,32 @@ def lint(self) -> Iterable[Advice]: ) return - def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree | None) -> list[Failure]: + def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree | None) -> Failure | None: """Parse a notebook by parsing its cells.""" - failures = [] for cell in notebook.cells: + failure = None if isinstance(cell, RunCell): - failures.extend(self._resolve_and_parse_run_cell(cell, parent_tree=parent_tree)) + failure = self._resolve_and_parse_run_cell(cell, parent_tree=parent_tree) elif isinstance(cell, PythonCell): - failures.extend(self._parse_python_cell(cell, notebook.path, parent_tree=parent_tree)) + failure = self._parse_python_cell(cell, notebook.path, parent_tree=parent_tree) parent_tree = self._python_tree_cache.get((notebook.path, cell)) - return failures + if failure: + return failure + return None - def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> list[Failure]: + def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> Failure | None: """Resolve the path in the run cell and parse the notebook it refers.""" path = run_cell.maybe_notebook_path() if path is None: - return [] # malformed run cell already reported + return None # malformed run cell already reported notebook = self._resolve_and_parse_notebook_path(path) if not notebook: - return [] + return None return self._parse_notebook(notebook, parent_tree=parent_tree) def _parse_python_cell( self, python_cell: PythonCell, notebook_path: Path, *, parent_tree: Tree | None - ) -> list[Failure]: + ) -> Failure | None: """Parse the Python cell.""" maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code) if maybe_tree.failure: @@ -198,23 +200,24 @@ def _parse_python_cell( start_line=maybe_tree.failure.start_line + python_cell.original_offset, end_line=maybe_tree.failure.end_line + python_cell.original_offset, ) - return [failure] + return failure assert maybe_tree.tree is not None if parent_tree: parent_tree.attach_child_tree(maybe_tree.tree) self._python_tree_cache[(notebook_path, python_cell)] = maybe_tree.tree return self._parse_tree(maybe_tree.tree) - def _parse_tree(self, tree: Tree) -> list[Failure]: + def _parse_tree(self, tree: Tree) -> Failure | None: """Parse tree by looking for referred notebooks and path changes that might affect loading notebooks.""" code_path_nodes = self._list_magic_lines_with_run_command(tree) + SysPathChange.extract_from_tree( self._session_state, tree ) - failures = [] # Sys path changes require to load children in order of reading for base_node in sorted(code_path_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)): - failures.extend(self._process_code_node(base_node, parent_tree=tree)) - return failures + failure = self._process_code_node(base_node, parent_tree=tree) + if failure: + return failure + return None @staticmethod def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: @@ -226,7 +229,7 @@ def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: run_commands.append(magic_line) return run_commands - def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tree | None) -> list[Failure]: + def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tree | None) -> Failure | None: """Process a code node. 1. `SysPathChange` mutate the path lookup. @@ -235,7 +238,7 @@ def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tr if isinstance(node, SysPathChange): failure = self._mutate_path_lookup(node) if failure: - return [failure] + return failure if isinstance(node, MagicLine): magic = node.as_magic() assert isinstance(magic, RunCommand) @@ -246,9 +249,9 @@ def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tr message=f"Can't locate dependency: {magic.notebook_path}", node=node.node, ) - return [failure] + return failure return self._parse_notebook(notebook, parent_tree=parent_tree) - return [] + return None def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: """Mutate the path lookup.""" diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index 583b59a7a9..51819ae740 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -173,6 +173,7 @@ def test_notebook_linter_lints_source_yielding_parse_failure(migration_index, mo ] +@pytest.mark.xfail(reason="https://github.com/databrickslabs/ucx/issues/3556") def test_notebook_linter_lints_source_yielding_parse_failures(migration_index, mock_path_lookup) -> None: source = """ # Databricks notebook source From 5bebad485dc3648b566c5aaf8f227b542fdf8f59 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 21 Jan 2025 16:56:38 +0100 Subject: [PATCH 88/94] Rewrite notebook linter to only extend globals --- .../labs/ucx/source_code/notebooks/sources.py | 93 +++++++++++-------- 1 file changed, 54 insertions(+), 39 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index f56843d9b8..0d14af535c 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -32,7 +32,7 @@ UnresolvedPath, ) from databricks.labs.ucx.source_code.notebooks.magic import MagicLine -from databricks.labs.ucx.source_code.python.python_ast import PythonLinter, Tree +from databricks.labs.ucx.source_code.python.python_ast import MaybeTree, PythonLinter, Tree from databricks.labs.ucx.source_code.notebooks.cells import ( CellLanguage, Cell, @@ -137,15 +137,15 @@ def __init__( self._path_lookup = path_lookup self._session_state = session_state self._notebook: Notebook = notebook - self._parent_tree = parent_tree + self._parent_tree = parent_tree or Tree.new_module() # Python trees are constructed during notebook parsing and cached for later usage - self._python_tree_cache: dict[tuple[Path, PythonCell], Tree] = {} # Path in key is the notebook's path + self._python_tree_cache: dict[tuple[Path, Cell], Tree] = {} # Path in key is the notebook's path def lint(self) -> Iterable[Advice]: - failure = self._parse_notebook(self._notebook, parent_tree=self._parent_tree) - if failure: - yield failure + maybe_tree = self._parse_notebook(self._notebook, parent_tree=self._parent_tree) + if maybe_tree.failure: + yield maybe_tree.failure return for cell in self._notebook.cells: try: @@ -166,32 +166,42 @@ def lint(self) -> Iterable[Advice]: ) return - def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree | None) -> Failure | None: - """Parse a notebook by parsing its cells.""" + def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree) -> MaybeTree: + """Parse a notebook by parsing its cells. + + Returns : + MaybeTree : The tree or failure belonging to the **last** cell. + """ + maybe_tree = MaybeTree(None, None) for cell in notebook.cells: - failure = None if isinstance(cell, RunCell): - failure = self._resolve_and_parse_run_cell(cell, parent_tree=parent_tree) + maybe_tree = self._resolve_and_parse_run_cell(cell, parent_tree=parent_tree) elif isinstance(cell, PythonCell): - failure = self._parse_python_cell(cell, notebook.path, parent_tree=parent_tree) - parent_tree = self._python_tree_cache.get((notebook.path, cell)) - if failure: - return failure - return None - - def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree | None = None) -> Failure | None: + maybe_tree = self._parse_python_cell(cell) + if maybe_tree.failure: + return maybe_tree + if maybe_tree.tree: + # The subsequent cell gets the globals from the previous cell + maybe_tree.tree.extend_globals(parent_tree.node.globals) + self._python_tree_cache[(notebook.path, cell)] = maybe_tree.tree + parent_tree = maybe_tree.tree + return maybe_tree + + def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree) -> MaybeTree: """Resolve the path in the run cell and parse the notebook it refers.""" path = run_cell.maybe_notebook_path() if path is None: - return None # malformed run cell already reported + return MaybeTree(None, None) # malformed run cell already reported notebook = self._resolve_and_parse_notebook_path(path) if not notebook: - return None - return self._parse_notebook(notebook, parent_tree=parent_tree) - - def _parse_python_cell( - self, python_cell: PythonCell, notebook_path: Path, *, parent_tree: Tree | None - ) -> Failure | None: + return MaybeTree(None, None) + maybe_tree = self._parse_notebook(notebook, parent_tree=parent_tree) + if maybe_tree.tree: + # From the perspective of this cell, a run cell pulls the globals from the child notebook in + parent_tree.extend_globals(maybe_tree.tree.node.globals) + return maybe_tree + + def _parse_python_cell(self, python_cell: PythonCell) -> MaybeTree: """Parse the Python cell.""" maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code) if maybe_tree.failure: @@ -200,24 +210,25 @@ def _parse_python_cell( start_line=maybe_tree.failure.start_line + python_cell.original_offset, end_line=maybe_tree.failure.end_line + python_cell.original_offset, ) - return failure + return MaybeTree(None, failure) assert maybe_tree.tree is not None - if parent_tree: - parent_tree.attach_child_tree(maybe_tree.tree) - self._python_tree_cache[(notebook_path, python_cell)] = maybe_tree.tree - return self._parse_tree(maybe_tree.tree) + maybe_child_tree = self._parse_tree(maybe_tree.tree) + if maybe_child_tree.failure: + return maybe_child_tree + return maybe_tree - def _parse_tree(self, tree: Tree) -> Failure | None: + def _parse_tree(self, tree: Tree) -> MaybeTree: """Parse tree by looking for referred notebooks and path changes that might affect loading notebooks.""" code_path_nodes = self._list_magic_lines_with_run_command(tree) + SysPathChange.extract_from_tree( self._session_state, tree ) + maybe_tree = MaybeTree(None, None) # Sys path changes require to load children in order of reading for base_node in sorted(code_path_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)): - failure = self._process_code_node(base_node, parent_tree=tree) - if failure: - return failure - return None + maybe_tree = self._process_code_node(base_node, parent_tree=tree) + if maybe_tree.failure: + return maybe_tree + return maybe_tree @staticmethod def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: @@ -229,7 +240,7 @@ def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: run_commands.append(magic_line) return run_commands - def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tree | None) -> Failure | None: + def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tree) -> MaybeTree: """Process a code node. 1. `SysPathChange` mutate the path lookup. @@ -238,7 +249,7 @@ def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tr if isinstance(node, SysPathChange): failure = self._mutate_path_lookup(node) if failure: - return failure + return MaybeTree(None, failure) if isinstance(node, MagicLine): magic = node.as_magic() assert isinstance(magic, RunCommand) @@ -249,9 +260,13 @@ def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tr message=f"Can't locate dependency: {magic.notebook_path}", node=node.node, ) - return failure - return self._parse_notebook(notebook, parent_tree=parent_tree) - return None + return MaybeTree(None, failure) + maybe_tree = self._parse_notebook(notebook, parent_tree=parent_tree) + if maybe_tree.tree: + # From the perspective of this node, a run node pulls the globals from the child notebook in + parent_tree.extend_globals(maybe_tree.tree.node.globals) + return maybe_tree + return MaybeTree(None, None) def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: """Mutate the path lookup.""" From f25174ab8b27a1692f8c4e87a90fad943d6a2ed8 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 22 Jan 2025 09:31:16 +0100 Subject: [PATCH 89/94] Add test showing unresolvable node issue --- .../source_code/python/test_python_ast.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/unit/source_code/python/test_python_ast.py b/tests/unit/source_code/python/test_python_ast.py index 7266ea12d8..fee06b26b0 100644 --- a/tests/unit/source_code/python/test_python_ast.py +++ b/tests/unit/source_code/python/test_python_ast.py @@ -216,6 +216,33 @@ def test_tree_extend_globals_for_parent_with_children_cannot_infer_value() -> No assert strings == [inferred_string] +def test_tree_extend_globals_for_unresolvable_parent_cannot_infer_value() -> None: + """A tree cannot infer the value from a parent that has an unresolvable node. + + This test shows a learning when working with Astroid. The unresolvable variable is irrelevant for the variable we + are trying to resolve, still, the unresolvable variable makes that we cannot resolve to searched value. + """ + inferred_string = "" # Nothing inferred + grand_parent_source = "name = 'John'" + parent_source = "print(unknown)\ngreeting = 'Hello'" # Unresolvable variable `unknown` + child_source = "say = f'{greeting} {name}!'" + grand_parent_maybe_tree = Tree.maybe_normalized_parse(grand_parent_source) + parent_maybe_tree = Tree.maybe_normalized_parse(parent_source) + child_maybe_tree = Tree.maybe_normalized_parse(child_source) + + assert grand_parent_maybe_tree.tree is not None, grand_parent_maybe_tree.failure + assert parent_maybe_tree.tree is not None, parent_maybe_tree.failure + assert child_maybe_tree.tree is not None, child_maybe_tree.failure + + parent_maybe_tree.tree.extend_globals(grand_parent_maybe_tree.tree.node.globals) + child_maybe_tree.tree.extend_globals(parent_maybe_tree.tree.node.globals) + + nodes = child_maybe_tree.tree.locate(Assign, []) + tree = Tree(nodes[0].value) # Starting from child, we are looking for the first assign + strings = [value.as_string() for value in InferredValue.infer_from_node(tree.node)] + assert strings == [inferred_string] + + def test_tree_extend_globals_with_notebook_using_variable_from_other_notebook() -> None: """Simulating a notebook where it uses a variable from another notebook.""" inferred_string = "catalog.schema.table" From 0c67f3eb96a06afbc9272a21ff421c16a15a79f3 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 22 Jan 2025 09:46:55 +0100 Subject: [PATCH 90/94] Pass tree globals to next cells tree --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 0d14af535c..3ac5f1cf91 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -177,14 +177,12 @@ def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree) -> MaybeTree if isinstance(cell, RunCell): maybe_tree = self._resolve_and_parse_run_cell(cell, parent_tree=parent_tree) elif isinstance(cell, PythonCell): - maybe_tree = self._parse_python_cell(cell) + maybe_tree = self._parse_python_cell(cell, parent_tree=parent_tree) if maybe_tree.failure: return maybe_tree if maybe_tree.tree: - # The subsequent cell gets the globals from the previous cell - maybe_tree.tree.extend_globals(parent_tree.node.globals) self._python_tree_cache[(notebook.path, cell)] = maybe_tree.tree - parent_tree = maybe_tree.tree + parent_tree = maybe_tree.tree # The subsequent cell gets the globals from the previous cell return maybe_tree def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree) -> MaybeTree: @@ -201,7 +199,7 @@ def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree) - parent_tree.extend_globals(maybe_tree.tree.node.globals) return maybe_tree - def _parse_python_cell(self, python_cell: PythonCell) -> MaybeTree: + def _parse_python_cell(self, python_cell: PythonCell, *, parent_tree: Tree) -> MaybeTree: """Parse the Python cell.""" maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code) if maybe_tree.failure: @@ -212,6 +210,7 @@ def _parse_python_cell(self, python_cell: PythonCell) -> MaybeTree: ) return MaybeTree(None, failure) assert maybe_tree.tree is not None + maybe_tree.tree.extend_globals(parent_tree.node.globals) maybe_child_tree = self._parse_tree(maybe_tree.tree) if maybe_child_tree.failure: return maybe_child_tree From 6360e7bd68f2a7ac8a07cd8669e8e13e90fae7fd Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 23 Jan 2025 12:06:43 +0100 Subject: [PATCH 91/94] Add assumption to docstring --- src/databricks/labs/ucx/source_code/notebooks/sources.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 3ac5f1cf91..ffd86295b8 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -169,6 +169,11 @@ def lint(self) -> Iterable[Advice]: def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree) -> MaybeTree: """Parse a notebook by parsing its cells. + The notebook linter is designed to parse a valid tree for its notebook **only**. Possible child notebooks + referenced by run cells are brought into the scope of this notebook, however, their trees are not valid complete + for linting. The child notebooks are linted with another call to the notebook linter that includes the + context(s) from which these notebooks are ran. + Returns : MaybeTree : The tree or failure belonging to the **last** cell. """ From 4afb7e9d635c64bd4a1af9dc186053c5daad55ea Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 28 Jan 2025 16:35:49 +0100 Subject: [PATCH 92/94] Add new line to test sources --- tests/unit/source_code/notebooks/test_sources.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index 51819ae740..1a51378c61 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -141,7 +141,7 @@ def test_notebook_linter_lints_source_yielding_no_advices(migration_index, mock_ migration_index, mock_path_lookup, CurrentSessionState(), - "# Databricks notebook source\nprint(1)", + "# Databricks notebook source\nprint(1)\n", Language.PYTHON, ) @@ -155,7 +155,7 @@ def test_notebook_linter_lints_source_yielding_parse_failure(migration_index, mo migration_index, mock_path_lookup, CurrentSessionState(), - "# Databricks notebook source\nprint(1", # Missing parenthesis is on purpose + "# Databricks notebook source\nprint(1\n", # Missing parenthesis is on purpose Language.PYTHON, ) From 2c0f32b3f866e9cbecfb3095e8c7f7f4272b310d Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 28 Jan 2025 16:37:50 +0100 Subject: [PATCH 93/94] Get first advice with next --- tests/unit/source_code/notebooks/test_sources.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index 1a51378c61..1fd1142803 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -312,10 +312,9 @@ def test_notebook_linter_lints_migrated_table_with_rename(migration_index, mock_ Language.PYTHON, ) - advices = list(linter.lint()) + first_advice = next(iter(linter.lint())) - assert advices - assert advices[0] == Deprecation( + assert first_advice == Deprecation( code='table-migrated-to-uc', message='Table old.things is migrated to brand.new.stuff in Unity Catalog', start_line=6, From c5a7f2848d85f6f0a3cfa96be25b2a09977bfeac Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 28 Jan 2025 17:17:02 +0100 Subject: [PATCH 94/94] Let process code node return failure --- .../labs/ucx/source_code/notebooks/sources.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 320a7a99e2..88b8b0d2b3 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -207,6 +207,7 @@ def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree) - def _parse_python_cell(self, python_cell: PythonCell, *, parent_tree: Tree) -> MaybeTree: """Parse the Python cell.""" + failure: Failure | None maybe_tree = MaybeTree.from_source_code(python_cell.original_code) if maybe_tree.failure: failure = dataclasses.replace( @@ -217,23 +218,22 @@ def _parse_python_cell(self, python_cell: PythonCell, *, parent_tree: Tree) -> M return MaybeTree(None, failure) assert maybe_tree.tree is not None maybe_tree.tree.extend_globals(parent_tree.node.globals) - maybe_child_tree = self._parse_tree(maybe_tree.tree) - if maybe_child_tree and maybe_child_tree.failure: - return maybe_child_tree + failure = self._parse_tree(maybe_tree.tree) + if failure: + return MaybeTree(None, failure) return maybe_tree - def _parse_tree(self, tree: Tree) -> MaybeTree | None: + def _parse_tree(self, tree: Tree) -> Failure | None: """Parse tree by looking for referred notebooks and path changes that might affect loading notebooks.""" code_path_nodes = self._list_magic_lines_with_run_command(tree) + SysPathChange.extract_from_tree( self._session_state, tree ) - maybe_tree = None # Sys path changes require to load children in order of reading for base_node in sorted(code_path_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)): - maybe_tree = self._process_code_node(base_node, parent_tree=tree) - if maybe_tree and maybe_tree.failure: - return maybe_tree - return maybe_tree + failure = self._process_code_node(base_node, parent_tree=tree) + if failure: + return failure + return None @staticmethod def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: @@ -245,16 +245,14 @@ def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: run_commands.append(magic_line) return run_commands - def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tree) -> MaybeTree | None: + def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tree) -> Failure | None: """Process a code node. 1. `SysPathChange` mutate the path lookup. 2. `MagicLine` containing a `RunCommand` run other notebooks that should be parsed. """ if isinstance(node, SysPathChange): - failure = self._mutate_path_lookup(node) - if failure: - return MaybeTree(None, failure) + return self._mutate_path_lookup(node) if isinstance(node, MagicLine): magic = node.as_magic() assert isinstance(magic, RunCommand) @@ -265,12 +263,14 @@ def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tr message=f"Can't locate dependency: {magic.notebook_path}", node=node.node, ) - return MaybeTree(None, failure) + return failure maybe_tree = self._parse_notebook(notebook, parent_tree=parent_tree) - if maybe_tree and maybe_tree.tree: + if not maybe_tree: + return None + if maybe_tree.tree: # From the perspective of this node, a run node pulls the globals from the child notebook in parent_tree.extend_globals(maybe_tree.tree.node.globals) - return maybe_tree + return maybe_tree.failure return None def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: