-
Notifications
You must be signed in to change notification settings - Fork 87
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Remove tree from PythonSequentialLinter
#3535
Changes from 89 commits
b53e0fd
108dd7b
8a22d90
6c0ebcf
fd2c761
2c46a01
ca30dcf
1d22084
48ddf4e
1211dda
043fc6b
b0e39ef
51358f2
460e88f
357a2e5
d9ffd91
a3c023b
1e7a4b9
249857e
ba04281
18e9c98
fb8d6ee
96ef1ec
e5365ef
aed3b12
ee148eb
84b5f69
d142f7c
00aad28
4c1e79e
f860a67
906ba87
feb0a8c
73991ae
4cf6d8a
04fda6f
7e16589
d2db4b6
14b8c45
5c28fc9
1b8f4b5
b7998bc
70aa5bf
c81e6b1
3fb49a7
1ff9891
273a40d
7728719
772684c
2f4c7fa
5441ec5
d6c0afc
435f6c8
fd8f04d
7209aac
f2dbc56
2603d14
23e5a49
8cef0ef
82f3ad1
ce23e4f
7176276
768706c
1f44031
b80f2d3
54895fb
3a8bb37
f46955e
be03c3b
3646140
b799076
da3eb20
cf5679d
63a5d22
8113828
ae206b8
4b1528f
5b98b44
2e7663f
9701f17
02e8cb3
6f8a819
47e3273
e800b7e
b648b2f
25549ec
09470e6
2a16d90
5bebad4
f25174a
0c67f3e
a273997
6360e7b
54607a6
4afb7e9
2c0f32b
c5a7f28
02e2254
35285e3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -7,21 +7,17 @@ | |||||
from pathlib import Path | ||||||
from typing import cast | ||||||
|
||||||
from astroid import Module, NodeNG # type: ignore | ||||||
|
||||||
from databricks.sdk.service.workspace import Language | ||||||
|
||||||
from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex | ||||||
from databricks.labs.ucx.source_code.base import ( | ||||||
file_language, | ||||||
is_a_notebook, | ||||||
safe_read_text, | ||||||
read_text, | ||||||
Advice, | ||||||
Advisory, | ||||||
CurrentSessionState, | ||||||
Failure, | ||||||
Linter, | ||||||
) | ||||||
|
||||||
from databricks.labs.ucx.source_code.graph import ( | ||||||
|
@@ -36,7 +32,7 @@ | |||||
UnresolvedPath, | ||||||
) | ||||||
from databricks.labs.ucx.source_code.notebooks.magic import MagicLine | ||||||
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, PythonSequentialLinter | ||||||
from databricks.labs.ucx.source_code.python.python_ast import MaybeTree, PythonLinter, Tree | ||||||
from databricks.labs.ucx.source_code.notebooks.cells import ( | ||||||
CellLanguage, | ||||||
Cell, | ||||||
|
@@ -129,163 +125,162 @@ class NotebookLinter: | |||||
to the code cells according to the language of the cell. | ||||||
""" | ||||||
|
||||||
@classmethod | ||||||
def from_source( | ||||||
cls, | ||||||
index: TableMigrationIndex, | ||||||
path_lookup: PathLookup, | ||||||
session_state: CurrentSessionState, | ||||||
source: str, | ||||||
default_language: Language, | ||||||
) -> NotebookLinter: | ||||||
ctx = LinterContext(index) | ||||||
notebook = Notebook.parse(Path(""), source, default_language) | ||||||
assert notebook is not None | ||||||
return cls(ctx, path_lookup, session_state, notebook) | ||||||
|
||||||
def __init__( | ||||||
self, | ||||||
context: LinterContext, | ||||||
path_lookup: PathLookup, | ||||||
session_state: CurrentSessionState, | ||||||
notebook: Notebook, | ||||||
inherited_tree: Tree | None = None, | ||||||
parent_tree: Tree | None = None, | ||||||
): | ||||||
self._context: LinterContext = context | ||||||
self._path_lookup = path_lookup | ||||||
self._session_state = session_state | ||||||
self._notebook: Notebook = notebook | ||||||
# reuse Python linter across related files and notebook cells | ||||||
# this is required in order to accumulate statements for improved inference | ||||||
self._python_linter: PythonSequentialLinter = cast(PythonSequentialLinter, context.linter(Language.PYTHON)) | ||||||
if inherited_tree is not None: | ||||||
self._python_linter.append_tree(inherited_tree) | ||||||
self._python_trees: dict[PythonCell, Tree] = {} # the original trees to be linted | ||||||
self._parent_tree = parent_tree or Tree.new_module() | ||||||
|
||||||
# Python trees are constructed during notebook parsing and cached for later usage | ||||||
self._python_tree_cache: dict[tuple[Path, Cell], Tree] = {} # Path in key is the notebook's path | ||||||
|
||||||
def lint(self) -> Iterable[Advice]: | ||||||
has_failure = False | ||||||
for advice in self._load_tree_from_notebook(self._notebook, True): | ||||||
if isinstance(advice, Failure): # happens when a cell is unparseable | ||||||
has_failure = True | ||||||
yield advice | ||||||
if has_failure: | ||||||
maybe_tree = self._parse_notebook(self._notebook, parent_tree=self._parent_tree) | ||||||
if maybe_tree.failure: | ||||||
yield maybe_tree.failure | ||||||
return | ||||||
for cell in self._notebook.cells: | ||||||
if not self._context.is_supported(cell.language.language): | ||||||
try: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is why: #3544 |
||||||
linter = self._context.linter(cell.language.language) | ||||||
except ValueError: # Language is not supported (yet) | ||||||
continue | ||||||
if isinstance(cell, PythonCell): | ||||||
tree = self._python_trees[cell] | ||||||
advices = self._python_linter.lint_tree(tree) | ||||||
linter = cast(PythonLinter, linter) | ||||||
tree = self._python_tree_cache[(self._notebook.path, cell)] | ||||||
advices = linter.lint_tree(tree) | ||||||
else: | ||||||
linter = self._linter(cell.language.language) | ||||||
advices = linter.lint(cell.original_code) | ||||||
for advice in advices: | ||||||
yield dataclasses.replace( | ||||||
advice, | ||||||
start_line=advice.start_line + cell.original_offset, | ||||||
end_line=advice.end_line + cell.original_offset, | ||||||
) | ||||||
return | ||||||
JCZuurmond marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
def _load_tree_from_notebook(self, notebook: Notebook, register_trees: bool) -> Iterable[Advice]: | ||||||
def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree) -> MaybeTree: | ||||||
"""Parse a notebook by parsing its cells. | ||||||
|
||||||
Returns : | ||||||
MaybeTree : The tree or failure belonging to the **last** cell. | ||||||
""" | ||||||
maybe_tree = MaybeTree(None, None) | ||||||
JCZuurmond marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
for cell in notebook.cells: | ||||||
if isinstance(cell, RunCell): | ||||||
yield from self._load_tree_from_run_cell(cell) | ||||||
continue | ||||||
if isinstance(cell, PythonCell): | ||||||
yield from self._load_tree_from_python_cell(cell, register_trees) | ||||||
continue | ||||||
|
||||||
def _load_tree_from_python_cell(self, python_cell: PythonCell, register_trees: bool) -> Iterable[Advice]: | ||||||
maybe_tree = self._resolve_and_parse_run_cell(cell, parent_tree=parent_tree) | ||||||
elif isinstance(cell, PythonCell): | ||||||
maybe_tree = self._parse_python_cell(cell) | ||||||
if maybe_tree.failure: | ||||||
return maybe_tree | ||||||
if maybe_tree.tree: | ||||||
# The subsequent cell gets the globals from the previous cell | ||||||
maybe_tree.tree.extend_globals(parent_tree.node.globals) | ||||||
self._python_tree_cache[(notebook.path, cell)] = maybe_tree.tree | ||||||
parent_tree = maybe_tree.tree | ||||||
return maybe_tree | ||||||
|
||||||
def _resolve_and_parse_run_cell(self, run_cell: RunCell, *, parent_tree: Tree) -> MaybeTree: | ||||||
"""Resolve the path in the run cell and parse the notebook it refers.""" | ||||||
path = run_cell.maybe_notebook_path() | ||||||
if path is None: | ||||||
return MaybeTree(None, None) # malformed run cell already reported | ||||||
notebook = self._resolve_and_parse_notebook_path(path) | ||||||
if not notebook: | ||||||
return MaybeTree(None, None) | ||||||
JCZuurmond marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
maybe_tree = self._parse_notebook(notebook, parent_tree=parent_tree) | ||||||
if maybe_tree.tree: | ||||||
# From the perspective of this cell, a run cell pulls the globals from the child notebook in | ||||||
parent_tree.extend_globals(maybe_tree.tree.node.globals) | ||||||
return maybe_tree | ||||||
|
||||||
def _parse_python_cell(self, python_cell: PythonCell) -> MaybeTree: | ||||||
"""Parse the Python cell.""" | ||||||
maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code) | ||||||
if maybe_tree.failure: | ||||||
yield maybe_tree.failure | ||||||
tree = maybe_tree.tree | ||||||
# a cell with only comments will not produce a tree | ||||||
if register_trees: | ||||||
self._python_trees[python_cell] = tree or Tree.new_module() | ||||||
if not tree: | ||||||
return | ||||||
yield from self._load_children_from_tree(tree) | ||||||
|
||||||
def _load_children_from_tree(self, tree: Tree) -> Iterable[Advice]: | ||||||
assert isinstance(tree.node, Module) | ||||||
# look for child notebooks (and sys.path changes that might affect their loading) | ||||||
base_nodes: list[NodeBase] = [] | ||||||
base_nodes.extend(self._list_run_magic_lines(tree)) | ||||||
base_nodes.extend(SysPathChange.extract_from_tree(self._session_state, tree)) | ||||||
if len(base_nodes) == 0: | ||||||
self._python_linter.append_tree(tree) | ||||||
return | ||||||
# append globals | ||||||
globs = cast(Module, tree.node).globals | ||||||
self._python_linter.append_globals(globs) | ||||||
# need to execute things in intertwined sequence so concat and sort them | ||||||
nodes = list(cast(Module, tree.node).body) | ||||||
base_nodes = sorted(base_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)) | ||||||
yield from self._load_children_with_base_nodes(nodes, base_nodes) | ||||||
# append remaining nodes | ||||||
self._python_linter.append_nodes(nodes) | ||||||
failure = dataclasses.replace( | ||||||
maybe_tree.failure, | ||||||
start_line=maybe_tree.failure.start_line + python_cell.original_offset, | ||||||
end_line=maybe_tree.failure.end_line + python_cell.original_offset, | ||||||
) | ||||||
return MaybeTree(None, failure) | ||||||
assert maybe_tree.tree is not None | ||||||
JCZuurmond marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
maybe_child_tree = self._parse_tree(maybe_tree.tree) | ||||||
if maybe_child_tree.failure: | ||||||
return maybe_child_tree | ||||||
return maybe_tree | ||||||
|
||||||
def _parse_tree(self, tree: Tree) -> MaybeTree: | ||||||
"""Parse tree by looking for referred notebooks and path changes that might affect loading notebooks.""" | ||||||
code_path_nodes = self._list_magic_lines_with_run_command(tree) + SysPathChange.extract_from_tree( | ||||||
self._session_state, tree | ||||||
) | ||||||
maybe_tree = MaybeTree(None, None) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should also be:
Suggested change
That will then let the linter expose a problem: if there are no What should we return in that case? (Can it be ruled out… do we always have nodes to iterate over?) Finally, why do we return the last tree? Is it somehow more important than any earlier ones? (If we don't care, why return it? In that case the return type should probably be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On your first point, I updated the signature to be On the "why do we return the last tree", that logic is t.b.d.. At least, this PR resolves a bug with the notebook linting and adds (much) more unit tests. At the same time, I do not know exactly (yet) how the Python trees should be linked/merged for all to work as expected. So, currently, it returns the last tree as the trees are sort of chained, e.g. the second notebook can go "up" to the first cell and the third cell can go "up" to the second (and via the second cell "find" the first cell). It becomes more complicated when introducing new notebooks with the |
||||||
# Sys path changes require to load children in order of reading | ||||||
for base_node in sorted(code_path_nodes, key=lambda node: (node.node.lineno, node.node.col_offset)): | ||||||
maybe_tree = self._process_code_node(base_node, parent_tree=tree) | ||||||
if maybe_tree.failure: | ||||||
return maybe_tree | ||||||
return maybe_tree | ||||||
|
||||||
@staticmethod | ||||||
def _list_run_magic_lines(tree: Tree) -> Iterable[MagicLine]: | ||||||
|
||||||
def _ignore_problem(_code: str, _message: str, _node: NodeNG) -> None: | ||||||
return None | ||||||
|
||||||
commands, _ = MagicLine.extract_from_tree(tree, _ignore_problem) | ||||||
for command in commands: | ||||||
if isinstance(command.as_magic(), RunCommand): | ||||||
yield command | ||||||
|
||||||
def _load_children_with_base_nodes(self, nodes: list[NodeNG], base_nodes: list[NodeBase]) -> Iterable[Advice]: | ||||||
for base_node in base_nodes: | ||||||
yield from self._load_children_with_base_node(nodes, base_node) | ||||||
|
||||||
def _load_children_with_base_node(self, nodes: list[NodeNG], base_node: NodeBase) -> Iterable[Advice]: | ||||||
while len(nodes) > 0: | ||||||
node = nodes.pop(0) | ||||||
self._python_linter.append_nodes([node]) | ||||||
if node.lineno < base_node.node.lineno: | ||||||
continue | ||||||
yield from self._load_children_from_base_node(base_node) | ||||||
|
||||||
def _load_children_from_base_node(self, base_node: NodeBase) -> Iterable[Advice]: | ||||||
if isinstance(base_node, SysPathChange): | ||||||
yield from self._mutate_path_lookup(base_node) | ||||||
return | ||||||
if isinstance(base_node, MagicLine): | ||||||
magic = base_node.as_magic() | ||||||
def _list_magic_lines_with_run_command(tree: Tree) -> list[MagicLine]: | ||||||
"""List the magic lines with a run command""" | ||||||
run_commands = [] | ||||||
magic_lines, _ = MagicLine.extract_from_tree(tree, lambda code, message, node: None) | ||||||
for magic_line in magic_lines: | ||||||
if isinstance(magic_line.as_magic(), RunCommand): | ||||||
run_commands.append(magic_line) | ||||||
return run_commands | ||||||
|
||||||
def _process_code_node(self, node: SysPathChange | MagicLine, *, parent_tree: Tree) -> MaybeTree: | ||||||
"""Process a code node. | ||||||
|
||||||
1. `SysPathChange` mutate the path lookup. | ||||||
2. `MagicLine` containing a `RunCommand` run other notebooks that should be parsed. | ||||||
""" | ||||||
if isinstance(node, SysPathChange): | ||||||
failure = self._mutate_path_lookup(node) | ||||||
if failure: | ||||||
return MaybeTree(None, failure) | ||||||
if isinstance(node, MagicLine): | ||||||
magic = node.as_magic() | ||||||
assert isinstance(magic, RunCommand) | ||||||
notebook = self._load_source_from_path(magic.notebook_path) | ||||||
notebook = self._resolve_and_parse_notebook_path(magic.notebook_path) | ||||||
if notebook is None: | ||||||
yield Advisory.from_node( | ||||||
failure = Failure.from_node( | ||||||
code='dependency-not-found', | ||||||
message=f"Can't locate dependency: {magic.notebook_path}", | ||||||
node=base_node.node, | ||||||
node=node.node, | ||||||
) | ||||||
return | ||||||
yield from self._load_tree_from_notebook(notebook, False) | ||||||
return | ||||||
|
||||||
def _mutate_path_lookup(self, change: SysPathChange) -> Iterable[Advice]: | ||||||
return MaybeTree(None, failure) | ||||||
maybe_tree = self._parse_notebook(notebook, parent_tree=parent_tree) | ||||||
if maybe_tree.tree: | ||||||
# From the perspective of this node, a run node pulls the globals from the child notebook in | ||||||
parent_tree.extend_globals(maybe_tree.tree.node.globals) | ||||||
return maybe_tree | ||||||
return MaybeTree(None, None) | ||||||
JCZuurmond marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
def _mutate_path_lookup(self, change: SysPathChange) -> Failure | None: | ||||||
"""Mutate the path lookup.""" | ||||||
if isinstance(change, UnresolvedPath): | ||||||
yield Advisory.from_node( | ||||||
return Failure.from_node( | ||||||
code='sys-path-cannot-compute-value', | ||||||
message=f"Can't update sys.path from {change.node.as_string()} because the expression cannot be computed", | ||||||
node=change.node, | ||||||
) | ||||||
return | ||||||
change.apply_to(self._path_lookup) | ||||||
return None | ||||||
|
||||||
def _load_tree_from_run_cell(self, run_cell: RunCell) -> Iterable[Advice]: | ||||||
path = run_cell.maybe_notebook_path() | ||||||
if path is None: | ||||||
return # malformed run cell already reported | ||||||
notebook = self._load_source_from_path(path) | ||||||
if notebook is not None: | ||||||
yield from self._load_tree_from_notebook(notebook, False) | ||||||
|
||||||
def _load_source_from_path(self, path: Path | None) -> Notebook | None: | ||||||
def _resolve_and_parse_notebook_path(self, path: Path | None) -> Notebook | None: | ||||||
"""Resolve and parse notebook path.""" | ||||||
if path is None: | ||||||
return None # already reported during dependency building | ||||||
resolved = self._path_lookup.resolve(path) | ||||||
|
@@ -302,15 +297,6 @@ def _load_source_from_path(self, path: Path | None) -> Notebook | None: | |||||
return None | ||||||
return Notebook.parse(path, source, language) | ||||||
|
||||||
def _linter(self, language: Language) -> Linter: | ||||||
if language is Language.PYTHON: | ||||||
return self._python_linter | ||||||
return self._context.linter(language) | ||||||
|
||||||
@staticmethod | ||||||
def name() -> str: | ||||||
return "notebook-linter" | ||||||
|
||||||
|
||||||
class FileLinter: | ||||||
_NOT_YET_SUPPORTED_SUFFIXES = { | ||||||
|
@@ -417,8 +403,6 @@ def _lint_file(self) -> Iterable[Advice]: | |||||
else: | ||||||
try: | ||||||
linter = self._ctx.linter(language) | ||||||
if self._inherited_tree is not None and isinstance(linter, PythonSequentialLinter): | ||||||
linter.append_tree(self._inherited_tree) | ||||||
yield from linter.lint(self._content) | ||||||
except ValueError as err: | ||||||
failure_message = f"Error while parsing content of {self._path.as_posix()}: {err}" | ||||||
|
@@ -432,10 +416,6 @@ def _lint_notebook(self) -> Iterable[Advice]: | |||||
return | ||||||
notebook = Notebook.parse(self._path, self._content, language) | ||||||
notebook_linter = NotebookLinter( | ||||||
self._ctx, | ||||||
self._path_lookup, | ||||||
self._session_state, | ||||||
notebook, | ||||||
self._inherited_tree, | ||||||
self._ctx, self._path_lookup, self._session_state, notebook, self._inherited_tree | ||||||
) | ||||||
yield from notebook_linter.lint() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was only used by tests, it should not be the way to create the notebook linter in UCX