Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TECH DEBT] Improve file handling by FileLoader #3640

Merged
merged 8 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 18 additions & 16 deletions src/databricks/labs/ucx/source_code/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from databricks.sdk.service.workspace import Language

from databricks.labs.ucx.source_code.base import file_language
from databricks.labs.ucx.source_code.base import file_language, safe_read_text
from databricks.labs.ucx.source_code.graph import (
SourceContainer,
DependencyGraph,
Expand Down Expand Up @@ -34,6 +34,11 @@ def __init__(self, path: Path, source: str, language: Language):
self._original_code = source
self._language = language

@property
def content(self) -> str:
"""The file content"""
return self._original_code

def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
"""The dependency graph for the local file."""
if self._language == Language.PYTHON:
Expand Down Expand Up @@ -76,23 +81,20 @@ def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProb


class FileLoader(DependencyLoader):
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None:
absolute_path = path_lookup.resolve(dependency.path)
if not absolute_path:
"""Loader for a file dependency."""

def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> LocalFile | StubContainer | None:
"""Load the dependency."""
resolved_path = path_lookup.resolve(dependency.path)
if not resolved_path:
return None
language = file_language(absolute_path)
language = file_language(resolved_path)
if not language:
return StubContainer(absolute_path)
for encoding in ("utf-8", "ascii"):
try:
code = absolute_path.read_text(encoding)
return LocalFile(absolute_path, code, language)
except UnicodeDecodeError:
pass
return None

def exists(self, path: Path) -> bool:
return path.exists()
return StubContainer(resolved_path)
content = safe_read_text(resolved_path)
if content is None:
return None
return LocalFile(resolved_path, content, language)

def __repr__(self):
return "FileLoader()"
Expand Down
13 changes: 10 additions & 3 deletions src/databricks/labs/ucx/source_code/folders.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@

from databricks.labs.ucx.source_code.base import is_a_notebook
from databricks.labs.ucx.source_code.files import FileLoader
from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyGraph, DependencyProblem, Dependency
from databricks.labs.ucx.source_code.graph import (
Dependency,
DependencyGraph,
DependencyLoader,
DependencyProblem,
SourceContainer,
)
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader
from databricks.labs.ucx.source_code.path_lookup import PathLookup

Expand Down Expand Up @@ -46,13 +52,14 @@ def __repr__(self):
return f"<Folder {self._path}>"


class FolderLoader(FileLoader):
class FolderLoader(DependencyLoader):
"""Load a folder."""

def __init__(self, notebook_loader: NotebookLoader, file_loader: FileLoader):
self._notebook_loader = notebook_loader
self._file_loader = file_loader

def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None:
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> Folder | None:
absolute_path = path_lookup.resolve(dependency.path)
if not absolute_path:
return None
Expand Down
30 changes: 25 additions & 5 deletions tests/unit/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import os
from pathlib import Path
import sys
Expand Down Expand Up @@ -189,11 +191,29 @@ def acc_client():


class MockPathLookup(PathLookup):
def __init__(self, cwd='source_code/samples', sys_paths: list[Path] | None = None):
super().__init__(Path(__file__).parent / cwd, sys_paths or [])

def change_directory(self, new_working_directory: Path) -> 'MockPathLookup':
return MockPathLookup(new_working_directory, self._sys_paths)
"""A path look up for the testing code samples."""

def __init__(
self,
cwd=Path(__file__).parent / "source_code/samples",
sys_paths: list[Path] | None = None,
):
super().__init__(cwd, sys_paths or [])

self.successfully_resolved_paths = set[Path]() # The paths that were successfully resolved
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that this attribute should be used in tests instead of passing a set for the linted paths as this is correct and it actually was a bug that the paths were not resolved


def resolve(self, path: Path) -> Path | None:
"""Resolve a path from the context of the lookup."""
resolved_path = super().resolve(path)
if resolved_path:
self.successfully_resolved_paths.add(path)
return resolved_path

def change_directory(self, new_working_directory: Path) -> MockPathLookup:
path_lookup = MockPathLookup(new_working_directory, self._sys_paths)
# For testing, we want to keep of the successfully resolved paths after directory changes
path_lookup.successfully_resolved_paths = self.successfully_resolved_paths
return path_lookup

def __repr__(self):
return f"<MockPathLookup {self._cwd}, sys.path: {self._sys_paths}>"
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/source_code/test_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def test_dependency_resolver_raises_problem_with_unresolved_root_notebook(simple
def test_dependency_resolver_raises_problem_with_unloadable_root_file(mock_path_lookup, mock_notebook_resolver) -> None:

class FailingFileLoader(FileLoader):
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None:
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> None:
return None

file_loader = FailingFileLoader()
Expand Down
91 changes: 90 additions & 1 deletion tests/unit/source_code/test_files.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
import codecs
from pathlib import Path
from unittest.mock import create_autospec

import pytest
from databricks.sdk.service.workspace import Language

from databricks.labs.ucx.source_code.base import CurrentSessionState
from databricks.labs.ucx.source_code.files import FileLoader, LocalFile
from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyProblem
from databricks.labs.ucx.source_code.graph import Dependency
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem


def test_local_file_content_is_accessible() -> None:
local_file = LocalFile(Path("test.py"), "print(1)", Language.PYTHON)
assert local_file.content == "print(1)"


@pytest.mark.parametrize("language", [Language.SQL, Language.SCALA, Language.R])
Expand Down Expand Up @@ -89,3 +98,83 @@ def test_local_file_builds_inherited_context_with_python_parse_error_problem(
Path("test.py"),
)
]


def test_file_loader_loads_file_without_permission() -> None:
path = create_autospec(Path)
path.suffix = ".py"
path.open.side_effect = PermissionError("Permission denied")
dependency = Dependency(FileLoader(), path)
path_lookup = create_autospec(PathLookup)
path_lookup.resolve.return_value = path

local_file = dependency.load(path_lookup)

# TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584
assert local_file is None
path.open.assert_called_once()
path_lookup.resolve.assert_called_once_with(path)


def test_file_loader_loads_non_ascii_file(mock_path_lookup) -> None:
dependency = Dependency(FileLoader(), Path("nonascii.py"))

local_file = dependency.load(mock_path_lookup)

# TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584
assert local_file is None
assert Path("nonascii.py") in mock_path_lookup.successfully_resolved_paths


def test_file_loader_loads_non_existing_file() -> None:
path = create_autospec(Path)
path.suffix = ".py"
path.open.side_effect = FileNotFoundError("No such file or directory: 'test.py'")
path_lookup = create_autospec(PathLookup)
path_lookup.resolve.return_value = path

dependency = Dependency(FileLoader(), path)
local_file = dependency.load(path_lookup)

assert local_file is None
path.open.assert_called_once()
path_lookup.resolve.assert_called_once_with(path)


@pytest.mark.parametrize(
"bom, encoding",
[
(codecs.BOM_UTF8, "utf-8"),
(codecs.BOM_UTF16_LE, "utf-16-le"),
(codecs.BOM_UTF16_BE, "utf-16-be"),
(codecs.BOM_UTF32_LE, "utf-32-le"),
(codecs.BOM_UTF32_BE, "utf-32-be"),
],
)
def test_file_loader_loads_file_with_bom(tmp_path, bom, encoding) -> None:
path = tmp_path / "file.py"
path.write_bytes(bom + "a = 12".encode(encoding))
dependency = Dependency(FileLoader(), path)
path_lookup = create_autospec(PathLookup)
path_lookup.resolve.return_value = path

local_file = dependency.load(path_lookup)

# TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584
assert isinstance(local_file, LocalFile)
assert local_file.content == "a = 12"
path_lookup.resolve.assert_called_once_with(path)


def test_file_loader_loads_empty_file(tmp_path) -> None:
path = tmp_path / "empty.py"
path.write_text("")
dependency = Dependency(FileLoader(), path)
path_lookup = create_autospec(PathLookup)
path_lookup.resolve.return_value = path

local_file = dependency.load(path_lookup)

# TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584
assert local_file
path_lookup.resolve.assert_called_once_with(path)