diff --git a/src/databricks/labs/ucx/source_code/files.py b/src/databricks/labs/ucx/source_code/files.py index f30ca4bb23..d35f6b138e 100644 --- a/src/databricks/labs/ucx/source_code/files.py +++ b/src/databricks/labs/ucx/source_code/files.py @@ -6,7 +6,7 @@ from databricks.sdk.service.workspace import Language -from databricks.labs.ucx.source_code.base import file_language +from databricks.labs.ucx.source_code.base import file_language, safe_read_text from databricks.labs.ucx.source_code.graph import ( SourceContainer, DependencyGraph, @@ -34,6 +34,11 @@ def __init__(self, path: Path, source: str, language: Language): self._original_code = source self._language = language + @property + def content(self) -> str: + """The file content""" + return self._original_code + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: """The dependency graph for the local file.""" if self._language == Language.PYTHON: @@ -76,23 +81,20 @@ def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProb class FileLoader(DependencyLoader): - def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None: - absolute_path = path_lookup.resolve(dependency.path) - if not absolute_path: + """Loader for a file dependency.""" + + def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> LocalFile | StubContainer | None: + """Load the dependency.""" + resolved_path = path_lookup.resolve(dependency.path) + if not resolved_path: return None - language = file_language(absolute_path) + language = file_language(resolved_path) if not language: - return StubContainer(absolute_path) - for encoding in ("utf-8", "ascii"): - try: - code = absolute_path.read_text(encoding) - return LocalFile(absolute_path, code, language) - except UnicodeDecodeError: - pass - return None - - def exists(self, path: Path) -> bool: - return path.exists() + return StubContainer(resolved_path) + content = safe_read_text(resolved_path) + if content is None: + return None + return LocalFile(resolved_path, content, language) def __repr__(self): return "FileLoader()" diff --git a/src/databricks/labs/ucx/source_code/folders.py b/src/databricks/labs/ucx/source_code/folders.py index be4e2d5a6f..7e71f6847a 100644 --- a/src/databricks/labs/ucx/source_code/folders.py +++ b/src/databricks/labs/ucx/source_code/folders.py @@ -5,7 +5,13 @@ from databricks.labs.ucx.source_code.base import is_a_notebook from databricks.labs.ucx.source_code.files import FileLoader -from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyGraph, DependencyProblem, Dependency +from databricks.labs.ucx.source_code.graph import ( + Dependency, + DependencyGraph, + DependencyLoader, + DependencyProblem, + SourceContainer, +) from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -46,13 +52,14 @@ def __repr__(self): return f"<Folder {self._path}>" -class FolderLoader(FileLoader): +class FolderLoader(DependencyLoader): + """Load a folder.""" def __init__(self, notebook_loader: NotebookLoader, file_loader: FileLoader): self._notebook_loader = notebook_loader self._file_loader = file_loader - def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None: + def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> Folder | None: absolute_path = path_lookup.resolve(dependency.path) if not absolute_path: return None diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 548a15b003..dae0adbc7b 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os from pathlib import Path import sys @@ -189,11 +191,29 @@ def acc_client(): class MockPathLookup(PathLookup): - def __init__(self, cwd='source_code/samples', sys_paths: list[Path] | None = None): - super().__init__(Path(__file__).parent / cwd, sys_paths or []) - - def change_directory(self, new_working_directory: Path) -> 'MockPathLookup': - return MockPathLookup(new_working_directory, self._sys_paths) + """A path look up for the testing code samples.""" + + def __init__( + self, + cwd=Path(__file__).parent / "source_code/samples", + sys_paths: list[Path] | None = None, + ): + super().__init__(cwd, sys_paths or []) + + self.successfully_resolved_paths = set[Path]() # The paths that were successfully resolved + + def resolve(self, path: Path) -> Path | None: + """Resolve a path from the context of the lookup.""" + resolved_path = super().resolve(path) + if resolved_path: + self.successfully_resolved_paths.add(path) + return resolved_path + + def change_directory(self, new_working_directory: Path) -> MockPathLookup: + path_lookup = MockPathLookup(new_working_directory, self._sys_paths) + # For testing, we want to keep of the successfully resolved paths after directory changes + path_lookup.successfully_resolved_paths = self.successfully_resolved_paths + return path_lookup def __repr__(self): return f"<MockPathLookup {self._cwd}, sys.path: {self._sys_paths}>" diff --git a/tests/unit/source_code/test_dependencies.py b/tests/unit/source_code/test_dependencies.py index aa1c2c0a0f..d5051a53de 100644 --- a/tests/unit/source_code/test_dependencies.py +++ b/tests/unit/source_code/test_dependencies.py @@ -170,7 +170,7 @@ def test_dependency_resolver_raises_problem_with_unresolved_root_notebook(simple def test_dependency_resolver_raises_problem_with_unloadable_root_file(mock_path_lookup, mock_notebook_resolver) -> None: class FailingFileLoader(FileLoader): - def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None: + def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> None: return None file_loader = FailingFileLoader() diff --git a/tests/unit/source_code/test_files.py b/tests/unit/source_code/test_files.py index 5ce2464ef5..eb81bdb26a 100644 --- a/tests/unit/source_code/test_files.py +++ b/tests/unit/source_code/test_files.py @@ -1,11 +1,20 @@ +import codecs from pathlib import Path +from unittest.mock import create_autospec import pytest from databricks.sdk.service.workspace import Language from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.files import FileLoader, LocalFile -from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyProblem +from databricks.labs.ucx.source_code.graph import Dependency +from databricks.labs.ucx.source_code.path_lookup import PathLookup +from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem + + +def test_local_file_content_is_accessible() -> None: + local_file = LocalFile(Path("test.py"), "print(1)", Language.PYTHON) + assert local_file.content == "print(1)" @pytest.mark.parametrize("language", [Language.SQL, Language.SCALA, Language.R]) @@ -89,3 +98,83 @@ def test_local_file_builds_inherited_context_with_python_parse_error_problem( Path("test.py"), ) ] + + +def test_file_loader_loads_file_without_permission() -> None: + path = create_autospec(Path) + path.suffix = ".py" + path.open.side_effect = PermissionError("Permission denied") + dependency = Dependency(FileLoader(), path) + path_lookup = create_autospec(PathLookup) + path_lookup.resolve.return_value = path + + local_file = dependency.load(path_lookup) + + # TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584 + assert local_file is None + path.open.assert_called_once() + path_lookup.resolve.assert_called_once_with(path) + + +def test_file_loader_loads_non_ascii_file(mock_path_lookup) -> None: + dependency = Dependency(FileLoader(), Path("nonascii.py")) + + local_file = dependency.load(mock_path_lookup) + + # TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584 + assert local_file is None + assert Path("nonascii.py") in mock_path_lookup.successfully_resolved_paths + + +def test_file_loader_loads_non_existing_file() -> None: + path = create_autospec(Path) + path.suffix = ".py" + path.open.side_effect = FileNotFoundError("No such file or directory: 'test.py'") + path_lookup = create_autospec(PathLookup) + path_lookup.resolve.return_value = path + + dependency = Dependency(FileLoader(), path) + local_file = dependency.load(path_lookup) + + assert local_file is None + path.open.assert_called_once() + path_lookup.resolve.assert_called_once_with(path) + + +@pytest.mark.parametrize( + "bom, encoding", + [ + (codecs.BOM_UTF8, "utf-8"), + (codecs.BOM_UTF16_LE, "utf-16-le"), + (codecs.BOM_UTF16_BE, "utf-16-be"), + (codecs.BOM_UTF32_LE, "utf-32-le"), + (codecs.BOM_UTF32_BE, "utf-32-be"), + ], +) +def test_file_loader_loads_file_with_bom(tmp_path, bom, encoding) -> None: + path = tmp_path / "file.py" + path.write_bytes(bom + "a = 12".encode(encoding)) + dependency = Dependency(FileLoader(), path) + path_lookup = create_autospec(PathLookup) + path_lookup.resolve.return_value = path + + local_file = dependency.load(path_lookup) + + # TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584 + assert isinstance(local_file, LocalFile) + assert local_file.content == "a = 12" + path_lookup.resolve.assert_called_once_with(path) + + +def test_file_loader_loads_empty_file(tmp_path) -> None: + path = tmp_path / "empty.py" + path.write_text("") + dependency = Dependency(FileLoader(), path) + path_lookup = create_autospec(PathLookup) + path_lookup.resolve.return_value = path + + local_file = dependency.load(path_lookup) + + # TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584 + assert local_file + path_lookup.resolve.assert_called_once_with(path)