From d7fa66bcc6f3938160566d37b04dbc939527b8da Mon Sep 17 00:00:00 2001 From: Colin Campbell Date: Fri, 23 May 2025 11:59:45 -0400 Subject: [PATCH] Update gitignore implementation to support more cases --- .gitignore | 3 + pyproject.toml | 162 +++++++++++ .../_internal/core/models/repos/local.py | 6 + src/dstack/_internal/utils/ignore.py | 139 +++++----- src/tests/_internal/utils/test_gitignore.py | 256 ++++++++++++++++++ 5 files changed, 501 insertions(+), 65 deletions(-) create mode 100644 pyproject.toml create mode 100644 src/tests/_internal/utils/test_gitignore.py diff --git a/.gitignore b/.gitignore index 344e57de0..c36ec92c3 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,6 @@ build/ .fleet .env .aider* +uv.lock +.local/ +.claude/settings.local.json diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..9aba61b00 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,162 @@ +[project] +name = "dstack" +dynamic = ["version", "readme"] +authors = [{ name = "Andrey Cheptsov", email = "andrey@dstack.ai" }] +description = "dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises." +requires-python = ">=3.9" +classifiers = [ + "Development Status :: 4 - Beta", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", + "Programming Language :: Python :: 3", +] +dependencies = [ + "pyyaml", + "requests", + "typing-extensions>=4.0.0", + "cryptography", + "packaging", + "python-dateutil", + "cachetools", + "gitpython", + "jsonschema", + "paramiko>=3.2.0", + "cursor", + "rich", + "rich-argparse", + "tqdm", + "simple-term-menu", + "pydantic>=1.10.10,<2.0.0", + "pydantic-duality>=1.2.4", + "websocket-client", + "python-multipart>=0.0.16", + "filelock", + "psutil", + "gpuhunt==0.1.6", + "argcomplete>=3.5.0", + "gitignore-parser>=0.1.12", +] + +[project.urls] +Homepage = "https://dstack.ai" +Source = "https://github.com/dstackai/dstack" +Documentation = "https://dstack.ai/docs" +Issues = "https://github.com/dstackai/dstack/issues" +Changelog = "https://github.com/dstackai/dstack/releases" +Discord = "https://discord.gg/u8SmfwPpMd" + +[build-system] +requires = ["hatchling", "hatch-fancy-pypi-readme"] +build-backend = "hatchling.build" + +[project.scripts] +dstack = "dstack._internal.cli.main:main" + +[tool.hatch.version] +path = "src/dstack/version.py" + +[tool.hatch.build.targets.sdist] +artifacts = ["src/dstack/_internal/server/statics/**"] + +[tool.hatch.build.targets.wheel] +artifacts = ["src/dstack/_internal/server/statics/**"] + +[tool.hatch.metadata.hooks.fancy-pypi-readme] +content-type = "text/markdown" + +[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]] +path = "README.md" + +[[tool.hatch.metadata.hooks.fancy-pypi-readme.substitutions]] +pattern = '\s*|]*>\s*|\s*|]*>\s*|\s*|### Demo\s*' +replacement = '' +ignore-case = true + +[dependency-groups] +dev = [ + "build>=1.2.2.post1", + "httpx>=0.28.1", + "pre-commit>=4.2.0", + "pytest-asyncio>=0.23.8", + "pytest-httpbin>=2.1.0", + "httpbin>=0.10.2", # indirect to make compatible with Werkzeug 3 + "pytest~=7.2", + "pytest-socket>=0.7.0", + "requests-mock>=1.12.1", + "openai>=1.68.2", + "freezegun>=1.5.1", + "ruff==0.11.6", # should match .pre-commit-config.yaml + "testcontainers>=4.9.2", + "pytest-xdist>=3.6.1", +] + +[project.optional-dependencies] +gateway = [ + "fastapi", + "starlette>=0.26.0", + "uvicorn", + "aiorwlock", + "aiocache", + "httpx", + "jinja2", +] +server = [ + "fastapi", + "starlette>=0.26.0", + "uvicorn", + "aiorwlock", + "aiocache", + "httpx", + "jinja2", + "watchfiles", + "sqlalchemy[asyncio]>=2.0.0", + "sqlalchemy_utils>=0.40.0", + "alembic>=1.10.2", + "apscheduler<4", + "aiosqlite", + "docker>=6.0.0", + "python-dxf==12.1.0", + "sentry-sdk[fastapi]", + "alembic-postgresql-enum", + "asyncpg", + "python-json-logger>=3.1.0", + "prometheus-client", + "grpcio>=1.50", + "backports.entry-points-selectable", +] +aws = ["boto3>=1.38.13", "botocore", "dstack[server]"] +azure = [ + "azure-identity>=1.12.0", + "azure-mgmt-subscription>=3.1.1", + "azure-mgmt-compute>=29.1.0", + "azure-mgmt-network>=23.0.0,<28.0.0", + "azure-mgmt-resource>=22.0.0", + "azure-mgmt-authorization>=3.0.0", + "azure-mgmt-msi>=7.0.0", + "dstack[server]", +] +gcp = [ + "google-auth>=2.3.0", + "google-cloud-storage>=2.0.0", + "google-cloud-compute>=1.5.0", + "google-cloud-logging>=2.0.0", + "google-api-python-client>=2.80.0", + "google-cloud-billing>=1.11.0", + "google-cloud-tpu>=1.18.3", + "dstack[server]", +] +datacrunch = ["datacrunch", "dstack[server]"] +kubernetes = ["kubernetes", "dstack[server]"] +lambda = ["boto3>=1.38.13", "botocore", "dstack[server]"] +oci = [ + "oci>=2.150.0", + "cryptography>=44.0.3", + # pyopenssl is indirect to avoid uv falling back to the old version + # due to an upper limit from oci + "pyopenssl>=23.2.0", + "dstack[server]", +] +nebius = ["nebius>=0.2.19,<0.3; python_version >= '3.10'", "dstack[server]"] +all = [ + "dstack[gateway,server,aws,azure,gcp,datacrunch,kubernetes,lambda,nebius,oci]", +] diff --git a/src/dstack/_internal/core/models/repos/local.py b/src/dstack/_internal/core/models/repos/local.py index 1bc815f12..cbeb07749 100644 --- a/src/dstack/_internal/core/models/repos/local.py +++ b/src/dstack/_internal/core/models/repos/local.py @@ -5,10 +5,14 @@ from typing_extensions import Literal from dstack._internal.core.models.repos.base import BaseRepoInfo, Repo +from dstack._internal.utils.common import sizeof_fmt from dstack._internal.utils.hash import get_sha256, slugify from dstack._internal.utils.ignore import GitIgnore +from dstack._internal.utils.logging import get_logger from dstack._internal.utils.path import PathLike +logger = get_logger(__name__) + class LocalRepoInfo(BaseRepoInfo): repo_type: Literal["local"] = "local" @@ -75,6 +79,8 @@ def write_code_file(self, fp: BinaryIO) -> str: arcname="", filter=TarIgnore(self.run_repo_data.repo_dir, globs=[".git"]), ) + + logger.debug(f"Code file size: {sizeof_fmt(fp.tell())} bytes") return get_sha256(fp) def get_repo_info(self) -> LocalRepoInfo: diff --git a/src/dstack/_internal/utils/ignore.py b/src/dstack/_internal/utils/ignore.py index cfe83e2e7..2d55e84a3 100644 --- a/src/dstack/_internal/utils/ignore.py +++ b/src/dstack/_internal/utils/ignore.py @@ -1,7 +1,8 @@ -import fnmatch -from itertools import zip_longest +import os from pathlib import Path -from typing import Dict, List, Optional +from typing import List + +from gitignore_parser import parse_gitignore_str from dstack._internal.utils.path import PathLike @@ -16,75 +17,83 @@ def __init__( if ignore_files is not None else [".gitignore", ".git/info/exclude", ".dstackignore"] ) - self.ignore_globs: Dict[str, List[str]] = {".": globs or []} - self.load_recursive() + self.parser = None + self._create_combined_parser(globs or []) - def load_ignore_file(self, path: str, ignore_file: Path): - if path not in self.ignore_globs: - self.ignore_globs[path] = [] - with ignore_file.open("r") as f: - for line in f: - line = self.rstrip(line.rstrip("\n")).rstrip("/") - line = line.replace("\\ ", " ") - if line.startswith("#") or not line: - continue - self.ignore_globs[path].append(line) + def _create_combined_parser(self, additional_globs: List[str]): + """Create a single parser from all ignore files and additional globs.""" + all_patterns = [] - def load_recursive(self, path: Optional[Path] = None): - path = path or self.root_dir - for ignore_file in self.ignore_files: - ignore_file = path / ignore_file - if ignore_file.exists(): - self.load_ignore_file(str(path.relative_to(self.root_dir)), ignore_file) + # Collect patterns from all ignore files recursively + self._collect_patterns_recursive(self.root_dir, all_patterns) - for subdir in path.iterdir(): - if not subdir.is_dir() or self.ignore(subdir.relative_to(self.root_dir)): - continue - self.load_recursive(subdir) + # Add additional glob patterns + all_patterns.extend(additional_globs) - @staticmethod - def rstrip(value: str) -> str: - end = len(value) - 1 - while end >= 0: - if not value[end].isspace(): - break - if end > 0 and value[end - 1] == "\\": - break # escaped space - end -= 1 - else: - return "" - return value[: end + 1] + self.parser = parse_gitignore_str("\n".join(all_patterns), self.root_dir) - @staticmethod - def fnmatch(name: str, pattern: str, sep="/") -> bool: - if pattern.startswith(sep): - name = sep + name - for n, p in zip_longest( - reversed(name.split(sep)), reversed(pattern.split(sep)), fillvalue=None - ): - if p == "**": - raise NotImplementedError() - if p is None: - return True - if n is None or not fnmatch.fnmatch(n, p): - return False - return True + def _collect_patterns_recursive(self, path: Path, patterns: List[str]): + """ + Recursively collect patterns from all ignore files and combine them into a single gitignore, + with the root directory as the base path. + """ + for ignore_file_name in self.ignore_files: + ignore_file = path / ignore_file_name + if ignore_file.exists(): + try: + # Get relative path from root to this directory + if path == self.root_dir: + prefix = "" + else: + prefix = path.relative_to(self.root_dir) - def ignore(self, path: PathLike, sep="/") -> bool: - if not path: + # Read patterns and prefix them with directory path + with ignore_file.open("r", encoding="utf-8", errors="ignore") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + if prefix: + # Prefix patterns with directory path for subdirectories + if line.startswith("/"): + # Absolute pattern within subdirectory + patterns.append(os.path.join(prefix, line[1:])) + else: + # Relative pattern within subdirectory + # Add pattern that matches files directly in the subdirectory + patterns.append(os.path.join(prefix, line)) + # Add pattern that matches files in deeper subdirectories + patterns.append(os.path.join(prefix, "**", line)) + else: + # Root directory patterns + patterns.append(line) + except (OSError, UnicodeDecodeError): + # Skip files we can't read + continue + + # Recursively process subdirectories + # Note: We need to check if directories should be ignored, but we can't + # use self.ignore() yet since we're still building the parser + # So we'll process all directories and let gitignore_parser handle the logic + try: + for subdir in path.iterdir(): + if subdir.is_dir(): + self._collect_patterns_recursive(subdir, patterns) + except (OSError, PermissionError): + # Skip directories we can't read + pass + + def ignore(self, path: PathLike) -> bool: + """Check if a path should be ignored.""" + if not path or not self.parser: return False + path = Path(path) if path.is_absolute(): - path = path.relative_to(self.root_dir) + try: + path = path.relative_to(self.root_dir) + except ValueError: + return False - tokens = ("." + sep + str(path)).split(sep) - for i in range(1, len(tokens)): - parent = sep.join(tokens[:-i]) - globs = self.ignore_globs.get(parent) - if not globs: - continue - name = sep.join(tokens[-i:]) - for glob in globs: - if self.fnmatch(name, glob, sep=sep): - return True - return False + # Convert to absolute path for gitignore_parser + abs_path = str(self.root_dir / path) + return self.parser(abs_path) diff --git a/src/tests/_internal/utils/test_gitignore.py b/src/tests/_internal/utils/test_gitignore.py new file mode 100644 index 000000000..602a67772 --- /dev/null +++ b/src/tests/_internal/utils/test_gitignore.py @@ -0,0 +1,256 @@ +import tempfile +from pathlib import Path + +from dstack._internal.utils.ignore import GitIgnore + + +class TestGitIgnore: + def test_basic_gitignore_functionality(self): + """Test basic .gitignore pattern matching.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + # Create .gitignore + gitignore_file = test_dir / ".gitignore" + gitignore_file.write_text("*.log\ntemp/\n__pycache__/\n") + + git_ignore = GitIgnore(test_dir) + + # Test file patterns + assert git_ignore.ignore("test.log") is True + assert git_ignore.ignore("debug.log") is True + assert git_ignore.ignore("test.txt") is False + assert git_ignore.ignore("script.py") is False + + # Test directory patterns + assert git_ignore.ignore("temp") is True + assert git_ignore.ignore("temp/") is True + assert git_ignore.ignore("temp/file.txt") is True + assert git_ignore.ignore("__pycache__") is True + assert git_ignore.ignore("__pycache__/module.pyc") is True + + def test_nested_gitignore_files(self): + """Test that nested .gitignore files are loaded recursively.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + # Root .gitignore + (test_dir / ".gitignore").write_text("*.log\n") + + # Nested directory with its own .gitignore + subdir = test_dir / "subdir" + subdir.mkdir() + (subdir / ".gitignore").write_text("*.tmp\n") + + # Create actual files for testing (gitignore_parser may need them) + (test_dir / "test.log").touch() + (subdir / "test.log").touch() + (subdir / "file.tmp").touch() + (test_dir / "file.tmp").touch() + + git_ignore = GitIgnore(test_dir) + + # Test patterns from root .gitignore + assert git_ignore.ignore("test.log") is True + assert git_ignore.ignore("subdir/test.log") is True + + # Test patterns from nested .gitignore + assert git_ignore.ignore("subdir/file.tmp") is True + # Files outside the subdir should not be matched by subdir's .gitignore + assert git_ignore.ignore("file.tmp") is False + + def test_dstackignore_file(self): + """Test that .dstackignore files are processed.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + # Create .dstackignore + dstackignore_file = test_dir / ".dstackignore" + dstackignore_file.write_text("*.cache\ndata/\n") + + git_ignore = GitIgnore(test_dir) + + assert git_ignore.ignore("file.cache") is True + assert git_ignore.ignore("data") is True + assert git_ignore.ignore("data/dataset.csv") is True + assert git_ignore.ignore("file.txt") is False + + def test_git_info_exclude(self): + """Test that .git/info/exclude files are processed.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + # Create .git/info/exclude in the root directory + git_info_dir = test_dir / ".git" / "info" + git_info_dir.mkdir(parents=True) + exclude_file = git_info_dir / "exclude" + exclude_file.write_text("*.exclude\nbuild/\n") + + git_ignore = GitIgnore(test_dir) + + # .git/info/exclude should apply to the entire repository + assert git_ignore.ignore("file.exclude") is True + assert git_ignore.ignore("build") is True + assert git_ignore.ignore("build/output.txt") is True + assert git_ignore.ignore("subdir/file.exclude") is True + assert git_ignore.ignore("file.txt") is False + + def test_custom_ignore_files(self): + """Test custom ignore file names.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + # Create custom ignore file + custom_ignore = test_dir / ".myignore" + custom_ignore.write_text("*.custom\n") + + git_ignore = GitIgnore(test_dir, ignore_files=[".myignore"]) + + assert git_ignore.ignore("file.custom") is True + assert git_ignore.ignore("file.txt") is False + + def test_additional_globs(self): + """Test additional glob patterns passed to constructor.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + git_ignore = GitIgnore(test_dir, globs=["*.pyc", "node_modules/"]) + + assert git_ignore.ignore("module.pyc") is True + assert git_ignore.ignore("node_modules") is True + assert git_ignore.ignore("node_modules/package.json") is True + assert git_ignore.ignore("script.py") is False + + def test_combined_ignore_sources(self): + """Test combination of .gitignore, custom files, and globs.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + # Create .gitignore + (test_dir / ".gitignore").write_text("*.log\n") + + # Create .dstackignore + (test_dir / ".dstackignore").write_text("*.cache\n") + + git_ignore = GitIgnore(test_dir, globs=["*.tmp"]) + + assert git_ignore.ignore("file.log") is True # from .gitignore + assert git_ignore.ignore("file.cache") is True # from .dstackignore + assert git_ignore.ignore("file.tmp") is True # from globs + assert git_ignore.ignore("file.txt") is False + + def test_absolute_paths(self): + """Test handling of absolute paths.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + # Create .gitignore + (test_dir / ".gitignore").write_text("*.log\n") + + git_ignore = GitIgnore(test_dir) + + # Test absolute path within repo + abs_path = test_dir / "test.log" + assert git_ignore.ignore(abs_path) is True + + # Test absolute path outside repo + outside_path = Path(tmpdir) / "outside.log" + assert git_ignore.ignore(outside_path) is False + + def test_empty_path(self): + """Test handling of empty paths.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + git_ignore = GitIgnore(test_dir) + + assert git_ignore.ignore("") is False + assert git_ignore.ignore(None) is False + + def test_nonexistent_ignore_files(self): + """Test that nonexistent ignore files are handled gracefully.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + # No ignore files exist + git_ignore = GitIgnore(test_dir) + + # Should not ignore anything + assert git_ignore.ignore("any_file.txt") is False + assert git_ignore.ignore("any_dir/") is False + + def test_malformed_ignore_files(self): + """Test handling of malformed ignore files.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + # Create a file that might cause parsing issues + gitignore_file = test_dir / ".gitignore" + gitignore_file.write_text("*.log\n# comment\n\n \n*.tmp\n") + + git_ignore = GitIgnore(test_dir) + + # Should still work for valid patterns + assert git_ignore.ignore("test.log") is True + assert git_ignore.ignore("test.tmp") is True + assert git_ignore.ignore("test.txt") is False + + def test_directory_traversal_stops_at_ignored_dirs(self): + """Test that ignored directories don't have their subdirectories processed.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + # Create root .gitignore that ignores 'ignored_dir' + (test_dir / ".gitignore").write_text("ignored_dir/\n") + + # Create ignored directory with its own .gitignore + ignored_dir = test_dir / "ignored_dir" + ignored_dir.mkdir() + (ignored_dir / ".gitignore").write_text("*.should_not_apply\n") + + # Create a subdirectory in the ignored directory + subdir = ignored_dir / "subdir" + subdir.mkdir() + (subdir / ".gitignore").write_text("*.also_should_not_apply\n") + + git_ignore = GitIgnore(test_dir) + + # The ignored directory itself should be ignored + assert git_ignore.ignore("ignored_dir") is True + assert git_ignore.ignore("ignored_dir/file.txt") is True + + # Patterns from .gitignore files inside ignored directories should not apply + # to files outside those directories + assert git_ignore.ignore("file.should_not_apply") is False + assert git_ignore.ignore("file.also_should_not_apply") is False + + def test_relative_path_handling(self): + """Test various relative path formats.""" + with tempfile.TemporaryDirectory() as tmpdir: + test_dir = Path(tmpdir) / "test" + test_dir.mkdir() + + (test_dir / ".gitignore").write_text("*.log\ntemp/\n") + + git_ignore = GitIgnore(test_dir) + + # Test different path formats + assert git_ignore.ignore("file.log") is True + assert git_ignore.ignore("./file.log") is True + assert git_ignore.ignore("subdir/file.log") is True + assert git_ignore.ignore("./subdir/file.log") is True + assert git_ignore.ignore("temp") is True + assert git_ignore.ignore("./temp") is True + assert git_ignore.ignore("temp/") is True