diff --git a/sql/schema/schema.sql b/sql/schema/schema.sql index 90e23d0d..0f98217c 100644 --- a/sql/schema/schema.sql +++ b/sql/schema/schema.sql @@ -1,12 +1,21 @@ -- Schema for codegate database using SQLite +-- Workspaces table +CREATE TABLE workspaces ( + id TEXT PRIMARY KEY, -- UUID stored as TEXT + name TEXT, + folder_tree_json TEXT -- JSON stored as TEXT +); + -- Prompts table CREATE TABLE prompts ( id TEXT PRIMARY KEY, -- UUID stored as TEXT + workspace_id TEXT NOT NULL, timestamp DATETIME NOT NULL, provider TEXT, -- VARCHAR(255) request TEXT NOT NULL, -- Record the full request that arrived to the server - type TEXT NOT NULL -- VARCHAR(50) (e.g. "fim", "chat") + type TEXT NOT NULL, -- VARCHAR(50) (e.g. "fim", "chat") + FOREIGN KEY (workspace_id) REFERENCES workspaces(id), ); -- Outputs table @@ -41,6 +50,7 @@ CREATE TABLE settings ( ); -- Create indexes for foreign keys and frequently queried columns +CREATE INDEX idx_prompts_workspace_id ON prompts(workspace_id); CREATE INDEX idx_outputs_prompt_id ON outputs(prompt_id); CREATE INDEX idx_alerts_prompt_id ON alerts(prompt_id); CREATE INDEX idx_prompts_timestamp ON prompts(timestamp); diff --git a/src/codegate/cli.py b/src/codegate/cli.py index 06456a00..7cd89d55 100644 --- a/src/codegate/cli.py +++ b/src/codegate/cli.py @@ -20,6 +20,7 @@ from codegate.providers.copilot.provider import CopilotProvider from codegate.server import init_app from codegate.storage.utils import restore_storage_backup +from codegate.workspaces.workspaces import Workspaces class UvicornServer: @@ -318,6 +319,7 @@ def serve( else: click.echo("Existing Certificates are already present.") + Workspaces().read_workspaces('/app/codegate_workspaces', cfg.ignore_paths_workspaces) # Initialize secrets manager and pipeline factory secrets_manager = SecretsManager() pipeline_factory = PipelineFactory(secrets_manager) diff --git a/src/codegate/config.py b/src/codegate/config.py index 3f99fd04..66ea64df 100644 --- a/src/codegate/config.py +++ b/src/codegate/config.py @@ -54,6 +54,9 @@ class Config: force_certs: bool = False max_fim_hash_lifetime: int = 60 * 5 # Time in seconds. Default is 5 minutes. + ignore_paths_workspaces = [ + ".git", "__pycache__", ".venv", ".DS_Store", "node_modules", ".pytest_cache", ".ruff_cache" + ] # Provider URLs with defaults provider_urls: Dict[str, str] = field(default_factory=lambda: DEFAULT_PROVIDER_URLS.copy()) diff --git a/src/codegate/db/connection.py b/src/codegate/db/connection.py index 443ab008..668d1fb7 100644 --- a/src/codegate/db/connection.py +++ b/src/codegate/db/connection.py @@ -15,6 +15,7 @@ GetPromptWithOutputsRow, Output, Prompt, + Workspace, ) from codegate.pipeline.base import PipelineContext @@ -252,6 +253,33 @@ async def record_context(self, context: Optional[PipelineContext]) -> None: except Exception as e: logger.error(f"Failed to record context: {context}.", error=str(e)) + async def record_workspaces(self, workspaces: List[Workspace]) -> List[Workspace]: + if not workspaces: + return + sql = text( + """ + INSERT INTO workspaces (id, name, folder_tree_json) + VALUES (:id, :name, :folder_tree_json) + RETURNING * + """ + ) + workspaces_tasks = [] + async with asyncio.TaskGroup() as tg: + for workspace in workspaces: + try: + result = tg.create_task(self._execute_update_pydantic_model(workspace, sql)) + workspaces_tasks.append(result) + except Exception as e: + logger.error(f"Failed to record alert: {workspace}.", error=str(e)) + + recorded_workspaces = [] + for workspace_coro in workspaces_tasks: + workspace_recorded = workspace_coro.result() + if workspace_recorded: + recorded_workspaces.append(workspace_recorded) + + return recorded_workspaces + class DbReader(DbCodeGate): diff --git a/src/codegate/db/models.py b/src/codegate/db/models.py index 22859573..d47dda2c 100644 --- a/src/codegate/db/models.py +++ b/src/codegate/db/models.py @@ -37,6 +37,12 @@ class Setting(pydantic.BaseModel): other_settings: Optional[Any] +class Workspace(pydantic.BaseModel): + id: Any + name: str + folder_tree_json: str + + # Models for select queries diff --git a/src/codegate/workspaces/__init__.py b/src/codegate/workspaces/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/codegate/workspaces/workspaces.py b/src/codegate/workspaces/workspaces.py new file mode 100644 index 00000000..28817413 --- /dev/null +++ b/src/codegate/workspaces/workspaces.py @@ -0,0 +1,80 @@ +import asyncio +import json +import uuid +from pathlib import Path +from typing import Dict, List, Optional, Union + +from pydantic import BaseModel + +from codegate.db.connection import DbRecorder +from codegate.db.models import Workspace + + +class Folder(BaseModel): + files: List[str] = [] + + +class Repository(BaseModel): + name: str + folder_tree: Dict[str, Folder] + + +class FolderRepoScanner: + + def __init__(self, ignore_paths: Optional[List[str]] = None): + if ignore_paths is None: + ignore_paths = [] + self.ignore_paths = ignore_paths + + def _should_skip(self, path: Path): + """Skip certain paths that are not relevant for scanning.""" + return any(part in path.parts for part in self.ignore_paths) + + def _read_repository_structure(self, repo_path: Path) -> Dict[str, Folder]: + folder_tree: Dict[str, Folder] = {} + for path in repo_path.rglob('*'): + if self._should_skip(path): + continue + + relative_path = path.relative_to(repo_path) + if path.is_dir(): + folder_tree[str(relative_path)] = Folder() + else: + parent_dir = str(relative_path.parent) + if parent_dir not in folder_tree: + folder_tree[parent_dir] = Folder() + folder_tree[parent_dir].files.append(path.name) + return folder_tree + + def read(self, path_str: Union[str, Path]) -> List[Repository]: + path_dir = Path(path_str) + if not path_dir.is_dir(): + print(f"Path {path_dir} is not a directory") + return [] + + found_repos = [] + for child_path in path_dir.rglob('*'): + if child_path.is_dir() and (child_path / ".git").exists(): + repo_structure = self._read_repository_structure(child_path) + new_repo = Repository(name=child_path.name, folder_tree=repo_structure) + found_repos.append(new_repo) + print(f"Found repository at {child_path}.") + + return found_repos + +class Workspaces: + + def __init__(self): + self._db_recorder = DbRecorder() + + def read_workspaces(self, path: str, ignore_paths: Optional[List[str]] = None) -> None: + repos = FolderRepoScanner(ignore_paths).read(path) + workspaces = [ + Workspace( + id=str(uuid.uuid4()), + name=repo.name, + folder_tree_json=json.dumps(repo.folder_tree) + ) + for repo in repos + ] + asyncio.run(self._db_recorder.record_workspaces(workspaces))