Skip to content

Commit

Permalink
Replace ref with absolute git commit in output source_info (#72)
Browse files Browse the repository at this point in the history
  • Loading branch information
efajardo-nv authored Feb 6, 2025
1 parent 62ee662 commit d312e7c
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 43 deletions.
82 changes: 42 additions & 40 deletions configs/schemas/config.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,8 @@
"const": "file",
"default": "file",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"file": {
"title": "File",
Expand All @@ -180,7 +181,8 @@
"const": "file",
"default": "file",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"file_path": {
"title": "File Path",
Expand Down Expand Up @@ -281,7 +283,8 @@
"const": "http",
"default": "http",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"address": {
"default": "127.0.0.1",
Expand All @@ -299,11 +302,7 @@
"type": "integer"
},
"http_method": {
"allOf": [
{
"$ref": "#/$defs/HTTPMethod"
}
],
"$ref": "#/$defs/HTTPMethod",
"default": "POST"
},
"stop_after": {
Expand All @@ -321,7 +320,8 @@
"const": "huggingface",
"default": "huggingface",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"model_name": {
"title": "Model Name",
Expand Down Expand Up @@ -439,7 +439,8 @@
"const": "manual",
"default": "manual",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"message": {
"$ref": "#/$defs/AgentMorpheusInput"
Expand All @@ -463,7 +464,8 @@
"const": "manual",
"default": "manual",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"packages": {
"items": {
Expand All @@ -485,7 +487,8 @@
"const": "nim",
"default": "nim",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"api_key": {
"anyOf": [
Expand Down Expand Up @@ -531,7 +534,8 @@
"const": "nvfoundation",
"default": "nvfoundation",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"service": {
"$ref": "#/$defs/NVFoundationLLMServiceConfig"
Expand All @@ -558,16 +562,9 @@
"type": "number"
},
"top_p": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"title": "Top P"
"default": 0.01,
"title": "Top P",
"type": "number"
},
"max_tokens": {
"default": 300,
Expand Down Expand Up @@ -601,7 +598,8 @@
"const": "nvfoundation",
"default": "nvfoundation",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"api_key": {
"anyOf": [
Expand Down Expand Up @@ -678,7 +676,7 @@
"type": "integer"
},
"top_p": {
"default": 1,
"default": 0.01,
"title": "Top P",
"type": "number"
},
Expand Down Expand Up @@ -738,7 +736,8 @@
"const": "nemo",
"default": "nemo",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"api_key": {
"anyOf": [
Expand Down Expand Up @@ -780,7 +779,8 @@
"const": "openai",
"default": "openai",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"openai_api_key": {
"anyOf": [
Expand Down Expand Up @@ -835,7 +835,7 @@
"type": "number"
},
"top_p": {
"default": 1.0,
"default": 0.01,
"title": "Top P",
"type": "number"
},
Expand Down Expand Up @@ -875,7 +875,8 @@
"const": "openai",
"default": "openai",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"api_key": {
"anyOf": [
Expand Down Expand Up @@ -911,7 +912,8 @@
"const": "file",
"default": "file",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"file_path": {
"anyOf": [
Expand Down Expand Up @@ -952,7 +954,8 @@
"const": "http",
"default": "http",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"url": {
"title": "Url",
Expand All @@ -976,7 +979,8 @@
"const": "plugin",
"default": "plugin",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"plugin_name": {
"title": "Plugin Name",
Expand All @@ -1000,7 +1004,8 @@
"const": "print",
"default": "print",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
}
},
"title": "OutputPrintConfig",
Expand All @@ -1012,7 +1017,8 @@
"const": "plugin",
"default": "plugin",
"description": "The type of the object",
"title": "Type"
"title": "Type",
"type": "string"
},
"plugin_name": {
"title": "Plugin Name",
Expand Down Expand Up @@ -1323,11 +1329,7 @@
},
"properties": {
"general": {
"allOf": [
{
"$ref": "#/$defs/GeneralConfig"
}
],
"$ref": "#/$defs/GeneralConfig",
"default": {
"base_vdb_dir": "/tmp/am_cache/vdb",
"base_git_dir": "/tmp/am_cache/git",
Expand All @@ -1336,7 +1338,7 @@
"ignore_build_vdb_errors": false,
"max_retries": 10,
"model_max_batch_size": 64,
"num_threads": 64,
"num_threads": 12,
"pipeline_batch_size": 1024,
"use_uvloop": true,
"code_search_tool": false
Expand Down
15 changes: 15 additions & 0 deletions src/cve/data_models/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@


import typing
from collections import defaultdict
from uuid import uuid4

from pydantic import AliasChoices, BaseModel
Expand Down Expand Up @@ -144,6 +145,20 @@ class ImageInfoInput(HashableModel):
source_info: list[SourceDocumentsInfo]
sbom_info: SBOMInfoInput

@field_validator('source_info', mode='after')
@classmethod
def check_conflicting_refs(cls, source_info: list[SourceDocumentsInfo]) -> list[SourceDocumentsInfo]:
"""Check if any git repos have conflicting refs"""
repo_refset = defaultdict(set)

for si in source_info:
repo_refset[si.git_repo].add(si.ref)

for repo, refset in repo_refset.items():
if len(refset) > 1:
raise ValueError(f"Conflicting refs specified for git repo: {repo}, refs: {refset}")
return source_info


class AgentMorpheusInput(HashableModel):
"""
Expand Down
1 change: 1 addition & 0 deletions src/cve/pipeline/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def emit_input_object(subscription: mrc.Subscription) -> typing.Generator[AgentM

build_vdb_stage = BuildSourceCodeVdbStage(config,
build_vdb_fn=embedder.build_vdbs,
base_git_dir=run_config.general.base_git_dir,
ignore_errors=run_config.general.ignore_build_vdb_errors,
ignore_code_embedding=run_config.general.code_search_tool)

Expand Down
13 changes: 10 additions & 3 deletions src/cve/stages/build_vdb_stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from ..data_models.input import AgentMorpheusEngineInput
from ..data_models.input import AgentMorpheusInput
from ..data_models.input import SourceDocumentsInfo
from ..utils.git_utils import get_commit_hash

logger = logging.getLogger(f"morpheus.{__name__}")

Expand All @@ -38,11 +39,13 @@ class BuildSourceCodeVdbStage(SinglePortStage):
def __init__(self,
c: Config,
build_vdb_fn: typing.Callable[[list[SourceDocumentsInfo]], tuple[Path | None, Path | None]],
base_git_dir: str,
ignore_errors: bool = False,
ignore_code_embedding: bool = False):
super().__init__(c)

self._build_vdb_fn = build_vdb_fn
self._base_git_dir = base_git_dir
self._ignore_errors = ignore_errors
self._ignore_code_embedding = ignore_code_embedding

Expand Down Expand Up @@ -93,9 +96,9 @@ def _build_source_code_vdb_stage(self, message: AgentMorpheusInput) -> AgentMorp
try:
base_image = message.image.name

source_code_repos = message.image.source_info
source_infos = message.image.source_info

vdb_code_path, vdb_doc_path = self._build_vdb_fn(source_code_repos, self._ignore_code_embedding)
vdb_code_path, vdb_doc_path = self._build_vdb_fn(source_infos, self._ignore_code_embedding)

if (vdb_code_path is None):
# Only log warning if we're not ignoring code embeddings
Expand All @@ -113,11 +116,15 @@ def _build_source_code_vdb_stage(self, message: AgentMorpheusInput) -> AgentMorp
else:
vdb_doc_path = str(vdb_doc_path)

for si in source_infos:
si.ref = get_commit_hash(self._base_git_dir,
si.git_repo)

except Exception as e:
# For now just skip the row
logger.error("Failure to build VDB for image, '%s', with source code info: %s\nError: %s",
base_image,
source_code_repos,
source_infos,
e,
exc_info=True)

Expand Down
47 changes: 47 additions & 0 deletions src/cve/utils/git_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from pathlib import Path
from pathlib import PurePath

from git import Repo


def get_commit_hash(base_dir: str, git_repo: str = ".git") -> str | None:
"""
Utility function for getting commit hash of Git repo.
Parameters
----------
base_dir : str
Path to base directory containing one or more Git repos
git_repo : str
Relative path to Git repo in base_dir, default is ".git"
Returns
-------
str
Commit hash of Git repo
"""
commit_hash: str | None = None
repo_path = base_dir / PurePath(git_repo)
repo_path = Path(repo_path)
if os.path.exists(repo_path):
repo = Repo(repo_path)
commit_hash = repo.commit().hexsha

return commit_hash

0 comments on commit d312e7c

Please sign in to comment.