From 8f386039078f8a1b8ba17fa3b77e07ce54118f2d Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Sun, 8 Dec 2024 18:14:08 -0500 Subject: [PATCH 1/7] feat(agents-api): added mmr search + configurable doc search params in chat.py --- .../agents_api/activities/execute_system.py | 1 - agents-api/agents_api/autogen/Docs.py | 6 + agents-api/agents_api/autogen/Sessions.py | 35 ++++- .../agents_api/models/chat/gather_messages.py | 129 +++++++++++++----- .../workflows/task_execution/transition.py | 1 - agents-api/tests/test_chat_routes.py | 17 ++- agents-api/tests/test_execution_workflow.py | 1 - .../integrations/autogen/Docs.py | 6 + .../integrations/autogen/Sessions.py | 35 ++++- memory-store/Dockerfile | 2 +- typespec/docs/models.tsp | 5 +- typespec/sessions/models.tsp | 31 ++++- .../@typespec/openapi3/openapi-1.0.0.yaml | 78 ++++++++++- 13 files changed, 298 insertions(+), 49 deletions(-) diff --git a/agents-api/agents_api/activities/execute_system.py b/agents-api/agents_api/activities/execute_system.py index 8d85a2639..ca269417d 100644 --- a/agents-api/agents_api/activities/execute_system.py +++ b/agents-api/agents_api/activities/execute_system.py @@ -6,7 +6,6 @@ from beartype import beartype from box import Box, BoxList -from fastapi import HTTPException from fastapi.background import BackgroundTasks from temporalio import activity diff --git a/agents-api/agents_api/autogen/Docs.py b/agents-api/agents_api/autogen/Docs.py index ffed27c1d..ca3371920 100644 --- a/agents-api/agents_api/autogen/Docs.py +++ b/agents-api/agents_api/autogen/Docs.py @@ -14,11 +14,17 @@ class BaseDocSearchRequest(BaseModel): populate_by_name=True, ) limit: Annotated[int, Field(ge=1, le=50)] = 10 + """ + The limit of documents to return + """ lang: Literal["en-US"] = "en-US" """ The language to be used for text-only search. Support for other languages coming soon. """ metadata_filter: dict[str, Any] = {} + """ + Metadata filter to apply to the search + """ mmr_strength: Annotated[float, Field(ge=0.0, lt=1.0)] = 0 """ MMR Strength (mmr_strength = 1 - mmr_lambda) diff --git a/agents-api/agents_api/autogen/Sessions.py b/agents-api/agents_api/autogen/Sessions.py index 460fd25ce..c14b8f96e 100644 --- a/agents-api/agents_api/autogen/Sessions.py +++ b/agents-api/agents_api/autogen/Sessions.py @@ -96,10 +96,41 @@ class RecallOptions(BaseModel): populate_by_name=True, ) mode: Literal["hybrid", "vector", "text"] = "vector" + """ + The mode to use for the search. + """ num_search_messages: int = 4 + """ + The number of search messages to use for the search. + """ max_query_length: int = 1000 - hybrid_alpha: float = 0.7 - confidence: float = 0.6 + """ + The maximum query length to use for the search. + """ + alpha: Annotated[float, Field(ge=0.0, le=1.0)] = 0.7 + """ + The weight to apply to BM25 vs Vector search results. 0 => pure BM25; 1 => pure vector; + """ + confidence: Annotated[float, Field(ge=0.0, le=1.0)] = 0.6 + """ + The confidence cutoff level + """ + limit: Annotated[int, Field(ge=1, le=50)] = 10 + """ + The limit of documents to return + """ + lang: Literal["en-US"] = "en-US" + """ + The language to be used for text-only search. Support for other languages coming soon. + """ + metadata_filter: dict[str, Any] = {} + """ + Metadata filter to apply to the search + """ + mmr_strength: Annotated[float, Field(ge=0.0, lt=1.0)] = 0 + """ + MMR Strength (mmr_strength = 1 - mmr_lambda) + """ class RecallOptionsUpdate(RecallOptions): diff --git a/agents-api/agents_api/models/chat/gather_messages.py b/agents-api/agents_api/models/chat/gather_messages.py index 28dc6607f..cf444e921 100644 --- a/agents-api/agents_api/models/chat/gather_messages.py +++ b/agents-api/agents_api/models/chat/gather_messages.py @@ -1,15 +1,18 @@ -from typing import TypeVar +from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union from uuid import UUID +import numpy as np from beartype import beartype from fastapi import HTTPException from pycozo.client import QueryException from pydantic import ValidationError from ...autogen.openapi_model import ChatInput, DocReference, History +from ...autogen.Sessions import RecallOptions from ...clients import litellm from ...common.protocol.developers import Developer from ...common.protocol.sessions import ChatContext +from ...models.docs.mmr import maximal_marginal_relevance from ..docs.search_docs_by_embedding import search_docs_by_embedding from ..docs.search_docs_by_text import search_docs_by_text from ..docs.search_docs_hybrid import search_docs_hybrid @@ -23,6 +26,52 @@ T = TypeVar("T") +def get_search_fn_and_params( + recall_options: RecallOptions, + query_text: str | None, + query_embedding: list[float] | None, +) -> Tuple[ + Any, + Optional[Dict[str, Union[float, int, str, Dict[str, float], List[float], None]]], +]: + search_fn, params = None, None + + match recall_options.mode: + case "text": + search_fn = search_docs_by_text + params = dict( + query=query_text, + k=recall_options.limit, + metadata_filter=recall_options.metadata_filter, + ) + + case "vector": + search_fn = search_docs_by_embedding + params = dict( + query_embedding=query_embedding, + k=recall_options.limit * 3 + if recall_options.mmr_strength > 0 + else recall_options.limit, + confidence=recall_options.confidence, + metadata_filter=recall_options.metadata_filter, + ) + + case "hybrid": + search_fn = search_docs_hybrid + params = dict( + query=query_text, + query_embedding=query_embedding, + k=recall_options.limit * 3 + if recall_options.mmr_strength > 0 + else recall_options.limit, + embed_search_options=dict(confidence=recall_options.confidence), + alpha=recall_options.alpha, + metadata_filter=recall_options.metadata_filter, + ) + + return search_fn, params + + @rewrap_exceptions( { QueryException: partialclass(HTTPException, status_code=400), @@ -98,44 +147,62 @@ async def gather_messages( ] ).strip() - [query_embedding, *_] = await litellm.aembedding( - # Truncate on the left to keep the last `search_query_chars` characters - inputs=embed_text[-(recall_options.max_query_length) :], - # TODO: Make this configurable once it's added to the ChatInput model - embed_instruction="Represent the query for retrieving supporting documents: ", - ) + # Set the query text and embedding + query_text, query_embedding = None, None + + # Embed the query + if recall_options.mode != "text": + [query_embedding, *_] = await litellm.aembedding( + # Truncate on the left to keep the last `search_query_chars` characters + inputs=embed_text[-(recall_options.max_query_length) :], + # TODO: Make this configurable once it's added to the ChatInput model + embed_instruction="Represent the query for retrieving supporting documents: ", + ) # Truncate on the right to take only the first `search_query_chars` characters - query_text = search_messages[-1]["content"].strip()[ - : recall_options.max_query_length - ] + if recall_options.mode == "text" or recall_options.mode == "hybrid": + query_text = search_messages[-1]["content"].strip()[ + : recall_options.max_query_length + ] # List all the applicable owners to search docs from active_agent_id = chat_context.get_active_agent().id user_ids = [user.id for user in chat_context.users] owners = [("user", user_id) for user_id in user_ids] + [("agent", active_agent_id)] + # Get the search function and parameters + search_fn, params = get_search_fn_and_params( + recall_options=recall_options, + query_text=query_text, + query_embedding=query_embedding, + ) + # Search for doc references - doc_references: list[DocReference] = [] - match recall_options.mode: - case "vector": - doc_references: list[DocReference] = search_docs_by_embedding( - developer_id=developer.id, - owners=owners, - query_embedding=query_embedding, - ) - case "hybrid": - doc_references: list[DocReference] = search_docs_hybrid( - developer_id=developer.id, - owners=owners, - query=query_text, - query_embedding=query_embedding, - ) - case "text": - doc_references: list[DocReference] = search_docs_by_text( - developer_id=developer.id, - owners=owners, - query=query_text, - ) + doc_references: list[DocReference] = search_fn( + developer_id=developer.id, + owners=owners, + **params, + ) + + # Apply MMR if enabled + if ( + # MMR is enabled + recall_options.mmr_strength > 0 + # The number of doc references is greater than the limit + and len(doc_references) > recall_options.limit + # MMR is not applied to text search + and recall_options.mode != "text" + ): + # Apply MMR + indices = maximal_marginal_relevance( + np.asarray(query_embedding), + [doc.snippet.embedding for doc in doc_references], + k=recall_options.limit, + ) + # Apply MMR + doc_references = [ + doc for i, doc in enumerate(doc_references) if i in set(indices) + ] + # Return the past messages and doc references return past_messages, doc_references diff --git a/agents-api/agents_api/workflows/task_execution/transition.py b/agents-api/agents_api/workflows/task_execution/transition.py index a26ac1778..c6197fed1 100644 --- a/agents-api/agents_api/workflows/task_execution/transition.py +++ b/agents-api/agents_api/workflows/task_execution/transition.py @@ -14,7 +14,6 @@ from ...common.retry_policies import DEFAULT_RETRY_POLICY from ...env import ( debug, - temporal_activity_after_retry_timeout, temporal_heartbeat_timeout, temporal_schedule_to_close_timeout, testing, diff --git a/agents-api/tests/test_chat_routes.py b/agents-api/tests/test_chat_routes.py index 4838efcd5..2c6567b04 100644 --- a/agents-api/tests/test_chat_routes.py +++ b/agents-api/tests/test_chat_routes.py @@ -87,9 +87,13 @@ async def _( agent=agent.id, situation="test session about", recall_options={ - "mode": "text", - "num_search_messages": 10, - "max_query_length": 1001, + "mode": "hybrid", + "num_search_messages": 6, + "max_query_length": 800, + "confidence": 0.6, + "alpha": 0.7, + "limit": 10, + "mmr_strength": 0.5, }, ), client=client, @@ -135,9 +139,12 @@ async def _( agent=agent.id, situation="test session about", recall_options={ - "mode": "vector", - "num_search_messages": 5, + "mode": "text", + "num_search_messages": 10, "max_query_length": 1001, + "confidence": 0.6, + "limit": 5, + "mmr_strength": 0.5, }, ), client=client, diff --git a/agents-api/tests/test_execution_workflow.py b/agents-api/tests/test_execution_workflow.py index e733f81c0..ae440ff02 100644 --- a/agents-api/tests/test_execution_workflow.py +++ b/agents-api/tests/test_execution_workflow.py @@ -16,7 +16,6 @@ from agents_api.models.task.create_task import create_task from agents_api.routers.tasks.create_task_execution import start_execution from tests.fixtures import ( - async_cozo_client, cozo_client, cozo_clients_with_migrations, test_agent, diff --git a/integrations-service/integrations/autogen/Docs.py b/integrations-service/integrations/autogen/Docs.py index ffed27c1d..ca3371920 100644 --- a/integrations-service/integrations/autogen/Docs.py +++ b/integrations-service/integrations/autogen/Docs.py @@ -14,11 +14,17 @@ class BaseDocSearchRequest(BaseModel): populate_by_name=True, ) limit: Annotated[int, Field(ge=1, le=50)] = 10 + """ + The limit of documents to return + """ lang: Literal["en-US"] = "en-US" """ The language to be used for text-only search. Support for other languages coming soon. """ metadata_filter: dict[str, Any] = {} + """ + Metadata filter to apply to the search + """ mmr_strength: Annotated[float, Field(ge=0.0, lt=1.0)] = 0 """ MMR Strength (mmr_strength = 1 - mmr_lambda) diff --git a/integrations-service/integrations/autogen/Sessions.py b/integrations-service/integrations/autogen/Sessions.py index 460fd25ce..c14b8f96e 100644 --- a/integrations-service/integrations/autogen/Sessions.py +++ b/integrations-service/integrations/autogen/Sessions.py @@ -96,10 +96,41 @@ class RecallOptions(BaseModel): populate_by_name=True, ) mode: Literal["hybrid", "vector", "text"] = "vector" + """ + The mode to use for the search. + """ num_search_messages: int = 4 + """ + The number of search messages to use for the search. + """ max_query_length: int = 1000 - hybrid_alpha: float = 0.7 - confidence: float = 0.6 + """ + The maximum query length to use for the search. + """ + alpha: Annotated[float, Field(ge=0.0, le=1.0)] = 0.7 + """ + The weight to apply to BM25 vs Vector search results. 0 => pure BM25; 1 => pure vector; + """ + confidence: Annotated[float, Field(ge=0.0, le=1.0)] = 0.6 + """ + The confidence cutoff level + """ + limit: Annotated[int, Field(ge=1, le=50)] = 10 + """ + The limit of documents to return + """ + lang: Literal["en-US"] = "en-US" + """ + The language to be used for text-only search. Support for other languages coming soon. + """ + metadata_filter: dict[str, Any] = {} + """ + Metadata filter to apply to the search + """ + mmr_strength: Annotated[float, Field(ge=0.0, lt=1.0)] = 0 + """ + MMR Strength (mmr_strength = 1 - mmr_lambda) + """ class RecallOptionsUpdate(RecallOptions): diff --git a/memory-store/Dockerfile b/memory-store/Dockerfile index 3821c2662..6d0d14789 100644 --- a/memory-store/Dockerfile +++ b/memory-store/Dockerfile @@ -6,7 +6,7 @@ # Then copy the run.sh script to the ./run.sh file # First stage: Build the Rust project -FROM rust:1.80.1-bookworm AS builder +FROM rust:1.81.0-bookworm AS builder # Install required dependencies RUN apt-get update && apt-get install -y \ diff --git a/typespec/docs/models.tsp b/typespec/docs/models.tsp index 055fc2003..af0e39292 100644 --- a/typespec/docs/models.tsp +++ b/typespec/docs/models.tsp @@ -90,13 +90,16 @@ model EmbedQueryResponse { } model BaseDocSearchRequest { + /** The limit of documents to return */ @minValue(1) @maxValue(50) limit: uint16 = 10; /** The language to be used for text-only search. Support for other languages coming soon. */ lang: "en-US" = "en-US"; - metadata_filter: MetadataFilter = #{}, + + /** Metadata filter to apply to the search */ + metadata_filter: MetadataFilter = #{}; /** MMR Strength (mmr_strength = 1 - mmr_lambda) */ @minValue(0) diff --git a/typespec/sessions/models.tsp b/typespec/sessions/models.tsp index f15453a5f..1a7023404 100644 --- a/typespec/sessions/models.tsp +++ b/typespec/sessions/models.tsp @@ -19,11 +19,40 @@ enum SearchMode { } model RecallOptions { + /** The mode to use for the search. */ mode: SearchMode = SearchMode.vector; + + /** The number of search messages to use for the search. */ num_search_messages: uint32 = 4; + + /** The maximum query length to use for the search. */ max_query_length: uint32 = 1000; - hybrid_alpha: float = 0.7; + + /** The weight to apply to BM25 vs Vector search results. 0 => pure BM25; 1 => pure vector; */ + @minValue(0) + @maxValue(1) + alpha: float = 0.7; + + /** The confidence cutoff level */ + @minValue(0) + @maxValue(1) confidence: float = 0.6; + + /** The limit of documents to return */ + @minValue(1) + @maxValue(50) + limit: uint16 = 10; + + /** The language to be used for text-only search. Support for other languages coming soon. */ + lang?: "en-US" = "en-US"; + + /** Metadata filter to apply to the search */ + metadata_filter: MetadataFilter = #{}; + + /** MMR Strength (mmr_strength = 1 - mmr_lambda) */ + @minValue(0) + @maxValueExclusive(1) + mmr_strength?: float = 0.0; } enum ContextOverflowType { diff --git a/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml b/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml index eb58eeef2..0e3c66d18 100644 --- a/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml +++ b/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml @@ -2756,6 +2756,7 @@ components: format: uint16 minimum: 1 maximum: 50 + description: The limit of documents to return default: 10 lang: type: string @@ -2766,6 +2767,7 @@ components: metadata_filter: type: object additionalProperties: {} + description: Metadata filter to apply to the search default: {} mmr_strength: type: number @@ -4103,48 +4105,118 @@ components: - mode - num_search_messages - max_query_length - - hybrid_alpha + - alpha - confidence + - limit + - metadata_filter properties: mode: allOf: - $ref: '#/components/schemas/Sessions.SearchMode' + description: The mode to use for the search. default: vector num_search_messages: type: integer format: uint32 + description: The number of search messages to use for the search. default: 4 max_query_length: type: integer format: uint32 + description: The maximum query length to use for the search. default: 1000 - hybrid_alpha: + alpha: type: number + minimum: 0 + maximum: 1 + description: The weight to apply to BM25 vs Vector search results. 0 => pure BM25; 1 => pure vector; default: 0.7 confidence: type: number + minimum: 0 + maximum: 1 + description: The confidence cutoff level default: 0.6 + limit: + type: integer + format: uint16 + minimum: 1 + maximum: 50 + description: The limit of documents to return + default: 10 + lang: + type: string + enum: + - en-US + description: The language to be used for text-only search. Support for other languages coming soon. + default: en-US + metadata_filter: + type: object + additionalProperties: {} + description: Metadata filter to apply to the search + default: {} + mmr_strength: + type: number + minimum: 0 + maximum: 1 + exclusiveMaximum: true + description: MMR Strength (mmr_strength = 1 - mmr_lambda) + default: 0 Sessions.RecallOptionsUpdate: type: object properties: mode: allOf: - $ref: '#/components/schemas/Sessions.SearchMode' + description: The mode to use for the search. default: vector num_search_messages: type: integer format: uint32 + description: The number of search messages to use for the search. default: 4 max_query_length: type: integer format: uint32 + description: The maximum query length to use for the search. default: 1000 - hybrid_alpha: + alpha: type: number + minimum: 0 + maximum: 1 + description: The weight to apply to BM25 vs Vector search results. 0 => pure BM25; 1 => pure vector; default: 0.7 confidence: type: number + minimum: 0 + maximum: 1 + description: The confidence cutoff level default: 0.6 + limit: + type: integer + format: uint16 + minimum: 1 + maximum: 50 + description: The limit of documents to return + default: 10 + lang: + type: string + enum: + - en-US + description: The language to be used for text-only search. Support for other languages coming soon. + default: en-US + metadata_filter: + type: object + additionalProperties: {} + description: Metadata filter to apply to the search + default: {} + mmr_strength: + type: number + minimum: 0 + maximum: 1 + exclusiveMaximum: true + description: MMR Strength (mmr_strength = 1 - mmr_lambda) + default: 0 Sessions.SearchMode: type: string enum: From e2ea7f3a8b0faf3c36945659a0db549c3567ea0f Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Tue, 10 Dec 2024 17:04:08 -0500 Subject: [PATCH 2/7] feat(agents-api): added get history system call as tool --- agents-api/agents_api/activities/execute_system.py | 3 ++- agents-api/agents_api/activities/utils.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/agents-api/agents_api/activities/execute_system.py b/agents-api/agents_api/activities/execute_system.py index ca269417d..9d4ee1a7a 100644 --- a/agents-api/agents_api/activities/execute_system.py +++ b/agents-api/agents_api/activities/execute_system.py @@ -108,7 +108,8 @@ async def execute_system( ) await bg_runner() return res - + + # Handle create operations if system.operation == "create" and system.resource == "session": developer_id = arguments.pop("developer_id") session_id = arguments.pop("session_id", None) diff --git a/agents-api/agents_api/activities/utils.py b/agents-api/agents_api/activities/utils.py index d9ad1840c..f087b9433 100644 --- a/agents-api/agents_api/activities/utils.py +++ b/agents-api/agents_api/activities/utils.py @@ -308,6 +308,7 @@ def get_handler(system: SystemDef) -> Callable: from ..models.session.get_session import get_session as get_session_query from ..models.session.list_sessions import list_sessions as list_sessions_query from ..models.session.update_session import update_session as update_session_query + from ..models.entry.get_history import get_history as get_history_query from ..models.task.create_task import create_task as create_task_query from ..models.task.delete_task import delete_task as delete_task_query from ..models.task.get_task import get_task as get_task_query @@ -376,6 +377,8 @@ def get_handler(system: SystemDef) -> Callable: return delete_session_query case ("session", None, "chat"): return chat + case ("session", None, "history"): + return get_history_query # TASKS case ("task", None, "list"): From 5ce27f5b9ac7335d3abe8bb7c384e60814e42b55 Mon Sep 17 00:00:00 2001 From: Vedantsahai18 Date: Tue, 10 Dec 2024 22:05:06 +0000 Subject: [PATCH 3/7] refactor: Lint agents-api (CI) --- agents-api/agents_api/activities/execute_system.py | 2 +- agents-api/agents_api/activities/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/agents-api/agents_api/activities/execute_system.py b/agents-api/agents_api/activities/execute_system.py index 9d4ee1a7a..bc2bcd8cf 100644 --- a/agents-api/agents_api/activities/execute_system.py +++ b/agents-api/agents_api/activities/execute_system.py @@ -108,7 +108,7 @@ async def execute_system( ) await bg_runner() return res - + # Handle create operations if system.operation == "create" and system.resource == "session": developer_id = arguments.pop("developer_id") diff --git a/agents-api/agents_api/activities/utils.py b/agents-api/agents_api/activities/utils.py index f087b9433..5b12d34e0 100644 --- a/agents-api/agents_api/activities/utils.py +++ b/agents-api/agents_api/activities/utils.py @@ -303,12 +303,12 @@ def get_handler(system: SystemDef) -> Callable: from ..models.agent.update_agent import update_agent as update_agent_query from ..models.docs.delete_doc import delete_doc as delete_doc_query from ..models.docs.list_docs import list_docs as list_docs_query + from ..models.entry.get_history import get_history as get_history_query from ..models.session.create_session import create_session as create_session_query from ..models.session.delete_session import delete_session as delete_session_query from ..models.session.get_session import get_session as get_session_query from ..models.session.list_sessions import list_sessions as list_sessions_query from ..models.session.update_session import update_session as update_session_query - from ..models.entry.get_history import get_history as get_history_query from ..models.task.create_task import create_task as create_task_query from ..models.task.delete_task import delete_task as delete_task_query from ..models.task.get_task import get_task as get_task_query From 11a98f994e0c881bab9a64a88e4d46df76373362 Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Wed, 11 Dec 2024 18:33:50 -0500 Subject: [PATCH 4/7] build: changelog ci pipeline using julep tasks --- .github/workflows/generate-changelog.yml | 82 ++++++++++ memory-store/Dockerfile | 4 +- scripts/generate_changelog.py | 189 +++++++++++++++++++++++ scripts/templates/authors.md | 9 ++ scripts/templates/changelog.yaml | 111 +++++++++++++ scripts/templates/header.html | 24 +++ 6 files changed, 417 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/generate-changelog.yml create mode 100644 scripts/generate_changelog.py create mode 100644 scripts/templates/authors.md create mode 100644 scripts/templates/changelog.yaml create mode 100644 scripts/templates/header.html diff --git a/.github/workflows/generate-changelog.yml b/.github/workflows/generate-changelog.yml new file mode 100644 index 000000000..1d750f111 --- /dev/null +++ b/.github/workflows/generate-changelog.yml @@ -0,0 +1,82 @@ +name: Generate and Update Changelog + +on: + # schedule: + # - cron: "0 0 */14 * *" # Runs every two weeks at midnight UTC + workflow_dispatch: + +jobs: + changelog_generation: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: dev + + - name: Setup GitHub CLI + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token + + - name: Collect merged PRs from the last two weeks + id: collect_prs + run: | + # Set date threshold for fetching PRs + if [[ "$OSTYPE" == "darwin"* ]]; then + date_threshold=$(date -v-14d +"%Y-%m-%d") + else + date_threshold=$(date -d '-14 days' +"%Y-%m-%d") + fi + + echo "Fetching merged PRs since $date_threshold..." + + # Find merged PRs from the last two weeks + merged_prs=$(gh pr list --state merged --json number,title,body,author --search "merged:>=$date_threshold" --jq 'map({number, title, body, author: .author.login})') + + if [ -z "$merged_prs" ] || [ "$merged_prs" = "null" ]; then + echo "No merged PRs found in the last two weeks." + echo "pr_data=[]" >> $GITHUB_ENV + echo '{"pr_data": []}' > pr_data.json + exit 0 + fi + + echo "pr_data=$merged_prs" >> $GITHUB_ENV + echo "pr_data=$merged_prs" >> "$GITHUB_OUTPUT" + echo "{\"pr_data\": $merged_prs}" > pr_data.json + + - name: Setup Python v3.10.12 + uses: actions/setup-python@v5 + with: + python-version: "3.10.12" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install PyYAML julep git+https://github.com/Jwink3101/parmapper + + - name: Send PR data to Python script + if: steps.collect_prs.outputs.pr_data != '[]' + id: generate_changelog + run: | + if ! python scripts/generate_changelog.py; then + echo "Error: Failed to generate changelog" + exit 1 + fi + env: + JULEP_API_KEY: ${{ secrets.JULEP_API_KEY }} + TASK_UUID: ${{ secrets.TASK_UUID }} + AGENT_UUID: ${{ secrets.AGENT_UUID }} + + - name: Create Pull Request + if: success() && steps.collect_prs.outputs.pr_data != '[]' + uses: peter-evans/create-pull-request@v7 + with: + token: ${{ secrets.GITHUB_TOKEN }} + commit-message: "chore(changelog): update CHANGELOG.md" + title: "Update CHANGELOG.md" + body: "This PR updates the changelog with PRs from the last two weeks." + branch: "update-changelog" + delete-branch: true + add-paths: | + CHANGELOG.md \ No newline at end of file diff --git a/memory-store/Dockerfile b/memory-store/Dockerfile index fa384cb12..b6b99b40c 100644 --- a/memory-store/Dockerfile +++ b/memory-store/Dockerfile @@ -16,8 +16,8 @@ RUN apt-get update && apt-get install -y \ # Build cozo-ce-bin from crates.io WORKDIR /usr/src -# RUN cargo install cozo-ce-bin@0.7.13-alpha.3 --features "requests graph-algo storage-new-rocksdb storage-sqlite jemalloc io-uring malloc-usable-size" -RUN cargo install --git https://github.com/cozo-community/cozo.git --branch f/publish-crate --rev 592f49b --profile release -F graph-algo -F jemalloc -F io-uring -F storage-new-rocksdb -F malloc-usable-size --target x86_64-unknown-linux-gnu cozo-ce-bin +RUN cargo install cozo-ce-bin@0.7.13-alpha.3 --features "requests graph-algo storage-new-rocksdb storage-sqlite jemalloc io-uring malloc-usable-size" +# RUN cargo install --git https://github.com/cozo-community/cozo.git --branch f/publish-crate --rev 592f49b --profile release -F graph-algo -F jemalloc -F io-uring -F storage-new-rocksdb -F malloc-usable-size --target x86_64-unknown-linux-gnu cozo-ce-bin # Copy the built binary to /usr/local/bin RUN cp /usr/local/cargo/bin/cozo-ce-bin /usr/local/bin/cozo diff --git a/scripts/generate_changelog.py b/scripts/generate_changelog.py new file mode 100644 index 000000000..3cb56d49b --- /dev/null +++ b/scripts/generate_changelog.py @@ -0,0 +1,189 @@ +# Standard library imports +import sys +import json +import re +import logging +from pathlib import Path +import os +import time +from typing import List, Dict, Any + +# Third-party imports +from julep import Client +import yaml + +# Configure logging with timestamp, level, and message format +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +# Constants and configurations +HTML_TAGS_PATTERN = re.compile(r"(<[^>]+>)") # Regex pattern to match HTML tags +REQUIRED_ENV_VARS = ['AGENT_UUID', 'TASK_UUID', 'JULEP_API_KEY'] # List of required environment variables + +def load_template(filename: str) -> str: + """Load template content from file""" + return Path(f'./scripts/templates/{filename}').read_text(encoding='utf-8') + +def run_task(pr_data: str) -> str: + """ + Execute the changelog generation task using Julep API. + + Args: + pr_data (str): Formatted PR data to process + + Returns: + str: Generated changelog content + + Raises: + ValueError: If required environment variables are missing + Exception: If task execution fails + """ + # Validate env vars with list comprehension + if missing_vars := [var for var in REQUIRED_ENV_VARS if not os.environ.get(var)]: + raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}") + + client = Client(api_key=os.environ['JULEP_API_KEY'], environment="dev") + + # Use context manager for file operations + with Path('./scripts/templates/changelog.yaml').open(encoding='utf-8') as f: + task_description = yaml.safe_load(f) + + # Create or update the AI agent + agent = client.agents.create_or_update( + agent_id=os.environ['AGENT_UUID'], + name="Changelog Generator", + about="An AI assistant that can generate a changelog from a list of PRs.", + model="gpt-4o", + ) + + # Create or update the task configuration + task = client.tasks.create_or_update( + task_id=os.environ['TASK_UUID'], + agent_id=os.environ['AGENT_UUID'], + **task_description + ) + + # Create a new execution instance + execution = client.executions.create( + task_id=os.environ['TASK_UUID'], + input={"pr_data": str(pr_data)} + ) + + # Wait for task completion using context manager for proper resource cleanup + with client: + while (result := client.executions.get(execution.id)).status not in ['succeeded', 'failed']: + time.sleep(3) + + if result.status != "succeeded": + raise Exception(result.error) + return result.output + +def preserve_and_update_changelog(new_changelog: str, source: str = './CHANGELOG.md') -> None: + """ + Save the generated changelog while preserving HTML content. + + Args: + new_changelog (str): The new changelog content to save + source (str): Path to the changelog file (default: 'CHANGELOG.md') + """ + path = Path(source) + path.parent.mkdir(parents=True, exist_ok=True) + + # Load templates at runtime + html_content = load_template('header.html') + author_list = load_template('authors.md') + + content = f"{html_content}\n\n{new_changelog}\n\n{author_list}" + path.write_text(content, encoding='utf-8') + +def is_html_tag(segment: str) -> bool: + """ + Check if a given string segment is an HTML tag. + + Args: + segment (str): String to check + + Returns: + bool: True if segment is an HTML tag, False otherwise + """ + return re.fullmatch(HTML_TAGS_PATTERN, segment) is not None + +def process_body(body: str) -> str: + """ + Process PR body text by removing HTML tags and special markers. + + Args: + body (str): PR description body text + + Returns: + str: Cleaned and processed body text + """ + if not body: + return "" + + # Remove HTML tags and clean up the text + segments = [seg for seg in re.split(HTML_TAGS_PATTERN, body) if not is_html_tag(seg)] + processed_body = "".join(segments) + return processed_body.replace(">", "").replace("[!IMPORTANT]", "").strip() + +def process_pr_data(pr_data: str) -> str: + """ + Generate changelog entries from PR data. + + Args: + pr_data (str): JSON string containing PR information + + Returns: + str: Formatted changelog entries + """ + prs: List[Dict[str, Any]] = json.loads(pr_data) + + # Use list comprehension with f-strings + entries = [ + f"""- PR #{pr['number']}: {pr['title']} + Author: {pr['author']} + Body: + {process_body(pr.get('body', ''))} + """ + for pr in prs + ] + return "\n".join(entries) + +def main(pr_data: str) -> None: + """ + Main function to orchestrate changelog generation process. + + Args: + pr_data (str): JSON string containing PR information + + Raises: + Exception: If any step in the process fails + """ + try: + logging.info("Processing PR data...") + processed_pr_data = process_pr_data(pr_data) + + logging.info("Running task...") + final_changelog = run_task(processed_pr_data) + + logging.info("Saving changelog...") + preserve_and_update_changelog(final_changelog) + + logging.info("Successfully saved changelog to CHANGELOG.md") + + # delete the pr_data.json file + os.remove('pr_data.json') + logging.info("Deleted pr_data.json file") + except Exception as e: + logging.error(f"Failed to generate changelog: {str(e)}") + raise + +# Script entry point +if __name__ == "__main__": + try: + # Read PR data from JSON file + with open('pr_data.json', 'r') as file: + pr_data = json.load(file) + main(pr_data) + except Exception as e: + logging.error(f"Script failed: {str(e)}") + sys.exit(1) \ No newline at end of file diff --git a/scripts/templates/authors.md b/scripts/templates/authors.md new file mode 100644 index 000000000..962b95f5e --- /dev/null +++ b/scripts/templates/authors.md @@ -0,0 +1,9 @@ +## Contributors + +Thank you to all our contributors who helped make this release possible! + +- [Dmitry Paramonov](https://github.com/whiterabbit1983) πŸ‡ +- [Ahmad Haidar](https://github.com/Ahmad-mtos) πŸš€ +- [Diwank Tomar](https://github.com/creatorrr) 🌟 +- [Vedant Sahai](https://github.com/Vedantsahai18) πŸ”₯ +- [Hamada Salhab](https://github.com/HamadaSalhab) πŸ’‘ \ No newline at end of file diff --git a/scripts/templates/changelog.yaml b/scripts/templates/changelog.yaml new file mode 100644 index 000000000..c6254e634 --- /dev/null +++ b/scripts/templates/changelog.yaml @@ -0,0 +1,111 @@ +name: Changelog Generator +description: Generates a changelog from a list of PRs. + +main: +- prompt: + - role: system + content: | + # AI Agent Prompt for Generating a Structured and Engaging Changelog + + Generate a **detailed** and **engaging changelog** based on information from the PR comment, title, author, and any additional context. + Your goal is to make the changelog both **informative** and **appealing** to the user. + + ## πŸ”„ Steps to Generate the Changelog: + + ### 1. **Extract Relevant Information** πŸ“‹ + Gather all the important details from the following sources: + - **PR Title**: What is the overall summary of the change? + - **PR Comment**: Any detailed description of the changes and reasons for the change. + - **PR Author**: Who made the change? (Include name or GitHub handle) + - **Additional Context**: Include any extra context or notes provided for a clearer understanding of the changes. + + ### 2. **Organize the Content** πŸ—‚οΈ + Structure the changelog into **clear sections**. These sections should be: + - **Features** ✨: New functionalities or major additions. + - **Fixes** πŸ”§: Bug fixes or issue resolutions. + - **Improvements** πŸ“ˆ: Enhancements or optimizations made. + - **Performance Enhancements** πŸš€: Changes that improve speed, efficiency, or scalability. + - **Breaking Changes** πŸ’₯: Changes that could potentially break backward compatibility. + + ### 3. **Detail the Changes** πŸ“ + For each section, include: + - A **concise but clear description** of what has changed. + - The **reason** behind the change (why it was needed). + - The **benefit** or **impact** on the user experience. + + ### 4. **Engage the User** πŸ—£οΈ + Write in a conversational and engaging tone. Try to: + - Highlight the **key updates** and their **user impact**. + - Encourage users to **take action** if necessary (e.g., β€œPlease upgrade to this version to experience the new feature!”). + + ### 5. **Format the Changelog** πŸ–‹οΈ + Make sure the changelog is visually clear and easy to read: + - Use **bullet points** for each change. + - Apply **headings** for sections like "Features", "Fixes", etc. + - Use **emojis** to visually separate and highlight sections. + + ### 6. **Output Format** πŸ—’οΈ + The final changelog should follow this format: + - **Title**: "Julep AI Changelog for 12th December 2024" (Note: Use `datetime.datetime.now().strftime('%d %B %Y')` for auto-generating the date). + - **Sections**: Start with a header for each section (e.g., `## Features ✨`) and list items under it in bullet points. + - Use **bold** and **italic** where needed to emphasize key points. + + The final changelog should be **engaging**, **well-structured**, and easy to read, making it accessible to both technical and non-technical users. + + Here are certain notes that you should follow: + - The output should be in markdown format as the output will be rendered in a markdown file. + - Wherever possible describe the changes in a way that is more engaging and conversational. + - Make it as visually appealing as possible. Add emojis to make it more engaging. + - Add empty lines between sections and make sure the output is formatted correctly. + - Add a footer with the all authors of the changelog at the bottom. + - Ensure that the output is formatted correctly. + - No need to add a footer or author list or contributors list. + + Please feel free to make changes to the output as you see fit. You only need to return me the changelog and nothing else. + Add markdown formatting to the output wherever needed to make it more visual attractive, readable and engaging. + + ### Example Output: + + # **Julep AI Changelog for {{datetime.datetime.now().strftime('%d %B %Y')}}** ✨ + + ## **Features** ✨ + - **Real-Time Analytics Dashboard**: A brand-new dashboard that provides real-time data visualizations of user activity. + - **Why**: We needed a way to track live data for quicker decision-making. + - **Impact**: Users can now see real-time metrics to improve decision-making on-the-go. + + ## **Fixes** πŸ”§ + - **Fixed an Issue with Data Export**: Resolved a bug preventing users from exporting large data sets. + - **Why**: Export failures were occurring due to improper handling of large files. + - **Impact**: Users can now export data without encountering errors. + + ## **Improvements** πŸ“ˆ + - **Improved Search Functionality**: The search engine now returns results 20% faster. + - **Why**: To ensure quicker access to information. + - **Impact**: Users will notice faster search results, improving overall efficiency. + + ## **Performance Enhancements** πŸš€ + - **Optimized Data Compression**: Reduced the size of stored data by 30%. + - **Why**: To make data storage more efficient. + - **Impact**: Faster data retrieval times and less storage usage. + + ## **Breaking Changes** πŸ’₯ + - **Deprecated Legacy API**: The old API version (v1) will no longer be supported from next month. + - **Why**: We’ve moved to a more secure and feature-rich version. + - **Impact**: Developers must upgrade to API v2 to continue using our services. + + ## **Notes** πŸ“ + - Please **update your version** to avoid issues with deprecated features. + - Known issues: Some users may experience temporary delays due to high traffic on the server. + + ## πŸ”‘ Key Notes: + - **Clarity**: Use clear, non-technical language where possible. + - **Version Specifics**: Mention if a particular version or update is affected by the changes. + - **Known Issues**: Document any known issues, especially those that might impact the user experience. + + - role: user + content: | + Here is the PR data: + + {{inputs[0]['pr_data']}} + + unwrap: true \ No newline at end of file diff --git a/scripts/templates/header.html b/scripts/templates/header.html new file mode 100644 index 000000000..50dd9f569 --- /dev/null +++ b/scripts/templates/header.html @@ -0,0 +1,24 @@ +
+ julep +
+ +

+
+ Explore Docs (wip) + Β· + Discord + Β· + 𝕏 + Β· + LinkedIn +

+ +

+ NPM Version +   + PyPI - Version +   + Docker Image Version +   + GitHub License +

\ No newline at end of file From 5829b6e211adbf00d76a571f99a85f334997131b Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Wed, 11 Dec 2024 23:24:25 -0500 Subject: [PATCH 5/7] chore: changed CI changelog name --- .github/workflows/generate-changelog.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/generate-changelog.yml b/.github/workflows/generate-changelog.yml index 1d750f111..fb61f3aba 100644 --- a/.github/workflows/generate-changelog.yml +++ b/.github/workflows/generate-changelog.yml @@ -1,4 +1,5 @@ -name: Generate and Update Changelog +name: Julep Changelog Generation +run-name: ${{ github.actor }} is generating changelog for the last two weeks using Julep on: # schedule: From 84a1a9572012689a4050d5837b57d229ec024239 Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Wed, 11 Dec 2024 23:42:52 -0500 Subject: [PATCH 6/7] fix: minor fix for changelog workflow --- .github/workflows/generate-changelog.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/generate-changelog.yml b/.github/workflows/generate-changelog.yml index fb61f3aba..b024726b8 100644 --- a/.github/workflows/generate-changelog.yml +++ b/.github/workflows/generate-changelog.yml @@ -1,9 +1,7 @@ -name: Julep Changelog Generation +name: Julep-Changelog-Generation run-name: ${{ github.actor }} is generating changelog for the last two weeks using Julep on: - # schedule: - # - cron: "0 0 */14 * *" # Runs every two weeks at midnight UTC workflow_dispatch: jobs: From d43cc8b884a823c858308be3a11e8aca3379218a Mon Sep 17 00:00:00 2001 From: vedantsahai18 Date: Thu, 12 Dec 2024 00:02:43 -0500 Subject: [PATCH 7/7] chore: minor fixes --- .github/workflows/generate-changelog.yml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/workflows/generate-changelog.yml b/.github/workflows/generate-changelog.yml index b024726b8..f23ebc7c8 100644 --- a/.github/workflows/generate-changelog.yml +++ b/.github/workflows/generate-changelog.yml @@ -1,8 +1,7 @@ name: Julep-Changelog-Generation -run-name: ${{ github.actor }} is generating changelog for the last two weeks using Julep on: - workflow_dispatch: + workflow_dispatch: {} jobs: changelog_generation: @@ -22,7 +21,6 @@ jobs: - name: Collect merged PRs from the last two weeks id: collect_prs run: | - # Set date threshold for fetching PRs if [[ "$OSTYPE" == "darwin"* ]]; then date_threshold=$(date -v-14d +"%Y-%m-%d") else @@ -31,8 +29,11 @@ jobs: echo "Fetching merged PRs since $date_threshold..." - # Find merged PRs from the last two weeks - merged_prs=$(gh pr list --state merged --json number,title,body,author --search "merged:>=$date_threshold" --jq 'map({number, title, body, author: .author.login})') + merged_prs=$( + gh pr list --state merged --json number,title,body,author \ + --search "merged:>=$date_threshold" \ + --jq 'map({number, title, body, author: .author.login})' + ) if [ -z "$merged_prs" ] || [ "$merged_prs" = "null" ]; then echo "No merged PRs found in the last two weeks." @@ -42,17 +43,17 @@ jobs: fi echo "pr_data=$merged_prs" >> $GITHUB_ENV - echo "pr_data=$merged_prs" >> "$GITHUB_OUTPUT" echo "{\"pr_data\": $merged_prs}" > pr_data.json - name: Setup Python v3.10.12 uses: actions/setup-python@v5 with: python-version: "3.10.12" + - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install PyYAML julep git+https://github.com/Jwink3101/parmapper + python -m pip install PyYAML julep - name: Send PR data to Python script if: steps.collect_prs.outputs.pr_data != '[]' @@ -74,8 +75,8 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} commit-message: "chore(changelog): update CHANGELOG.md" title: "Update CHANGELOG.md" - body: "This PR updates the changelog with PRs from the last two weeks." + body: "This PR updates the changelog with PRs from the last 2 weeks." branch: "update-changelog" delete-branch: true add-paths: | - CHANGELOG.md \ No newline at end of file + CHANGELOG.md