datakind · JanPeterDatakind · Jul 16, 2024 · Jul 12, 2024 · Jul 14, 2024 · Jul 14, 2024
diff --git a/.env.example b/.env.example
@@ -33,7 +33,7 @@ RECIPE_DB_CONN_STRING=postgresql://${POSTGRES_RECIPE_USER}:${POSTGRES_RECIPE_PAS
 RECIPES_OPENAI_API_TYPE=<azure or openai>
 RECIPES_OPENAI_API_KEY=<API Key>
 RECIPES_OPENAI_API_ENDPOINT=<only for Azure, eg https://<YOUR DEPLOYMENT NAME>.openai.azure.com/>
-RECIPES_OPENAI_API_VERSION=<only for Azure, eg 2024-02-15-preview>
+RECIPES_OPENAI_API_VERSION=<only for Azure, eg 2024-05-01-preview >
 RECIPES_MODEL=<On Opne AI model name, on Azure the deployment name you created in Azure, eg gpt-4o>
 #
 # Leave these as-is for quick start
@@ -72,13 +72,10 @@ ASSISTANTS_API_TYPE=<azure or openai>
 ASSISTANTS_API_KEY=<API Key as found on the Azure OpenAI resource>
 ASSISTANTS_ID=<ID of the assistant you created in OpenAI. Leave blank if you do not have one yet>
 ASSISTANTS_BASE_URL=<for Azure only, eg https://<YOUR DEPLOYMENT NAME>.openai.azure.com/>
-ASSISTANTS_API_VERSION=<For Azure only, eg 2024-02-15-preview>
+ASSISTANTS_API_VERSION=<For Azure only, eg 2024-05-01-preview >
 ASSISTANTS_MODEL=<On Open AI, the model name, on Azure the deployment name of the model you created in Azure which the assitant uses, eg gpt-4o>
 ASSISTANTS_BOT_NAME=<Your assistant name, eg "Humanitarian AI Assistant">
 
-# Leave as-is
-ASSISTANTS_API_KEY=${OPENAI_API_KEY}
-ASSISTANTS_BASE_URL=""
 #==================================================#
 #             Deployments Settings                 #
 #==================================================#

diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml
@@ -1,12 +1,13 @@
 name: End-to-End tests
 
-#on: [push, pull_request]
-
-on: [push]
-
-#on:
-#  pull_request_target:
-#    types: [labeled]
+on:
+  push:
+    branches:
+      - develop
+  pull_request:
+    branches:
+      - master
+      - main
 
 jobs:
   test:
@@ -134,37 +135,37 @@ jobs:
         #- name: DEBUG - Setup upterm session
         #  uses: lhotari/action-upterm@v1
 
-        #- name: DEBUG - Run Selenium outside of promptflow
-        #  run: |
-        #    docker exec promptflow python call_assistant.py
-
-        - name: Run tests 
+        - name: DEBUG - Run Selenium outside of promptflow
           run: |
-            env > .env 
-            docker exec promptflow pf run create --flow . --data ./data.jsonl --stream  --column-mapping query='${data.query}' context='${data.context}' chat_history='${data.chat_history}'  --name base_run 
+            docker exec promptflow python call_assistant.py
 
-        - name: Check logs post-tests
-          run: |
-            docker ps
+        # - name: Run tests 
+        #   run: |
+        #     env > .env 
+        #     docker exec promptflow pf run create --flow . --data ./data.jsonl --stream  --column-mapping query='${data.query}' context='${data.context}' chat_history='${data.chat_history}'  --name base_run 
 
-            echo "logs datadb ..."
-            docker compose logs datadb
+        # - name: Check logs post-tests
+        #   run: |
+        #     docker ps
 
-            echo "logs promptflow ..."
-            docker logs promptflow
+        #     echo "logs datadb ..."
+        #     docker compose logs datadb
 
-            echo "logs chat ..."
-            docker compose logs chat
+        #     echo "logs promptflow ..."
+        #     docker logs promptflow
 
-            echo "logs server ..."
-            docker compose logs server  
+        #     echo "logs chat ..."
+        #     docker compose logs chat
 
-        - name: Show results
-          run: |
-            docker exec promptflow pf run show-details -n base_run
-            echo "Getting metrics ..."
-            docker exec promptflow  pf run show-metrics -n base_run
-            ##docker exec promptflow  pf run visualize -n base_run
-            echo "Checking results ..."
-            docker exec promptflow python3 check_evaluation_results.py
+        #     echo "logs server ..."
+        #     docker compose logs server  
+
+        # - name: Show results
+        #   run: |
+        #     docker exec promptflow pf run show-details -n base_run
+        #     echo "Getting metrics ..."
+        #     docker exec promptflow  pf run show-metrics -n base_run
+        #     ##docker exec promptflow  pf run visualize -n base_run
+        #     echo "Checking results ..."
+        #     docker exec promptflow python3 check_evaluation_results.py
 
diff --git a/.github/workflows/get_memory_test.yml b/.github/workflows/get_memory_test.yml
@@ -7,8 +7,7 @@ jobs:
       runs-on: ubuntu-latest
       environment: "GitHub Actions 1"
       env: 
-        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        ASSISTANTS_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        ASSISTANTS_API_KEY: ${{ secrets.ASSISTANTS_API_KEY }}
         ASSISTANTS_API_TYPE: ${{ secrets.ASSISTANTS_API_TYPE }}
         ASSISTANTS_API_VERSION: ${{ secrets.ASSISTANTS_API_VERSION }}
         ASSISTANTS_ID: ${{ secrets.ASSISTANTS_ID }} 
@@ -102,6 +101,16 @@ jobs:
 
             sleep 10
 
+        # Debugging GitHUb actions interactively, by connecting to the runner ...
+        # Get ssh connection details for runner.
+        # See here https://github.com/marketplace/actions/debugging-with-ssh
+        # Basically, uncomment this, then get connection string in actions output, then connect with
+        #
+        # ssh -i <YOUR GITHUB SSH KEY> <CONN STRING ON ACTIONS>
+        #
+        #- name: DEBUG - Setup upterm session
+        #  uses: lhotari/action-upterm@v1
+
         - name: Run tests
           run: |
             echo "exec into container ..."

diff --git a/CONTRIBUTION.md b/CONTRIBUTION.md
@@ -93,7 +93,7 @@ Then ...
 
 #### Changing between Azure OpenAI <> OpenAI
 
-As noted in the README, the repo supports assistants on OpenAI or Azure OpenAI. The README has instructions on how to change in the `.env` file, but you will also have to change the connection in the promptflow groundedness node accordingly.
+As noted in the README, the repo supports assistants on OpenAI or Azure OpenAI. The README has instructions on how to change in the `.env` file, remeber to change ASSISTANT_ID as well as the API settings, but you will **also have to change the connection in the promptflow groundedness node accordingly.**
 
 ## GitHub Workflow
 

diff --git a/README.md b/README.md
@@ -13,11 +13,24 @@ For more information on the recipes concept, please see see [here](https://towar
 - [Demo video](https://www.loom.com/share/e27ae40a73e9470597e2d58176ecc4b9?sid=8421a557-fbdd-47f2-85eb-4e96da96b55f) showing AI-assisted coding for managing recipes
 - [Demo video](https://www.loom.com/share/bd63433f977c4f21b2b7d44e7305473f?sid=9ad945ef-cbbd-469d-b4b5-dc8c6859fd75) showing end-user prototype
 
+# Table of Contents
+- [Design Concepts](#design-concepts)
+- [What's in this repo?](#whats-in-this-repo)
+- [What features are supported?](#what-features-are-supported)
+- [Quick start](#quick-start)
+- [Stoping/Starting the environment](#stopingstarting-the-environment)
+- [Using Recipes](#using-recipes)
+- [Additional Features](#additional-features)
+- [Managing recipes](#managing-recipes)
+- [Autogen Studio and autogen agent teams for creating data recipes](#autogen-studio-and-autogen-agent-teams-for-creating-data-recipes)
+
+For additional information, check out [CONTRIBUTION.md](CONTRIBUTION.md)
+
 # Design Concepts
 
 Data recipes have two types: (i) Exact memories, eg '*What is the population of Mali?*' which can be served directly to the user when they ask this question; (ii) Generic skills which can be run when requested for a scenario not in memory, eg a skill for 'What is the population of country X?' which can be called when the user asks something like '*What is the population of Nigeria?*'. In both cases the match to the user's intent is made using semantic search with LLM-reranking.
 
-Given the rapidly changing landscape of LLMs, we have tried as much as possible to implement data recipes in such a way that it can be integrated with various semantic architectures and frameworks. By implementing recipes using a recipes server (powered by FastAPI), it can be called from [Open AI assistant](https://platform.openai.com/docs/assistants/overview) actions and [Copilot Studio](https://www.microsoft.com/en-us/microsoft-copilot/microsoft-copilot-studio) as well from any custom code. Also included in this repo is an example of using recipes via OpenAI format plugins, as supported by frameworks such as [semantic kernel](https://learn.microsoft.com/en-us/semantic-kernel/overview/?tabs=Csharp). 
+Given the rapidly changing landscape of LLMs, we have tried as much as possible to implement data recipes in such a way that it can be integrated with various semantic architectures and frameworks. By implementing recipes using a recipes server (powered by FastAPI), it can be called from [Open AI assistant](https://platform.openai.com/docs/assistants/overview) actions and [Copilot Studio](https://www.microsoft.com/en-us/microsoft-copilot/microsoft-copilot-studio) as well as from any custom code. Also included in this repo is an example of using recipes via OpenAI format plugins, as supported by frameworks such as [semantic kernel](https://learn.microsoft.com/en-us/semantic-kernel/overview/?tabs=Csharp). 
 
 Data recipes supports datasources accessed via API, but in some cases it is preferable to ingest data in order to leverage LLM SQL capabilities. We include an initial set of data sources specific to humanitarian response in the ingestion module, which can be extended to include additional sources as required.
 
@@ -39,7 +52,7 @@ This repo contains a docker-compose environment that will run the following comp
 - (Azure) Open AI Assistant creation tools to create assistants that are aware of the data sources available in the data recipes ai environment 
 - Autogen studio agent team for helping creating recipes [ In progress ]
 
-# What can features are supported?
+# What features are supported?
 
 - Ability to create data recipes using LLMs, these can be served to end users via chat
 - Ability for end users to access memories and recipes using a chat interface. Memories will present saved results, recipes will run to get latest results
@@ -61,43 +74,25 @@ This repo contains a docker-compose environment that will run the following comp
 
     First, copy `.env.example` in your repo to `.env` in the same location, then adjust the following valriables.
 
-    If using **Azure OpenAI**, you will need to set these in your `.env` ...
-
     ```
-    RECIPES_OPENAI_API_TYPE=azure
-    RECIPES_OPENAI_API_KEY=<The API key>
-    RECIPES_OPENAI_API_ENDPOINT=<eg https://<YOUR DEPLOYMENT NAME>.openai.azure.com/>
-    RECIPES_OPENAI_API_VERSION=<The API version in your deployment, eg 2024-05-01-preview>
-    RECIPES_MODEL=<The deployment name you created in Azure, eg gpt-4o>
+    RECIPES_OPENAI_API_TYPE=<azure or openai>
+    RECIPES_OPENAI_API_KEY=<API Key>
+    RECIPES_OPENAI_API_ENDPOINT=<only for Azure, eg https://<YOUR DEPLOYMENT NAME>.openai.azure.com/>
+    RECIPES_OPENAI_API_VERSION=<only for Azure, eg 2024-02-15-preview>
+    RECIPES_MODEL=<On Opne AI model name, on Azure the deployment name you created in Azure, eg gpt-4o>
 
-    ASSISTANTS_API_TYPE=azure  
+    ASSISTANTS_API_TYPE=<azure or openai>  
     ASSISTANTS_API_KEY=<API Key as found on the Azure OpenAI resource>
     ASSISTANTS_ID=<ID of the assistant you created in OpenAI. Leave blank if you do not have one yet>
-    ASSISTANTS_BASE_URL=<eg https://<YOUR DEPLOYMENT NAME>.openai.azure.com/>
-    ASSISTANTS_API_VERSION=<The API version in your deployment, eg 2024-05-01-preview>
-    ASSISTANTS_MODEL=<The deployment name of the model you created in Azure which the assitant uses, eg gpt-4o>
+    ASSISTANTS_BASE_URL=<for Azure only, eg https://<YOUR DEPLOYMENT NAME>.openai.azure.com/>
+    ASSISTANTS_API_VERSION=<For Azure only, eg 2024-02-15-preview>
+    ASSISTANTS_MODEL=<On Open AI, the model name, on Azure the deployment name of the model you created in Azure which the assitant uses, eg gpt-4o>
     ASSISTANTS_BOT_NAME=<Your assistant name, eg "Humanitarian AI Assistant">
 
     ```
 
     Note: In Azure Playground, you can view code for your assistant which provide most of the variables above
 
-    If using **OpenAI directly***, you will instead need to set these ...
-
-    ```
-    RECIPES_OPENAI_API_TYPE=openai
-    RECIPES_OPENAI_API_KEY=<The API key you created on OpenAI>
-    RECIPES_MODEL=<model name, we recommend gpt-4o>
-    RECIPES_OPENAI_TEXT_COMPLETION_DEPLOYMENT_NAME=text-embedding-ada-002
-
-    ASSISTANTS_API_TYPE=openai 
-    OPENAI_API_KEY=<The API key you created on OpenAI>
-    ASSISTANTS_API_KEY=${OPENAI_API_KEY}
-    ASSISTANTS_ID=<ID of the assistant you created in OpenAI. Leave blank if you do not have one yet>
-    ASSISTANTS_MODEL=<The model your assistant uses>
-    ASSISTANTS_BOT_NAME=<Your assistant name, eg "Humanitarian AI Assistant">
-    ```
-
     Be aware that lower-power models such as GPT-3.5-Turbo can serve recipes and carry out basic chat, but perform poorly for analysis and code generation.
 
     Not needed for quick start, but if you want to run ingestion of data with the new HDX API, then you will need to set ...
@@ -106,7 +101,7 @@ This repo contains a docker-compose environment that will run the following comp
 
 4. Download sample Humanitarian Data Exchange (HDX) API data
 
-    For a quick start, we have prepared a sample dataset extracted from the new [HDX API](https://hdx-hapi.readthedocs.io/en/latest/). You can also run the ingestion yourself (see below), but this demo file should get you started quickly.
+    For a quick start, we have prepared a sample dataset extracted from the new [HDX API](https://hdx-hapi.readthedocs.io/en/latest/). You can also run the ingestion yourself [see below](#analysis-on-ingested-data), but this demo file should get you started quickly.
 
     From [this Google folder](https://drive.google.com/drive/folders/1E4G9HM-QzxdXVNkgP3fQXsuNcABWzdus?usp=sharing), download the file starting with 'datadb' and save it into the 'data' folder of your repo.
 
@@ -122,9 +117,11 @@ This repo contains a docker-compose environment that will run the following comp
 
     In a terminal, navigate to the repo top folder and run `docker compose exec chat python create_update_assistant.py`
 
-    Make note of the assitant ID, then edit your `.env` file and using it set variable `ASSISTANTS_ID`.
+    Make note of the assitant ID printed, then edit your `.env` file and using it set variable `ASSISTANTS_ID`.
+
+    Note: (i) If you rerun `create_update_assistant.py` once `ASSISTANTS_ID` is set, the script will update the assistant rather than create a new one. You will need to do this if trying different models; (ii) You can also add your own data, pdf, docx, csv, xlsx files for the assistant to use, see section [Adding your own files for the assistant to analyze](#adding-your-own-files-for-the-assistant-to-analyze) below.
 
-    Note: (i) If you rerun `create_update_assistant.py` once `ASSISTANTS_ID` is set, the script will update the assistant rather than create a new one. You will need to do this if trying different models; (ii) You can also add your own data, pdf, docx, csv, xlsx files for the assistant to use, see section 'Adding your own files for the assistant to analyze' below.
+    *Warning! If using **Azure**, at time of writing July 2024, [the documented approach](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/file-search?tabs=python#update-the-assistant-to-use-the-new-vector-store) for uploading files in Python does not work while assistants in preview model. Though the python runs, the files do not appear in the UI. We recommedn you upload files in [./assistants/chat_ui/files](./assistants/chat_ui/files) in the UI yourself.*
 
 7. Restart so the assistant ID is set, `docker compose up -d`
 

diff --git a/assistants/chat_ui/create_update_assistant.py b/assistants/chat_ui/create_update_assistant.py
@@ -221,7 +221,7 @@ def create_update_assistant():
         f.write(instructions)
 
     # Upload any local files needed by assistant for file_search (RAG)
-    vector_store_id = upload_files_to_vector_store("local_files_vectore_store", client)
+    vector_store_id = upload_files_to_vector_store("local_files_vector_store", client)
 
     # Upload any files that will be used for code_interpretor
     code_interpreter_file_ids = upload_files_for_code_interpreter(client)
@@ -258,23 +258,24 @@ def create_update_assistant():
     if "code_interpreter" in tool_resources or "file_search" in tool_resources:
         params["tool_resources"] = tool_resources
 
-    # If we were provided an ID in .env, pass it in to update existing assistant
-    if assistant_id is not None:
-        params["assistant_id"] = assistant_id
-
-    print(json.dumps(params, indent=4))
+    print(json.dumps(params, indent=2))
 
-    if assistant_id is None:
+    if (
+        assistant_id is None
+        or assistant_id == ""
+        or assistant_id.replace(" ", "") == ""
+    ):
         print(
-            f"Calling assistant API for ID: {assistant_id}, name: {bot_name} and model {model} ..."
+            f"Calling CREATE assistant API for ID: {assistant_id}, name: {bot_name} and model {model} ..."
         )
         assistant = client.beta.assistants.create(**params)
         print("Assistant created!! Here is the assistant ID:\n")
         print(assistant.id)
         print("\nNow update ASSISTANTS_ID in your .env file with this ID")
     else:
+        params["assistant_id"] = assistant_id
         print(
-            f"Calling assistant API for ID: {assistant_id}, name: {bot_name} and model {model} ..."
+            f"Calling UPDATE assistant API for ID: {assistant_id}, name: {bot_name} and model {model} ..."
         )
         assistant = client.beta.assistants.update(**params)
         print("Assistant updated!!")

diff --git a/db/recipedb/3-demo-data-langchain-embedding.sql b/db/recipedb/3-demo-data-langchain-embedding.sql