From 390dda9f6d01ae66aec39c093f26d3defe0e7b30 Mon Sep 17 00:00:00 2001
From: Anil Dwarakanath <anildwa@microsoft.com>
Date: Sat, 7 Oct 2023 01:11:48 -0700
Subject: [PATCH 1/3] add vectordb

---
 .../ingest/search-indexer-vectordb.py         | 191 ++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100644 scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py

diff --git a/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py b/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py
new file mode 100644
index 000000000..0b2329448
--- /dev/null
+++ b/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py
@@ -0,0 +1,191 @@
+import os
+import openai  
+from azure.search.documents.indexes import SearchIndexClient  
+from azure.search.documents.models import Vector  #pip install ./build/azure_search_documents-11.4.0b4-py3-none-any.whl  
+from azure.search.documents.indexes.models import (  
+    SearchIndex,  
+    SearchField,  
+    SearchFieldDataType,  
+    SimpleField,  
+    SearchableField,  
+    SearchIndex,  
+    SemanticConfiguration,  
+    PrioritizedFields,  
+    SemanticField,  
+    SearchField,  
+    SemanticSettings,  
+    VectorSearch,  
+    HnswVectorSearchAlgorithmConfiguration,  
+)  
+from azure.core.credentials import AzureKeyCredential
+from azure.ai.formrecognizer import DocumentAnalysisClient
+import requests
+from dotenv import load_dotenv
+from pathlib import Path  # Python 3.6+ only
+
+
+
+env_path = Path('.') / 'secrets.env'
+load_dotenv(dotenv_path=env_path)
+
+
+
+SEARCH_ENDPOINT = os.environ["AZSEARCH_EP"]
+SEARCH_API_KEY = os.environ["AZSEARCH_KEY"]
+SEARCH_INDEX = os.environ["INDEX_NAME"]
+
+api_version = '?api-version=2021-04-30-Preview'
+headers = {'Content-Type': 'application/json',
+        'api-key': SEARCH_API_KEY }
+
+endpoint = os.environ["AFR_ENDPOINT"]
+key = os.environ["AFR_API_KEY"]
+credential = AzureKeyCredential(SEARCH_API_KEY)
+openai.api_type = "azure"  
+openai.api_key = os.getenv("OPENAI_API_KEY")  
+openai.api_base = os.getenv("OPENAI_API_BASE")  
+openai.api_version = os.getenv("OPENAI_API_VERSION") 
+
+formUrl = os.environ["FILE_URL"]
+localFolderPath = os.environ["LOCAL_FOLDER_PATH"]
+
+if (formUrl == "" and localFolderPath == ""):
+    print("Please provide a valid FILE_URL or LOCAL_FOLDER_PATH in secrets.env file.")
+    exit()
+
+
+
+document_analysis_client = DocumentAnalysisClient(
+    endpoint=endpoint, credential=AzureKeyCredential(key)
+)
+
+
+def create_vector_index(index_name=SEARCH_INDEX):
+    index_client = SearchIndexClient(
+        endpoint=SEARCH_ENDPOINT, credential=credential)
+    
+    try:
+        idx = index_client.get_index(index_name)
+        return
+    except Exception as e:
+        if e.status_code == 404:
+            pass
+        else:
+            raise e
+
+
+    fields = [
+        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
+        SearchableField(name="text", type=SearchFieldDataType.String,
+                        searchable=True, retrievable=True),
+        SearchableField(name="summary", type=SearchFieldDataType.String,
+                        searchable=True, retrievable=True),                        
+        SearchableField(name="fileName", type=SearchFieldDataType.String,
+                        searchable=False, retrievable=True),
+        SearchableField(name="pageNumber", type=SearchFieldDataType.String,
+                        filterable=False, searchable=False, retrievable=True),
+        SearchField(name="textVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+                    searchable=True, vector_search_dimensions=1536, vector_search_configuration="vector-config"),
+        SearchField(name="summaryVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+                    searchable=True, vector_search_dimensions=1536, vector_search_configuration="vector-config"),
+    ]
+
+    vector_search = VectorSearch(
+      algorithm_configurations=[
+          HnswVectorSearchAlgorithmConfiguration(
+              name="vector-config",
+              kind="hnsw",
+              hnsw_parameters={
+                  "m": 4,
+                  "efConstruction": 400,
+                  "efSearch": 500,
+                  "metric": "cosine"
+              }
+          )
+      ]
+    )
+
+    semantic_config = SemanticConfiguration(
+        name="my-semantic-config",
+        prioritized_fields=PrioritizedFields(
+            title_field=SemanticField(field_name="text"),
+            prioritized_keywords_fields=[SemanticField(field_name="summary")]
+        )
+    )
+    # Create the semantic settings with the configuration
+    semantic_settings = SemanticSettings(configurations=[semantic_config])
+
+    # Create the search index with the semantic settings
+    index = SearchIndex(name=index_name, fields=fields,
+                        vector_search=vector_search, semantic_settings=semantic_settings)
+    result = index_client.create_index(index)
+    return(f' {result.name} created')
+
+def delete_search_index(index_name=SEARCH_INDEX):
+    try:
+        url = SEARCH_ENDPOINT + "indexes/" + index_name + api_version 
+        response  = requests.delete(url, headers=headers)
+        return("Index deleted")
+    except Exception as e:
+        return(e)
+    
+
+def add_document_to_index(page_idx, documents, index_name=SEARCH_INDEX):
+    try:
+        url = SEARCH_ENDPOINT + "indexes/" + index_name + "/docs/index" + api_version
+        response  = requests.post(url, headers=headers, json=documents)
+        print(f"page_idx is {page_idx} - {len(documents['value'])} Documents added")
+    except Exception as e:
+        print(e)
+
+def process_afr_result(result, filename):
+    print(f"Processing {filename } with {len(result.pages)} pages into Azure Search....this might take a few minutes depending on number of pages...")
+    for page_idx in range(len(result.pages)):
+        docs = []
+        content_chunk = ""
+        for line_idx, line in enumerate(result.pages[page_idx].lines):
+            #print("...Line # {} has text content '{}'".format(line_idx,line.content.encode("utf-8")))
+            content_chunk += str(line.content.encode("utf-8")).replace('b','') + "\n"
+
+            if line_idx != 0 and line_idx % 20 == 0:
+              search_doc = {
+                    "id":  f"page-number-{page_idx + 1}-line-number-{line_idx}",
+                    "text": content_chunk,
+                    "textVector": generate_embeddings(content_chunk),
+                    "fileName": filename,
+                    "pageNumber": str(page_idx+1)
+              }
+              docs.append(search_doc)
+              content_chunk = ""
+        search_doc = {
+                    "id":  f"page-number-{page_idx + 1}-line-number-{line_idx}",
+                    "text": content_chunk,
+                    "textVector": generate_embeddings(content_chunk),
+                    "fileName": filename,
+                    "pageNumber": str(page_idx + 1)
+        }
+        docs.append(search_doc)   
+        add_document_to_index(page_idx, {"value": docs})
+
+
+# Function to generate embeddings for title and content fields, also used for query embeddings
+def generate_embeddings(text):
+    response = openai.Embedding.create(
+        input=text, engine="text-embedding-ada-002")
+    embeddings = response['data'][0]['embedding']
+    return embeddings
+
+
+
+def main():
+    create_vector_index()
+    if(formUrl != ""):
+      print(f"Analyzing form from URL {formUrl}...")
+      poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", formUrl)
+      result = poller.result()
+      print(f"Processing result...this might take a few minutes...")
+      process_afr_result(result, "aml-api-v2.pdf")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 548580f4c6ff0300a8a1fb1d9354302604870cb8 Mon Sep 17 00:00:00 2001
From: Anil Dwarakanath <anildwa@microsoft.com>
Date: Mon, 9 Oct 2023 23:51:46 -0700
Subject: [PATCH 2/3] update

---
 .../ingest/requirements.txt                       |  6 ++++++
 .../ingest/search-indexer-vectordb.py             | 15 ++++++++++++++-
 .../ingest/secrets.rename                         |  6 ++++++
 .../orchestrator/requirements.txt                 |  2 +-
 4 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 scenarios/openai_on_custom_dataset/ingest/requirements.txt

diff --git a/scenarios/openai_on_custom_dataset/ingest/requirements.txt b/scenarios/openai_on_custom_dataset/ingest/requirements.txt
new file mode 100644
index 000000000..bad7c4572
--- /dev/null
+++ b/scenarios/openai_on_custom_dataset/ingest/requirements.txt
@@ -0,0 +1,6 @@
+azure-core
+openai
+pandas
+#azure-search-documents==11.4.0b9
+azure-ai-formrecognizer
+python-dotenv
\ No newline at end of file
diff --git a/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py b/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py
index 0b2329448..8376abacf 100644
--- a/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py
+++ b/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py
@@ -171,7 +171,7 @@ def process_afr_result(result, filename):
 # Function to generate embeddings for title and content fields, also used for query embeddings
 def generate_embeddings(text):
     response = openai.Embedding.create(
-        input=text, engine="text-embedding-ada-002")
+        input=text, engine=os.getenv("OPENAI_MODEL_NAME"))
     embeddings = response['data'][0]['embedding']
     return embeddings
 
@@ -186,6 +186,19 @@ def main():
       print(f"Processing result...this might take a few minutes...")
       process_afr_result(result, "aml-api-v2.pdf")
 
+    if(localFolderPath != ""):
+      for filename in os.listdir(localFolderPath):
+        file = os.path.join(localFolderPath, filename)    
+        with open(file, "rb") as f:
+          print(f"Analyzing file {filename} from directory {localFolderPath}...")
+          poller = document_analysis_client.begin_analyze_document(
+              "prebuilt-layout", document=f
+          )
+          result = poller.result()
+          print(f"{filename}Processing result...this might take a few minutes...")
+          process_afr_result(result, filename)
+      print(f"done")
+
 
 if __name__ == "__main__":
     main()
\ No newline at end of file
diff --git a/scenarios/openai_on_custom_dataset/ingest/secrets.rename b/scenarios/openai_on_custom_dataset/ingest/secrets.rename
index 80709087b..71601f8b7 100644
--- a/scenarios/openai_on_custom_dataset/ingest/secrets.rename
+++ b/scenarios/openai_on_custom_dataset/ingest/secrets.rename
@@ -3,5 +3,11 @@ AZSEARCH_KEY=""
 AFR_ENDPOINT="https://westus2.api.cognitive.microsoft.com/"
 AFR_API_KEY=""
 INDEX_NAME="azure-ml-docs"
+OPENAI_API_KEY=b22952ca03934af9953bf842b813682b
+OPENAI_API_BASE="https://anildwaextopenai2.openai.azure.com/" 
+OPENAI_API_VERSION=2023-03-15-preview
+OPENAI_API_TYPE="azure"
+OPENAI_MODEL_NAME="text-embedding-ada-002"
+
 FILE_URL="https://github.com/microsoft/OpenAIWorkshop/raw/main/scenarios/data/azure-machine-learning-2-500.pdf"
 LOCAL_FOLDER_PATH="" #for e.g C:\\source\\repos\\tibco\\testfile for Windows and /mnt/source/folder for Linux
\ No newline at end of file
diff --git a/scenarios/openai_on_custom_dataset/orchestrator/requirements.txt b/scenarios/openai_on_custom_dataset/orchestrator/requirements.txt
index 7626ee2f2..2dbaae0c5 100644
--- a/scenarios/openai_on_custom_dataset/orchestrator/requirements.txt
+++ b/scenarios/openai_on_custom_dataset/orchestrator/requirements.txt
@@ -8,6 +8,6 @@ openai==0.27.2
 azure-kusto-data==4.0.2
 pandas
 pyodbc
-azure-search-documents==11.4.0b3
+azure-search-documents==azure-search-documents==11.4.0b9
 azure-ai-formrecognizer
 python-dotenv

From 75fa284c70ed3c50b2b91d4f948e33e809a89c13 Mon Sep 17 00:00:00 2001
From: Anil Dwarakanath <anildwa@microsoft.com>
Date: Mon, 9 Oct 2023 23:53:11 -0700
Subject: [PATCH 3/3] update

---
 scenarios/openai_on_custom_dataset/ingest/secrets.rename | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scenarios/openai_on_custom_dataset/ingest/secrets.rename b/scenarios/openai_on_custom_dataset/ingest/secrets.rename
index 71601f8b7..2b995cb2a 100644
--- a/scenarios/openai_on_custom_dataset/ingest/secrets.rename
+++ b/scenarios/openai_on_custom_dataset/ingest/secrets.rename
@@ -3,7 +3,7 @@ AZSEARCH_KEY=""
 AFR_ENDPOINT="https://westus2.api.cognitive.microsoft.com/"
 AFR_API_KEY=""
 INDEX_NAME="azure-ml-docs"
-OPENAI_API_KEY=b22952ca03934af9953bf842b813682b
+OPENAI_API_KEY=
 OPENAI_API_BASE="https://anildwaextopenai2.openai.azure.com/" 
 OPENAI_API_VERSION=2023-03-15-preview
 OPENAI_API_TYPE="azure"