From 390dda9f6d01ae66aec39c093f26d3defe0e7b30 Mon Sep 17 00:00:00 2001 From: Anil Dwarakanath Date: Sat, 7 Oct 2023 01:11:48 -0700 Subject: [PATCH 1/3] add vectordb --- .../ingest/search-indexer-vectordb.py | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py diff --git a/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py b/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py new file mode 100644 index 000000000..0b2329448 --- /dev/null +++ b/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py @@ -0,0 +1,191 @@ +import os +import openai +from azure.search.documents.indexes import SearchIndexClient +from azure.search.documents.models import Vector #pip install ./build/azure_search_documents-11.4.0b4-py3-none-any.whl +from azure.search.documents.indexes.models import ( + SearchIndex, + SearchField, + SearchFieldDataType, + SimpleField, + SearchableField, + SearchIndex, + SemanticConfiguration, + PrioritizedFields, + SemanticField, + SearchField, + SemanticSettings, + VectorSearch, + HnswVectorSearchAlgorithmConfiguration, +) +from azure.core.credentials import AzureKeyCredential +from azure.ai.formrecognizer import DocumentAnalysisClient +import requests +from dotenv import load_dotenv +from pathlib import Path # Python 3.6+ only + + + +env_path = Path('.') / 'secrets.env' +load_dotenv(dotenv_path=env_path) + + + +SEARCH_ENDPOINT = os.environ["AZSEARCH_EP"] +SEARCH_API_KEY = os.environ["AZSEARCH_KEY"] +SEARCH_INDEX = os.environ["INDEX_NAME"] + +api_version = '?api-version=2021-04-30-Preview' +headers = {'Content-Type': 'application/json', + 'api-key': SEARCH_API_KEY } + +endpoint = os.environ["AFR_ENDPOINT"] +key = os.environ["AFR_API_KEY"] +credential = AzureKeyCredential(SEARCH_API_KEY) +openai.api_type = "azure" +openai.api_key = os.getenv("OPENAI_API_KEY") +openai.api_base = os.getenv("OPENAI_API_BASE") +openai.api_version = os.getenv("OPENAI_API_VERSION") + +formUrl = os.environ["FILE_URL"] +localFolderPath = os.environ["LOCAL_FOLDER_PATH"] + +if (formUrl == "" and localFolderPath == ""): + print("Please provide a valid FILE_URL or LOCAL_FOLDER_PATH in secrets.env file.") + exit() + + + +document_analysis_client = DocumentAnalysisClient( + endpoint=endpoint, credential=AzureKeyCredential(key) +) + + +def create_vector_index(index_name=SEARCH_INDEX): + index_client = SearchIndexClient( + endpoint=SEARCH_ENDPOINT, credential=credential) + + try: + idx = index_client.get_index(index_name) + return + except Exception as e: + if e.status_code == 404: + pass + else: + raise e + + + fields = [ + SimpleField(name="id", type=SearchFieldDataType.String, key=True), + SearchableField(name="text", type=SearchFieldDataType.String, + searchable=True, retrievable=True), + SearchableField(name="summary", type=SearchFieldDataType.String, + searchable=True, retrievable=True), + SearchableField(name="fileName", type=SearchFieldDataType.String, + searchable=False, retrievable=True), + SearchableField(name="pageNumber", type=SearchFieldDataType.String, + filterable=False, searchable=False, retrievable=True), + SearchField(name="textVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), + searchable=True, vector_search_dimensions=1536, vector_search_configuration="vector-config"), + SearchField(name="summaryVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), + searchable=True, vector_search_dimensions=1536, vector_search_configuration="vector-config"), + ] + + vector_search = VectorSearch( + algorithm_configurations=[ + HnswVectorSearchAlgorithmConfiguration( + name="vector-config", + kind="hnsw", + hnsw_parameters={ + "m": 4, + "efConstruction": 400, + "efSearch": 500, + "metric": "cosine" + } + ) + ] + ) + + semantic_config = SemanticConfiguration( + name="my-semantic-config", + prioritized_fields=PrioritizedFields( + title_field=SemanticField(field_name="text"), + prioritized_keywords_fields=[SemanticField(field_name="summary")] + ) + ) + # Create the semantic settings with the configuration + semantic_settings = SemanticSettings(configurations=[semantic_config]) + + # Create the search index with the semantic settings + index = SearchIndex(name=index_name, fields=fields, + vector_search=vector_search, semantic_settings=semantic_settings) + result = index_client.create_index(index) + return(f' {result.name} created') + +def delete_search_index(index_name=SEARCH_INDEX): + try: + url = SEARCH_ENDPOINT + "indexes/" + index_name + api_version + response = requests.delete(url, headers=headers) + return("Index deleted") + except Exception as e: + return(e) + + +def add_document_to_index(page_idx, documents, index_name=SEARCH_INDEX): + try: + url = SEARCH_ENDPOINT + "indexes/" + index_name + "/docs/index" + api_version + response = requests.post(url, headers=headers, json=documents) + print(f"page_idx is {page_idx} - {len(documents['value'])} Documents added") + except Exception as e: + print(e) + +def process_afr_result(result, filename): + print(f"Processing {filename } with {len(result.pages)} pages into Azure Search....this might take a few minutes depending on number of pages...") + for page_idx in range(len(result.pages)): + docs = [] + content_chunk = "" + for line_idx, line in enumerate(result.pages[page_idx].lines): + #print("...Line # {} has text content '{}'".format(line_idx,line.content.encode("utf-8"))) + content_chunk += str(line.content.encode("utf-8")).replace('b','') + "\n" + + if line_idx != 0 and line_idx % 20 == 0: + search_doc = { + "id": f"page-number-{page_idx + 1}-line-number-{line_idx}", + "text": content_chunk, + "textVector": generate_embeddings(content_chunk), + "fileName": filename, + "pageNumber": str(page_idx+1) + } + docs.append(search_doc) + content_chunk = "" + search_doc = { + "id": f"page-number-{page_idx + 1}-line-number-{line_idx}", + "text": content_chunk, + "textVector": generate_embeddings(content_chunk), + "fileName": filename, + "pageNumber": str(page_idx + 1) + } + docs.append(search_doc) + add_document_to_index(page_idx, {"value": docs}) + + +# Function to generate embeddings for title and content fields, also used for query embeddings +def generate_embeddings(text): + response = openai.Embedding.create( + input=text, engine="text-embedding-ada-002") + embeddings = response['data'][0]['embedding'] + return embeddings + + + +def main(): + create_vector_index() + if(formUrl != ""): + print(f"Analyzing form from URL {formUrl}...") + poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", formUrl) + result = poller.result() + print(f"Processing result...this might take a few minutes...") + process_afr_result(result, "aml-api-v2.pdf") + + +if __name__ == "__main__": + main() \ No newline at end of file From 548580f4c6ff0300a8a1fb1d9354302604870cb8 Mon Sep 17 00:00:00 2001 From: Anil Dwarakanath Date: Mon, 9 Oct 2023 23:51:46 -0700 Subject: [PATCH 2/3] update --- .../ingest/requirements.txt | 6 ++++++ .../ingest/search-indexer-vectordb.py | 15 ++++++++++++++- .../ingest/secrets.rename | 6 ++++++ .../orchestrator/requirements.txt | 2 +- 4 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 scenarios/openai_on_custom_dataset/ingest/requirements.txt diff --git a/scenarios/openai_on_custom_dataset/ingest/requirements.txt b/scenarios/openai_on_custom_dataset/ingest/requirements.txt new file mode 100644 index 000000000..bad7c4572 --- /dev/null +++ b/scenarios/openai_on_custom_dataset/ingest/requirements.txt @@ -0,0 +1,6 @@ +azure-core +openai +pandas +#azure-search-documents==11.4.0b9 +azure-ai-formrecognizer +python-dotenv \ No newline at end of file diff --git a/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py b/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py index 0b2329448..8376abacf 100644 --- a/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py +++ b/scenarios/openai_on_custom_dataset/ingest/search-indexer-vectordb.py @@ -171,7 +171,7 @@ def process_afr_result(result, filename): # Function to generate embeddings for title and content fields, also used for query embeddings def generate_embeddings(text): response = openai.Embedding.create( - input=text, engine="text-embedding-ada-002") + input=text, engine=os.getenv("OPENAI_MODEL_NAME")) embeddings = response['data'][0]['embedding'] return embeddings @@ -186,6 +186,19 @@ def main(): print(f"Processing result...this might take a few minutes...") process_afr_result(result, "aml-api-v2.pdf") + if(localFolderPath != ""): + for filename in os.listdir(localFolderPath): + file = os.path.join(localFolderPath, filename) + with open(file, "rb") as f: + print(f"Analyzing file {filename} from directory {localFolderPath}...") + poller = document_analysis_client.begin_analyze_document( + "prebuilt-layout", document=f + ) + result = poller.result() + print(f"{filename}Processing result...this might take a few minutes...") + process_afr_result(result, filename) + print(f"done") + if __name__ == "__main__": main() \ No newline at end of file diff --git a/scenarios/openai_on_custom_dataset/ingest/secrets.rename b/scenarios/openai_on_custom_dataset/ingest/secrets.rename index 80709087b..71601f8b7 100644 --- a/scenarios/openai_on_custom_dataset/ingest/secrets.rename +++ b/scenarios/openai_on_custom_dataset/ingest/secrets.rename @@ -3,5 +3,11 @@ AZSEARCH_KEY="" AFR_ENDPOINT="https://westus2.api.cognitive.microsoft.com/" AFR_API_KEY="" INDEX_NAME="azure-ml-docs" +OPENAI_API_KEY=b22952ca03934af9953bf842b813682b +OPENAI_API_BASE="https://anildwaextopenai2.openai.azure.com/" +OPENAI_API_VERSION=2023-03-15-preview +OPENAI_API_TYPE="azure" +OPENAI_MODEL_NAME="text-embedding-ada-002" + FILE_URL="https://github.com/microsoft/OpenAIWorkshop/raw/main/scenarios/data/azure-machine-learning-2-500.pdf" LOCAL_FOLDER_PATH="" #for e.g C:\\source\\repos\\tibco\\testfile for Windows and /mnt/source/folder for Linux \ No newline at end of file diff --git a/scenarios/openai_on_custom_dataset/orchestrator/requirements.txt b/scenarios/openai_on_custom_dataset/orchestrator/requirements.txt index 7626ee2f2..2dbaae0c5 100644 --- a/scenarios/openai_on_custom_dataset/orchestrator/requirements.txt +++ b/scenarios/openai_on_custom_dataset/orchestrator/requirements.txt @@ -8,6 +8,6 @@ openai==0.27.2 azure-kusto-data==4.0.2 pandas pyodbc -azure-search-documents==11.4.0b3 +azure-search-documents==azure-search-documents==11.4.0b9 azure-ai-formrecognizer python-dotenv From 75fa284c70ed3c50b2b91d4f948e33e809a89c13 Mon Sep 17 00:00:00 2001 From: Anil Dwarakanath Date: Mon, 9 Oct 2023 23:53:11 -0700 Subject: [PATCH 3/3] update --- scenarios/openai_on_custom_dataset/ingest/secrets.rename | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scenarios/openai_on_custom_dataset/ingest/secrets.rename b/scenarios/openai_on_custom_dataset/ingest/secrets.rename index 71601f8b7..2b995cb2a 100644 --- a/scenarios/openai_on_custom_dataset/ingest/secrets.rename +++ b/scenarios/openai_on_custom_dataset/ingest/secrets.rename @@ -3,7 +3,7 @@ AZSEARCH_KEY="" AFR_ENDPOINT="https://westus2.api.cognitive.microsoft.com/" AFR_API_KEY="" INDEX_NAME="azure-ml-docs" -OPENAI_API_KEY=b22952ca03934af9953bf842b813682b +OPENAI_API_KEY= OPENAI_API_BASE="https://anildwaextopenai2.openai.azure.com/" OPENAI_API_VERSION=2023-03-15-preview OPENAI_API_TYPE="azure"