From cc4b9355388683173f976960d843c5dc357800f4 Mon Sep 17 00:00:00 2001 From: cjackson202 Date: Tue, 5 Nov 2024 07:52:54 -0500 Subject: [PATCH 1/7] Updated streamlit embedding demo --- notebooks/GenAI/.gitignore | 4 + notebooks/GenAI/embedding_demos/Demo_Suite.py | 61 +++++++++ .../GenAI/embedding_demos/acs_embeddings.py | 79 ----------- .../GenAI/embedding_demos/aoai_embeddings.py | 102 -------------- .../embedding_demos/pages/AI_Search_Query.py | 106 +++++++++++++++ .../embedding_demos/pages/AOAI_Embeddings.py | 126 ++++++++++++++++++ notebooks/GenAI/embedding_demos/style.css | 116 ++++++++++++++++ notebooks/GenAI/embedding_demos/styling.py | 7 + notebooks/GenAI/requirements.txt | 9 +- 9 files changed, 422 insertions(+), 188 deletions(-) create mode 100644 notebooks/GenAI/.gitignore create mode 100644 notebooks/GenAI/embedding_demos/Demo_Suite.py delete mode 100644 notebooks/GenAI/embedding_demos/acs_embeddings.py delete mode 100644 notebooks/GenAI/embedding_demos/aoai_embeddings.py create mode 100644 notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py create mode 100644 notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py create mode 100644 notebooks/GenAI/embedding_demos/style.css create mode 100644 notebooks/GenAI/embedding_demos/styling.py diff --git a/notebooks/GenAI/.gitignore b/notebooks/GenAI/.gitignore new file mode 100644 index 0000000..ad23308 --- /dev/null +++ b/notebooks/GenAI/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +.venv +.env +microsoft-earnings_embeddings.csv \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/Demo_Suite.py b/notebooks/GenAI/embedding_demos/Demo_Suite.py new file mode 100644 index 0000000..327eb8d --- /dev/null +++ b/notebooks/GenAI/embedding_demos/Demo_Suite.py @@ -0,0 +1,61 @@ +import streamlit as st +from styling import global_page_style + +def main(): + # Set page configuration + # st.set_page_config(page_title="Azure OpenAI RAG Demo Suite", layout="wide") + + # Title and subtitle + # Create columns for logo and title + + st.markdown( + f'
', + unsafe_allow_html=True + ) + st.title("Azure OpenAI RAG Demo Suite") + st.markdown("### Demo Overviews") + st.write(""" + Welcome to the Azure OpenAI RAG Demo Suite. On the left side-panel, you will find various demonstrations that showcase the capabilities of Azure OpenAI with a Streamlit frontend. Each demonstration is described in detail below, highlighting their unique features and functionalities. + """) + + # Horizontal divider + st.markdown("---") + + # Chat with Your Data section + st.markdown("### Chat with Your Data using Azure OpenAI API and AI Search Index (AI Search Query)") + st.write(""" + This demo allows users to interact with data stored in their Azure AI Search Index using a combination of semantic and vector search methods. + """) + st.write(""" + - **Semantic Search**: Understands the meaning and context of your queries to deliver more relevant results. + - **Vector Search**: Utilizes numerical representations of text to find similar content based on cosine similarity. + """) + # Ensure the user has created the Azure AI search index already + st.write(""" + **Note**: Users must have created the Azure AI search index already as shown here: [Upload your own data and query over it](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/Azure_Open_AI_README.md) + """) + + # Horizontal divider + st.markdown("---") + + # Generate & Search with Azure OpenAI Embeddings section + st.markdown("### Generate & Search with Azure OpenAI Embeddings (AOAI Embeddings)") + st.write(""" + This demo enables users to generate embeddings from a pre-chunked CSV file and perform searches over the content using vector search. + """) + st.write(""" + - **Vectorize**: Creates embeddings based on the "microsoft-earnings.csv" file provided in this directory. The embeddings are generated from the "text" column. The CSV file is pre-chunked, meaning the text has already been split and prepared for embedding generation. A new CSV file will be created to store all generated embeddings, forming your vector store. + - **Retrieve**: Generates embeddings based on user queries. The query embedding is then used to search for the most similar document within the vector store using cosine similarity. + """) + st.write(""" + Example questions a user can ask about the microsoft-earnings.csv: + - What was said about the budget? + - How many people utilize GitHub to build software? + - How many points did Microsoft Cloud gross margin percentage increase by? + - What are the expectations for the Q2 cash flow? + """) + + +if __name__ == '__main__': + global_page_style() + main() \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/acs_embeddings.py b/notebooks/GenAI/embedding_demos/acs_embeddings.py deleted file mode 100644 index 8a4a68a..0000000 --- a/notebooks/GenAI/embedding_demos/acs_embeddings.py +++ /dev/null @@ -1,79 +0,0 @@ -from langchain.retrievers import AzureCognitiveSearchRetriever -from langchain.embeddings import OpenAIEmbeddings -from langchain.vectorstores import FAISS -from langchain.chains import RetrievalQA -from langchain.chat_models import AzureChatOpenAI -from PIL import Image -import os -import streamlit as st -from dotenv import load_dotenv - -# load in .env variables -load_dotenv() - -def config_keys(): - # set api keys for AOAI and Azure Search - os.environ['OPENAI_API_VERSION'] = os.getenv('AZURE_OPENAI_VERSION') - os.environ['OPENAI_API_KEY'] = os.getenv('AZURE_OPENAI_KEY') - os.environ['OPENAI_API_BASE'] = os.getenv('AZURE_OPENAI_ENDPOINT') - os.environ['OPENAI_EMBEDDING_DEPLOYMENT_NAME'] = os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME') - os.environ['AZURE_COGNITIVE_SEARCH_SERVICE_NAME'] = os.getenv('AZURE_COGNITIVE_SEARCH_SERVICE_NAME') - os.environ['AZURE_COGNITIVE_SEARCH_API_KEY'] = os.getenv('AZURE_COGNITIVE_SEARCH_API_KEY') - os.environ['AZURE_COGNITIVE_SEARCH_INDEX_NAME'] = os.getenv('AZURE_COGNITIVE_SEARCH_INDEX_NAME') - - -def main(): - # Streamlit config - st.title("Demo - Azure OpenAI & Cognitive Search Embeddings") - image = Image.open('image_logo2.png') - st.image(image, caption = '') - st.write('This program is designed to chat over your files in Azure Cognitive Search. \ - Be specific and clear with the questions you ask. \ - Welcome to CHATGPT over your own data !!') - if 'generated' not in st.session_state: - st.session_state.generated = [] - if 'past' not in st.session_state: - st.session_state.past = [] - - # create your LLM and embeddings. Will be conifuring 'azure' in the openai_api_type parameter. - llm = AzureChatOpenAI( - deployment_name = "gpt-35-turbo", - openai_api_type = "azure", - model = "gpt-35-turbo", - temperature=0.7, - max_tokens=200 - ) - - embeddings = OpenAIEmbeddings(chunk_size=1, openai_api_type="azure") - - # ask for the user query - query = st.text_input("Enter a search query: ", key='search_term', placeholder="") - - if query: - st.session_state.past.append(query) - - # set up Azure Cognitive Search to retrieve documents - # top_k = 1: we only want first related doc - retriever = AzureCognitiveSearchRetriever(content_key="content", top_k=1) - - # get the relevant document from Azure Cognitive Search that are only relevant to the query being asked - docs = retriever.get_relevant_documents(query) - - # create embedding from the document retrieved and place in a FAISS vector database - db = FAISS.from_documents(documents=docs, embedding=embeddings) - - # set up the chain that will feed the retrieved document to the LLM - chain = RetrievalQA.from_chain_type(llm=llm, retriever = db.as_retriever(), chain_type="stuff") - - # run the chain on the query asked - response = chain.run(query) - st.session_state.generated.append(response) - - with st.expander('Vector Search'): - for i in range(len(st.session_state.generated)-1, -1, -1): - st.info(st.session_state.past[i]) - st.success(st.session_state.generated[i]) - -if __name__ == '__main__': - config_keys() - main() diff --git a/notebooks/GenAI/embedding_demos/aoai_embeddings.py b/notebooks/GenAI/embedding_demos/aoai_embeddings.py deleted file mode 100644 index eb694c7..0000000 --- a/notebooks/GenAI/embedding_demos/aoai_embeddings.py +++ /dev/null @@ -1,102 +0,0 @@ -import openai -from openai.embeddings_utils import get_embedding, cosine_similarity # must pip install openai[embeddings] -import pandas as pd -import numpy as np -import os -import streamlit as st -import time -from PIL import Image -from dotenv import load_dotenv - -# load in .env variables -load_dotenv() - -# configure azure openai keys -openai.api_type = 'azure' -openai.api_version = os.environ['AZURE_OPENAI_VERSION'] -openai.api_base = os.environ['AZURE_OPENAI_ENDPOINT'] -openai.api_key = os.environ['AZURE_OPENAI_KEY'] - -def embedding_create(): - # acquire the filename to be embed - st.subheader("Vector Creation") - st.write('This program is designed to embed your pre-chunked .csv file. \ - By accomplishing this task, you will be able to chat over all cotent in your .csv via vector searching. \ - Just enter the file and the program will take care of the rest (specify file path if not in this directory). \ - Welcome to CHATGPT over your own data !!') - filename = st.text_input("Enter a file: ", key='filename', value="") - - # start the embeddings process if filename provided - if filename: - - # read the data file to be embed - df = pd.read_csv('C:\\src\\AzureOpenAI_Gov_Workshop\\' + filename) - st.write(df) - - # calculate word embeddings - df['embedding'] = df['text'].apply(lambda x:get_embedding(x, engine='text-embedding-ada-002')) - df.to_csv('C:\\src\\AzureOpenAI_Gov_Workshop\\microsoft-earnings_embeddings.csv') - time.sleep(3) - st.subheader("Post Embedding") - st.success('Embeddings Created Sucessfully!!') - st.write(df) - - -def embeddings_search(): - - # Streamlit configuration - st.subheader("Vector Search") - st.write('This program is designed to chat over your vector stored (embedding) .csv file. \ - This Chat Bot works alongside the "Embeddings Bot" Chat Bot. \ - Be specific with the information you want to obtain over your data. \ - Welcome to CHATGPT over your own data !!') - if 'answer' not in st.session_state: - st.session_state.answer = [] - if 'score' not in st.session_state: - st.session_state.score = [] - if 'past' not in st.session_state: - st.session_state.past = [] - - # read in the embeddings .csv - # convert elements in 'embedding' column back to numpy array - df = pd.read_csv('C:\\src\\AzureOpenAI_Gov_Workshop\\microsoft-earnings_embeddings.csv') - df['embedding'] = df['embedding'].apply(eval).apply(np.array) - - # caluculate user query embedding - search_term = st.text_input("Enter a search query: ", key='search_term', placeholder="") - if search_term: - st.session_state.past.append(search_term) - search_term_vector = get_embedding(search_term, engine='text-embedding-ada-002') - - # find similiarity between query and vectors - df['similarities'] = df['embedding'].apply(lambda x:cosine_similarity(x, search_term_vector)) - df1 = df.sort_values("similarities", ascending=False).head(5) - - # output the response - answer = df1['text'].loc[df1.index[0]] - score = df1['similarities'].loc[df1.index[0]] - st.session_state.answer.append(answer) - st.session_state.score.append(score) - with st.expander('Vector Search'): - for i in range(len(st.session_state.answer)-1, -1, -1): - st.info(st.session_state.past[i]) - st.write(st.session_state.answer[i]) - st.write('Score: ', st.session_state.score[i]) - - -def main(): - # Streamlit config - st.title("Demo-Azure OpenAI Embeddings") - image = Image.open('image_logo2.png') - st.image(image, caption = '') - st.sidebar.title('Chat Bot Type Selection') - chat_style = st.sidebar.selectbox( - 'Choose between Embeddings Bot or Search Bot', ['Embeddings Bot','Search Bot'] - ) - if chat_style == 'Embeddings Bot': - embedding_create() - elif chat_style == 'Search Bot': - embeddings_search() - -if __name__ == '__main__': - main() diff --git a/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py b/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py new file mode 100644 index 0000000..c1c18cd --- /dev/null +++ b/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py @@ -0,0 +1,106 @@ +from openai import AzureOpenAI +import os +import streamlit as st +from dotenv import load_dotenv +from styling import global_page_style + +# load in .env variables +load_dotenv() + +# Configure Azure OpenAI params, using an Azure OpenAI account with a deployment of an embedding model +azure_endpoint: str = os.getenv('AZURE_OPENAI_BASE') +azure_openai_api_key: str = os.getenv('AZURE_OPENAI_KEY') +azure_openai_api_version: str = os.getenv('AZURE_OPENAI_VERSION') +azure_ada_deployment: str = os.getenv('AZURE_EMBEDDINGS_DEPLOYMENT') +azure_gpt_deployment: str = os.getenv('AZURE_GPT_DEPLOYMENT') + +# Configure Azure AI Search params +search_endpoint: str = os.getenv('AZURE_SEARCH_ENDPOINT') +search_key: str = os.getenv('AZURE_SEARCH_ADMIN_KEY') + +def chat_on_your_data(query, search_index, messages): + messages.append({"role": "user", "content":query}) + with st.chat_message("user"): + st.markdown(query) + with st.spinner('Processing...'): + client = AzureOpenAI( + azure_endpoint=azure_endpoint, + api_key=azure_openai_api_key, + api_version=azure_openai_api_version, + ) + completion = client.chat.completions.create( + model=azure_gpt_deployment, + messages=[ + {"role": "system", "content": "You are an AI assistant that helps people find information. \ + Ensure the Markdown responses are correctly formatted before responding."}, + {"role": "user", "content": query} + ], + max_tokens=800, + temperature=0.7, + top_p=0.95, + frequency_penalty=0, + presence_penalty=0, + stop=None, + stream=False, + extra_body={ + "data_sources": [{ + "type": "azure_search", + "parameters": { + "endpoint": f"{search_endpoint}", + "index_name": search_index, + "semantic_configuration": "default", + "query_type": "vector_simple_hybrid", + "fields_mapping": {}, + "in_scope": True, + "role_information": "You are an AI assistant that helps people find information.", + "filter": None, + "strictness": 3, + "top_n_documents": 5, + "authentication": { + "type": "api_key", + "key": f"{search_key}" + }, + "embedding_dependency": { + "type": "deployment_name", + "deployment_name": azure_ada_deployment + } + } + }] + } + ) + print(completion) + response_data = completion.to_dict() + ai_response = response_data['choices'][0]['message']['content'] + messages.append({"role": "assistant", "content":ai_response}) + with st.chat_message("assistant"): + st.markdown(ai_response) + +def main(): + st.markdown( + f'
', + unsafe_allow_html=True + ) + st.title("Demo - Azure OpenAI & AI Search") + # image = Image.open('image_logo2.png') + # st.image(image, caption = '') + st.write('This demo showcases an innovative way for users to engage with data housed in their Azure AI Search Index by leveraging both \ + semantic and vector search techniques. Semantic search enhances the querying process by comprehending the meaning and context of \ + user queries, thereby providing more pertinent results. Vector search, on the other hand, employs numerical representations of \ + text to identify similar content using cosine similarity. ***For users to effectively utilize this demo, it is essential that they \ + have previously created their Azure AI Search Index, following the necessary steps to upload and query their data as outlined [here](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/Azure_Open_AI_README.md).***') + if 'messages' not in st.session_state: + st.session_state.messages = [] + index_name = st.text_input(label="Azure AI Search index name:", value="") + st.write('-'*50) + if index_name: + query = st.chat_input('Input search query here...') + for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message['content']) + if query: + chat_on_your_data(query, index_name, st.session_state.messages) + + +if __name__ == '__main__': + global_page_style() + main() \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py b/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py new file mode 100644 index 0000000..77702de --- /dev/null +++ b/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py @@ -0,0 +1,126 @@ +from openai import AzureOpenAI +# from openai.embeddings_utils import get_embedding, cosine_similarity # must pip install openai[embeddings] +import pandas as pd +import numpy as np +import os +import streamlit as st +import time +from PIL import Image +from dotenv import load_dotenv +from styling import global_page_style + +# load in .env variables +load_dotenv() + +# configure azure openai keys +# openai.api_type = 'azure' +# openai.api_version = os.environ['AZURE_OPENAI_VERSION'] +# openai.api_base = os.environ['AZURE_OPENAI_ENDPOINT'] +# openai.api_key = os.environ['AZURE_OPENAI_KEY'] + +def get_embedding(text, engine): + client = AzureOpenAI( + api_key=os.getenv("Azure_OPENAI_KEY"), + api_version=os.getenv('AZURE_OPENAI_VERSION'), + azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT') + ) + + embeddings = client.embeddings.create(input = [text], model=engine).data[0].embedding + return embeddings + +def cosine_similarity(a, b): + return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) + +def embedding_create(): + # acquire the filename to be embed + st.subheader("Vector Creation") + st.write('The process of vectorization involves creating embeddings from the [microsoft-earnings.csv](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/microsoft-earnings.csv) \ + file located in the specified directory, utilizing the data in the "text" column. These embeddings are derived from pre-chunked text, \ + indicating that the text has already been divided and formatted for embedding generation. The resultant embeddings will be \ + compiled into a new CSV file, which will serve as a vector store for future reference and utilization.') + filename = st.text_input("Enter a file: ", key='filename', value="microsoft-earnings.csv") + + # start the embeddings process if filename provided + if filename: + file_path = os.path.join('..', filename) + # read the data file to be embed + df = pd.read_csv(file_path) + df_placeholder = st.empty() + df_placeholder.dataframe(df, width=2000, height=350) + button_placeholder = st.empty() + if button_placeholder.button("Generate Embeddings"): + # calculate word embeddings + df['embedding'] = df['text'].apply(lambda x:get_embedding(x, engine=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT"))) + df.to_csv('.\\microsoft-earnings_embeddings.csv') + time.sleep(3) + button_placeholder.success('Embeddings Created Sucessfully!!') + df_placeholder.dataframe(df) + + +def embeddings_search(): + + # Streamlit configuration + st.subheader("Vector Search") + st.write('This process generates embeddings based on user queries, utilizing the compiled CSV that was created, to search for the most similar\ + documents within the vector store by employing cosine similarity.') + if 'answer' not in st.session_state: + st.session_state.answer = [] + if 'score' not in st.session_state: + st.session_state.score = [] + if 'past' not in st.session_state: + st.session_state.past = [] + + # read in the embeddings .csv + # convert elements in 'embedding' column back to numpy array + df = pd.read_csv('.\\microsoft-earnings_embeddings.csv') + df['embedding'] = df['embedding'].apply(eval).apply(np.array) + + # caluculate user query embedding + search_term = st.text_area("Enter a search query: ", key='search_term', placeholder="") + if search_term: + st.session_state.past.append(search_term) + search_term_vector = get_embedding(search_term, engine=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT")) + + # find similiarity between query and vectors + df['similarities'] = df['embedding'].apply(lambda x:cosine_similarity(x, search_term_vector)) + df1 = df.sort_values("similarities", ascending=False).head(5) + + # output the response + answer = df1['text'].loc[df1.index[0]] + score = df1['similarities'].loc[df1.index[0]] + st.session_state.answer.append(answer) + st.session_state.score.append(score) + with st.expander('Vector Search'): + for i in range(len(st.session_state.answer)-1, -1, -1): + st.info(st.session_state.past[i]) + st.write(st.session_state.answer[i]) + st.write('Score: ', st.session_state.score[i]) + with st.expander('Top 5 Results'): + df1 = df1.reset_index(drop=True) + df1.index = df1.index + 1 + df1 = df1.rename(columns={'Unnamed: 0': 'Row Number'}) + print(df1) + st.dataframe(df1) + + +def main(): + st.markdown( + f'
', + unsafe_allow_html=True + ) + st.title("Demo-Azure OpenAI Embeddings") + # image = Image.open('image_logo2.png') + # st.image(image, caption = '') + st.sidebar.title('Embedding Function Selection') + chat_style = st.sidebar.radio( + 'Choose an Embedding function below:', + ['Vectorize', 'Retrieve'] + ) + if chat_style == 'Vectorize': + embedding_create() + elif chat_style == 'Retrieve': + embeddings_search() + +if __name__ == '__main__': + global_page_style() + main() diff --git a/notebooks/GenAI/embedding_demos/style.css b/notebooks/GenAI/embedding_demos/style.css new file mode 100644 index 0000000..e4ebec9 --- /dev/null +++ b/notebooks/GenAI/embedding_demos/style.css @@ -0,0 +1,116 @@ +/* Global font family */ +body, .stApp, .stApp * { + font-family: sans-serif; +} + +.stApp > header { + background-color: #ffffff; + color: #f8f8f8; + padding: 2rem; +} + +.stApp { + background-color: #ffffff; + color: #000000; + padding: 2rem; +} + +/* Sidebar styling */ +[data-testid="stSidebar"] { + background-color: #2c5483; + color: #ffffff; +} + +[data-testid="stSidebarNav"] { + background-image: url(https://cloud.nih.gov/nih_logo.png); + background-repeat: no-repeat; + padding-top: 175px; + background-position: center 0px; + background-size: 200px; +} + +[data-testid="stSidebarNav"]::before { + content: ""; + margin-left: 60px; + margin-top: 20px; + font-size: 30px; + position: relative; + top: 100px; + font-weight: bold; + text-align: center; +} + +/* Ensure all text in sidebar is white */ +[data-testid="stSidebar"] * { + color: #ffffff !important; +} + +/* Success message styling */ +.success-message { + text-align: center; + margin: 20px 0; +} + +/* Main title styling */ +h1 { + color: #000400; + text-align: center; + margin-bottom: 20px; + font-size: 40px; +} + +/* Select box styling */ +.css-2trqyj { + font-size: 18px; +} + +/* Number input styling */ +.css-1siy2j7 { + font-size: 18px; +} + +/* Radio button styling */ +.css-1awt6go { + font-size: 18px; +} + +/* DataFrame styling */ +.css-1l269bu { + margin-top: 20px; +} + +/* Spinner styling */ +.css-1f6lu8k { + margin-top: 20px; + text-align: center; +} + +/* Pages Menu styling */ +.st-emotion-cache-1rtdyuf.eczjsme13 { + color: #ffffff; /* This sets the text color to white */ +} + +.st-emotion-cache-6tkfeg.eczjsme13 { + color: #ffffff !important; +} + +.css-10trblm, .css-1r7vkyz, .css-2trqyj { + color: black; +} + +/* Modify the streamlit menu in top right corner */ +.st-emotion-cache-w3nhqi.ef3psqc5 { + color: black !important; +} + +.st-emotion-cache-1wbqy5l.e17vllj40 { + color: black !important; +} + +.st-emotion-cache-jdyw56.en6cib60 { + color: black !important; +} + +.st-emotion-cache-cgqxho.ef3psqc6 { + color: black !important; +} \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/styling.py b/notebooks/GenAI/embedding_demos/styling.py new file mode 100644 index 0000000..da99bcf --- /dev/null +++ b/notebooks/GenAI/embedding_demos/styling.py @@ -0,0 +1,7 @@ +import streamlit as st + +def global_page_style(): + st.set_page_config(layout="centered") + with open('style.css') as f: + css = f.read() + st.markdown(f'', unsafe_allow_html=True) \ No newline at end of file diff --git a/notebooks/GenAI/requirements.txt b/notebooks/GenAI/requirements.txt index ebf0d31..56ca87e 100644 --- a/notebooks/GenAI/requirements.txt +++ b/notebooks/GenAI/requirements.txt @@ -1,12 +1,7 @@ python-dotenv openai -openai[embeddings] pandas numpy streamlit -langchain -langchain-openai -langchain-community -azure-search-documents==11.4.0b6 -tiktoken -faiss-cpu +azure-search-documents +azure-identity From 11efb56494f89321706ece8f213a977b1cef5752 Mon Sep 17 00:00:00 2001 From: cjackson202 Date: Wed, 6 Nov 2024 13:56:50 -0500 Subject: [PATCH 2/7] added ai search doc ingest, updated workshop example scripts --- .gitignore | 1 + .../embedding_demos/pages/AI_Search_Query.py | 324 ++++++++++++++---- notebooks/GenAI/embedding_demos/readme.md | 58 ++++ .../example_scripts/workshop_embedding.py | 35 +- .../GenAI/example_scripts/workshop_search.py | 23 +- notebooks/GenAI/requirements.txt | 1 + 6 files changed, 348 insertions(+), 94 deletions(-) create mode 100644 .gitignore create mode 100644 notebooks/GenAI/embedding_demos/readme.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d38da71 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +notebooks/GenAI/embedding_demos/p1.py diff --git a/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py b/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py index c1c18cd..eae28c8 100644 --- a/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py +++ b/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py @@ -1,27 +1,50 @@ +import os +import io +import pdfplumber +import streamlit as st +from azure.storage.blob import BlobServiceClient +from azure.core.credentials import AzureKeyCredential +from azure.identity import DefaultAzureCredential +from azure.search.documents import SearchClient +from azure.search.documents.indexes import SearchIndexClient +from azure.search.documents.indexes.models import ( + SimpleField, + SearchFieldDataType, + VectorSearch, + SearchIndex, + SearchableField, + SearchField, + VectorSearchProfile, + HnswAlgorithmConfiguration +) +from dotenv import load_dotenv from openai import AzureOpenAI -import os -import streamlit as st -from dotenv import load_dotenv -from styling import global_page_style - -# load in .env variables -load_dotenv() - -# Configure Azure OpenAI params, using an Azure OpenAI account with a deployment of an embedding model -azure_endpoint: str = os.getenv('AZURE_OPENAI_BASE') -azure_openai_api_key: str = os.getenv('AZURE_OPENAI_KEY') -azure_openai_api_version: str = os.getenv('AZURE_OPENAI_VERSION') -azure_ada_deployment: str = os.getenv('AZURE_EMBEDDINGS_DEPLOYMENT') -azure_gpt_deployment: str = os.getenv('AZURE_GPT_DEPLOYMENT') - -# Configure Azure AI Search params -search_endpoint: str = os.getenv('AZURE_SEARCH_ENDPOINT') -search_key: str = os.getenv('AZURE_SEARCH_ADMIN_KEY') - -def chat_on_your_data(query, search_index, messages): - messages.append({"role": "user", "content":query}) +import tiktoken +from styling import global_page_style + +# Load environment variables +load_dotenv() + +# Configure Azure OpenAI parameters +azure_endpoint = os.getenv('AZURE_OPENAI_BASE') +azure_openai_api_key = os.getenv('AZURE_OPENAI_KEY') +azure_openai_api_version = os.getenv('AZURE_OPENAI_VERSION') +azure_ada_deployment = os.getenv('AZURE_EMBEDDINGS_DEPLOYMENT') +azure_gpt_deployment = os.getenv('AZURE_GPT_DEPLOYMENT') + +# Configure Azure AI Search parameters +search_endpoint = os.getenv('AZURE_SEARCH_ENDPOINT') +search_key = os.getenv('AZURE_SEARCH_ADMIN_KEY') + +def chat_on_your_data(query, search_index, messages): + """ + Perform retrieval queries over documents from the Azure AI Search Index. + """ + messages.append({"role": "user", "content": query}) + with st.chat_message("user"): - st.markdown(query) + st.markdown(query) + with st.spinner('Processing...'): client = AzureOpenAI( azure_endpoint=azure_endpoint, @@ -31,8 +54,7 @@ def chat_on_your_data(query, search_index, messages): completion = client.chat.completions.create( model=azure_gpt_deployment, messages=[ - {"role": "system", "content": "You are an AI assistant that helps people find information. \ - Ensure the Markdown responses are correctly formatted before responding."}, + {"role": "system", "content": "You are an AI assistant that helps people find information. Ensure the Markdown responses are correctly formatted before responding."}, {"role": "user", "content": query} ], max_tokens=800, @@ -46,7 +68,7 @@ def chat_on_your_data(query, search_index, messages): "data_sources": [{ "type": "azure_search", "parameters": { - "endpoint": f"{search_endpoint}", + "endpoint": search_endpoint, "index_name": search_index, "semantic_configuration": "default", "query_type": "vector_simple_hybrid", @@ -55,52 +77,238 @@ def chat_on_your_data(query, search_index, messages): "role_information": "You are an AI assistant that helps people find information.", "filter": None, "strictness": 3, - "top_n_documents": 5, + "top_n_documents": 1, "authentication": { "type": "api_key", - "key": f"{search_key}" + "key": search_key }, "embedding_dependency": { "type": "deployment_name", - "deployment_name": azure_ada_deployment + "deployment_name": azure_ada_deployment } } }] } ) - print(completion) + response_data = completion.to_dict() - ai_response = response_data['choices'][0]['message']['content'] - messages.append({"role": "assistant", "content":ai_response}) + ai_response = response_data['choices'][0]['message']['content'] + messages.append({"role": "assistant", "content": ai_response}) + with st.chat_message("assistant"): - st.markdown(ai_response) - -def main(): + st.markdown(ai_response) + +def setup_azure_openai(log_text): + """ + Sets up Azure OpenAI. + """ + log_text.write("Setting up Azure OpenAI...") + azure_openai = AzureOpenAI( + api_key=os.getenv("Azure_OPENAI_KEY"), + api_version=os.getenv('AZURE_OPENAI_VERSION'), + azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT') + ) + log_text.write("Azure OpenAI setup complete.") + return azure_openai + +def connect_to_blob_storage(log_text, container): + """ + Connects to Azure Blob Storage. + """ + log_text.write("Connecting to Blob Storage...") + blob_service_client = BlobServiceClient.from_connection_string(os.getenv("BLOB_CONNECTION_STRING")) + container_client = blob_service_client.get_container_client(os.getenv("BLOB_CONTAINER_NAME")) + log_text.write("Connected to Blob Storage.") + return container_client + +def split_text_with_metadata(text, metadata, max_length=800, overlap=75, encoding_name='cl100k_base'): + """ + Splits the text into chunks with metadata. + """ + tokenizer = tiktoken.get_encoding(encoding_name) + tokens = tokenizer.encode(text) + chunks = [] + start = 0 + end = max_length + + while start < len(tokens): + chunk = tokens[start:end] + chunk_text = tokenizer.decode(chunk) + chunk_metadata = metadata.copy() + chunk_metadata.update({ + 'start_token': start, + 'end_token': end, + 'chunk_length': len(chunk), + 'chunk_text_preview': chunk_text[:50] + '...' + }) + chunks.append({ + 'text': chunk_text, + 'metadata': chunk_metadata + }) + start = end - overlap + end = start + max_length + + return chunks + +def load_blob_content(blob_client): + """ + Loads and returns the content of the PDF blob. + """ + blob_name = blob_client.blob_name + if not blob_name.lower().endswith('.pdf'): + raise ValueError(f"Blob {blob_name} is not a PDF file.") + + blob_data = blob_client.download_blob().readall() + pdf_stream = io.BytesIO(blob_data) + document_text = "" + + with pdfplumber.open(pdf_stream) as pdf: + for page in pdf.pages: + document_text += page.extract_text() + "\n" + + return document_text + +def vectorize(log_text): + """ + Main function that orchestrates the vector workflow. + """ + azure_openai = setup_azure_openai(log_text) + container_client = connect_to_blob_storage(log_text) + + # Read and chunk documents with metadata + log_text.write("Listing blobs in container...") + blob_list = container_client.list_blobs() + documents = [] + for blob in blob_list: + if not blob.name.lower().endswith('.pdf'): + log_text.write(f"Skipping non-PDF blob: {blob.name}") + continue + + log_text.write(f"Processing blob: {blob.name}") + blob_client = container_client.get_blob_client(blob) + try: + document = load_blob_content(blob_client) + metadata = {"blob_name": blob.name} + chunks = split_text_with_metadata(document, metadata) + documents.extend(chunks) + except Exception as e: + log_text.write(f"Failed to process blob {blob.name}: {e}") + + log_text.write("Blobs processed and documents chunked.") + + # Generate embeddings + log_text.write("Generating embeddings...") + embeddings = [] + tokenizer = tiktoken.get_encoding("cl100k_base") + max_tokens = 8192 + for i, doc in enumerate(documents): + log_text.write(f"Processing chunk {i + 1}/{len(documents)}") + log_text.write(f"Chunk text: {doc['text']}\n") + tokens = tokenizer.encode(doc["text"]) + if len(tokens) > max_tokens: + log_text.write(f"Skipping document chunk {i + 1} with {len(tokens)} tokens, exceeding max limit of {max_tokens}.") + continue + response = azure_openai.embeddings.create(input=doc["text"], model=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT")) + embeddings.append({ + "embedding": response.data[0].embedding, + "metadata": doc["metadata"] + }) + log_text.write(f"Embeddings: {response.data[0].embedding}") + + log_text.write("Embeddings generation complete.") + + # Create Search Index + log_text.write("Creating search index...") + credential = AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY")) + search_index_client = SearchIndexClient(endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"), credential=credential) + fields = [ + SimpleField(name="id", type=SearchFieldDataType.String, key=True), + SearchableField(name="content", type=SearchFieldDataType.String), + SearchableField(name="blob_name", type=SearchFieldDataType.String), + SearchField( + name="embedding", + type=SearchFieldDataType.Collection(SearchFieldDataType.Single), + searchable=True, + vector_search_dimensions=1536, + vector_search_profile_name="myHnswProfile" + ) + ] + vector_search = VectorSearch( + algorithms=[ + HnswAlgorithmConfiguration(name="myHnsw") + ], + profiles=[ + VectorSearchProfile( + name="myHnswProfile", + algorithm_configuration_name="myHnsw" + ) + ] + ) + index = SearchIndex(name="documents-index", fields=fields, vector_search=vector_search) + search_index_client.create_index(index) + log_text.write("Search index created.") + + # Upload chunks and embeddings to Azure AI Search + log_text.write("Uploading documents to search index...") + search_client = SearchClient(endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"), index_name="documents-index", credential=credential) + documents_to_upload = [] + + for i, doc in enumerate(embeddings): + documents_to_upload.append({ + "id": str(i), + "content": documents[i]["text"], + "embedding": doc["embedding"], + "blob_name": doc["metadata"]["blob_name"] + }) + search_client.upload_documents(documents=documents_to_upload) + log_text.success("Documents uploaded to search index.") + +def main(): + """ + Main program execution function. + """ st.markdown( - f'
', - unsafe_allow_html=True - ) - st.title("Demo - Azure OpenAI & AI Search") - # image = Image.open('image_logo2.png') - # st.image(image, caption = '') - st.write('This demo showcases an innovative way for users to engage with data housed in their Azure AI Search Index by leveraging both \ - semantic and vector search techniques. Semantic search enhances the querying process by comprehending the meaning and context of \ - user queries, thereby providing more pertinent results. Vector search, on the other hand, employs numerical representations of \ - text to identify similar content using cosine similarity. ***For users to effectively utilize this demo, it is essential that they \ - have previously created their Azure AI Search Index, following the necessary steps to upload and query their data as outlined [here](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/Azure_Open_AI_README.md).***') - if 'messages' not in st.session_state: - st.session_state.messages = [] - index_name = st.text_input(label="Azure AI Search index name:", value="") - st.write('-'*50) - if index_name: - query = st.chat_input('Input search query here...') + f'
', + unsafe_allow_html=True + ) + st.title("Demo - Azure OpenAI & AI Search") + + task = st.sidebar.radio( + 'Choose a function below:', + ['Vectorize', 'Retrieve'] + ) + + # Task for retrieving documents from Azure AI Search in Streamlit UI + if task == 'Retrieve': + st.write('This demo showcases an innovative way for users to engage with data housed in their Azure AI Search Index by \ + leveraging both semantic and vector search techniques. Semantic search enhances the querying process by comprehending \ + the meaning and context of user queries, thereby providing more pertinent results. Vector search, on the other hand, employs \ + numerical representations of text to identify similar content using cosine similarity. ***For users to effectively \ + utilize this demo, it is essential that they have previously created their Azure AI Search Index, by executing the \ + "vectorize" task.***') + + if 'messages' not in st.session_state: + st.session_state.messages = [] + + index_name = os.getenv('AZURE_SEARCH_INDEX') + + st.write('-'*50) + query = st.chat_input('Input search query here...') for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message['content']) - if query: - chat_on_your_data(query, index_name, st.session_state.messages) - - -if __name__ == '__main__': - global_page_style() - main() \ No newline at end of file + if query: + chat_on_your_data(query, index_name, st.session_state.messages) + + # Task for embedding documents from Azure Blob to Azure AI Search index in Streamlit UI + elif task == 'Vectorize': + st.write('This demo processes PDF files from Azure Blob Storage, generates embeddings, and uploads them to Azure AI Search for indexing. \ + ***For users to effectively utilize this demo, it is essential that they uploade PDF files from the \ + "/search_documents" directory to Azure Blob container. Instructions to do this can be found [here](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal).***') + if st.button("Start Process"): + log_text = st.empty() + vectorize(log_text) + +if __name__ == '__main__': + global_page_style() + main() \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/readme.md b/notebooks/GenAI/embedding_demos/readme.md new file mode 100644 index 0000000..b240484 --- /dev/null +++ b/notebooks/GenAI/embedding_demos/readme.md @@ -0,0 +1,58 @@ +# Azure OpenAI Demo w/ Streamlit Frontend + +The Azure OpenAI Demo w/ Streamlit Frontend is designed to host various demonstrations that showcase the capabilities of Azure OpenAI with a Streamlit frontend. Demonstrations include utilizing Azure OpenAI to create/chat over data indexes in Azure AI Search, and utilizing Azure OpenAI SDK to generate embeddings for text and query those embeddings. For documentation on building Streamlit apps, please search links below: +- [Streamlit Documentation](https://docs.streamlit.io/get-started) +- [Geeksforgeeks](https://www.geeksforgeeks.org/a-beginners-guide-to-streamlit/) + +## Environment Setup +To excute this demo, be sure to complete the following steps: + +1. Create a virtual environment in the \GenAI directory. + + - Navigate to the /GenAI directory: + ```sh + cd .\notebooks\GenAI + ``` + - Create the virtual environment: + ``` + python -m venv venv + ``` + - ***Note: This command will create a virtual environment named venv in \GenAI*** + - Activate the virtual environment: + - On ***Windows***: + ```sh + venv\Scripts\activate + ``` + - On ***macOS/Linux***: + ```sh + source venv/bin/activate + ``` + +2. Install all required libraries from the provided requirements.txt file. + ```sh + pip install -r requirements.txt + ``` + +3. Create a .env file in the /GenAI directory and set the following variables: + ```sh + AZURE_OPENAI_VERSION = "Your Azure OpenAI API version" + AZURE_OPENAI_BASE = "Your Azure OpenAI API endpoint" + AZURE_OPENAI_KEY = "Your Azure OpenAI API key" + AZURE_GPT_DEPLOYMENT = "Your Azure OpenAI deployed GPT model name" + AZURE_EMBEDDINGS_DEPLOYMENT = "Your Azure OpenAI deployed ADA model name" + AZURE_SEARCH_ENDPOINT = "Your Azure AI Search API endpoint" + AZURE_SEARCH_ADMIN_KEY = "Your Azure AI Search API key" + AZURE_SEARCH_INDEX = "documents-index" # The index name 'documents-index' is used as default in this demo + BLOB_CONTAINER_NAME = "Your Azure Blob Container name hosting files from /search_documents" + BLOB_CONNECTION_STRING = "Your Azure Blob connection string" + ``` + +4. Navigate to the /embeddings directory (location of the Streamlit demo) + ```sh + cd .\notebooks\GenAI\embedding_demos + ``` + +5. Execute the Streamlit demo + ```sh + streamlit run Demo_Suite.py + ``` \ No newline at end of file diff --git a/notebooks/GenAI/example_scripts/workshop_embedding.py b/notebooks/GenAI/example_scripts/workshop_embedding.py index fffba7e..dc4690f 100644 --- a/notebooks/GenAI/example_scripts/workshop_embedding.py +++ b/notebooks/GenAI/example_scripts/workshop_embedding.py @@ -1,8 +1,6 @@ -import openai +from openai import AzureOpenAI import pandas as pd -import numpy as np import os -import streamlit as st from dotenv import load_dotenv import time @@ -10,33 +8,24 @@ # load in variables from .env load_dotenv() - -# set keys and configure Azure OpenAI -os.environ["AZURE_OPENAI_ENDPOINT"] = "" -os.environ["AZURE_OPENAI_KEY"] = "" - #create embeddings functions to apply to a given column - -client = AzureOpenAI( - api_key=os.getenv("AZURE_OPENAI_KEY"), - api_version="2023-05-15", - azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") - )] +def get_embedding(text, engine): + client = AzureOpenAI( + api_key=os.getenv("Azure_OPENAI_KEY"), + api_version=os.getenv('AZURE_OPENAI_VERSION'), + azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT') + ) + embeddings = client.embeddings.create(input = [text], model=engine).data[0].embedding + return embeddings # read the data file to be embed -df = pd.read_csv('microsoft-earnings.csv') +df = pd.read_csv(os.path.join('..', 'microsoft-earnings.csv')) print(df) -#embeddings function -def get_embedding(text, model="text-embedding-ada-002"): - text = text.replace("\n", " ") - return client.embeddings.create(input = [text], model=model).data[0].embedding - - # calculate word embeddings -df['embedding'] = df['text'].apply(lambda x: get_embedding(x)) -df.to_csv('microsoft-earnings_embeddings.csv', index=False) +df['embedding'] = df['text'].apply(lambda x:get_embedding(x, engine=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT"))) +df.to_csv('.\\microsoft-earnings_embeddings.csv', index=False) time.sleep(3) print(df) diff --git a/notebooks/GenAI/example_scripts/workshop_search.py b/notebooks/GenAI/example_scripts/workshop_search.py index 914105f..d9f16c2 100644 --- a/notebooks/GenAI/example_scripts/workshop_search.py +++ b/notebooks/GenAI/example_scripts/workshop_search.py @@ -1,25 +1,22 @@ -import openai +from openai import AzureOpenAI import pandas as pd import numpy as np import os -import streamlit as st from dotenv import load_dotenv load_dotenv() -# set keys and configure Azure OpenAI -os.environ["AZURE_OPENAI_ENDPOINT"] = "" -os.environ["AZURE_OPENAI_KEY"] = "" - #create embeddings functions to apply to a given column - -client = AzureOpenAI( - api_key=os.getenv("AZURE_OPENAI_KEY"), - api_version="2023-05-15", - azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") +def get_embedding(text, engine): + client = AzureOpenAI( + api_key=os.getenv("Azure_OPENAI_KEY"), + api_version=os.getenv('AZURE_OPENAI_VERSION'), + azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT') ) + embeddings = client.embeddings.create(input = [text], model=engine).data[0].embedding + return embeddings #create cosine function def cosine_similarity(a, b): @@ -27,13 +24,13 @@ def cosine_similarity(a, b): # read in the embeddings .csv # convert elements in 'embedding' column back to numpy array -df = pd.read_csv('microsoft-earnings_embeddings.csv') +df = pd.read_csv('.\\microsoft-earnings_embeddings.csv') df['embedding'] = df['embedding'].apply(eval).apply(np.array) # caluculate user query embedding search_term = input("Enter a search term: ") if search_term: - search_term_vector = get_embedding(search_term, engine='text-embedding-ada-002') + search_term_vector = get_embedding(search_term, engine=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT")) # find similiarity between query and vectors df['similarities'] = df['embedding'].apply(lambda x:cosine_similarity(x, search_term_vector)) diff --git a/notebooks/GenAI/requirements.txt b/notebooks/GenAI/requirements.txt index 56ca87e..da48ce8 100644 --- a/notebooks/GenAI/requirements.txt +++ b/notebooks/GenAI/requirements.txt @@ -5,3 +5,4 @@ numpy streamlit azure-search-documents azure-identity +azure-storage-blob \ No newline at end of file From 9747d8ba2d8a0933237fdfae3bfd3070f922ba7e Mon Sep 17 00:00:00 2001 From: cjackson202 Date: Wed, 6 Nov 2024 14:00:41 -0500 Subject: [PATCH 3/7] update gitignore --- .gitignore | 1 - notebooks/GenAI/.gitignore | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index d38da71..0000000 --- a/.gitignore +++ /dev/null @@ -1 +0,0 @@ -notebooks/GenAI/embedding_demos/p1.py diff --git a/notebooks/GenAI/.gitignore b/notebooks/GenAI/.gitignore index ad23308..f61515b 100644 --- a/notebooks/GenAI/.gitignore +++ b/notebooks/GenAI/.gitignore @@ -1,4 +1,5 @@ __pycache__ .venv .env -microsoft-earnings_embeddings.csv \ No newline at end of file +microsoft-earnings_embeddings.csv +embedding_demos/p1.py \ No newline at end of file From 91f11b65a0c7178c14a78a11c516ec96fc3e698e Mon Sep 17 00:00:00 2001 From: cjackson202 Date: Tue, 26 Nov 2024 10:16:19 -0500 Subject: [PATCH 4/7] juypter notebook created for aisearch demo, arm template added for Azure resources --- notebooks/GenAI/azure_infra_setup/README.md | 234 ++++++ .../azure_infra_setup/arm_resources.json | 249 ++++++ notebooks/GenAI/embedding_demos/Demo_Suite.py | 19 +- .../embedding_demos/pages/AI_Search_Query.py | 46 +- .../embedding_demos/pages/AOAI_Embeddings.py | 10 +- notebooks/GenAI/embedding_demos/styling.py | 2 +- .../notebooks/AISearch_RAG_chatbot.ipynb | 719 ++++++++++++++++++ notebooks/GenAI/requirements.txt | 4 +- 8 files changed, 1255 insertions(+), 28 deletions(-) create mode 100644 notebooks/GenAI/azure_infra_setup/README.md create mode 100644 notebooks/GenAI/azure_infra_setup/arm_resources.json create mode 100644 notebooks/GenAI/notebooks/AISearch_RAG_chatbot.ipynb diff --git a/notebooks/GenAI/azure_infra_setup/README.md b/notebooks/GenAI/azure_infra_setup/README.md new file mode 100644 index 0000000..cf0e16a --- /dev/null +++ b/notebooks/GenAI/azure_infra_setup/README.md @@ -0,0 +1,234 @@ +# Setting Up Azure Environment for Azure GenAI Cloud Lab + +Welcome! This guide will help you set up your Azure environment to complete the activities in the [Azure GenAI](../) directory of the NIH Cloud Lab. We will walk you through the steps required to configure PowerShell, deploy necessary resources using an ARM template, upload local files to Azure Storage Account, and acquire keys and secrets for `.env` variables. + +## Prerequisites + +- An active Azure subscription +- PowerShell installed on your machine (option 1) +- Azure CLI installed (option 2) + +## Steps + +### 1. Setting Up the Azure Module in PowerShell + +First, you need to install the Azure module in PowerShell to connect to your Azure account. + +```powershell +# Install the Az module (if using PowerShell) +Install-Module -Name Az -AllowClobber -Force + +# Import the Az module (if using Azure CLI) +Import-Module Az +``` + +### 2. Logging into Azure + +You can log into your Azure account either using PowerShell or Azure CLI. + +**Using PowerShell** +```powershell +# Log into your Azure account +Connect-AzAccount +``` +**Using Azure CLI** +```powershell +# Log into your Azure account +az login +``` + +### 3. Setting Variables + +Set the following variables, which you'll need throughout the setup process. + +**Using PowerShell** +```powershell +# Variables +$resourceGroupName="nihcloudlabrg" +$location="eastus2" +$templateFilePath="Path To ./arm_resources.json" +$storageAccountName="cloudlabstgacct" +$containerName="cloudlabdocuments" +$localFilePath="Path To ../search_documents" +$searchServiceName="cloudlabsearch" +$openAIResourceName="cloudlabaoai" +``` +**Using Azure CLI** +```bash +# Variables +resourceGroupName="nihcloudlabrg" +location="eastus2" +templateFilePath="Path To ./arm_resources.json" +storageAccountName="cloudlabstgacct" +containerName="cloudlabdocuments" +localFilePath="Path To ../search_documents" +searchServiceName="cloudlabsearch" +openAIResourceName="cloudlabaoai" +``` + +### 4. Creating an Empty Resource Group + +Create an empty resource group where the ARM template will deploy the necessary resources. + +**Using PowerShell** +```powershell +# Create a resource group +New-AzResourceGroup -Name $resourceGroupName -Location $location +``` +**Using Azure CLI** +```bash +# Create a resource group +az group create --name $resourceGroupName --location $location +``` + +### 5. Deploying the ARM Template + +Deploy the [ARM template](/notebooks/GenAI/azure_infra_setup/arm_resources.json) to create the Azure Storage Account, Azure AI Search, and Azure OpenAI resources. + +***Using PowerShell*** +```powershell +# Deploy the ARM template +New-AzResourceGroupDeployment -ResourceGroupName $resourceGroupName -TemplateFile $templateFilePath +``` +***Using Azure CLI*** +```bash +# Deploy the ARM template +az deployment group create --resource-group $resourceGroupName --template-file $templateFilePath +``` + +### 6. Uploading Local Files to Azure Storage + +Upload your local files to the blob container in the Azure Storage Account. + +**Using PowerShell** +```powershell +# Get storage account context +$storageContext = (Get-AzStorageAccount -ResourceGroupName $resourceGroupName -Name $storageAccountName).Context + +# Upload all files in the directory +Get-ChildItem -Path $localFilePath -File | ForEach-Object { + Set-AzStorageBlobContent -File $_.FullName -Container $containerName -Context $storageContext +} +``` +**Using Azure CLI** +```bash +# Get storage account key +storageAccountKey=$(az storage account keys list --resource-group $resourceGroupName --account-name $storageAccountName --query "[0].value" --output tsv) + +# Upload all files in the directory +for file in localFilePath/*; do + az storage blob upload --account-name $storageAccountName --account-key $storageAccountKey --container-name $containerName --file file --name (basename file) +done +``` + +### 7. Retrieving API Keys + +Retrieve the API keys for each service created by the ARM template deployment. These secrets are confidential and should be handled appropriately. Once the output is received, the values will be added to your `.env` file, which should be created in the ./notebooks/GenAI directory. Note that this `.env` file is already added to the `.gitignore`. + +**Azure Storage Account** + +***Using PowerShell*** +```powershell +# Get the storage account key +$storageAccountKey = (Get-AzStorageAccountKey -ResourceGroupName $resourceGroupName -Name $storageAccountName)[0].Value +# Construct the Blob connection string +$connectionString = "DefaultEndpointsProtocol=https;AccountName=$storageAccountName;AccountKey=$storageAccountKey;EndpointSuffix=core.windows.net" +# Output the connection string +Write-Output $connectionString +``` +***Using Azure CLI*** +```bash +# Get the storage account key +storageAccountKey=(az storage account keys list --resource-group $resourceGroupName --account-name $storageAccountName --query '[0].value' --output tsv) +echo $storageAccountKey +# Construct the Blob connection string +connectionString="DefaultEndpointsProtocol=https;AccountName=$storageAccountName;AccountKey=$storageAccountKey;EndpointSuffix=core.windows.net" +echo $connectionString +``` + +You now have the secrets to set the following .env variables in your local file. Copy the values to your `.env`: +- ***BLOB_CONTAINER_NAME*** = Use the value of `$containerName` or `containerName`. +- ***BLOB_CONNECTION_STRING*** = Use the value of `$connectionString ` or `connectionString`. +- ***BLOB_ACCOUNT_NAME*** = Use the value of `$storageAccountName` or `storageAccountName`. + +**Azure AI Search** + +***Using PowerShell*** +```powershell +# Acquire the AI Search Admin Key +$adminKeys = Get-AzSearchAdminKeyPair -ResourceGroupName $resourceGroupName -ServiceName $searchServiceName +Write-Output $adminKeys +# Construct the AI Search Admin Key +$searchServiceEndpoint="https://$searchServiceName.search.windows.net" +Write-Output $searchServiceEndpoint +``` +***Using Azure CLI*** +```bash +# Acquire the AI Search Admin Key +searchServiceKey = az search admin-key show --resource-group resourceGroupName --service-name $searchServiceName --query primaryKey -o tsv +echo $searchServiceKey +# Construct the AI Search endpoint +searchServiceEndpoint="https://$searchServiceName.search.windows.net" +echo $searchServiceEndpoint +``` + +You now have the secrets to set the following .env variables in your local file. Copy the values to your `.env`: +- ***AZURE_SEARCH_ENDPOINT*** = Use the value of `$searchServiceEndpoint` or `searchServiceEndpoint`. +- ***AZURE_SEARCH_ADMIN_KEY*** = Use the value of `$searchServiceKey` or `searchServiceKey`. + +**Azure OpenAI** + +***Using PowerShell*** +```powershell +# Get the Azure OpenAI key 1 +$openAIKey = az cognitiveservices account keys list --resource-group $resourceGroupName --name $openAIResourceName --query "key1" --output tsv +Write-Output $openAIKey +# Construct the Azure OpenAI endpoint +$openAIEndpoint = "https://$openAIResourceName.openai.azure.com/" +Write-Output $openAIEndpoint +``` +***Using Azure CLI*** +```bash +# Get the Azure OpenAI key +openAIKey=$(az cognitiveservices account keys list --resource-group $resourceGroupName --name $openAIResourceName --query "key1" --output tsv) +echo $openAIKey +# Construct the Azure OpenAI endpoint +openAIEndpoint = "https://$openAIResourceName.openai.azure.com/" +echo $openAIEndpoint +``` + +You now have the secrets to set the following .env variables in your local file. Copy the values to your `.env`: +- ***AZURE_OPENAI_ENDPOINT*** = Use the value of `$openAIEndpoint` or `openAIEndpoint`. +- ***AZURE_OPENAI_KEY*** = Use the value of `$openAIKey` or `openAIKey`. +- ***AZURE_GPT_DEPLOYMENT*** = Use the value of `gpt-4o-mini`. +- ***AZURE_EMBEDDINGS_DEPLOYMENT*** = Use the value of `text-embedding-3-small`. + +**Note**: To find the ***API version (Azure_OPENAI_VERSION)*** for your resource in the Azure OpenAI playground, follow these steps: +1. **Navigate to Deployments**: In the left side panel of the Azure OpenAI playground, click on “Deployments.” +2. **Select the Model Deployment**: Click on the specific model deployment you are working with. +3. **Locate the Endpoint Section**: In the endpoint section, you will see the Target URI. +4. **Find the API Version**: Look for the part of the URL that includes `api-version=2024-08-01-preview`. This will be your API version. + +Your final local `.env` file should look something like this: +```sh +AZURE_OPENAI_VERSION = "Your Azure OpenAI API version" +AZURE_OPENAI_ENDPOINT = "Your Azure OpenAI API endpoint" +AZURE_OPENAI_KEY = "Your Azure OpenAI API key" +AZURE_GPT_DEPLOYMENT = "Your Azure OpenAI deployed GPT model name" +AZURE_EMBEDDINGS_DEPLOYMENT = "Your Azure OpenAI deployed ADA model name" +AZURE_SEARCH_ENDPOINT = "Your Azure AI Search API endpoint" +AZURE_SEARCH_ADMIN_KEY = "Your Azure AI Search API key" +BLOB_CONTAINER_NAME = "Your Azure Blob Container name hosting files from /search_documents" +BLOB_CONNECTION_STRING = "Your Azure Blob connection string" +``` +## Conclusion + +Congratulations on completing the Azure setup! During this process, we established a new resource group dedicated to the NIH Cloud Lab environment and configured three Azure resources in your tenant using an ARM template file. The resources include: + +- An Azure Storage Account with a deployed Blob container and files uploaded from `../search_documents` +- Azure AI Search +- Azure OpenAI with deployed `gpt-4o-mini` and `text-embedding-3-small` models + +Additionally, we configured `.env` variables in your local `.env` file, which is added to `.gitignore` by default. + +You are now ready to proceed with the GenAI activities in the NIH Cloud Lab. \ No newline at end of file diff --git a/notebooks/GenAI/azure_infra_setup/arm_resources.json b/notebooks/GenAI/azure_infra_setup/arm_resources.json new file mode 100644 index 0000000..16b1473 --- /dev/null +++ b/notebooks/GenAI/azure_infra_setup/arm_resources.json @@ -0,0 +1,249 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "searchServices_cloudlabsearch_name": { + "defaultValue": "cloudlabsearch5", + "type": "String" + }, + "storageAccounts_genaicloudlab_name": { + "defaultValue": "cloudlabstgacct5", + "type": "String" + }, + "accounts_cloudlabaoai_name": { + "defaultValue": "cloudlabaoai5", + "type": "String" + } + }, + "variables": {}, + "resources": [ + { + "type": "Microsoft.CognitiveServices/accounts", + "apiVersion": "2024-06-01-preview", + "name": "[parameters('accounts_cloudlabaoai_name')]", + "location": "eastus2", + "sku": { + "name": "S0" + }, + "kind": "OpenAI", + "properties": { + "apiProperties": {}, + "customSubDomainName": "[parameters('accounts_cloudlabaoai_name')]", + "networkAcls": { + "defaultAction": "Allow", + "virtualNetworkRules": [], + "ipRules": [] + }, + "publicNetworkAccess": "Enabled" + } + }, + { + "type": "Microsoft.Search/searchServices", + "apiVersion": "2024-06-01-preview", + "name": "[parameters('searchServices_cloudlabsearch_name')]", + "location": "East US 2", + "sku": { + "name": "basic" + }, + "properties": { + "replicaCount": 1, + "partitionCount": 1, + "hostingMode": "Default", + "publicNetworkAccess": "Enabled", + "networkRuleSet": { + "ipRules": [], + "bypass": "None" + }, + "encryptionWithCmk": { + "enforcement": "Unspecified" + }, + "disableLocalAuth": false, + "authOptions": { + "apiKeyOnly": {} + }, + "disabledDataExfiltrationOptions": [], + "semanticSearch": "disabled" + } + }, + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2023-05-01", + "name": "[parameters('storageAccounts_genaicloudlab_name')]", + "location": "eastus2", + "sku": { + "name": "Standard_LRS", + "tier": "Standard" + }, + "kind": "StorageV2", + "properties": { + "dnsEndpointType": "Standard", + "defaultToOAuthAuthentication": false, + "publicNetworkAccess": "Enabled", + "allowCrossTenantReplication": false, + "minimumTlsVersion": "TLS1_2", + "allowBlobPublicAccess": false, + "allowSharedKeyAccess": true, + "largeFileSharesState": "Enabled", + "networkAcls": { + "bypass": "AzureServices", + "virtualNetworkRules": [], + "ipRules": [], + "defaultAction": "Allow" + }, + "supportsHttpsTrafficOnly": true, + "encryption": { + "requireInfrastructureEncryption": false, + "services": { + "file": { + "keyType": "Account", + "enabled": true + }, + "blob": { + "keyType": "Account", + "enabled": true + } + }, + "keySource": "Microsoft.Storage" + }, + "accessTier": "Hot" + } + }, + { + "type": "Microsoft.CognitiveServices/accounts/deployments", + "apiVersion": "2024-06-01-preview", + "name": "[concat(parameters('accounts_cloudlabaoai_name'), '/gpt-4o-mini')]", + "dependsOn": [ + "[resourceId('Microsoft.CognitiveServices/accounts', parameters('accounts_cloudlabaoai_name'))]" + ], + "sku": { + "name": "GlobalStandard", + "capacity": 10 + }, + "properties": { + "model": { + "format": "OpenAI", + "name": "gpt-4o-mini", + "version": "2024-07-18" + }, + "versionUpgradeOption": "OnceNewDefaultVersionAvailable", + "currentCapacity": 10, + "raiPolicyName": "Microsoft.DefaultV2" + } + }, + { + "type": "Microsoft.CognitiveServices/accounts/deployments", + "apiVersion": "2024-06-01-preview", + "name": "[concat(parameters('accounts_cloudlabaoai_name'), '/text-embedding-3-small')]", + "dependsOn": [ + "[resourceId('Microsoft.CognitiveServices/accounts', parameters('accounts_cloudlabaoai_name'))]" + ], + "sku": { + "name": "Standard", + "capacity": 120 + }, + "properties": { + "model": { + "format": "OpenAI", + "name": "text-embedding-3-small", + "version": "1" + }, + "versionUpgradeOption": "OnceNewDefaultVersionAvailable", + "currentCapacity": 120, + "raiPolicyName": "Microsoft.DefaultV2" + } + }, + { + "type": "Microsoft.Storage/storageAccounts/blobServices", + "apiVersion": "2023-05-01", + "name": "[concat(parameters('storageAccounts_genaicloudlab_name'), '/default')]", + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccounts_genaicloudlab_name'))]" + ], + "sku": { + "name": "Standard_LRS", + "tier": "Standard" + }, + "properties": { + "containerDeleteRetentionPolicy": { + "enabled": true, + "days": 7 + }, + "cors": { + "corsRules": [] + }, + "deleteRetentionPolicy": { + "allowPermanentDelete": false, + "enabled": true, + "days": 7 + } + } + }, + { + "type": "Microsoft.Storage/storageAccounts/fileServices", + "apiVersion": "2023-05-01", + "name": "[concat(parameters('storageAccounts_genaicloudlab_name'), '/default')]", + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccounts_genaicloudlab_name'))]" + ], + "sku": { + "name": "Standard_LRS", + "tier": "Standard" + }, + "properties": { + "protocolSettings": { + "smb": {} + }, + "cors": { + "corsRules": [] + }, + "shareDeleteRetentionPolicy": { + "enabled": true, + "days": 7 + } + } + }, + { + "type": "Microsoft.Storage/storageAccounts/queueServices", + "apiVersion": "2023-05-01", + "name": "[concat(parameters('storageAccounts_genaicloudlab_name'), '/default')]", + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccounts_genaicloudlab_name'))]" + ], + "properties": { + "cors": { + "corsRules": [] + } + } + }, + { + "type": "Microsoft.Storage/storageAccounts/tableServices", + "apiVersion": "2023-05-01", + "name": "[concat(parameters('storageAccounts_genaicloudlab_name'), '/default')]", + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccounts_genaicloudlab_name'))]" + ], + "properties": { + "cors": { + "corsRules": [] + } + } + }, + { + "type": "Microsoft.Storage/storageAccounts/blobServices/containers", + "apiVersion": "2023-05-01", + "name": "[concat(parameters('storageAccounts_genaicloudlab_name'), '/default/cloudlabdocuments')]", + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts/blobServices', parameters('storageAccounts_genaicloudlab_name'), 'default')]", + "[resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccounts_genaicloudlab_name'))]" + ], + "properties": { + "immutableStorageWithVersioning": { + "enabled": false + }, + "defaultEncryptionScope": "$account-encryption-key", + "denyEncryptionScopeOverride": false, + "publicAccess": "None" + } + } + ] +} \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/Demo_Suite.py b/notebooks/GenAI/embedding_demos/Demo_Suite.py index 327eb8d..937cbf1 100644 --- a/notebooks/GenAI/embedding_demos/Demo_Suite.py +++ b/notebooks/GenAI/embedding_demos/Demo_Suite.py @@ -22,17 +22,22 @@ def main(): st.markdown("---") # Chat with Your Data section - st.markdown("### Chat with Your Data using Azure OpenAI API and AI Search Index (AI Search Query)") + st.markdown("### Generate & Search with Azure OpenAI & Azure AI Search (AI Search Query)") st.write(""" - This demo allows users to interact with data stored in their Azure AI Search Index using a combination of semantic and vector search methods. +This demo provides an interactive platform for users to manage documents stored in their Azure Blob Container. +This is accomplished by indexing the documents in Azure AI Search and employing a combination of semantic and vector search techniques. +In this demo, we concentrate on real documents that are housed in an Azure Blob Container. +These documents undergo a process of chunking, after which embeddings from these chunks are stored in Azure AI Search, serving as our vector database. """) st.write(""" - **Semantic Search**: Understands the meaning and context of your queries to deliver more relevant results. - **Vector Search**: Utilizes numerical representations of text to find similar content based on cosine similarity. """) - # Ensure the user has created the Azure AI search index already + # Ensure the user has uploaded documents to an Azure Storage Account Blob container. st.write(""" - **Note**: Users must have created the Azure AI search index already as shown here: [Upload your own data and query over it](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/Azure_Open_AI_README.md) +**Important**: Ensure that you have already created an Azure Storage Account and [uploaded documents](https://github.com/STRIDES/NIHCloudLabAzure/tree/main/notebooks/GenAI/search_documents) to a Blob container. If you haven't completed this step yet, please follow one of the options below: +1. [Create and upload documents via Azure CLI or PowerShell with an ARM template](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/azure_infra_setup/README.md) +2. [Manually upload documents via Azure Portal](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal) """) # Horizontal divider @@ -41,7 +46,11 @@ def main(): # Generate & Search with Azure OpenAI Embeddings section st.markdown("### Generate & Search with Azure OpenAI Embeddings (AOAI Embeddings)") st.write(""" - This demo enables users to generate embeddings from a pre-chunked CSV file and perform searches over the content using vector search. +This demo empowers users to generate embeddings from a pre-chunked CSV file and execute content searches using vector search. +Our sole focus in this instance is on producing document embeddings with Azure OpenAI and storing them in a local .csv file. +This exercise aids in understanding the functionality of the Azure OpenAI service, specifically its embedding generation process. +This demo is primarily designed as a learning tool, serving as an introduction to working with embeddings in Azure OpenAI. +As such, there is no prerequisite to chunk documents or store them in a vector database. """) st.write(""" - **Vectorize**: Creates embeddings based on the "microsoft-earnings.csv" file provided in this directory. The embeddings are generated from the "text" column. The CSV file is pre-chunked, meaning the text has already been split and prepared for embedding generation. A new CSV file will be created to store all generated embeddings, forming your vector store. diff --git a/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py b/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py index eae28c8..27c7410 100644 --- a/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py +++ b/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py @@ -21,17 +21,11 @@ from openai import AzureOpenAI import tiktoken from styling import global_page_style +import re # Load environment variables load_dotenv() -# Configure Azure OpenAI parameters -azure_endpoint = os.getenv('AZURE_OPENAI_BASE') -azure_openai_api_key = os.getenv('AZURE_OPENAI_KEY') -azure_openai_api_version = os.getenv('AZURE_OPENAI_VERSION') -azure_ada_deployment = os.getenv('AZURE_EMBEDDINGS_DEPLOYMENT') -azure_gpt_deployment = os.getenv('AZURE_GPT_DEPLOYMENT') - # Configure Azure AI Search parameters search_endpoint = os.getenv('AZURE_SEARCH_ENDPOINT') search_key = os.getenv('AZURE_SEARCH_ADMIN_KEY') @@ -40,6 +34,13 @@ def chat_on_your_data(query, search_index, messages): """ Perform retrieval queries over documents from the Azure AI Search Index. """ + # Configure Azure OpenAI parameters + azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT') + azure_openai_api_key = os.getenv('AZURE_OPENAI_KEY') + azure_openai_api_version = os.getenv('AZURE_OPENAI_VERSION') + azure_ada_deployment = os.getenv('AZURE_EMBEDDINGS_DEPLOYMENT') + azure_gpt_deployment = os.getenv('AZURE_GPT_DEPLOYMENT') + messages.append({"role": "user", "content": query}) with st.chat_message("user"): @@ -77,7 +78,7 @@ def chat_on_your_data(query, search_index, messages): "role_information": "You are an AI assistant that helps people find information.", "filter": None, "strictness": 3, - "top_n_documents": 1, + "top_n_documents": 5, "authentication": { "type": "api_key", "key": search_key @@ -91,12 +92,14 @@ def chat_on_your_data(query, search_index, messages): } ) - response_data = completion.to_dict() + response_data = completion.to_dict() ai_response = response_data['choices'][0]['message']['content'] - messages.append({"role": "assistant", "content": ai_response}) - + ai_response_cleaned = re.sub(r'\s+\.$', '.', re.sub(r'\[doc\d+\]', '', ai_response)) + citation = response_data["choices"][0]["message"]["context"]["citations"][0]["url"] + ai_response_final = f"{ai_response_cleaned}\n\nCitation(s):\n{citation}" + messages.append({"role": "assistant", "content": ai_response_final}) with st.chat_message("assistant"): - st.markdown(ai_response) + st.markdown(ai_response_final) def setup_azure_openai(log_text): """ @@ -111,7 +114,7 @@ def setup_azure_openai(log_text): log_text.write("Azure OpenAI setup complete.") return azure_openai -def connect_to_blob_storage(log_text, container): +def connect_to_blob_storage(log_text): """ Connects to Azure Blob Storage. """ @@ -188,7 +191,9 @@ def vectorize(log_text): blob_client = container_client.get_blob_client(blob) try: document = load_blob_content(blob_client) - metadata = {"blob_name": blob.name} + document_link = f'https://{os.getenv("BLOB_ACCOUNT_NAME")}.blob.core.windows.net/{os.getenv("BLOB_CONTAINER_NAME")}/{blob.name}' + metadata = {"blob_name": blob.name, + "document_link": document_link} chunks = split_text_with_metadata(document, metadata) documents.extend(chunks) except Exception as e: @@ -225,6 +230,7 @@ def vectorize(log_text): SimpleField(name="id", type=SearchFieldDataType.String, key=True), SearchableField(name="content", type=SearchFieldDataType.String), SearchableField(name="blob_name", type=SearchFieldDataType.String), + SearchableField(name="document_link", type=SearchFieldDataType.String), SearchField( name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), @@ -258,7 +264,8 @@ def vectorize(log_text): "id": str(i), "content": documents[i]["text"], "embedding": doc["embedding"], - "blob_name": doc["metadata"]["blob_name"] + "blob_name": doc["metadata"]["blob_name"], + "document_link": doc["metadata"]["document_link"] }) search_client.upload_documents(documents=documents_to_upload) log_text.success("Documents uploaded to search index.") @@ -280,8 +287,8 @@ def main(): # Task for retrieving documents from Azure AI Search in Streamlit UI if task == 'Retrieve': - st.write('This demo showcases an innovative way for users to engage with data housed in their Azure AI Search Index by \ - leveraging both semantic and vector search techniques. Semantic search enhances the querying process by comprehending \ + st.write('This demo allows users to chat over the data in the Azure AI Search Index by \ + leveraging both semantic and vector search techniques alongside the GPT model. Semantic search enhances the querying process by comprehending \ the meaning and context of user queries, thereby providing more pertinent results. Vector search, on the other hand, employs \ numerical representations of text to identify similar content using cosine similarity. ***For users to effectively \ utilize this demo, it is essential that they have previously created their Azure AI Search Index, by executing the \ @@ -303,8 +310,9 @@ def main(): # Task for embedding documents from Azure Blob to Azure AI Search index in Streamlit UI elif task == 'Vectorize': st.write('This demo processes PDF files from Azure Blob Storage, generates embeddings, and uploads them to Azure AI Search for indexing. \ - ***For users to effectively utilize this demo, it is essential that they uploade PDF files from the \ - "/search_documents" directory to Azure Blob container. Instructions to do this can be found [here](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal).***') + **Please complete this process before performing retrieval.** \ + For users to effectively utilize this demo, it is essential that they upload PDF files from the \ + [/search_documents](https://github.com/STRIDES/NIHCloudLabAzure/tree/main/notebooks/GenAI/search_documents) directory to Azure Blob container.') if st.button("Start Process"): log_text = st.empty() vectorize(log_text) diff --git a/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py b/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py index 77702de..e2b807f 100644 --- a/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py +++ b/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py @@ -61,8 +61,14 @@ def embeddings_search(): # Streamlit configuration st.subheader("Vector Search") - st.write('This process generates embeddings based on user queries, utilizing the compiled CSV that was created, to search for the most similar\ - documents within the vector store by employing cosine similarity.') + st.write(''' +This process generates embeddings based on user queries, utilizing the compiled CSV that was created, to search for the most similar +documents within the vector store by employing cosine similarity. Example questions a user can ask about the microsoft-earnings.csv: +- What was said about the budget? +- How many people utilize GitHub to build software? +- How many points did Microsoft Cloud gross margin percentage increase by? +- What are the expectations for the Q2 cash flow?''') + if 'answer' not in st.session_state: st.session_state.answer = [] if 'score' not in st.session_state: diff --git a/notebooks/GenAI/embedding_demos/styling.py b/notebooks/GenAI/embedding_demos/styling.py index da99bcf..cbb5036 100644 --- a/notebooks/GenAI/embedding_demos/styling.py +++ b/notebooks/GenAI/embedding_demos/styling.py @@ -1,7 +1,7 @@ import streamlit as st def global_page_style(): - st.set_page_config(layout="centered") + st.set_page_config(layout="wide") with open('style.css') as f: css = f.read() st.markdown(f'', unsafe_allow_html=True) \ No newline at end of file diff --git a/notebooks/GenAI/notebooks/AISearch_RAG_chatbot.ipynb b/notebooks/GenAI/notebooks/AISearch_RAG_chatbot.ipynb new file mode 100644 index 0000000..6bcd016 --- /dev/null +++ b/notebooks/GenAI/notebooks/AISearch_RAG_chatbot.ipynb @@ -0,0 +1,719 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Document Embedding and Indexing with Azure OpenAI and AI Search\n", + "\n", + "## Introduction \n", + "This notebook is designed to help developers build applications that utilize various Azure services to process and retrieve data. The main goal is to pull files from Azure Blob Storage, generate embeddings using Azure OpenAI, store these documents with custom metadata in an Azure AI Index, and then interact with the indexed data via Azure OpenAI. \n", + "### Objectives \n", + "1. **Vectorize**: \n", + " - Pull files from Azure Blob Containers. \n", + " - Generate embeddings using Azure OpenAI. \n", + " - Store documents with custom metadata in an Azure AI Index. \n", + "2. **Retrieve**: \n", + " - Chat over the data indexes with Azure OpenAI. \n", + "Each section of the notebook will focus on specific tasks and utilize the REST APIs provided by each Azure service to accomplish these tasks. By the end of this notebook, you will have a comprehensive understanding of how to integrate and use these Azure services to build a robust data processing and retrieval application. \n", + "### Prerequisites \n", + "Before proceeding with this notebook, please ensure that you have the following Azure services deployed and configured: \n", + " \n", + "1. **Azure OpenAI Service**: \n", + " - Ensure that you have deployed both a GPT model and an Ada model within your Azure OpenAI instance. \n", + "2. **Azure AI Search**: \n", + " - Your Azure AI Search service should be a minimum of the Basic tier to ensure compatibility with Azure OpenAI. \n", + "3. **Azure Blob Storage Account**: \n", + " - You should have an Azure Blob Storage account with PDF files stored in a blob container. These files should be located in the `/search_documents` directory of the `GenAI` directory. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Environment Setup \n", + "This section will guide you through setting up the environment for the notebook. We will import the necessary libraries, load environment variables, and configure Azure AI Search parameters. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 0.1 Install Python libraries from requirements.txt\n", + "\n", + "To ensure all necessary Python libraries are installed in the virtual environment for this notebook, we will use `pip` to install the packages specified in the `requirements.txt` file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r ../requirements.txt # Will Install all packages from the requirements.txt file into your .venv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 0.2 Import Necessary Libraries \n", + "Import all the packages installed in the virtual environment into our Python script. This is a crucial step as it makes the required functionalities available for the script to execute correctly." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary libraries \n", + " \n", + "# For handling file and directory operations \n", + "import os \n", + " \n", + "# For handling I/O operations \n", + "import io \n", + " \n", + "# For extracting text and tables from PDF files \n", + "import pdfplumber \n", + " \n", + "# For interacting with Azure Blob Storage \n", + "from azure.storage.blob import BlobServiceClient \n", + " \n", + "# For handling Azure credentials \n", + "from azure.core.credentials import AzureKeyCredential \n", + "from azure.identity import DefaultAzureCredential \n", + " \n", + "# For working with Azure Search service \n", + "from azure.search.documents import SearchClient \n", + "from azure.search.documents.indexes import SearchIndexClient \n", + " \n", + "# For configuring search indexes and fields \n", + "from azure.search.documents.indexes.models import ( \n", + " SimpleField, # Represents a simple field in an index \n", + " SearchFieldDataType, # Represents the data type of a field \n", + " VectorSearch, # Enables vector search capabilities \n", + " SearchIndex, # Represents a search index \n", + " SearchableField, # Represents a searchable field \n", + " SearchField, # Represents a field in a search index \n", + " VectorSearchProfile, # Represents a vector search profile \n", + " HnswAlgorithmConfiguration # Configuration for HNSW algorithm in vector search \n", + ") \n", + " \n", + "# For loading environment variables from a .env file \n", + "from dotenv import load_dotenv \n", + " \n", + "# For utilizing OpenAI functionalities within Azure \n", + "from openai import AzureOpenAI \n", + " \n", + "# For tokenization tasks \n", + "import tiktoken \n", + " \n", + "# For regular expression operations \n", + "import re " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 0.3 Load Environment Variables \n", + "Load the environment variables from a `.env` file. Ensure you have a `.env` file with the required Azure service credentials and configurations. This file should contain all necessary keys and connection strings to connect to your Azure services." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables \n", + "load_dotenv() \n", + " \n", + "# Example .env file format: \n", + "# AZURE_OPENAI_VERSION=your_openai_version \n", + "# AZURE_OPENAI_BASE=your_openai_base_url \n", + "# AZURE_OPENAI_ENDPOINT=your_openai_endpoint \n", + "# AZURE_OPENAI_KEY=your_openai_key \n", + "# AZURE_GPT_DEPLOYMENT=your_gpt_deployment \n", + "# AZURE_EMBEDDINGS_DEPLOYMENT=your_embeddings_deployment \n", + "# AZURE_SEARCH_ENDPOINT=your_search_endpoint \n", + "# AZURE_SEARCH_ADMIN_KEY=your_search_admin_key \n", + "# AZURE_SEARCH_INDEX=your_search_index \n", + "# BLOB_CONTAINER_NAME=your_blob_container_name \n", + "# BLOB_CONNECTION_STRING=your_blob_connection_string \n", + "# BLOB_ACCOUNT_NAME=your_blob_account_name " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- The `load_dotenv()` function reads the key-value pairs from the .env file and adds them to the environment variables.\n", + "- Replace the placeholder values in your .env file with your actual Azure service credentials and configuration details.\n", + "- ***This step is crucial for securely managing your credentials and keeping them out of your main codebase.***" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 0.4 Configure Azure AI Search Parameters\n", + "Configure the Azure AI Search parameters using the loaded environment variables. This allows us to set up the necessary configurations for connecting to the Azure AI Search service." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure Azure AI Search parameters \n", + "search_endpoint = os.getenv('AZURE_SEARCH_ENDPOINT') # Get the Azure Search endpoint from environment variables \n", + "search_key = os.getenv('AZURE_SEARCH_ADMIN_KEY') # Get the Azure Search admin key from environment variables " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- `os.getenv('AZURE_SEARCH_ENDPOINT')` retrieves the value of the AZURE_SEARCH_ENDPOINT environment variable, which contains the endpoint URL for your Azure Search service.\n", + "- `os.getenv('AZURE_SEARCH_ADMIN_KEY')` retrieves the value of the AZURE_SEARCH_ADMIN_KEY environment variable, which contains the admin key for your Azure Search service.\n", + "- These configurations are essential for authenticating and connecting to your Azure Search service." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Vectorization \n", + " \n", + "In this section, we will connect to Azure Blob Storage, process PDF documents into text chunks with metadata, generate embeddings using Azure OpenAI, and upload the data to Azure AI Search. \n", + "\n", + "Objectives:\n", + "1. Setup Function for Azure OpenAI\n", + "2. Connecting to Azure Blob Storage\n", + "3. Splitting Text with Metadata\n", + "4. Loading Blob Content\n", + "5. Vectorize Function" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.1 Setup Function for Azure OpenAI \n", + "This function sets up the Azure OpenAI instance using the provided API key, version, and endpoint from environment variables. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def setup_azure_openai(): \n", + " \"\"\" \n", + " Sets up Azure OpenAI. \n", + " \"\"\" \n", + " print(\"Setting up Azure OpenAI...\") \n", + " azure_openai = AzureOpenAI( \n", + " api_key=os.getenv(\"AZURE_OPENAI_KEY\"), \n", + " api_version=os.getenv('AZURE_OPENAI_VERSION'), \n", + " azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT') \n", + " ) \n", + " print(\"Azure OpenAI setup complete.\") \n", + " return azure_openai" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 Connecting to Azure Blob Storage \n", + "The following function connects to the Azure Blob Storage using the provided connection string and container name from the environment variables. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def connect_to_blob_storage(): \n", + " \"\"\" \n", + " Connects to Azure Blob Storage. \n", + " \"\"\" \n", + " print(\"Connecting to Blob Storage...\") \n", + " blob_service_client = BlobServiceClient.from_connection_string(os.getenv(\"BLOB_CONNECTION_STRING\")) \n", + " container_client = blob_service_client.get_container_client(os.getenv(\"BLOB_CONTAINER_NAME\")) \n", + " print(\"Connected to Blob Storage.\") \n", + " return container_client " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3 Splitting Text with Metadata \n", + "Split the content from PDF files into chunks with associated metadata. The text will be split by a max token length with additional chunk overlap. This is useful for processing large documents. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def split_text_with_metadata(text, metadata, max_length=800, overlap=75, encoding_name='cl100k_base'): \n", + " \"\"\" \n", + " Splits the text into chunks with metadata. \n", + " \"\"\" \n", + " tokenizer = tiktoken.get_encoding(encoding_name) \n", + " tokens = tokenizer.encode(text) \n", + " chunks = [] \n", + " start = 0 \n", + " end = max_length \n", + " \n", + " while start < len(tokens): \n", + " chunk = tokens[start:end] \n", + " chunk_text = tokenizer.decode(chunk) \n", + " chunk_metadata = metadata.copy() \n", + " chunk_metadata.update({ \n", + " 'start_token': start, \n", + " 'end_token': end, \n", + " 'chunk_length': len(chunk), \n", + " 'chunk_text_preview': chunk_text[:50] + '...' \n", + " }) \n", + " chunks.append({ \n", + " 'text': chunk_text, \n", + " 'metadata': chunk_metadata \n", + " }) \n", + " start = end - overlap \n", + " end = start + max_length \n", + " \n", + " return chunks " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. ***Tokenize Text***: The text is encoded into tokens using the tokenizer.\n", + "2. ***Initialize Variables***: Set up initial indices for chunking.\n", + "3. ***Create Chunks***: Loop through the tokens to create chunks:\n", + " - Extract a chunk of tokens.\n", + " - Decode the chunk back into text.\n", + " - Copy and update metadata with chunk-specific information.\n", + " - Append the chunk and its metadata to the list.\n", + "4. ***Overlap Handling***: Move the start index back by the overlap amount to ensure chunks overlap as specified.\n", + "\n", + "**Key Params**:\n", + "- ***Max Chunk Size (max_length)***: Each chunk will have a maximum of `max_length` tokens (default is 800 tokens).\n", + "- ***Chunk Overlap (overlap)***: Consecutive chunks will overlap by `overlap` tokens (default is 75 tokens)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.4 Loading Blob Content \n", + "Load and extracts the content of a PDF file from the Azure Blob Storage client. " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def load_blob_content(blob_client): \n", + " \"\"\" \n", + " Loads and returns the content of the PDF blob. \n", + " \"\"\" \n", + " blob_name = blob_client.blob_name \n", + " if not blob_name.lower().endswith('.pdf'): \n", + " raise ValueError(f\"Blob {blob_name} is not a PDF file.\") \n", + " \n", + " blob_data = blob_client.download_blob().readall() \n", + " pdf_stream = io.BytesIO(blob_data) \n", + " document_text = \"\" \n", + " \n", + " with pdfplumber.open(pdf_stream) as pdf: \n", + " for page in pdf.pages: \n", + " document_text += page.extract_text() + \"\\n\" \n", + " \n", + " return document_text " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Check File Type**:\n", + " - The function first checks if the blob is a PDF file by verifying the file extension.\n", + " - If the file is not a PDF, it raises a `ValueError`.\n", + "2. **Download Blob Content**:\n", + " - The blob content is downloaded and read into `blob_data`.\n", + "3. **Convert to Stream**:\n", + " - The blob data is converted into a byte stream using `io.BytesIO`.\n", + "4. **Extract Text from PDF**:\n", + " - The PDF is opened using `pdfplumber`.\n", + " - Text is extracted from each page of the PDF and concatenated into `document_text`.\n", + "5. **Return Document Text**:\n", + " - The function returns the extracted text from the PDF." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.5 Vectorize Function \n", + "Create a function that orchestrates the vector workflow. This function will connect to Azure services, processes blobs, generate embeddings, and upload the data to Azure AI Search index. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def vectorize(): \n", + " \"\"\" \n", + " Main function that orchestrates the vector workflow. \n", + " \"\"\" \n", + " azure_openai = setup_azure_openai() \n", + " container_client = connect_to_blob_storage() \n", + " \n", + " # Read and chunk documents with metadata \n", + " print(\"Listing blobs in container...\") \n", + " blob_list = container_client.list_blobs() \n", + " documents = [] \n", + " for blob in blob_list: \n", + " if not blob.name.lower().endswith('.pdf'): \n", + " print(f\"Skipping non-PDF blob: {blob.name}\") \n", + " continue \n", + " \n", + " print(f\"Processing blob: {blob.name}\") \n", + " blob_client = container_client.get_blob_client(blob) \n", + " try: \n", + " document = load_blob_content(blob_client) \n", + " document_link = f'https://{os.getenv(\"BLOB_ACCOUNT_NAME\")}.blob.core.windows.net/{os.getenv(\"BLOB_CONTAINER_NAME\")}/{blob.name}' \n", + " \n", + " metadata = {\"blob_name\": blob.name, \"document_link\": document_link} \n", + " chunks = split_text_with_metadata(document, metadata) \n", + " documents.extend(chunks) \n", + " except Exception as e: \n", + " print(f\"Failed to process blob {blob.name}: {e}\") \n", + " \n", + " print(\"Blobs processed and documents chunked.\") \n", + " \n", + " # Generate embeddings \n", + " print(\"Generating embeddings...\") \n", + " embeddings = [] \n", + " tokenizer = tiktoken.get_encoding(\"cl100k_base\") \n", + " max_tokens = 8192 \n", + " for i, doc in enumerate(documents): \n", + " print(f\"Processing chunk {i + 1}/{len(documents)}\") \n", + " print(f\"Chunk text: {doc['text']}\\n\") \n", + " tokens = tokenizer.encode(doc[\"text\"]) \n", + " if len(tokens) > max_tokens: \n", + " print(f\"Skipping document chunk {i + 1} with {len(tokens)} tokens, exceeding max limit of {max_tokens}.\") \n", + " continue \n", + " response = azure_openai.embeddings.create(input=doc[\"text\"], model=os.getenv(\"AZURE_EMBEDDINGS_DEPLOYMENT\")) \n", + " embeddings.append({ \n", + " \"embedding\": response.data[0].embedding, \n", + " \"metadata\": doc[\"metadata\"] \n", + " }) \n", + " print(f\"Embeddings: {response.data[0].embedding}\") \n", + " \n", + " print(\"Embeddings generation complete.\") \n", + " \n", + " # Create Search Index \n", + " print(\"Creating search index...\") \n", + " credential = AzureKeyCredential(os.getenv(\"AZURE_SEARCH_ADMIN_KEY\")) \n", + " search_index_client = SearchIndexClient(endpoint=os.getenv(\"AZURE_SEARCH_ENDPOINT\"), credential=credential) \n", + " fields = [ \n", + " SimpleField(name=\"id\", type=SearchFieldDataType.String, key=True), \n", + " SearchableField(name=\"content\", type=SearchFieldDataType.String), \n", + " SearchableField(name=\"blob_name\", type=SearchFieldDataType.String), \n", + " SearchableField(name=\"document_link\", type=SearchFieldDataType.String), \n", + " SearchField( \n", + " name=\"embedding\", \n", + " type=SearchFieldDataType.Collection(SearchFieldDataType.Single), \n", + " searchable=True, \n", + " vector_search_dimensions=1536, \n", + " vector_search_profile_name=\"myHnswProfile\" \n", + " ) \n", + " ] \n", + " vector_search = VectorSearch( \n", + " algorithms=[ \n", + " HnswAlgorithmConfiguration(name=\"myHnsw\") \n", + " ], \n", + " profiles=[ \n", + " VectorSearchProfile( \n", + " name=\"myHnswProfile\", \n", + " algorithm_configuration_name=\"myHnsw\" \n", + " ) \n", + " ] \n", + " ) \n", + " index = SearchIndex(name=\"documents-index\", fields=fields, vector_search=vector_search) \n", + " search_index_client.create_index(index) \n", + " print(\"Search index created.\") \n", + " \n", + " # Upload chunks and embeddings to Azure AI Search \n", + " print(\"Uploading documents to search index...\") \n", + " search_client = SearchClient(endpoint=os.getenv(\"AZURE_SEARCH_ENDPOINT\"), index_name=\"documents-index\", credential=credential) \n", + " documents_to_upload = [] \n", + " \n", + " for i, doc in enumerate(embeddings): \n", + " documents_to_upload.append({ \n", + " \"id\": str(i), \n", + " \"content\": documents[i][\"text\"], \n", + " \"embedding\": doc[\"embedding\"], \n", + " \"blob_name\": doc[\"metadata\"][\"blob_name\"], \n", + " \"document_link\": doc[\"metadata\"][\"document_link\"] \n", + " }) \n", + " search_client.upload_documents(documents=documents_to_upload) \n", + " print(\"Documents uploaded to search index.\") \n", + "\n", + "vectorize()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Setup Connections**:\n", + " - Connect to Azure OpenAI and Blob Storage.\n", + "2. **Process Blobs**:\n", + " - List blobs in the container.\n", + " - For each PDF blob, load its content and split it into chunks with metadata.\n", + "3. **Customize Metadata**:\n", + " - Add custom metadata such as the blob file name and blob URL:\n", + " ```python\n", + " metadata = {\"blob_name\": blob.name, \"document_link\": document_link} \n", + " ```\n", + "4. **Generate Embeddings**:\n", + " - For each chunk, generate embeddings using Azure OpenAI.\n", + "5. **Create Search Index**:\n", + " - Define and create a search index in Azure AI Search.\n", + "6. **Upload Documents**:\n", + " - Upload the chunks and their embeddings to the search index." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Retrieve \n", + "\n", + "In this section, we will implement a function to perform retrieval queries over documents from the Azure AI Search Index using Azure OpenAI for chat capabilities.\n", + "\n", + "Objective: \n", + "1. Retrieve Function: `chat_on_your_data`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Retrieve Function: chat_on_your_data \n", + "Perform retrieval queries over documents from the Azure AI Search Index using Azure OpenAI. Construct a search query, interacts with the search index, and processes the results to provide relevant information based on the query." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User: What year was New York State Route 373 built?\n", + "Processing...\n", + "GPT Response: New York State Route 373 was designated in 1930.\n", + "\n", + "Citation(s):\n", + "https://syndatastg.blob.core.windows.net/testcloudlab/New_York_State_Route_373.pdf\n" + ] + } + ], + "source": [ + "def chat_on_your_data(): \n", + " \"\"\" \n", + " Perform retrieval queries over documents from the Azure AI Search Index. \n", + " \"\"\" \n", + " # Define the query and other parameters \n", + " query = \"What year was New York State Route 373 built?\" # Example query\n", + " search_index = \"documents-index\" \n", + " messages = [] \n", + " \n", + " # Configure Azure OpenAI parameters \n", + " azure_endpoint = os.getenv('AZURE_OPENAI_BASE') \n", + " azure_openai_api_key = os.getenv('AZURE_OPENAI_KEY') \n", + " azure_openai_api_version = os.getenv('AZURE_OPENAI_VERSION') \n", + " azure_ada_deployment = os.getenv('AZURE_EMBEDDINGS_DEPLOYMENT') \n", + " azure_gpt_deployment = os.getenv('AZURE_GPT_DEPLOYMENT') \n", + " \n", + " # Append user query to chat messages \n", + " messages.append({\"role\": \"user\", \"content\": query}) \n", + " \n", + " print(f\"User: {query}\") \n", + " \n", + " print('Processing...') \n", + " # Initialize the AzureOpenAI client \n", + " client = AzureOpenAI( \n", + " azure_endpoint=azure_endpoint, \n", + " api_key=azure_openai_api_key, \n", + " api_version=azure_openai_api_version, \n", + " ) \n", + " # Create a chat completion with Azure OpenAI \n", + " completion = client.chat.completions.create( \n", + " model=azure_gpt_deployment, \n", + " messages=[ \n", + " {\"role\": \"system\", \"content\": \"You are an AI assistant that helps people find information. Ensure the Markdown responses are correctly formatted before responding.\"}, \n", + " {\"role\": \"user\", \"content\": query} \n", + " ], \n", + " max_tokens=800, \n", + " temperature=0.7, \n", + " top_p=0.95, \n", + " frequency_penalty=0, \n", + " presence_penalty=0, \n", + " stop=None, \n", + " stream=False, \n", + " extra_body={ \n", + " \"data_sources\": [{ \n", + " \"type\": \"azure_search\", \n", + " \"parameters\": { \n", + " \"endpoint\": search_endpoint, \n", + " \"index_name\": search_index, \n", + " \"semantic_configuration\": \"default\", \n", + " \"query_type\": \"vector_simple_hybrid\", \n", + " \"fields_mapping\": {}, \n", + " \"in_scope\": True, \n", + " \"role_information\": \"You are an AI assistant that helps people find information.\", \n", + " \"filter\": None, \n", + " \"strictness\": 3, \n", + " \"top_n_documents\": 5, \n", + " \"authentication\": { \n", + " \"type\": \"api_key\", \n", + " \"key\": search_key \n", + " }, \n", + " \"embedding_dependency\": { \n", + " \"type\": \"deployment_name\", \n", + " \"deployment_name\": azure_ada_deployment \n", + " } \n", + " } \n", + " }] \n", + " } \n", + " ) \n", + " \n", + " # Extract the response data \n", + " response_data = completion.to_dict() \n", + " ai_response = response_data['choices'][0]['message']['content'] \n", + " # Clean up the AI response \n", + " ai_response_cleaned = re.sub(r'\\s+\\.$', '.', re.sub(r'\\[doc\\d+\\]', '', ai_response)) \n", + " citation = response_data[\"choices\"][0][\"message\"][\"context\"][\"citations\"][0][\"url\"] \n", + " ai_response_final = f\"{ai_response_cleaned}\\n\\nCitation(s):\\n{citation}\" \n", + " \n", + " # Append AI response to chat messages \n", + " messages.append({\"role\": \"assistant\", \"content\": ai_response_final}) \n", + " \n", + " print(f\"GPT Response: {ai_response_final}\") \n", + " \n", + "# Call the function to test it \n", + "chat_on_your_data() " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. **Define Query and Search Index**:\n", + " - Set up the user query and the name of the search index to be created.\n", + " - Default values are provided:\n", + " - Example query: `\"What year did the hurricane Irene occur?\"`\n", + " - Search index: `\"documents-index\"`\n", + "2. **Configure Azure OpenAI Parameters**:\n", + " - Retrieve necessary configurations and API keys from environment variables.\n", + "3. **Append User Query**:\n", + " - Append the user's query to the chat messages list.\n", + "4. **Initialize AzureOpenAI Client**:\n", + " - Initialize the Azure OpenAI client using the provided endpoint, API key, and API version.\n", + "5. **Create Chat Completion**:\n", + " - Create a chat completion request using Azure OpenAI.\n", + " - Specify the model deployment, chat messages, and additional parameters like `max_tokens`, `temperature`, etc.\n", + " - Provide extra body parameters to include Azure Search as a data source.\n", + " - Extra Body Parameters:\n", + " - `endpoint`: The Azure Search endpoint.\n", + " - `index_name`: The name of the search index.\n", + " - `semantic_configuration`: The semantic search configuration.\n", + " - `query_type`: Type of query (e.g., `vector_simple_hybrid`).\n", + " - `fields_mapping`: Mapping of fields (if any).\n", + " - `role_information`: Information about the role of the assistant.\n", + " - `filter`: Any filters to apply to the search (if any).\n", + " - `strictness`: Level of strictness for the search.\n", + " - `top_n_documents`: Number of top documents to retrieve.\n", + " - `authentication`: Authentication details (API key).\n", + " - `embedding_dependency`: Embedding deployment details.\n", + "6. **Extract and Clean Response**:\n", + " - Extract the response data from the completion result.\n", + " - Clean up the AI response by removing unnecessary characters and formatting it properly.\n", + " - Extract the citation URL from the response context.\n", + "7. **Append AI Response**:\n", + " - Append the cleaned AI response to the chat messages list.\n", + " - Print the final response." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "Congratulations! You have successfully created a Retrieval-Augmented Generation (RAG) application for documents stored in your Azure Blob Storage Account, using Azure OpenAI and Azure AI Search. At this point, you should have a solid understanding of how to build the logic for vectorizing documents from an Azure Blob Storage container and retrieving those documents in your Azure OpenAI application.\n", + "\n", + "### Key Accomplishments:\n", + "1. **Environment Setup**:\n", + " - Initialized Azure OpenAI with the necessary API credentials and configurations.\n", + " - Established a connection to Azure Blob Storage to access PDF documents.\n", + "2. **Vectorize**:\n", + " - Implemented a function to split PDF text into manageable chunks with associated metadata.\n", + " - Orchestrated the entire vectorization process:\n", + " - Setup Azure OpenAI and connected to Azure Blob Storage.\n", + " - Retrieved and chunked documents.\n", + " - Generated embeddings for each chunk using Azure OpenAI.\n", + " - Created a search index in Azure AI Search.\n", + " - Uploaded the chunks and their embeddings to Azure AI Search.\n", + "3. **Retrieve**:\n", + " - Implemented a function to perform retrieval queries over the documents indexed in Azure AI Search using Azure OpenAI.\n", + " - Executed a user query and performed a search using Azure AI Search.\n", + " - Generated a chat completion based on the search results and formatted it for display." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/GenAI/requirements.txt b/notebooks/GenAI/requirements.txt index da48ce8..1f1f464 100644 --- a/notebooks/GenAI/requirements.txt +++ b/notebooks/GenAI/requirements.txt @@ -5,4 +5,6 @@ numpy streamlit azure-search-documents azure-identity -azure-storage-blob \ No newline at end of file +azure-storage-blob +pdfplumber +tiktoken \ No newline at end of file From 5fe1b531f32e4e14bb11f506d69aeb794bd69ac2 Mon Sep 17 00:00:00 2001 From: cjackson202 Date: Fri, 10 Jan 2025 12:39:00 -0500 Subject: [PATCH 5/7] resolve PR comments, rev 2 --- notebooks/GenAI/azure_infra_setup/README.md | 98 ++++- notebooks/GenAI/embedding_demos/Demo_Suite.py | 5 +- .../GenAI/embedding_demos/Demo_Suite_ngrok.py | 21 ++ .../embedding_demos/pages/AI_Search_Query.py | 126 +++++-- .../embedding_demos/pages/AOAI_Embeddings.py | 344 +++++++++++------- notebooks/GenAI/embedding_demos/readme.md | 191 +++++++++- .../example_scripts/workshop_embedding.py | 42 ++- .../GenAI/example_scripts/workshop_search.py | 3 +- .../notebooks/AISearch_RAG_chatbot.ipynb | 92 +++-- notebooks/GenAI/requirements.txt | 3 +- 10 files changed, 684 insertions(+), 241 deletions(-) create mode 100644 notebooks/GenAI/embedding_demos/Demo_Suite_ngrok.py diff --git a/notebooks/GenAI/azure_infra_setup/README.md b/notebooks/GenAI/azure_infra_setup/README.md index cf0e16a..844a1ed 100644 --- a/notebooks/GenAI/azure_infra_setup/README.md +++ b/notebooks/GenAI/azure_infra_setup/README.md @@ -1,14 +1,77 @@ -# Setting Up Azure Environment for Azure GenAI Cloud Lab - -Welcome! This guide will help you set up your Azure environment to complete the activities in the [Azure GenAI](../) directory of the NIH Cloud Lab. We will walk you through the steps required to configure PowerShell, deploy necessary resources using an ARM template, upload local files to Azure Storage Account, and acquire keys and secrets for `.env` variables. - -## Prerequisites +# Setting Up Azure Environment for Azure GenAI Cloud Lab + +This guide will help you set up your Azure environment to complete the activities in the [GenAI](../) directory of the NIH Cloud Lab. +The purpose of this guide is to walk you through an automated deployment of the resources needed to carry out these activities. +This automated approach utilizes a pre-built [ARM template](arm_resources.json) file, which serves as an alternative approach +to manually deploying and configuring resources via the Azure portal. + +## Page Contents ++ [Learning Objectives](#learning_objectives) ++ [Prerequisites](#prerequisites) ++ [Resources and Pricing](#resources_and_pricing) ++ [Get Started](#get_started) ++ [Conclusion](#conclusion) ++ [Clean Up](#clean_up) + +## Learning Objectives + +1. Configure PowerShell or Azure CLI + - Step-by-step instructions to set up and configure PowerShell and Azure CLI for the neccessary Azure resource deployments. +2. Deploy Resources Using an ARM Template + - Detailed guidance on deploying the necessary resources in Azure using an ARM template for the [GenAI](../) directory. +3. Upload Local Files to Azure Storage Account + - Instructions on how to upload files from the [search_documents](../search_documents/) directory to an Azure Storage Account Blob container. +4. Acquire Keys and Secrets for .env Variables + - Steps to obtain keys and secrets from deployed resources and use them in your .env files for the tutorials in the [GenAI](../) directory. + +## Prerequisites - An active Azure subscription - PowerShell installed on your machine (option 1) - Azure CLI installed (option 2) + +### Powershell (option 1) vs. Azure CLI (option 2) + +Choosing between Azure CLI and PowerShell comes down to personal preference and the working environment: + +- **Cloud Environments**: For users working in the cloud, such as with Azure Machine Learning or Azure VMs, Azure CLI may be a more suitable option. + - ***Note***: If users are utilizing any of these environments, please skip Step 1 and move directly to Step 2 using Azure CLI (option 2). +- **Local Environments**: For users working on a local machine, both Azure CLI and PowerShell are viable options. The choice depends on personal preference. + - ***Note***: If users are utilizing Azure CLI, please skip Step 1 and move directly to Step 2. + +## Resources and Pricing + +Provided is a list of resources that will be deployed by the provided ARM template along with the estimated cost breakdown for each resource. +***An ARM Template is a JSON file that defines the infrastructure and configuration for your Azure project***. It allows you to deploy, manage, and configure +all the resources for your solution in a single, coordinated operation. When executing the provided ARM template, actual costs may vary depending on usage +and the Azure pricing model for each resource. Please find the resources that will be deployed below. + +### Resources Deployed +1. **Azure Storage Account** + - **Resource Type**: Storage Account (Standard_LRS) + - **Purpose**: This resource is used to store and manage files from [search_documents](../search_documents/) in a single container. + - **Estimated Cost**: $0.018 per GB/$18.40 per 1000 GB per month + +2. **Azure AI Search** + - **Resource Type**: Cognitive Search (Basic) + - **Purpose**: This resource provides AI search capabilities for the GenAI tutorials, including indexing and querying. + - **Estimated Cost**: $0.10 per hour/$73.73 per month + +3. **Azure OpenAI** + - **Resource Type**: Cognitive Services (Standard) + - **Purpose**: This resource provides access to OpenAI models, including GPT-4 and embeddings for AI processing. + - **Models Deployed**: + - **Model**: gpt-4o-mini + - **Version**: 2024-07-18 + - **Cost per 1M Tokens**: $0.15 input/$0.60 output + - **Model**: text-embedding-3-small + - **Version**: 1 + - **Cost per 1K Tokens**: $0.00002 + - **Estimated Cost**: Varies based on model usage and API calls. Please refer to [Azure OpenAI Service Pricing](https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/?msockid=3df6a53ac4916aa73e41b1e3c5c36bd4) for more details. -## Steps +Please refer to the [Azure Pricing Calculator](https://azure.microsoft.com/en-us/pricing/calculator/) for a more detailed and personalized estimate based on your specific usage patterns and region. + +## Get Started ### 1. Setting Up the Azure Module in PowerShell @@ -17,9 +80,6 @@ First, you need to install the Azure module in PowerShell to connect to your Azu ```powershell # Install the Az module (if using PowerShell) Install-Module -Name Az -AllowClobber -Force - -# Import the Az module (if using Azure CLI) -Import-Module Az ``` ### 2. Logging into Azure @@ -32,7 +92,7 @@ You can log into your Azure account either using PowerShell or Azure CLI. Connect-AzAccount ``` **Using Azure CLI** -```powershell +```bash # Log into your Azure account az login ``` @@ -83,7 +143,7 @@ az group create --name $resourceGroupName --location $location ### 5. Deploying the ARM Template -Deploy the [ARM template](/notebooks/GenAI/azure_infra_setup/arm_resources.json) to create the Azure Storage Account, Azure AI Search, and Azure OpenAI resources. +Deploy the [ARM template](arm_resources.json) to create the Azure Storage Account, Azure AI Search, and Azure OpenAI resources. ***Using PowerShell*** ```powershell @@ -123,7 +183,11 @@ done ### 7. Retrieving API Keys -Retrieve the API keys for each service created by the ARM template deployment. These secrets are confidential and should be handled appropriately. Once the output is received, the values will be added to your `.env` file, which should be created in the ./notebooks/GenAI directory. Note that this `.env` file is already added to the `.gitignore`. +Retrieve the API keys for each service created by the ARM template deployment. These secrets are confidential and should be handled appropriately. +Once the output is received, the values should be added to your `.env` file, which should be created in the [GenAI](../) directory. +Note that this `.env` file is already added to the `.gitignore` file, which tells Git which files or directories to ignore in a project, +preventing them from being tracked or included in version control. Adding `.env` to `.gitignore` is crucial because it prevents sensitive information +like API keys and passwords from being exposed in your version control system. **Azure Storage Account** @@ -221,9 +285,10 @@ AZURE_SEARCH_ADMIN_KEY = "Your Azure AI Search API key" BLOB_CONTAINER_NAME = "Your Azure Blob Container name hosting files from /search_documents" BLOB_CONNECTION_STRING = "Your Azure Blob connection string" ``` -## Conclusion +## Conclusion -Congratulations on completing the Azure setup! During this process, we established a new resource group dedicated to the NIH Cloud Lab environment and configured three Azure resources in your tenant using an ARM template file. The resources include: +Congratulations on completing the Azure setup! During this process, we established a new resource group dedicated to the NIH Cloud Lab environment and +configured three Azure resources in your tenant using an ARM template file. The resources include: - An Azure Storage Account with a deployed Blob container and files uploaded from `../search_documents` - Azure AI Search @@ -231,4 +296,7 @@ Congratulations on completing the Azure setup! During this process, we establish Additionally, we configured `.env` variables in your local `.env` file, which is added to `.gitignore` by default. -You are now ready to proceed with the GenAI activities in the NIH Cloud Lab. \ No newline at end of file +You are now ready to proceed with the GenAI tutorials! + +## Clean Up +No clean up neccessary, as the created resources will be used for tutorials found in [GenAI](../). \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/Demo_Suite.py b/notebooks/GenAI/embedding_demos/Demo_Suite.py index 937cbf1..af81d4c 100644 --- a/notebooks/GenAI/embedding_demos/Demo_Suite.py +++ b/notebooks/GenAI/embedding_demos/Demo_Suite.py @@ -15,7 +15,8 @@ def main(): st.title("Azure OpenAI RAG Demo Suite") st.markdown("### Demo Overviews") st.write(""" - Welcome to the Azure OpenAI RAG Demo Suite. On the left side-panel, you will find various demonstrations that showcase the capabilities of Azure OpenAI with a Streamlit frontend. Each demonstration is described in detail below, highlighting their unique features and functionalities. + Welcome to the Azure OpenAI RAG Demo Suite. On the left side-panel, you will find various demonstrations that showcase the capabilities + of Azure OpenAI with a Streamlit frontend. Each demonstration is described in detail below, highlighting their unique features and functionalities. """) # Horizontal divider @@ -24,7 +25,7 @@ def main(): # Chat with Your Data section st.markdown("### Generate & Search with Azure OpenAI & Azure AI Search (AI Search Query)") st.write(""" -This demo provides an interactive platform for users to manage documents stored in their Azure Blob Container. +This demo provides an interactive platform for users to chat over documents stored in their Azure Blob Container. This is accomplished by indexing the documents in Azure AI Search and employing a combination of semantic and vector search techniques. In this demo, we concentrate on real documents that are housed in an Azure Blob Container. These documents undergo a process of chunking, after which embeddings from these chunks are stored in Azure AI Search, serving as our vector database. diff --git a/notebooks/GenAI/embedding_demos/Demo_Suite_ngrok.py b/notebooks/GenAI/embedding_demos/Demo_Suite_ngrok.py new file mode 100644 index 0000000..c4f1e20 --- /dev/null +++ b/notebooks/GenAI/embedding_demos/Demo_Suite_ngrok.py @@ -0,0 +1,21 @@ +import os +from pyngrok import ngrok, conf +from dotenv import load_dotenv +import subprocess + +# Load environment variables from .env file +load_dotenv() + +# Set up Ngrok configuration +ngrok_config = conf.PyngrokConfig() +conf.set_default(ngrok_config) + +# Authenticate with Ngrok using the auth token +ngrok.set_auth_token(os.getenv("ngrok_key")) + +# Create a public URL for your local Streamlit app +public_url = ngrok.connect(8501) +print("Ngrok Tunnel URL:", public_url) + +# Run your Streamlit app +subprocess.run(["streamlit", "run", "Demo_Suite.py"]) \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py b/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py index 27c7410..3854703 100644 --- a/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py +++ b/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py @@ -21,26 +21,34 @@ from openai import AzureOpenAI import tiktoken from styling import global_page_style -import re +import re # Load environment variables load_dotenv() # Configure Azure AI Search parameters -search_endpoint = os.getenv('AZURE_SEARCH_ENDPOINT') -search_key = os.getenv('AZURE_SEARCH_ADMIN_KEY') +search_endpoint = os.getenv('AZURE_SEARCH_ENDPOINT') # The endpoint for your Azure AI Search instance +search_key = os.getenv('AZURE_SEARCH_ADMIN_KEY') # Your AI Search API key def chat_on_your_data(query, search_index, messages): """ Perform retrieval queries over documents from the Azure AI Search Index. + + Args: + query (str): The user's search query. + search_index (str): The name of the search index. + messages (list): List of messages to display in the chat. + + Returns: + None """ # Configure Azure OpenAI parameters - azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT') - azure_openai_api_key = os.getenv('AZURE_OPENAI_KEY') - azure_openai_api_version = os.getenv('AZURE_OPENAI_VERSION') - azure_ada_deployment = os.getenv('AZURE_EMBEDDINGS_DEPLOYMENT') - azure_gpt_deployment = os.getenv('AZURE_GPT_DEPLOYMENT') - + azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT') # The endpoint for your Azure OpenAI instance + azure_openai_api_key = os.getenv('AZURE_OPENAI_KEY') # Your Azure OpenAI API key + azure_openai_api_version = os.getenv('AZURE_OPENAI_VERSION') # The version of the Azure OpenAI API you are using + azure_ada_deployment = os.getenv('AZURE_EMBEDDINGS_DEPLOYMENT') # The deployed ADA model for your Azure OpenAI instance + azure_gpt_deployment = os.getenv('AZURE_GPT_DEPLOYMENT') # The deployed GPT model for your Azure OpenAI instance + messages.append({"role": "user", "content": query}) with st.chat_message("user"): @@ -91,25 +99,35 @@ def chat_on_your_data(query, search_index, messages): }] } ) - - response_data = completion.to_dict() + + response_data = completion.to_dict() ai_response = response_data['choices'][0]['message']['content'] - ai_response_cleaned = re.sub(r'\s+\.$', '.', re.sub(r'\[doc\d+\]', '', ai_response)) + + # Clean the AI response + ai_response_cleaned = re.sub(r'\s+\.$', '.', re.sub(r'\[doc\d+\]', '', ai_response)) citation = response_data["choices"][0]["message"]["context"]["citations"][0]["url"] - ai_response_final = f"{ai_response_cleaned}\n\nCitation(s):\n{citation}" + ai_response_final = f"{ai_response_cleaned}\n\nCitation(s):\n{citation}" + messages.append({"role": "assistant", "content": ai_response_final}) + with st.chat_message("assistant"): st.markdown(ai_response_final) def setup_azure_openai(log_text): """ Sets up Azure OpenAI. + + Args: + log_text (streamlit DeltaGenerator): Streamlit log text object for logging setup progress. + + Returns: + AzureOpenAI: Configured AzureOpenAI client instance. """ log_text.write("Setting up Azure OpenAI...") azure_openai = AzureOpenAI( - api_key=os.getenv("Azure_OPENAI_KEY"), - api_version=os.getenv('AZURE_OPENAI_VERSION'), - azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT') + api_key=os.getenv("AZURE_OPENAI_KEY"), # Your Azure OpenAI API key + api_version=os.getenv('AZURE_OPENAI_VERSION'), # The version of the Azure OpenAI API you are using + azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT') # The endpoint for your Azure OpenAI instance ) log_text.write("Azure OpenAI setup complete.") return azure_openai @@ -117,6 +135,12 @@ def setup_azure_openai(log_text): def connect_to_blob_storage(log_text): """ Connects to Azure Blob Storage. + + Args: + log_text (streamlit DeltaGenerator): Streamlit log text object for logging connection progress. + + Returns: + ContainerClient: Azure Blob Storage container client. """ log_text.write("Connecting to Blob Storage...") blob_service_client = BlobServiceClient.from_connection_string(os.getenv("BLOB_CONNECTION_STRING")) @@ -127,13 +151,23 @@ def connect_to_blob_storage(log_text): def split_text_with_metadata(text, metadata, max_length=800, overlap=75, encoding_name='cl100k_base'): """ Splits the text into chunks with metadata. + + Args: + text (str): The text to split. + metadata (dict): Metadata to associate with each chunk. + max_length (int): Maximum length of each chunk in tokens. + overlap (int): Number of tokens to overlap between chunks. + encoding_name (str): Encoding name for tokenization. + + Returns: + list: List of chunks with associated metadata. """ tokenizer = tiktoken.get_encoding(encoding_name) tokens = tokenizer.encode(text) chunks = [] start = 0 end = max_length - + while start < len(tokens): chunk = tokens[start:end] chunk_text = tokenizer.decode(chunk) @@ -150,12 +184,21 @@ def split_text_with_metadata(text, metadata, max_length=800, overlap=75, encodin }) start = end - overlap end = start + max_length - + return chunks def load_blob_content(blob_client): """ Loads and returns the content of the PDF blob. + + Args: + blob_client (BlobClient): Azure Blob Storage blob client. + + Returns: + str: Extracted text from the PDF blob. + + Raises: + ValueError: If the blob is not a PDF file. """ blob_name = blob_client.blob_name if not blob_name.lower().endswith('.pdf'): @@ -164,7 +207,7 @@ def load_blob_content(blob_client): blob_data = blob_client.download_blob().readall() pdf_stream = io.BytesIO(blob_data) document_text = "" - + with pdfplumber.open(pdf_stream) as pdf: for page in pdf.pages: document_text += page.extract_text() + "\n" @@ -174,6 +217,12 @@ def load_blob_content(blob_client): def vectorize(log_text): """ Main function that orchestrates the vector workflow. + + Args: + log_text (streamlit DeltaGenerator): Streamlit log text object for logging workflow progress. + + Returns: + None """ azure_openai = setup_azure_openai(log_text) container_client = connect_to_blob_storage(log_text) @@ -191,9 +240,8 @@ def vectorize(log_text): blob_client = container_client.get_blob_client(blob) try: document = load_blob_content(blob_client) - document_link = f'https://{os.getenv("BLOB_ACCOUNT_NAME")}.blob.core.windows.net/{os.getenv("BLOB_CONTAINER_NAME")}/{blob.name}' - metadata = {"blob_name": blob.name, - "document_link": document_link} + document_link = f'https://{os.getenv("BLOB_ACCOUNT_NAME")}.blob.core.windows.net/{os.getenv("BLOB_CONTAINER_NAME")}/{blob.name}' + metadata = {"blob_name": blob.name, "document_link": document_link} chunks = split_text_with_metadata(document, metadata) documents.extend(chunks) except Exception as e: @@ -230,7 +278,7 @@ def vectorize(log_text): SimpleField(name="id", type=SearchFieldDataType.String, key=True), SearchableField(name="content", type=SearchFieldDataType.String), SearchableField(name="blob_name", type=SearchFieldDataType.String), - SearchableField(name="document_link", type=SearchFieldDataType.String), + SearchableField(name="document_link", type=SearchFieldDataType.String), SearchField( name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), @@ -264,8 +312,8 @@ def vectorize(log_text): "id": str(i), "content": documents[i]["text"], "embedding": doc["embedding"], - "blob_name": doc["metadata"]["blob_name"], - "document_link": doc["metadata"]["document_link"] + "blob_name": doc["metadata"]["blob_name"], + "document_link": doc["metadata"]["document_link"] }) search_client.upload_documents(documents=documents_to_upload) log_text.success("Documents uploaded to search index.") @@ -273,9 +321,15 @@ def vectorize(log_text): def main(): """ Main program execution function. + + This function sets up the Streamlit app interface, allowing users to choose between + vectorizing documents or retrieving documents from the Azure AI Search Index. + + Returns: + None """ st.markdown( - f'
', + f'
', unsafe_allow_html=True ) st.title("Demo - Azure OpenAI & AI Search") @@ -287,12 +341,12 @@ def main(): # Task for retrieving documents from Azure AI Search in Streamlit UI if task == 'Retrieve': - st.write('This demo allows users to chat over the data in the Azure AI Search Index by \ - leveraging both semantic and vector search techniques alongside the GPT model. Semantic search enhances the querying process by comprehending \ - the meaning and context of user queries, thereby providing more pertinent results. Vector search, on the other hand, employs \ - numerical representations of text to identify similar content using cosine similarity. ***For users to effectively \ - utilize this demo, it is essential that they have previously created their Azure AI Search Index, by executing the \ - "vectorize" task.***') + st.write(''''This demo allows users to chat over the data in the Azure AI Search Index by + leveraging both semantic and vector search techniques alongside the GPT model. Semantic search enhances the querying process by comprehending + the meaning and context of user queries, thereby providing more pertinent results. Vector search, on the other hand, employs + numerical representations of text to identify similar content using cosine similarity. ***For users to effectively + utilize this demo, it is essential that they have previously created their Azure AI Search Index, by executing the + "vectorize" task.***''') if 'messages' not in st.session_state: st.session_state.messages = [] @@ -309,10 +363,10 @@ def main(): # Task for embedding documents from Azure Blob to Azure AI Search index in Streamlit UI elif task == 'Vectorize': - st.write('This demo processes PDF files from Azure Blob Storage, generates embeddings, and uploads them to Azure AI Search for indexing. \ - **Please complete this process before performing retrieval.** \ - For users to effectively utilize this demo, it is essential that they upload PDF files from the \ - [/search_documents](https://github.com/STRIDES/NIHCloudLabAzure/tree/main/notebooks/GenAI/search_documents) directory to Azure Blob container.') + st.write('''This demo processes PDF files from Azure Blob Storage, generates embeddings, and uploads them to Azure AI Search for indexing. + **Please complete this process before performing retrieval.** + For users to effectively utilize this demo, it is essential that they upload PDF files from the + [/search_documents](https://github.com/STRIDES/NIHCloudLabAzure/tree/main/notebooks/GenAI/search_documents) directory to Azure Blob container.''') if st.button("Start Process"): log_text = st.empty() vectorize(log_text) diff --git a/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py b/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py index e2b807f..6a6c656 100644 --- a/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py +++ b/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py @@ -1,132 +1,224 @@ -from openai import AzureOpenAI -# from openai.embeddings_utils import get_embedding, cosine_similarity # must pip install openai[embeddings] -import pandas as pd -import numpy as np -import os -import streamlit as st -import time -from PIL import Image -from dotenv import load_dotenv -from styling import global_page_style - -# load in .env variables -load_dotenv() - -# configure azure openai keys -# openai.api_type = 'azure' -# openai.api_version = os.environ['AZURE_OPENAI_VERSION'] -# openai.api_base = os.environ['AZURE_OPENAI_ENDPOINT'] -# openai.api_key = os.environ['AZURE_OPENAI_KEY'] - -def get_embedding(text, engine): - client = AzureOpenAI( - api_key=os.getenv("Azure_OPENAI_KEY"), - api_version=os.getenv('AZURE_OPENAI_VERSION'), - azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT') - ) - - embeddings = client.embeddings.create(input = [text], model=engine).data[0].embedding - return embeddings - -def cosine_similarity(a, b): - return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) - -def embedding_create(): - # acquire the filename to be embed - st.subheader("Vector Creation") - st.write('The process of vectorization involves creating embeddings from the [microsoft-earnings.csv](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/microsoft-earnings.csv) \ - file located in the specified directory, utilizing the data in the "text" column. These embeddings are derived from pre-chunked text, \ - indicating that the text has already been divided and formatted for embedding generation. The resultant embeddings will be \ - compiled into a new CSV file, which will serve as a vector store for future reference and utilization.') - filename = st.text_input("Enter a file: ", key='filename', value="microsoft-earnings.csv") - - # start the embeddings process if filename provided - if filename: - file_path = os.path.join('..', filename) - # read the data file to be embed - df = pd.read_csv(file_path) - df_placeholder = st.empty() - df_placeholder.dataframe(df, width=2000, height=350) - button_placeholder = st.empty() - if button_placeholder.button("Generate Embeddings"): - # calculate word embeddings - df['embedding'] = df['text'].apply(lambda x:get_embedding(x, engine=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT"))) - df.to_csv('.\\microsoft-earnings_embeddings.csv') - time.sleep(3) - button_placeholder.success('Embeddings Created Sucessfully!!') - df_placeholder.dataframe(df) - - -def embeddings_search(): - - # Streamlit configuration - st.subheader("Vector Search") - st.write(''' -This process generates embeddings based on user queries, utilizing the compiled CSV that was created, to search for the most similar -documents within the vector store by employing cosine similarity. Example questions a user can ask about the microsoft-earnings.csv: -- What was said about the budget? -- How many people utilize GitHub to build software? -- How many points did Microsoft Cloud gross margin percentage increase by? -- What are the expectations for the Q2 cash flow?''') - - if 'answer' not in st.session_state: +from openai import AzureOpenAI +import pandas as pd +import numpy as np +import os +import streamlit as st +import time +from dotenv import load_dotenv +from styling import global_page_style + +# Load environment variables from .env file +load_dotenv() + +def get_embedding(text, engine): + """ + Generate embeddings for a given text using Azure OpenAI. + + This function utilizes the Azure OpenAI service to generate embeddings + for the provided text. It retrieves the necessary API key, version, and + endpoint from environment variables loaded using load_dotenv(). + + Environment Variables: + Azure_OPENAI_KEY: Your Azure OpenAI API key. + AZURE_OPENAI_VERSION: The version of the Azure OpenAI API you are using. + AZURE_OPENAI_ENDPOINT: The endpoint for your Azure OpenAI instance. + + Args: + text (str): The text for which to generate embeddings. + engine (str): The deployment model to use for generating embeddings. + + Returns: + list: A list representing the generated embeddings. + """ + client = AzureOpenAI( + api_key=os.getenv("AZURE_OPENAI_KEY"), # Your Azure OpenAI API key + api_version=os.getenv('AZURE_OPENAI_VERSION'), # The version of the Azure OpenAI API you are using + azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT') # The endpoint for your Azure OpenAI instance + ) + embeddings = client.embeddings.create(input=[text], model=engine).data[0].embedding + return embeddings + +def cosine_similarity(a, b): + """ + Calculate the cosine similarity between two vectors. + + Cosine similarity is a metric used to measure how similar two vectors are. + It is calculated as the dot product of the vectors divided by the product + of their magnitudes. + + Args: + a (np.ndarray): The first vector. + b (np.ndarray): The second vector. + + Returns: + float: The cosine similarity between the two vectors. + """ + return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) + +def embedding_create(): + """ + Create embeddings from a specified CSV file and save them. + + This function reads a CSV file specified by the user, generates embeddings + for the text data in the "text" column, and saves the embeddings to a new + CSV file. The user is prompted to provide the filename, and the embeddings + are created using the Azure OpenAI service. + + The function utilizes Streamlit for the user interface: + - Displays the input CSV file in a Streamlit dataframe. + - Prompts the user to generate embeddings by clicking a button. + - Displays the updated dataframe with embeddings. + - Saves the embeddings to a new CSV file. + """ + st.subheader("Vector Creation") + st.write('''The process of vectorization involves creating embeddings from the + [microsoft-earnings.csv](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/microsoft-earnings.csv) file located in + the specified directory, utilizing the data in the "text" column. These embeddings are derived from pre-chunked text, + indicating that the text has already been divided and formatted for embedding generation. The resultant embeddings will be + compiled into a new CSV file, which will serve as a vector store for future reference and utilization.''') + + # Prompt the user to enter the filename + filename = st.text_input("Enter a file: ", key='filename', value="microsoft-earnings.csv") + + # If a filename is provided, proceed to generate embeddings + if filename: + file_path = os.path.join('..', filename) + + # Read the CSV file into a DataFrame + df = pd.read_csv(file_path) + + # Display the DataFrame in the Streamlit app + df_placeholder = st.empty() + df_placeholder.dataframe(df, width=2000, height=350) + + button_placeholder = st.empty() + + # When the "Generate Embeddings" button is clicked + if button_placeholder.button("Generate Embeddings"): + with st.spinner("Generating Embeddings. Please hold..."): + # Calculate word embeddings for each text entry in the DataFrame + df['embedding'] = df['text'].apply(lambda x: get_embedding(x, engine=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT"))) + + # Save the embeddings to a new CSV file + df.to_csv('./microsoft-earnings_embeddings.csv') + + # Display success message after a short delay + time.sleep(3) + button_placeholder.success('Embeddings Created Successfully!!') + + # Update the displayed DataFrame to include the new embeddings + df_placeholder.dataframe(df, width=2000, height=350) + print(df) + +def embeddings_search(): + """ + Search for similar documents based on user query embeddings. + + This function generates embeddings for a user-provided search query and + compares them with pre-generated embeddings from a CSV file. The most + similar documents are identified using cosine similarity, and the top + results are displayed to the user. + + The function utilizes Streamlit for the user interface: + - Prompts the user to enter a search query. + - Generates embeddings for the search query. + - Calculates cosine similarity between query embeddings and document embeddings. + - Displays the search query, top match, and similarity score. + - Displays the top 5 most similar documents in a Streamlit dataframe. + """ + st.subheader("Vector Search") + st.write('''This process generates embeddings based on user queries, utilizing the compiled CSV that was created, to search for the most similar + documents within the vector store by employing cosine similarity. Example questions a user can ask about the microsoft-earnings.csv: + - What was said about the budget? + - How many people utilize GitHub to build software? + - How many points did Microsoft Cloud gross margin percentage increase by? + - What are the expectations for the Q2 cash flow?''') + + # Initialize session state variables if they don't exist + if 'answer' not in st.session_state: st.session_state.answer = [] - if 'score' not in st.session_state: - st.session_state.score = [] - if 'past' not in st.session_state: + if 'score' not in st.session_state: + st.session_state.score = [] + if 'past' not in st.session_state: st.session_state.past = [] - - # read in the embeddings .csv - # convert elements in 'embedding' column back to numpy array - df = pd.read_csv('.\\microsoft-earnings_embeddings.csv') - df['embedding'] = df['embedding'].apply(eval).apply(np.array) - - # caluculate user query embedding - search_term = st.text_area("Enter a search query: ", key='search_term', placeholder="") - if search_term: - st.session_state.past.append(search_term) - search_term_vector = get_embedding(search_term, engine=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT")) - - # find similiarity between query and vectors - df['similarities'] = df['embedding'].apply(lambda x:cosine_similarity(x, search_term_vector)) - df1 = df.sort_values("similarities", ascending=False).head(5) - - # output the response - answer = df1['text'].loc[df1.index[0]] - score = df1['similarities'].loc[df1.index[0]] - st.session_state.answer.append(answer) - st.session_state.score.append(score) - with st.expander('Vector Search'): - for i in range(len(st.session_state.answer)-1, -1, -1): - st.info(st.session_state.past[i]) - st.write(st.session_state.answer[i]) - st.write('Score: ', st.session_state.score[i]) - with st.expander('Top 5 Results'): - df1 = df1.reset_index(drop=True) + + # Read the embeddings CSV and convert the 'embedding' column back to numpy arrays + df = pd.read_csv('./microsoft-earnings_embeddings.csv') + df['embedding'] = df['embedding'].apply(eval).apply(np.array) + + # Prompt the user to enter a search query + search_term = st.text_area("Enter a search query: ", key='search_term', placeholder="") + + if search_term: + # Store the user's search query + st.session_state.past.append(search_term) + + # Generate the embedding for the search query + search_term_vector = get_embedding(search_term, engine=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT")) + + # Calculate similarity between the query embedding and document embeddings + df['similarities'] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector)) + + # Get the top 5 most similar documents + df1 = df.sort_values("similarities", ascending=False).head(5) + + # Extract the most similar document and its similarity score + answer = df1['text'].iloc[0] + score = df1['similarities'].iloc[0] + + # Store the answer and score in session state + st.session_state.answer.append(answer) + st.session_state.score.append(score) + + # Display the search history and results + with st.expander('Vector Search'): + for i in range(len(st.session_state.answer) - 1, -1, -1): + st.info(st.session_state.past[i]) + st.write(st.session_state.answer[i]) + st.write('Score: ', st.session_state.score[i]) + + # Display the top 5 search results in a DataFrame + with st.expander('Top 5 Results'): + df1 = df1.reset_index(drop=True) df1.index = df1.index + 1 df1 = df1.rename(columns={'Unnamed: 0': 'Row Number'}) - print(df1) - st.dataframe(df1) - - -def main(): + st.dataframe(df1) + +def main(): + """ + Set up the Streamlit app and handle user interactions. + + This function sets up the main interface of the Streamlit app, including + the title, sidebar, and embedding function selection. Based on the user's + selection, it calls either the embedding_create() or embeddings_search() + function to perform the respective actions. + + The function also displays the Microsoft logo at the top of the app. + + The function utilizes Streamlit for the user interface: + - Provides a sidebar for the user to select the embedding function. + - Calls the appropriate function based on user selection. + """ st.markdown( - f'
', - unsafe_allow_html=True - ) - st.title("Demo-Azure OpenAI Embeddings") - # image = Image.open('image_logo2.png') - # st.image(image, caption = '') - st.sidebar.title('Embedding Function Selection') + f'
', + unsafe_allow_html=True + ) + st.title("Demo-Azure OpenAI Embeddings") + + # Sidebar for selecting the embedding function + st.sidebar.title('Embedding Function Selection') chat_style = st.sidebar.radio( - 'Choose an Embedding function below:', - ['Vectorize', 'Retrieve'] - ) - if chat_style == 'Vectorize': - embedding_create() - elif chat_style == 'Retrieve': - embeddings_search() - -if __name__ == '__main__': - global_page_style() - main() + 'Choose an Embedding function below:', + ['Vectorize', 'Retrieve'] + ) + + # Call the appropriate function based on user selection + if chat_style == 'Vectorize': + embedding_create() + elif chat_style == 'Retrieve': + embeddings_search() + +# Entry point of the script +if __name__ == '__main__': + global_page_style() + main() \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/readme.md b/notebooks/GenAI/embedding_demos/readme.md index b240484..90867af 100644 --- a/notebooks/GenAI/embedding_demos/readme.md +++ b/notebooks/GenAI/embedding_demos/readme.md @@ -4,28 +4,193 @@ The Azure OpenAI Demo w/ Streamlit Frontend is designed to host various demonstr - [Streamlit Documentation](https://docs.streamlit.io/get-started) - [Geeksforgeeks](https://www.geeksforgeeks.org/a-beginners-guide-to-streamlit/) -## Environment Setup +## Page Contents ++ [Learning Objectives](#learning_objectives) ++ [Prerequisites](#prerequisites) ++ [Overview of Streamlit Scripts](#overview_of_streamlit_scripts) ++ [Executing the Azure OpenAI Demo w/ Streamlit Frontend](#executing_the_azure_openai_demo) ++ [Conclusion](#conclusion) ++ [Clean Up](#clean_up) + +## Learning Objectives +1. **Integrate Azure OpenAI with Streamlit**: + - Use Azure OpenAI in a Streamlit frontend to interact with data indexes in Azure AI Search. +2. **Understand Streamlit Scripts**: + - Learn the roles and functionalities of `Demo_Suite.py`, `AI_Search_Query.py`, and `AOAI_Embeddings.py`. +3. **Generate and Query Embeddings**: + - Create and query text embeddings using the Azure OpenAI SDK. +4. **Use Ngrok with Azure ML**: + - Set up Ngrok to securely run the demo on Azure ML or a VM. +5. **Setup Development Environment**: + - Create a virtual environment, install dependencies, and configure environment variables. + - Execute the Streamlit demo locally and in cloud environments. + +## Prerequisites +Before proceeding with this notebook, please ensure that you have the following Azure services deployed and configured. Resources can be deployed manually in Azure portal or automated by following along with the [ARM Deployment tutorial](../azure_infra_setup/README.md): + +1. **Azure OpenAI Service**: + - Ensure that you have deployed both a GPT model and an Ada model within your Azure OpenAI instance. + - Estimated costs for this service varies based on the model usage and number of API calls. + - ***gpt-4o-mini(2024-07-18): $0.15 input/$0.60 output per 1M tokens*** + - ***text-embedding-3-small(1): $0.00002 per 1K tokens*** +2. **Azure AI Search**: + - Your Azure AI Search service should be a minimum of the Basic tier to ensure compatibility with Azure OpenAI. + - ***Estimated cost for this service is $0.10 per hour.*** +3. **Azure Blob Storage Account**: + - You should have an Azure Blob Storage account with PDF files stored in a blob container. These files should be located in the `/search_documents` directory of the `GenAI` directory. + - ***Estimated cost for this service is $0.018 per GB.*** + +## Overview of Streamlit Scripts + +### Demo_Suite.py + +**Purpose**: +This script serves as the home page for the Azure OpenAI Demo site. It is the initial script that Streamlit runs to start the demo, providing users with an overview of the demo site and descriptions of each available demonstration. + +**Key Points**: +- **Location**: This file is located in the root directory (/embedding_demos). +- **Functionality**: + - Launches the Streamlit UI. + - Displays descriptive information about the demo site content. + - Provides links and descriptions for each demo page. +- **Usage**: Run this script using the command `streamlit run demo_suite.py`. + +### AI_Search_Query.py + +**Purpose**: +This script serves as the frontend for the embedding demo that utilizes Azure AI Search and Azure OpenAI SDKs to chat over documents stored in a Blob Storage account. + +**Key Points**: +- **Location**: This file is located in the /pages subdirectory, identifying it as a subpage in the Streamlit interface. +- **Functionality**: + - Allows users to interact with documents stored in the [/search_documents](../search_documents) directory. + - Consistent with the [AISearch_RAG_chatbot.ipynb](../notebooks/AISearch_RAG_chatbot.ipynb) tutorial. + - Provides a user interface for querying and chatting over the documents using Azure AI Search and OpenAI capabilities. +- **Usage**: Accessible from the main page (demo_suite.py) via the left sidebar. + +### AOAI_Embeddings.py + +**Purpose**: +This script serves as the frontend for the embedding demo that uses the Azure OpenAI SDK to generate embeddings for pre-chunked text in the [microsoft-earnings.csv](../microsoft-earnings.csv) file. + +**Key Points**: +- **Location**: This file is located in the /pages subdirectory, identifying it as a subpage in the Streamlit interface. +- **Functionality**: + - Demonstrates how to generate embeddings for text using the Azure OpenAI SDK. + - Consistent with the [AzureOpenAI_embeddings.ipynb](../notebooks/AzureOpenAI_embeddings.ipynb) tutorial. + - Provides a user interface for generating and viewing embeddings. +- **Usage**: Accessible from the main page (demo_suite.py) via the left sidebar. + +## Executing the Azure OpenAI Demo w/ Streamlit Frontend + +In this phase, you will choose your preferred environment to execute the Azure OpenAI Demo with a Streamlit frontend. You can either use Azure ML or run the demo locally using VSCode. Please choose an execution environment below: ++ [Executing via Azure ML or VM](#executing_via_azure_ml_vm) ++ [Executing via VsCode](#executing_via_vscode) + +### Executing via Azure ML or VM +Streamlit's native behavior expects to run applications locally on port 8501, which isn't possible when executing this demo from Azure ML or a VM. **Ngrok** is a tool that creates a secure tunnel to expose a local server to the internet. By using Ngrok, you can securely expose the Streamlit demo app running on Azure ML or a VM to the internet, bypassing the limitation of needing to run it on the local port 8501. This ensures that you can access and interact with the Streamlit application without needing to run from your local machine. + +**Phase 1 - Obtain a free Ngrok Authtoken:** + +1. Visit the Ngrok website: https://ngrok.com. +2. Click on the "Sign Up" button at the top right corner. +3. Sign up using your email address and create a password, or use your GitHub account. +4. Check your email for a verification message from Ngrok. +5. Click the verification link in the email to activate your account. +6. Log in to your Ngrok account. +7. Once logged in, you will be taken to your dashboard. +8. In your dashboard, find and copy your authentication (Authtoken) token. This token will be used for the `ngrok_key` variable in step 2. + +**Phase 2 - Create a .env file:** +1. Open the terminal in your Azure ML Studio or Azure VM instance. +2. Navigate to the ***/GenAI*** directory: + ```bash + cd ./notebooks/GenAI + ``` +3. Create a `.env` file by executing the following command: + ```bash + nano .env + ``` +4. In the text editor add the following variables: + ```bash + AZURE_OPENAI_VERSION = "Your Azure OpenAI API version" + AZURE_OPENAI_BASE = "Your Azure OpenAI API endpoint" + AZURE_OPENAI_KEY = "Your Azure OpenAI API key" + AZURE_GPT_DEPLOYMENT = "Your Azure OpenAI deployed GPT model name" + AZURE_EMBEDDINGS_DEPLOYMENT = "Your Azure OpenAI deployed ADA model name" + AZURE_SEARCH_ENDPOINT = "Your Azure AI Search API endpoint" + AZURE_SEARCH_ADMIN_KEY = "Your Azure AI Search API key" + AZURE_SEARCH_INDEX = "documents-index" # The index name 'documents-index' is used as default in this demo + BLOB_CONTAINER_NAME = "Your Azure Blob Container name hosting files from /search_documents" + BLOB_CONNECTION_STRING = "Your Azure Blob connection string" + ngrok_key = "Your Ngrok Authtoken from STEP 1" + ``` +5. Save the `.env` file and exit the text editor: + - Press `Ctrl + X` to exit the text editor. + - Press `Shift + Y` to save the changes to the file. + - Press `Enter` to confirm the file name to be saved (filename will be .env since we used `nano .env` command) +6. Output the `.env` variables from terminal to ensure all variables are present: + ```bash + cat .env + ``` +**Phase 3 - Configure the virtual environment:** +1. If not already in ***/GenAI***, navigate there by: + ```bash + cd ./notebooks/GenAI + ``` +2. Create a virtual environment in ***/GenAI*** by executing the following command: + ```bash + python3 -m venv --clear .venv + ``` + ***Note: This command will create a virtual environment named venv in \GenAI*** + +3. Activate the virtual environment: + ```bash + source .venv/bin/activate + ``` +4. Install all required libraries from the provided [requirements.txt](../requirements.txt) file to the virtual environment. + ```bash + pip install -r requirements.txt + ``` + +**Phase 4 - Execute the Streamlit demo:** +1. Navigate to the /embeddings directory (location of the Streamlit demo): + ```bash + cd ./embedding_demos + ``` +2. Execute the Streamlit demo: + - Run the [Demo_Suite_ngrok.py](./Demo_Suite_ngrok.py) file. This file will generate the secure Ngrok tunnel to access the Streamlit app. You do not need to run the `Demo_Suite.py` file. + ```bash + python Demo_Suite_ngrok.py + ``` + ***Note: Access the Streamlit site from the provided URL in the terminal.*** + + %Insert Image Here% + + + +### Executing via VsCode To excute this demo, be sure to complete the following steps: -1. Create a virtual environment in the \GenAI directory. +1. Create a virtual environment in the /GenAI directory. - Navigate to the /GenAI directory: ```sh - cd .\notebooks\GenAI + cd ./notebooks/GenAI ``` - Create the virtual environment: ``` - python -m venv venv + python -m venv .venv ``` - - ***Note: This command will create a virtual environment named venv in \GenAI*** + - ***Note: This command will create a virtual environment named venv in /GenAI*** - Activate the virtual environment: - On ***Windows***: ```sh - venv\Scripts\activate + .venv/Scripts/activate ``` - On ***macOS/Linux***: ```sh - source venv/bin/activate + source .venv/bin/activate ``` 2. Install all required libraries from the provided requirements.txt file. @@ -47,12 +212,18 @@ To excute this demo, be sure to complete the following steps: BLOB_CONNECTION_STRING = "Your Azure Blob connection string" ``` -4. Navigate to the /embeddings directory (location of the Streamlit demo) +4. Navigate to the /embedding_demos directory (location of the Streamlit demo) ```sh - cd .\notebooks\GenAI\embedding_demos + cd ./embedding_demos ``` 5. Execute the Streamlit demo ```sh streamlit run Demo_Suite.py - ``` \ No newline at end of file + ``` + +## Conclusion +By completing the "Azure OpenAI Demo w/ Streamlit Frontend" tutorial, you have gained valuable hands-on experience in integrating Azure OpenAI services with a Streamlit frontend. You have learned how to set up and configure essential components, including Azure OpenAI, Azure AI Search, and Azure Blob Storage. Additionally, you have explored the functionalities of key scripts and understood how to generate and query embeddings for interactive applications. This tutorial also guided you through executing the demo both on Azure ML using Ngrok and locally using VSCode, ensuring you are equipped to handle different deployment scenarios. We hope this tutorial has been informative and empowers you to leverage Azure OpenAI and Streamlit for your future projects. + +## Clean Up +Make sure to shut down your Azure ML compute and if desired you can delete your Azure AI Search service, Azure Blob Storage Account, and Azure OpenAI service. ***Note these services can be used in other tutorials in this notebook.*** \ No newline at end of file diff --git a/notebooks/GenAI/example_scripts/workshop_embedding.py b/notebooks/GenAI/example_scripts/workshop_embedding.py index dc4690f..94ec447 100644 --- a/notebooks/GenAI/example_scripts/workshop_embedding.py +++ b/notebooks/GenAI/example_scripts/workshop_embedding.py @@ -3,31 +3,51 @@ import os from dotenv import load_dotenv import time +import threading +import itertools - -# load in variables from .env +# Load in variables from .env load_dotenv() -#create embeddings functions to apply to a given column +# Create embeddings function to apply to a given column def get_embedding(text, engine): client = AzureOpenAI( api_key=os.getenv("Azure_OPENAI_KEY"), api_version=os.getenv('AZURE_OPENAI_VERSION'), azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT') ) - embeddings = client.embeddings.create(input = [text], model=engine).data[0].embedding + embeddings = client.embeddings.create(input=[text], model=engine).data[0].embedding return embeddings - -# read the data file to be embed +# Function to display loading pattern +def loading_pattern(): + for c in itertools.cycle(['|', '/', '-', '\\']): + if done: + break + print('\rLoading ' + c, end='') + time.sleep(0.1) + print('\rDone! ') + +# Start the loading pattern in a separate thread +done = False +loading_thread = threading.Thread(target=loading_pattern) +loading_thread.start() + +# Read the data file to be embedded df = pd.read_csv(os.path.join('..', 'microsoft-earnings.csv')) print(df) -# calculate word embeddings -df['embedding'] = df['text'].apply(lambda x:get_embedding(x, engine=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT"))) -df.to_csv('.\\microsoft-earnings_embeddings.csv', index=False) -time.sleep(3) -print(df) +# Calculate word embeddings +df['embedding'] = df['text'].apply(lambda x: get_embedding(x, engine=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT"))) +# Save the DataFrame with embeddings to the specified path +file_path = os.path.join('.', 'microsoft-earnings_embeddings.csv') +df.to_csv(file_path, index=False) +# Stop the loading pattern +done = True +loading_thread.join() +# Print the DataFrame with embeddings +time.sleep(3) +print(df) diff --git a/notebooks/GenAI/example_scripts/workshop_search.py b/notebooks/GenAI/example_scripts/workshop_search.py index d9f16c2..3a825c5 100644 --- a/notebooks/GenAI/example_scripts/workshop_search.py +++ b/notebooks/GenAI/example_scripts/workshop_search.py @@ -24,7 +24,8 @@ def cosine_similarity(a, b): # read in the embeddings .csv # convert elements in 'embedding' column back to numpy array -df = pd.read_csv('.\\microsoft-earnings_embeddings.csv') +file_path = os.path.join('.', 'microsoft-earnings_embeddings.csv') +df = pd.read_csv(file_path) df['embedding'] = df['embedding'].apply(eval).apply(np.array) # caluculate user query embedding diff --git a/notebooks/GenAI/notebooks/AISearch_RAG_chatbot.ipynb b/notebooks/GenAI/notebooks/AISearch_RAG_chatbot.ipynb index 6bcd016..1f08a6e 100644 --- a/notebooks/GenAI/notebooks/AISearch_RAG_chatbot.ipynb +++ b/notebooks/GenAI/notebooks/AISearch_RAG_chatbot.ipynb @@ -6,32 +6,40 @@ "source": [ "# Document Embedding and Indexing with Azure OpenAI and AI Search\n", "\n", - "## Introduction \n", - "This notebook is designed to help developers build applications that utilize various Azure services to process and retrieve data. The main goal is to pull files from Azure Blob Storage, generate embeddings using Azure OpenAI, store these documents with custom metadata in an Azure AI Index, and then interact with the indexed data via Azure OpenAI. \n", - "### Objectives \n", - "1. **Vectorize**: \n", - " - Pull files from Azure Blob Containers. \n", - " - Generate embeddings using Azure OpenAI. \n", - " - Store documents with custom metadata in an Azure AI Index. \n", - "2. **Retrieve**: \n", - " - Chat over the data indexes with Azure OpenAI. \n", - "Each section of the notebook will focus on specific tasks and utilize the REST APIs provided by each Azure service to accomplish these tasks. By the end of this notebook, you will have a comprehensive understanding of how to integrate and use these Azure services to build a robust data processing and retrieval application. \n", + "## Overview \n", + "This tutorial provides a step-by-step guide on how to pull files from [Azure Blob Storage](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blobs-introduction), generate embeddings for these files, and store the embeddings in an [Azure AI Search](https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search) index. Embeddings are numerical representations of text that capture the semantic meaning of the content, facilitating advanced search and analysis. An index in AI search is a data structure that organizes these embeddings to improve the speed and efficiency of search queries. Additionally, this tutorial demonstrates how to enable users to interact with these embedding indexes through Azure AI Search and [Azure OpenAI](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview), effectively allowing them to chat over the original files from Azure Blob Storage.\n", + "### Learning Objectives \n", + "1. **Vectorization**:\n", + " - Learn how to extract files from Azure Blob Storage.\n", + " - Understand how to generate embeddings using Azure OpenAI.\n", + " - Discover how to store documents with custom metadata in an Azure AI Index.\n", + "2. **Retrieval**:\n", + " - Gain skills in interacting with Azure AI Search indexes using Azure OpenAI.\n", + "\n", + "Each section of this notebook will guide you through specific tasks and demonstrate how to utilize the REST APIs provided by each Azure service. By the end of this notebook, you will have a comprehensive understanding of how to integrate and utilize these Azure services to develop a robust data processing and retrieval application. \n", "### Prerequisites \n", - "Before proceeding with this notebook, please ensure that you have the following Azure services deployed and configured: \n", + "Before proceeding with this notebook, please ensure that you have the following Azure services deployed and configured. Resources can be deployed manually in Azure portal or automated by following along with the [ARM Deployment tutorial](../azure_infra_setup/README.md): \n", " \n", "1. **Azure OpenAI Service**: \n", - " - Ensure that you have deployed both a GPT model and an Ada model within your Azure OpenAI instance. \n", + " - Ensure that you have deployed both a GPT model and an Ada model within your Azure OpenAI instance.\n", + " - Estimated costs for this service varies based on the model usage and number of API calls.\n", + " - ***gpt-4o-mini(2024-07-18): $0.15 input/$0.60 output per 1M tokens***\n", + " - ***text-embedding-3-small(1): $0.00002 per 1K tokens***\n", "2. **Azure AI Search**: \n", " - Your Azure AI Search service should be a minimum of the Basic tier to ensure compatibility with Azure OpenAI. \n", + " - ***Estimated cost for this service is $0.10 per hour.***\n", "3. **Azure Blob Storage Account**: \n", - " - You should have an Azure Blob Storage account with PDF files stored in a blob container. These files should be located in the `/search_documents` directory of the `GenAI` directory. " + " - You should have an Azure Blob Storage account with PDF files stored in a blob container. These files should be located in the `/search_documents` directory of the `GenAI` directory. \n", + " - ***Estimated cost for this service is $0.018 per GB.***" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 0. Environment Setup \n", + "## Get Started \n", + "\n", + "### 0. Environment Setup \n", "This section will guide you through setting up the environment for the notebook. We will import the necessary libraries, load environment variables, and configure Azure AI Search parameters. " ] }, @@ -50,7 +58,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install -r ../requirements.txt # Will Install all packages from the requirements.txt file into your .venv" + "%pip install -r ../requirements.txt " ] }, { @@ -63,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -124,9 +132,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Load environment variables \n", "load_dotenv() \n", @@ -165,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -209,7 +228,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -237,7 +256,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -262,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -324,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -525,22 +544,9 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "User: What year was New York State Route 373 built?\n", - "Processing...\n", - "GPT Response: New York State Route 373 was designated in 1930.\n", - "\n", - "Citation(s):\n", - "https://syndatastg.blob.core.windows.net/testcloudlab/New_York_State_Route_373.pdf\n" - ] - } - ], + "outputs": [], "source": [ "def chat_on_your_data(): \n", " \"\"\" \n", @@ -635,7 +641,7 @@ "1. **Define Query and Search Index**:\n", " - Set up the user query and the name of the search index to be created.\n", " - Default values are provided:\n", - " - Example query: `\"What year did the hurricane Irene occur?\"`\n", + " - Example query: `\"What year was the New York State Route 373 built?\"`\n", " - Search index: `\"documents-index\"`\n", "2. **Configure Azure OpenAI Parameters**:\n", " - Retrieve necessary configurations and API keys from environment variables.\n", @@ -693,6 +699,14 @@ " - Executed a user query and performed a search using Azure AI Search.\n", " - Generated a chat completion based on the search results and formatted it for display." ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean Up\n", + "Make sure to shut down your Azure ML compute and if desired you can delete your Azure AI Search service, Azure Blob Storage Account, and Azure OpenAI service. ***Note these services can be used in other tutorials in this notebook.***" + ] } ], "metadata": { diff --git a/notebooks/GenAI/requirements.txt b/notebooks/GenAI/requirements.txt index 1f1f464..4f82915 100644 --- a/notebooks/GenAI/requirements.txt +++ b/notebooks/GenAI/requirements.txt @@ -7,4 +7,5 @@ azure-search-documents azure-identity azure-storage-blob pdfplumber -tiktoken \ No newline at end of file +tiktoken +pyngrok \ No newline at end of file From 66c4431f589837f8bda0148c0646d9419b8e1734 Mon Sep 17 00:00:00 2001 From: cjackson202 <134412115+cjackson202@users.noreply.github.com> Date: Sat, 11 Jan 2025 08:40:15 -0500 Subject: [PATCH 6/7] Update readme.md --- notebooks/GenAI/embedding_demos/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/GenAI/embedding_demos/readme.md b/notebooks/GenAI/embedding_demos/readme.md index 90867af..6a20cc1 100644 --- a/notebooks/GenAI/embedding_demos/readme.md +++ b/notebooks/GenAI/embedding_demos/readme.md @@ -14,7 +14,7 @@ The Azure OpenAI Demo w/ Streamlit Frontend is designed to host various demonstr ## Learning Objectives 1. **Integrate Azure OpenAI with Streamlit**: - - Use Azure OpenAI in a Streamlit frontend to interact with data indexes in Azure AI Search. + - Use Azure OpenAI in a Streamlit frontend. 2. **Understand Streamlit Scripts**: - Learn the roles and functionalities of `Demo_Suite.py`, `AI_Search_Query.py`, and `AOAI_Embeddings.py`. 3. **Generate and Query Embeddings**: From e206509afbe0592409105a699112031bcc1d6e14 Mon Sep 17 00:00:00 2001 From: cjackson202 <134412115+cjackson202@users.noreply.github.com> Date: Tue, 21 Jan 2025 11:12:19 -0500 Subject: [PATCH 7/7] Update readme.md --- notebooks/GenAI/embedding_demos/readme.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/GenAI/embedding_demos/readme.md b/notebooks/GenAI/embedding_demos/readme.md index 6a20cc1..1047eb2 100644 --- a/notebooks/GenAI/embedding_demos/readme.md +++ b/notebooks/GenAI/embedding_demos/readme.md @@ -165,7 +165,7 @@ Streamlit's native behavior expects to run applications locally on port 8501, wh ``` ***Note: Access the Streamlit site from the provided URL in the terminal.*** - %Insert Image Here% + ![Image](https://github.com/user-attachments/assets/663053ed-957f-4ec3-b8b1-340b820852cc) @@ -226,4 +226,4 @@ To excute this demo, be sure to complete the following steps: By completing the "Azure OpenAI Demo w/ Streamlit Frontend" tutorial, you have gained valuable hands-on experience in integrating Azure OpenAI services with a Streamlit frontend. You have learned how to set up and configure essential components, including Azure OpenAI, Azure AI Search, and Azure Blob Storage. Additionally, you have explored the functionalities of key scripts and understood how to generate and query embeddings for interactive applications. This tutorial also guided you through executing the demo both on Azure ML using Ngrok and locally using VSCode, ensuring you are equipped to handle different deployment scenarios. We hope this tutorial has been informative and empowers you to leverage Azure OpenAI and Streamlit for your future projects. ## Clean Up -Make sure to shut down your Azure ML compute and if desired you can delete your Azure AI Search service, Azure Blob Storage Account, and Azure OpenAI service. ***Note these services can be used in other tutorials in this notebook.*** \ No newline at end of file +Make sure to shut down your Azure ML compute and if desired you can delete your Azure AI Search service, Azure Blob Storage Account, and Azure OpenAI service. ***Note these services can be used in other tutorials in this notebook.***