diff --git a/.gitignore b/.gitignore index 89929d7e..f8fb36aa 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,6 @@ appsettings.json __pycache__ -.cache \ No newline at end of file +.cache + +.idea/ \ No newline at end of file diff --git a/ai-services/multilingual-agent/.env b/ai-services/multilingual-agent/.env new file mode 100644 index 00000000..83fcedfe --- /dev/null +++ b/ai-services/multilingual-agent/.env @@ -0,0 +1,7 @@ +SPEECH_API_KEY="" +SPEECH_REGION="" +TRANSLATION_KEY="" +TRANSLATION_REGION="" +OPENAI_KEY="" +OPENAI_ENDPOINT="" +ASSISTANT_ID="" diff --git a/ai-services/multilingual-agent/.gitignore b/ai-services/multilingual-agent/.gitignore new file mode 100644 index 00000000..acc8a432 --- /dev/null +++ b/ai-services/multilingual-agent/.gitignore @@ -0,0 +1,5 @@ +*.log + +.idea/ + +.env diff --git a/ai-services/multilingual-agent/LICENSE b/ai-services/multilingual-agent/LICENSE new file mode 100644 index 00000000..6401c462 --- /dev/null +++ b/ai-services/multilingual-agent/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Chris Ayers + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/ai-services/multilingual-agent/README.md b/ai-services/multilingual-agent/README.md new file mode 100644 index 00000000..67e8f293 --- /dev/null +++ b/ai-services/multilingual-agent/README.md @@ -0,0 +1,105 @@ +# AI-in-a-Box Multilingual Agent + + +||| +|:---| ---:| +|This solution is part of the the AI-in-a-Box framework developed by the team of Microsoft Customer Engineers and Architects to accelerate the deployment of AI and ML solutions. Our goal is to simplify the adoption of AI technologies by providing ready-to-use accelerators that ensure quality, efficiency, and rapid deployment.| AI-in-a-box Logo: Description | + +## User Story +![multilingual-agent](./media/multilingual-agent.jpg) + + +This is the WHY + +Insert a image here that tells an interesting story about the solution being delivered + +Describe how this solution can help a user's organization, including examples on how this solution could help specific industries + +Describe what makes this solution and other reasons why someone would want to deploy this. Here's some ideas that you may wish to consider: + +- **Speed and Efficiency**: How does this solution accelerate the deployment of AI/ML solutions? +- **Cost-Effectiveness**: In what ways does it help save on development costs? +- **Quality and Reliability**: What measures are in place to ensure the high quality and reliability of your solution? +- **Competitive Edge**: How does it give users a competitive advantage in their domain? + +## What's in the Box + +- CLI (command line interface) that allows users to talk with the assistant by voice +- Orchestrator that is responsible for + 1. Capture and convert the user's voice to text + 2. Translate the text to the language of the assistant (english) + 3. Send the text to the assistant + 4. Translate the response to the user's language + 5. Convert the text to voice and play it to the user +- Deployment of all azure resources needed: + - Azure AI Services (we will be using Speech and Language services) + - Azure OpenAI +- Creation of Open AI Assistant + - Integrated with File Search tool + + +This is WHAT they get when they deploy the solution + +Describe any helpful technical benefits of this solution (for example, deploys key vault for storing keys securely, UAMI for easy and secure integration) + +Describe what Azure Resources are deployed + +Include Architecture Diagrams including inputs and outputs + +Provide links to any associated blogs about this solution (any FTA blogs you wrote that provide more details) + +## Thinking Outside of the Box + +This is a WHY and a WHAT + +Describe ways users can customize and enahance the solution for use inside their organization + +## Deploy the Solution + +Provide instructions on how to deploy the solutione: + +1. **Prerequisites**: List any requirements for using this solution (e.g., software, accounts). +2. **Installation**: Step-by-step instructions for deploying the solution to Azure. +3. **Post Deployment**: Include any instructions that the user may need to do after the resources have been deployed; for example, upload files to blob storage, create an ML or an AI Services project + +## Run the Solution + +Include instructions on how they can run and test the solution + +## Customize the Solution + +Describe different ideas on how to enhance or customize for their use cases + +## How to Contribute + +This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq) or contact with any additional questions or comments. +## Key Contacts & Contributors + +Highlight the main contacts for the project and acknowledge contributors. You can adapt the structure from AI-in-a-Box: + +| Contact | GitHub ID | Email | +|---------|-----------|-------| +| Your Name | @YourGitHub | your.email@example.com | + +## Acknowledgments + +If applicable, offer thanks to individuals, organizations, or projects that helped inspire or support your project. + +## License + +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. + +--- + +This project is part of the AI-in-a-Box series, aimed at providing the technical community with tools and accelerators to implement AI/ML solutions efficiently and effectively. \ No newline at end of file diff --git a/ai-services/multilingual-agent/app.py b/ai-services/multilingual-agent/app.py new file mode 100644 index 00000000..ed516485 --- /dev/null +++ b/ai-services/multilingual-agent/app.py @@ -0,0 +1,61 @@ +import os +import logging + +from dotenv import load_dotenv +from openai import AzureOpenAI +from azure.cognitiveservices.speech import SpeechConfig, SpeechRecognizer, AutoDetectSourceLanguageConfig, SpeechSynthesizer +from azure.cognitiveservices.speech.audio import AudioOutputConfig +from azure.ai.translation.text import TextTranslationClient, TranslatorCredential + +from cli import Cli +from assistant import create_assistant + +load_dotenv() + +logger = logging.getLogger(__name__) + + +if __name__ == "__main__": + try: + logging.basicConfig(filename='app.log', level=logging.INFO) + + speech_key = os.getenv("SPEECH_API_KEY") + speech_region = os.getenv("SPEECH_REGION") + translation_key = os.getenv("TRANSLATION_KEY") + translation_region = os.getenv("TRANSLATION_REGION") + + openai_client = AzureOpenAI( + api_key=os.getenv("OPENAI_KEY"), + api_version="2024-07-01-preview", + azure_endpoint=os.getenv("OPENAI_ENDPOINT"), + default_headers={"X-Ms-Enable-Preview": "true"} + ) + + assistant_id = os.getenv("ASSISTANT_ID") + + if assistant_id is None or assistant_id == "": + assistant_id = create_assistant(openai_client).id + logger.debug("created new assistant with id {}".format(assistant_id)) + + speech_config = SpeechConfig(subscription=speech_key, region=speech_region) + + auto_detect_config = AutoDetectSourceLanguageConfig(languages=["en-US", "fr-FR", "pt-BR"]) + speech_recognizer = SpeechRecognizer(speech_config=speech_config, auto_detect_source_language_config=auto_detect_config) + + audio_config = AudioOutputConfig(use_default_speaker=True) + speech_synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config) + + translator_credential = TranslatorCredential(key=translation_key, region=translation_region) + text_translator = TextTranslationClient(credential=translator_credential) + + runner = Cli( + openai_client=openai_client, + assistant_id=assistant_id, + speech_recognizer=speech_recognizer, + speech_synthesizer=speech_synthesizer, + text_translator=text_translator + ) + + runner.run() + except Exception as error: + raise error diff --git a/ai-services/multilingual-agent/assistant.py b/ai-services/multilingual-agent/assistant.py new file mode 100644 index 00000000..a035b3d0 --- /dev/null +++ b/ai-services/multilingual-agent/assistant.py @@ -0,0 +1,31 @@ +import os + +from openai import AzureOpenAI + + +def create_assistant(client: AzureOpenAI): + return client.beta.assistants.create( + name="Travel planner copilot", + instructions=''' +You are travel planner that helps people plan trips across the world. +The user might give you constraints like: +- destination +- weather preference +- attractions preference +- date preference +When asked for up-to-date information, you should use the browser tool. +You should try to give a plan in the following format: +- city +- start and end date +- cost breakdown +- weather forecast +- attractions and any useful information about tickets. + ''', + tools=[{ + "type": "browser", + "browser": { + "bing_resource_id": os.getenv("BING_RESOURCE_ID") + } + }], + model="gpt-4-1106-preview", + ) diff --git a/ai-services/multilingual-agent/cli.py b/ai-services/multilingual-agent/cli.py new file mode 100644 index 00000000..88e5dc81 --- /dev/null +++ b/ai-services/multilingual-agent/cli.py @@ -0,0 +1,109 @@ +import logging + +from openai import AzureOpenAI +from azure.cognitiveservices.speech import SpeechRecognizer, SpeechSynthesizer, ResultReason, CancellationReason, PropertyId +from azure.ai.translation.text import TextTranslationClient +from azure.ai.translation.text.models import InputTextItem + +from event_handler import EventHandler + + +logger = logging.getLogger(__name__) + +base_language = 'en' + + +class Cli: + def __init__(self, + openai_client: AzureOpenAI, + assistant_id: str, + speech_recognizer: SpeechRecognizer, + speech_synthesizer: SpeechSynthesizer, + text_translator: TextTranslationClient): + self.openai_client = openai_client + self.assistant_id = assistant_id + self.speech_recognizer = speech_recognizer + self.speech_synthesizer = speech_synthesizer + self.text_translator = text_translator + self.language = '' + self.thread_id = '' + + def run(self): + thread = self.openai_client.beta.threads.create() + self.thread_id = thread.id + + print("Say something...") + + while True: + try: + user_input = self.recognize() + + base_language_text = user_input + if not self.language.startswith(base_language): + base_language_text = self.translate(text=user_input, language=base_language) + + output_text = self.assistant(content=base_language_text) + + if not self.language.startswith(base_language): + output_text = self.translate(text=output_text, language=self.language) + + self.synthesize(output_text) + except Exception as e: + logger.error("failure: {}".format(e)) + continue + + def recognize(self) -> str: + response = self.speech_recognizer.recognize_once() + + reason = response.reason + if reason != ResultReason.RecognizedSpeech: + error = 'Failed to recognize speech.' + if reason == ResultReason.NoMatch: + error = "No speech could be recognized: {}".format(response.no_match_details) + elif reason == ResultReason.Canceled: + cancellation_details = response.cancellation_details + error = "Speech Recognition canceled: {}".format(cancellation_details.reason) + if cancellation_details.reason == CancellationReason.Error: + error += "Error details: {}".format(cancellation_details.error_details) + raise Exception("Speech recognition failed with error: {}".format(error)) + + self.language = response.properties[PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult] + logger.info("Recognized (language={}): {}".format(self.language, response.text)) + + return response.text + + def synthesize(self, text: str) -> None: + response = self.speech_synthesizer.speak_text(text) + + if response.reason != ResultReason.SynthesizingAudioCompleted: + cancellation_details = response.cancellation_details + error = "Speech synthesis canceled: {}".format(cancellation_details.reason) + if cancellation_details.reason == CancellationReason.Error: + if cancellation_details.error_details: + error += "Error details: {}".format(cancellation_details.error_details) + raise Exception("Speech synthesis failed with error: {}".format(error)) + + logger.info("Speech synthesized for text [{}]".format(text)) + + def translate(self, text: str, language: str) -> str: + content = InputTextItem(text=text) + translation = self.text_translator.translate(content=[content], to=[language]) + if len(translation) == 0 or len(translation[0].translations) == 0: + raise Exception("Failed to translate to {} text: {}".format(language, text)) + + logger.info("Translated [{}] to [{}]".format(text, translation[0].translations[0].text)) + return translation[0].translations[0].text + + def assistant(self, content: str) -> str: + self.openai_client.beta.threads.messages.create( + thread_id=self.thread_id, + role="user", + content=content + ) + + event_handler = EventHandler() + with self.openai_client.beta.threads.runs.stream(assistant_id=self.assistant_id, thread_id=self.thread_id, + event_handler=event_handler) as stream: + stream.until_done() + + return event_handler.get_result() diff --git a/ai-services/multilingual-agent/event_handler.py b/ai-services/multilingual-agent/event_handler.py new file mode 100644 index 00000000..ba2568f9 --- /dev/null +++ b/ai-services/multilingual-agent/event_handler.py @@ -0,0 +1,44 @@ +import logging + +from openai import AssistantEventHandler +from openai.types.beta.threads.runs import ToolCall +from openai.types.beta.threads import Text + + +logger = logging.getLogger(__name__) + + +class EventHandler(AssistantEventHandler): + def __init__(self): + super().__init__() + self.result = '' + + def on_exception(self, exception: Exception) -> None: + logger.error("please try again. an exception occurred: {}".format(exception)) + + def on_tool_call_created(self, tool_call: ToolCall): + logger.info("started calling tool {}".format(tool_call['type'])) + + def on_tool_call_done(self, tool_call: ToolCall) -> None: + logger.info("completed calling tool {}".format(tool_call['type'])) + + def on_text_done(self, text: Text) -> None: + self.result = text.value + + is_first_url_citation = True + for annotation in text.annotations: + if annotation.type == "url_citation": + if is_first_url_citation: + self.result += "\nUrl citations: \n" + title = annotation.model_extra['url_citation']['title'] + url = annotation.model_extra['url_citation']['url'] + self.result += "* {} - [{}]({})\n".format(annotation.text, title, url) + + def on_timeout(self) -> None: + logger.warning("timeout occurred. please try again") + + def on_end(self) -> None: + logger.info("completed conversation with assistant") + + def get_result(self) -> str: + return self.result diff --git a/ai-services/multilingual-agent/media/multilingual-agent.jpg b/ai-services/multilingual-agent/media/multilingual-agent.jpg new file mode 100644 index 00000000..5a43308e Binary files /dev/null and b/ai-services/multilingual-agent/media/multilingual-agent.jpg differ diff --git a/ai-services/multilingual-agent/requirements.txt b/ai-services/multilingual-agent/requirements.txt new file mode 100644 index 00000000..62470e90 --- /dev/null +++ b/ai-services/multilingual-agent/requirements.txt @@ -0,0 +1,4 @@ +azure.cognitiveservices.speech==1.38.0 +azure-ai-translation-text==1.0.0b1 +openai==1.30.1 +python-dotenv