Simli.py

import asyncio
from dotenv import load_dotenv
import os
import time
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from langchain.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain_groq import ChatGroq
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferMemory
from deepgram import Deepgram
import requests

load_dotenv()

app = FastAPI()

class LanguageModelProcessor:
    def __init__(self):
        self.llm = ChatGroq(temperature=0, model_name="llama-3.1-8b-instant", groq_api_key=os.getenv("GROQ_API_KEY"))
        self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
        
        with open('system_prompt.txt', 'r') as file:
            system_prompt = file.read().strip()
        
        self.prompt = ChatPromptTemplate.from_messages([
            SystemMessagePromptTemplate.from_template(system_prompt),
            MessagesPlaceholder(variable_name="chat_history"),
            HumanMessagePromptTemplate.from_template("{text}")
        ])
        self.conversation = LLMChain(
            llm=self.llm,
            prompt=self.prompt,
            memory=self.memory
        )
    
    def process(self, text):
        self.memory.chat_memory.add_user_message(text)
        start_time = time.time()
        response = self.conversation.invoke({"text": text})
        end_time = time.time()
        self.memory.chat_memory.add_ai_message(response['text'])
        elapsed_time = int((end_time - start_time) * 1000)
        print(f"LLM ({elapsed_time}ms): {response['text']}")
        return response['text']

class AvatarGeneration:
    def __init__(self):
        self.tts_api_key = os.getenv("ELEVENLABS_API_KEY")
        self.simli_api_key = os.getenv("SIMLI_API_KEY")
        self.url = "https://api.simli.ai/textToVideoStream"
    
    def speak(self, text: str):
        if not self.tts_api_key or not self.simli_api_key:
            raise ValueError("API keys for TTS or Simli are missing in environment variables.")
        
        payload = {
            "ttsAPIKey": self.tts_api_key,
            "simliAPIKey": self.simli_api_key,
            "faceId": "30b2d7d5-dfb6-4f38-b207-a5552c9e9c55",
            "requestBody": {
                "audioProvider": "ElevenLabs",
                "text": text,
                "voiceName": "pMsXgVXv3BLzUgSXRplE",
                "model_id": "eleven_flash_v2",
                "voice_settings": {
                    "stability": 0.1,
                    "similarity_boost": 0.1,
                    "style": 0
                }
            }
        }
        
        headers = {"Content-Type": "application/json"}
        try:
            response = requests.post(self.url, json=payload, headers=headers)
            if response.status_code != 200:
                raise HTTPException(
                    status_code=response.status_code,
                    detail=f"AvatarGeneration API error: {response.text}"
                )
            return response.json()
        except requests.RequestException as e:
            raise HTTPException(status_code=500, detail=f"HTTP request failed: {str(e)}")

# Transcription collector class
class TranscriptCollector:
    def __init__(self):
        self.reset()

    def reset(self):
        self.transcript_parts = []

    def add_part(self, part):
        self.transcript_parts.append(part)

    def get_full_transcript(self):
        return ' '.join(self.transcript_parts)

transcript_collector = TranscriptCollector()

# Deepgram transcription client setup for streaming
async def transcribe_audio_stream(ws: WebSocket):
    try:
        dg_client = Deepgram(os.getenv("DEEPGRAM_API_KEY"))
        async with dg_client.transcription.stream(
            {"language": "en-US", "punctuate": True}
        ) as stream:
            while True:
                # Receive audio data from the WebSocket connection
                audio_data = await ws.receive_bytes()
                if audio_data:
                    await stream.send_audio(audio_data)
                    transcript_response = await stream.receive_json()
                    transcript = transcript_response['results']['channels'][0]['alternatives'][0]['transcript']
                    transcript_collector.add_part(transcript)
                    # Send partial transcript back to the client (optional)
                    await ws.send_text(transcript_collector.get_full_transcript())
    except WebSocketDisconnect:
        print("Client disconnected")
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Transcription error: {str(e)}")

# CORS setup
origins = ["*"]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# WebSocket route to handle real-time audio stream and return transcription, LLM response, and MP4 URL
@app.websocket("/ws/audio")
async def websocket_audio_stream(ws: WebSocket):
    await ws.accept()

    # Collect the full transcription
    await transcribe_audio_stream(ws)

    # Get the final transcript after streaming is finished
    transcription = transcript_collector.get_full_transcript()

    # Process the transcript with the LLM
    llm_response = LanguageModelProcessor().process(transcription)

    # Generate avatar's MP4 URL
    avatar_response = AvatarGeneration().speak(llm_response)
    mp4_url = avatar_response.get("mp4_url")

    # Return all responses to the client
    response = {
        "User": transcription,
        "llm_response": llm_response,
        "avatar_response": mp4_url
    }

    # Send the final response back to the client
    await ws.send_text(str(response))

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8080)