From ab581138f57a689b562bd6ec311d42680c0ed2c7 Mon Sep 17 00:00:00 2001 From: winlin Date: Mon, 29 Jan 2024 15:33:56 +0800 Subject: [PATCH] Room: AI-Talk allow disable ASR/TTS, enable text. v5.13.19 --- DEVELOPER.md | 1 + platform/ai-talk.go | 160 ++++++++++++++++++++-------- platform/live-room.go | 63 ++++++++--- ui/src/components/AITalk.js | 152 +++++++++++++++++++------- ui/src/components/OpenAISettings.js | 11 +- ui/src/pages/ScenarioLiveRoom.js | 151 ++++++++++++++++++-------- ui/src/pages/ScenarioTranscript.js | 13 ++- ui/src/resources/locale.json | 24 ++++- 8 files changed, 421 insertions(+), 154 deletions(-) diff --git a/DEVELOPER.md b/DEVELOPER.md index 57525853..9088401d 100644 --- a/DEVELOPER.md +++ b/DEVELOPER.md @@ -1127,6 +1127,7 @@ The following are the update records for the SRS Stack server. * Room: AI-Talk support popout AI assistant. v5.13.17 * Room: AI-Talk support multiple assistant in a room. v5.13.18 * Room: AI-Talk support user different languages. v5.13.18 + * Room: AI-Talk allow disable ASR/TTS, enable text. v5.13.19 * v5.12 * Refine local variable name conf to config. v5.12.1 * Add forced exit on timeout for program termination. v5.12.1 diff --git a/platform/ai-talk.go b/platform/ai-talk.go index bdc04dde..b1df702a 100644 --- a/platform/ai-talk.go +++ b/platform/ai-talk.go @@ -555,42 +555,42 @@ func (v *StageRequest) total() float64 { } func (v *StageRequest) upload() float64 { - if v.lastUploadAudio.After(v.lastSentence) { + if v.lastUploadAudio.After(v.lastSentence.Add(100 * time.Millisecond)) { return float64(v.lastUploadAudio.Sub(v.lastSentence)) / float64(time.Second) } return 0 } func (v *StageRequest) exta() float64 { - if v.lastExtractAudio.After(v.lastUploadAudio) { + if v.lastExtractAudio.After(v.lastUploadAudio.Add(100 * time.Millisecond)) { return float64(v.lastExtractAudio.Sub(v.lastUploadAudio)) / float64(time.Second) } return 0 } func (v *StageRequest) asr() float64 { - if v.lastRequestASR.After(v.lastExtractAudio) { + if v.lastRequestASR.After(v.lastExtractAudio.Add(100 * time.Millisecond)) { return float64(v.lastRequestASR.Sub(v.lastExtractAudio)) / float64(time.Second) } return 0 } func (v *StageRequest) chat() float64 { - if v.lastRequestChat.After(v.lastRequestASR) { + if v.lastRequestChat.After(v.lastRequestASR.Add(100 * time.Millisecond)) { return float64(v.lastRequestChat.Sub(v.lastRequestASR)) / float64(time.Second) } return 0 } func (v *StageRequest) tts() float64 { - if v.lastRequestTTS.After(v.lastRequestChat) { + if v.lastRequestTTS.After(v.lastRequestChat.Add(100 * time.Millisecond)) { return float64(v.lastRequestTTS.Sub(v.lastRequestChat)) / float64(time.Second) } return 0 } func (v *StageRequest) download() float64 { - if v.lastDownloadAudio.After(v.lastRequestTTS) { + if v.lastDownloadAudio.After(v.lastRequestTTS.Add(100 * time.Millisecond)) { return float64(v.lastDownloadAudio.Sub(v.lastRequestTTS)) / float64(time.Second) } return 0 @@ -614,6 +614,8 @@ type StageMessage struct { // For role audio. // The audio segment uuid. SegmentUUID string `json:"asid"` + // Whether has audio file. + HasAudioFile bool `json:"hasAudio"` // The audio tts file for audio message. audioFile string @@ -701,11 +703,18 @@ func (v *StageSubscriber) createRobotEmptyMessage() *StageMessage { func (v *StageSubscriber) completeRobotAudioMessage(ctx context.Context, sreq *StageRequest, segment *AnswerSegment, message *StageMessage) { // Build a new copy file of ttsFile. - ttsExt := path.Ext(segment.ttsFile) - copyFile := fmt.Sprintf("%v-copy-%v%v", segment.ttsFile[:len(segment.ttsFile)-len(ttsExt)], v.spid, ttsExt) + var copyFile string + if !segment.noTTS && segment.ttsFile != "" { + ttsExt := path.Ext(segment.ttsFile) + copyFile = fmt.Sprintf("%v-copy-%v%v", segment.ttsFile[:len(segment.ttsFile)-len(ttsExt)], v.spid, ttsExt) + } // Copy the ttsFile to copyFile. if err := func() error { + if copyFile == "" { + return nil + } + // If segment is error, ignore. if segment.err != nil { return nil @@ -738,6 +747,9 @@ func (v *StageSubscriber) completeRobotAudioMessage(ctx context.Context, sreq *S message.RequestUUID, message.SegmentUUID = sreq.rid, segment.asid message.Role, message.Message, message.audioFile = "robot", segment.text, copyFile + // User may disable TTS, we only ship the text message to user. + message.HasAudioFile = !segment.noTTS + // Always close message if timeout. go func() { select { @@ -866,6 +878,11 @@ type Stage struct { // AI Chat message window. chatWindow int + // Whether enabled AI services. + aiASREnabled bool + aiChatEnabled bool + aiTtsEnabled bool + // The AI configuration. aiConfig openai.ClientConfig // The room it belongs to. Note that it's a caching object, update when updating the room. The room object @@ -924,6 +941,11 @@ func helloVoiceFromLanguage(language string) string { } func (v *Stage) UpdateFromRoom(room *SrsLiveRoom) { + // Whether enabled. + v.aiASREnabled = room.AIASREnabled + v.aiChatEnabled = room.AIChatEnabled + v.aiTtsEnabled = room.AITTSEnabled + // Create robot for the stage, which attach to a special room. v.voice = helloVoiceFromLanguage(room.AIASRLanguage) v.prompt = room.AIChatPrompt @@ -1028,6 +1050,8 @@ type AnswerSegment struct { text string // The TTS file path. ttsFile string + // Whether no tts file, as user disabled TTS for example. + noTTS bool // Whether TTS is done, ready to play. ready bool // Whether TTS is error, failed. @@ -1181,18 +1205,25 @@ func (v *TTSWorker) SubmitSegment(ctx context.Context, stage *Stage, sreq *Stage go func() { defer v.wg.Done() - ttsService := NewOpenAITTSService(stage.aiConfig) - if err := ttsService.RequestTTS(ctx, func(ext string) string { - segment.ttsFile = path.Join(aiTalkWorkDir, - fmt.Sprintf("assistant-%v-sentence-%v-tts.%v", sreq.rid, segment.asid, ext), - ) - return segment.ttsFile - }, segment.text); err != nil { - segment.err = err + if stage.aiTtsEnabled { + ttsService := NewOpenAITTSService(stage.aiConfig) + if err := ttsService.RequestTTS(ctx, func(ext string) string { + segment.ttsFile = path.Join(aiTalkWorkDir, + fmt.Sprintf("assistant-%v-sentence-%v-tts.%v", sreq.rid, segment.asid, ext), + ) + return segment.ttsFile + }, segment.text); err != nil { + segment.err = err + } else { + segment.ready, segment.noTTS = true, false + sreq.onSegmentReady(segment) + logger.Tf(ctx, "TTS: Complete rid=%v, asid=%v, file saved to %v, %v", + sreq.rid, segment.asid, segment.ttsFile, segment.text) + } } else { - segment.ready = true + segment.ready, segment.noTTS = true, true sreq.onSegmentReady(segment) - logger.Tf(ctx, "File saved to %v, %v", segment.ttsFile, segment.text) + logger.Tf(ctx, "TTS: Skip rid=%v, asid=%v, %v", sreq.rid, segment.asid, segment.text) } // Update all messages. @@ -1358,11 +1389,15 @@ func handleAITalkService(ctx context.Context, handler *http.ServeMux) error { StageID string `json:"sid"` RoomToken string `json:"roomToken"` UserID string `json:"userId"` + // AI Configurations. + AIASREnabled bool `json:"aiAsrEnabled"` } r0 := &StageResult{ StageID: stage.sid, RoomToken: stage.room.RoomToken, UserID: user.UserID, + // AI Configurations. + AIASREnabled: room.AIASREnabled, } ohttp.WriteData(ctx, w, r, r0) @@ -1447,9 +1482,10 @@ func handleAITalkService(ctx context.Context, handler *http.ServeMux) error { handler.HandleFunc(ep, func(w http.ResponseWriter, r *http.Request) { if err := func() error { var token string - var sid, rid, userID, audioBase64Data string + var sid, rid, userID string var roomUUID, roomToken string var userMayInput float64 + var audioBase64Data, textMessage string if err := ParseBody(ctx, r.Body, &struct { Token *string `json:"token"` StageUUID *string `json:"sid"` @@ -1457,11 +1493,12 @@ func handleAITalkService(ctx context.Context, handler *http.ServeMux) error { RequestUUID *string `json:"rid"` UserMayInput *float64 `json:"umi"` AudioData *string `json:"audio"` + TextMessage *string `json:"text"` RoomUUID *string `json:"room"` RoomToken *string `json:"roomToken"` }{ Token: &token, StageUUID: &sid, UserID: &userID, RequestUUID: &rid, - UserMayInput: &userMayInput, AudioData: &audioBase64Data, + UserMayInput: &userMayInput, TextMessage: &textMessage, AudioData: &audioBase64Data, RoomUUID: &roomUUID, RoomToken: &roomToken, }); err != nil { return errors.Wrapf(err, "parse body") @@ -1484,6 +1521,9 @@ func handleAITalkService(ctx context.Context, handler *http.ServeMux) error { if userID == "" { return errors.Errorf("empty userId") } + if audioBase64Data == "" && textMessage == "" { + return errors.Errorf("empty audio and text") + } stage := talkServer.QueryStage(sid) if stage == nil { @@ -1522,18 +1562,33 @@ func handleAITalkService(ctx context.Context, handler *http.ServeMux) error { logger.Tf(ctx, "Stage: Got question sid=%v, rid=%v, user=%v, umi=%v, input=%v", sid, sreq.rid, userID, userMayInput, sreq.inputFile) - // Save audio input to file. - if err := sreq.receiveInputFile(ctx, audioBase64Data); err != nil { - return errors.Wrapf(err, "save %vB audio to file %v", len(audioBase64Data), sreq.inputFile) + // Whether user input audio. + if audioBase64Data != "" { + // Save audio input to file. + if err := sreq.receiveInputFile(ctx, audioBase64Data); err != nil { + return errors.Wrapf(err, "save %vB audio to file %v", len(audioBase64Data), sreq.inputFile) + } + + // Do ASR, convert to text. + asrLanguage := ChooseNotEmpty(user.Language, stage.asrLanguage) + if err := sreq.asrAudioToText(ctx, stage.aiConfig, asrLanguage, user.previousAsrText); err != nil { + return errors.Wrapf(err, "asr lang=%v, previous=%v", asrLanguage, user.previousAsrText) + } + logger.Tf(ctx, "ASR ok, sid=%v, rid=%v, user=%v, lang=%v, prompt=<%v>, resp is <%v>", + sid, sreq.rid, userID, asrLanguage, user.previousAsrText, sreq.asrText) + } else { + // Directly update the time for stat. + sreq.lastUploadAudio = time.Now() + sreq.lastExtractAudio = time.Now() + sreq.lastRequestASR = time.Now() } - // Do ASR, convert to text. - asrLanguage := ChooseNotEmpty(user.Language, stage.asrLanguage) - if err := sreq.asrAudioToText(ctx, stage.aiConfig, asrLanguage, user.previousAsrText); err != nil { - return errors.Wrapf(err, "asr lang=%v, previous=%v", asrLanguage, user.previousAsrText) + // Handle user input text. + if textMessage != "" { + sreq.asrText = textMessage + logger.Tf(ctx, "Text ok, sid=%v, rid=%v, user=%v, text=%v", + sid, sreq.rid, userID, sreq.asrText) } - logger.Tf(ctx, "ASR ok, sid=%v, rid=%v, user=%v, lang=%v, prompt=<%v>, resp is <%v>", - sid, sreq.rid, userID, asrLanguage, user.previousAsrText, sreq.asrText) // Important trace log. user.previousAsrText = sreq.asrText @@ -1638,7 +1693,7 @@ func handleAITalkService(ctx context.Context, handler *http.ServeMux) error { ohttp.WriteData(ctx, w, r, struct { Finished bool `json:"finished"` }{ - Finished: !sreq.finished, + Finished: sreq.finished, }) return nil @@ -1892,6 +1947,30 @@ func handleAITalkService(ctx context.Context, handler *http.ServeMux) error { } }) + finishAudioSegment := func(segment *AnswerSegment) { + if segment == nil || segment.logged { + return + } + + // Only log the first segment. + segment.logged = true + if !segment.first { + return + } + + // Time cost logging. + sreq := segment.request + sreq.lastDownloadAudio = time.Now() + speech := float64(sreq.lastAsrDuration) / float64(time.Second) + logger.Tf(ctx, "Elapsed cost total=%.1fs, steps=[upload=%.1fs,exta=%.1fs,asr=%.1fs,chat=%.1fs,tts=%.1fs,download=%.1fs], ask=%v, speech=%.1fs, answer=%v", + sreq.total(), sreq.upload(), sreq.exta(), sreq.asr(), sreq.chat(), sreq.tts(), sreq.download(), + sreq.lastRequestAsrText, speech, sreq.lastRobotFirstText) + + // Important trace log. Note that browser may request multiple times, so we only log for the first + // request to reduce logs. + logger.Tf(ctx, "Bot: %v", segment.text) + } + ep = "/terraform/v1/ai-talk/subscribe/tts" logger.Tf(ctx, "Handle %v", ep) handler.HandleFunc(ep, func(w http.ResponseWriter, r *http.Request) { @@ -1954,21 +2033,7 @@ func handleAITalkService(ctx context.Context, handler *http.ServeMux) error { logger.Tf(ctx, "Stage: Download sid=%v, spid=%v, asid=%v", sid, spid, asid) // When the first subscriber got the segment, we log the elapsed time. - if segment := answer.segment; !segment.logged { - sreq := segment.request - if segment.first { - sreq.lastDownloadAudio = time.Now() - speech := float64(sreq.lastAsrDuration) / float64(time.Second) - logger.Tf(ctx, "Elapsed cost total=%.1fs, steps=[upload=%.1fs,exta=%.1fs,asr=%.1fs,chat=%.1fs,tts=%.1fs,download=%.1fs], ask=%v, speech=%.1fs, answer=%v", - sreq.total(), sreq.upload(), sreq.exta(), sreq.asr(), sreq.chat(), sreq.tts(), sreq.download(), - sreq.lastRequestAsrText, speech, sreq.lastRobotFirstText) - } - - // Important trace log. Note that browser may request multiple times, so we only log for the first - // request to reduce logs. - segment.logged = true - logger.Tf(ctx, "Bot: %v", segment.text) - } + finishAudioSegment(answer.segment) // Read the ttsFile and response it as opus audio. if strings.HasSuffix(answer.audioFile, ".wav") { @@ -2041,6 +2106,11 @@ func handleAITalkService(ctx context.Context, handler *http.ServeMux) error { return errors.Errorf("invalid spid %v of sid %v", spid, sid) } + // If no audio file, we stat the time cost when remove the segment. + if answer := subscriber.queryAudioFile(asid); answer != nil { + finishAudioSegment(answer.segment) + } + // Keep alive the stage. stage.KeepAlive() subscriber.KeepAlive() diff --git a/platform/live-room.go b/platform/live-room.go index e9570e28..60c8f8a0 100644 --- a/platform/live-room.go +++ b/platform/live-room.go @@ -37,19 +37,9 @@ func handleLiveRoomService(ctx context.Context, handler *http.ServeMux) error { return errors.Wrapf(err, "authenticate") } - room := &SrsLiveRoom{ - UUID: uuid.NewString(), - // The title of live room. - Title: title, - // The stream name of room. - StreamName: strings.ToLower(strings.ReplaceAll(uuid.NewString(), "-", ""))[:12], - // The secret of live room. - Secret: strings.ToUpper(strings.ReplaceAll(uuid.NewString(), "-", ""))[:16], - // Create time. - CreatedAt: time.Now().Format(time.RFC3339), - // The stage level token for popout. - RoomToken: uuid.NewString(), - } + room := NewLiveRoom(func(room *SrsLiveRoom) { + room.Title = title + }) if b, err := json.Marshal(room); err != nil { return errors.Wrapf(err, "marshal room") } else if err := rdb.HSet(ctx, SRS_LIVE_ROOM, room.UUID, string(b)).Err(); err != nil { @@ -239,7 +229,7 @@ type SrsLiveRoom struct { // Live room secret. Secret string `json:"secret"` // The AI assistant settings. - SrsAssistant + *SrsAssistant // The current AI assistant stage, might change to others. StageUUID string `json:"stage_uuid"` // The room level authentication token, for example, popout application with this token to verify @@ -249,6 +239,26 @@ type SrsLiveRoom struct { CreatedAt string `json:"created_at"` } +func NewLiveRoom(opts ...func(room *SrsLiveRoom)) *SrsLiveRoom { + v := &SrsLiveRoom{ + UUID: uuid.NewString(), + // The stream name of room. + StreamName: strings.ToLower(strings.ReplaceAll(uuid.NewString(), "-", ""))[:12], + // The secret of live room. + Secret: strings.ToUpper(strings.ReplaceAll(uuid.NewString(), "-", ""))[:16], + // Create time. + CreatedAt: time.Now().Format(time.RFC3339), + // The stage level token for popout. + RoomToken: uuid.NewString(), + // Create a default assistant. + SrsAssistant: NewAssistant(), + } + for _, opt := range opts { + opt(v) + } + return v +} + func (v *SrsLiveRoom) String() string { return fmt.Sprintf("uuid=%v, title=%v, stream=%v, secret=%vB, roomToken=%vB, stage=%v, assistant=<%v>", v.UUID, v.Title, v.StreamName, len(v.Secret), len(v.RoomToken), v.StageUUID, v.SrsAssistant.String()) @@ -273,8 +283,14 @@ type SrsAssistant struct { AISecretKey string `json:"aiSecretKey"` // The AI base URL. AIBaseURL string `json:"aiBaseURL"` + + // Whether enable the AI ASR. + AIASREnabled bool `json:"aiAsrEnabled"` // The AI asr language. AIASRLanguage string `json:"aiAsrLanguage"` + + // Whether enable the AI processing. + AIChatEnabled bool `json:"aiChatEnabled"` // The AI model name. AIChatModel string `json:"aiChatModel"` // The AI chat system prompt. @@ -283,9 +299,24 @@ type SrsAssistant struct { AIChatMaxWindow int `json:"aiChatMaxWindow"` // The AI chat max words. AIChatMaxWords int `json:"aiChatMaxWords"` + + // Whether enable the AI TTS. + AITTSEnabled bool `json:"aiTtsEnabled"` +} + +func NewAssistant(opts ...func(*SrsAssistant)) *SrsAssistant { + v := &SrsAssistant{ + AIASREnabled: true, AIChatEnabled: true, AITTSEnabled: true, + } + for _, opt := range opts { + opt(v) + } + return v } func (v *SrsAssistant) String() string { - return fmt.Sprintf("assistant=%v, aiName=%v, aiProvider=%v, aiSecretKey=%v, aiBaseURL=%v, aiAsrLanguage=%v, aiChatModel=%v, aiChatPrompt=%v, aiChatMaxWindow=%v, aiChatMaxWords=%v", - v.Assistant, v.AIName, v.AIProvider, len(v.AISecretKey), v.AIBaseURL, v.AIASRLanguage, v.AIChatModel, v.AIChatPrompt, v.AIChatMaxWindow, v.AIChatMaxWords) + return fmt.Sprintf("assistant=%v, name=%v, provider=%v, secretKey=%vB, baseURL=%v, asr=, chat=, tts=<%v>", + v.Assistant, v.AIName, v.AIProvider, len(v.AISecretKey), v.AIBaseURL, v.AIASREnabled, + v.AIASRLanguage, v.AIChatEnabled, v.AIChatModel, v.AIChatPrompt, v.AIChatMaxWindow, + v.AIChatMaxWords, v.AITTSEnabled) } diff --git a/ui/src/components/AITalk.js b/ui/src/components/AITalk.js index 9708f335..080aa759 100644 --- a/ui/src/components/AITalk.js +++ b/ui/src/components/AITalk.js @@ -1,5 +1,5 @@ import React from "react"; -import {Alert, Button, Card, Col, Dropdown, Form, Row, Spinner} from "react-bootstrap"; +import {Alert, Button, Card, Col, Dropdown, Form, InputGroup, Row, Spinner} from "react-bootstrap"; import {useTranslation} from "react-i18next"; import {useErrorHandler} from "react-error-boundary"; import useIsMobile from "./IsMobile"; @@ -29,8 +29,11 @@ export function AITalkAssistantPanel({roomUUID, roomToken, fullscreen}) { // The uuid and robot in stage, which is unchanged after stage started. const [stageUUID, setStageUUID] = React.useState(null); const [userID, setUserID] = React.useState(null); + const [aiAsrEnabled, setAiAsrEnabled] = React.useState(); const [stageUser, setStageUser] = React.useState(null); const [stagePopoutUUID, setStagePopoutUUID] = React.useState(null); + // Last user input text, from ASR, set to input for user to update it. + const [userAsrText, setUserAsrText] = React.useState(null); const [booting, setBooting] = React.useState(true); const [errorLogs, setErrorLogs] = React.useState([]); @@ -176,8 +179,9 @@ export function AITalkAssistantPanel({roomUUID, roomToken, fullscreen}) { console.log(`Start: Create stage success: ${JSON.stringify(res.data.data)}`); setStageUUID(res.data.data.sid); setUserID(res.data.data.userId); + setAiAsrEnabled(res.data.data.aiAsrEnabled); }).catch(handleError); - }, [handleError, booting, roomUUID, roomToken, setStageUUID, setUserID]); + }, [handleError, booting, roomUUID, roomToken, setStageUUID, setUserID, setAiAsrEnabled]); // Start to chat, set the robot to ready. const startChatting = React.useCallback(async (user) => { @@ -319,6 +323,7 @@ export function AITalkAssistantPanel({roomUUID, roomToken, fullscreen}) { }).then(res => { console.log(`ASR: Upload success: ${res.data.data.rid} ${res.data.data.asr}`); resolve(res.data.data.rid); + setUserAsrText(res.data.data.asr); }).catch((error) => reject(error)); }); @@ -410,11 +415,45 @@ export function AITalkAssistantPanel({roomUUID, roomToken, fullscreen}) { ref.current.stopHandler = setTimeout(() => { stopRecordingImpl(); }, timeoutWaitForLastVoice); - }, [roomUUID, roomToken, stageUUID, userID, robotReady, ref, setProcessing, setMicWorking, refRequest]); + }, [roomUUID, roomToken, stageUUID, userID, robotReady, ref, setProcessing, setMicWorking, refRequest, setUserAsrText]); + + // User directly send text message to assistant. + const sendText = React.useCallback(async (text, onFinished) => { + if (!robotReady) return; + if (!text) return; + + try { + const requestUUID = await new Promise((resolve, reject) => { + axios.post('/terraform/v1/ai-talk/stage/conversation', { + room: roomUUID, roomToken, sid: stageUUID, + }, { + headers: Token.loadBearerHeader(), + }).then(res => { + console.log(`ASR: Start conversation success, rid=${res.data.data.rid}`); + resolve(res.data.data.rid); + }).catch(handleError); + }); + + await new Promise((resolve, reject) => { + axios.post('/terraform/v1/ai-talk/stage/upload', { + room: roomUUID, roomToken, sid: stageUUID, rid: requestUUID, userId: userID, + text: text, + }, { + headers: Token.loadBearerHeader(), + }).then(res => { + console.log(`ASR: Send text success: ${res.data.data.rid} ${res.data.data.asr}`); + resolve(res.data.data.rid); + }).catch(handleError); + }); + } finally { + onFinished && onFinished(); + } + }, [robotReady, handleError, roomUUID, roomToken, stageUUID, userID]); // Setup the keyboard event, for PC browser. React.useEffect(() => { if (!robotReady) return; + if (!aiAsrEnabled) return; const handleKeyDown = (e) => { if (processing) return; @@ -436,7 +475,7 @@ export function AITalkAssistantPanel({roomUUID, roomToken, fullscreen}) { window.removeEventListener('keydown', handleKeyDown); window.removeEventListener('keyup', handleKeyUp); }; - }, [robotReady, startRecording, stopRecording, processing]); + }, [robotReady, startRecording, stopRecording, processing, aiAsrEnabled]); // Request server to create a new popout for subscribing all events. React.useEffect(() => { @@ -491,24 +530,29 @@ export function AITalkAssistantPanel({roomUUID, roomToken, fullscreen}) { const audioSegmentUUID = msg.asid; traceLog('Bot', msg.msg, 'success'); - // Play the AI generated audio. - await new Promise(resolve => { - const url = `/terraform/v1/ai-talk/subscribe/tts?sid=${stageUUID}&spid=${stagePopoutUUID}&asid=${audioSegmentUUID}&room=${roomUUID}&roomToken=${roomToken}`; - console.log(`TTS: Playing ${url}`); - - const listener = () => { - playerRef.current.removeEventListener('ended', listener); - console.log(`TTS: Played ${url} done.`); - resolve(); - }; - playerRef.current.addEventListener('ended', listener); - - playerRef.current.src = url; - playerRef.current.play().catch(error => { - console.log(`TTS: Play ${url} failed: ${error}`); - resolve(); + // No audio file, skip it. + if (!msg.hasAudio) { + console.log(`TTS: Consume text message done, ${JSON.stringify(msg)}`); + } else { + // Play the AI generated audio. + await new Promise(resolve => { + const url = `/terraform/v1/ai-talk/subscribe/tts?sid=${stageUUID}&spid=${stagePopoutUUID}&asid=${audioSegmentUUID}&room=${roomUUID}&roomToken=${roomToken}`; + console.log(`TTS: Playing ${url}`); + + const listener = () => { + playerRef.current.removeEventListener('ended', listener); + console.log(`TTS: Played ${url} done.`); + resolve(); + }; + playerRef.current.addEventListener('ended', listener); + + playerRef.current.src = url; + playerRef.current.play().catch(error => { + console.log(`TTS: Play ${url} failed: ${error}`); + resolve(); + }); }); - }); + } // Remove the AI generated audio. await new Promise((resolve, reject) => { @@ -582,7 +626,10 @@ export function AITalkAssistantPanel({roomUUID, roomToken, fullscreen}) { {robotReady && !isMobile ? - + @@ -596,7 +643,10 @@ export function AITalkAssistantPanel({roomUUID, roomToken, fullscreen}) { - + : ''}
@@ -943,7 +993,7 @@ function AITalkUserConfigImpl({roomUUID, roomToken, stageUUID, user, disabled, l ; } -function AITalkAssistantImpl({processing, micWorking, startRecording, stopRecording, roomUUID, roomToken, stageUUID, stageUser}) { +function AITalkAssistantImpl({processing, micWorking, startRecording, stopRecording, userAsrText, sendText, roomUUID, roomToken, stageUUID, stageUser, aiAsrEnabled}) { const {t} = useTranslation(); const isMobile = useIsMobile(); const [showSettings, setShowSettings] = React.useState(false); @@ -951,6 +1001,7 @@ function AITalkAssistantImpl({processing, micWorking, startRecording, stopRecord const [showUserConfig, setShowUserConfig] = React.useState(false); const [user, setUser] = React.useState(stageUser); const [description, setDescription] = React.useState(); + const [userText, setUserText] = React.useState(''); React.useEffect(() => { if (!roomUUID) return; @@ -985,6 +1036,26 @@ function AITalkAssistantImpl({processing, micWorking, startRecording, stopRecord setDescription(` for ${user.username || 'You'} (${user.language})`); }, [user, setDescription]); + const onSendText = React.useCallback((e) => { + e.preventDefault(); + if (!userText) return; + + sendText && sendText(userText, () => { + setUserText(''); + }); + }, [userText, sendText, setUserText]); + + const onUserPressKey = React.useCallback((e) => { + if (e.key === 'Enter') { + onSendText(e); + } + }, [onSendText]); + + React.useEffect(() => { + if (!userAsrText) return; + setUserText(userAsrText); + }, [userAsrText]); + return (
@@ -1005,18 +1076,27 @@ function AITalkAssistantImpl({processing, micWorking, startRecording, stopRecord roomUUID, roomToken, stageUUID, userID: stageUser.userId, onSubmit: onFinishUserConfig, onCancel: () => setShowUserConfig(false), }} /> : ''} -
- {!processing ? -
-
-
-
-
: -
- -
} -
+ + setUserText(e.target.value)} + onKeyPress={onUserPressKey}/> + + + {aiAsrEnabled && <> +
+ {!processing ? +
+
+
+
+
: +
+ +
} +
+ }
diff --git a/ui/src/components/OpenAISettings.js b/ui/src/components/OpenAISettings.js index 90425e34..41eff27a 100644 --- a/ui/src/components/OpenAISettings.js +++ b/ui/src/components/OpenAISettings.js @@ -5,7 +5,7 @@ import {Button, Form, Spinner} from "react-bootstrap"; import {useTranslation} from "react-i18next"; import {useErrorHandler} from "react-error-boundary"; -export function OpenAIWhisperSettings({baseURL, setBaseURL, secretKey, setSecretKey, targetLanguage, setTargetLanguage}) { +export function OpenAISecretSettings({baseURL, setBaseURL, secretKey, setSecretKey}) { const {t} = useTranslation(); const handleError = useErrorHandler(); @@ -51,15 +51,6 @@ export function OpenAIWhisperSettings({baseURL, setBaseURL, secretKey, setSecret   {checking && } -

- - {t('transcript.lang')} - * {t('transcript.lang2')}.   - {t('helper.eg')} en, zh, fr, de, ja, ru , ...   - {t('helper.see')} ISO-639-1. - - setTargetLanguage(e.target.value)} /> - ); } diff --git a/ui/src/pages/ScenarioLiveRoom.js b/ui/src/pages/ScenarioLiveRoom.js index ea872745..6409820d 100644 --- a/ui/src/pages/ScenarioLiveRoom.js +++ b/ui/src/pages/ScenarioLiveRoom.js @@ -15,7 +15,7 @@ import {buildUrls} from "../components/UrlGenerator"; import {SrsEnvContext} from "../components/SrsEnvContext"; import * as Icon from "react-bootstrap-icons"; import PopoverConfirm from "../components/PopoverConfirm"; -import {OpenAIWhisperSettings} from "../components/OpenAISettings"; +import {OpenAISecretSettings} from "../components/OpenAISettings"; import {AITalkAssistantPanel} from "../components/AITalk"; export default function ScenarioLiveRoom() { @@ -351,11 +351,21 @@ function LiveRoomAssistantConfiguration({room, requesting, updateRoom}) { const [aiProvider, setAiProvider] = React.useState(room.aiProvider || 'openai'); const [aiSecretKey, setAiSecretKey] = React.useState(room.aiSecretKey); const [aiBaseURL, setAiBaseURL] = React.useState(room.aiBaseURL || (language === 'zh' ? '' : 'https://api.openai.com/v1')); + const [aiAsrEnabled, setAiAsrEnabled] = React.useState(room.aiAsrEnabled); + const [aiChatEnabled, setAiChatEnabled] = React.useState(room.aiChatEnabled); + const [aiTtsEnabled, setAiTtsEnabled] = React.useState(room.aiTtsEnabled); const [aiAsrLanguage, setAiAsrLanguage] = React.useState(room.aiAsrLanguage || language); - const [aiChatModel, setAiChatModel] = React.useState(room.aiChatModel || 'gpt-3.5-turbo'); + const [aiChatModel, setAiChatModel] = React.useState(room.aiChatModel || 'gpt-4-turbo-preview'); const [aiChatPrompt, setAiChatPrompt] = React.useState(room.aiChatPrompt || 'You are a helpful assistant.'); const [aiChatMaxWindow, setAiChatMaxWindow] = React.useState(room.aiChatMaxWindow || 5); - const [aiChatMaxWords, setAiChatMaxWords] = React.useState(room.aiChatMaxWords || 30); + const [aiChatMaxWords, setAiChatMaxWords] = React.useState(room.aiChatMaxWords || 300); + + const [configItem, setConfigItem] = React.useState('basic'); + + const changeConfigItem = React.useCallback((e, t) => { + e.preventDefault(); + setConfigItem(t); + }, [setConfigItem]); if (!room.assistant) { return ( @@ -367,49 +377,106 @@ function LiveRoomAssistantConfiguration({room, requesting, updateRoom}) { } return (
- - {t('lr.room.name')} - * {t('lr.room.name2')} - setAiName(e.target.value)} /> - - - {t('lr.room.provider')} - * {t('lr.room.provider2')} - setAiProvider(e.target.value)}> - - - - - - - {t('lr.room.model')} - * {t('lr.room.model2')} - setAiChatModel(e.target.value)} /> - - - {t('lr.room.prompt')} - * {t('lr.room.prompt2')} - setAiChatPrompt(e.target.value)} /> - - - {t('lr.room.window')} - * {t('lr.room.window2')} - setAiChatMaxWindow(e.target.value)} /> - - - {t('lr.room.words')} - * {t('lr.room.words2')} - setAiChatMaxWords(e.target.value)} /> - + + + + + {configItem === 'basic' && + + {t('lr.room.name')} + * {t('lr.room.name2')} + setAiName(e.target.value)} /> + + } + {configItem === 'provider' && + + {t('lr.room.provider')} + * {t('lr.room.provider2')} + setAiProvider(e.target.value)}> + + + + + + } + {configItem === 'asr' && + + + setAiAsrEnabled(!aiAsrEnabled)} /> + + + + {t('transcript.lang')} + * {t('transcript.lang2')}.   + {t('helper.eg')} en, zh, fr, de, ja, ru , ...   + {t('helper.see')} ISO-639-1. + + setAiAsrLanguage(e.target.value)} /> + + } + {configItem === 'chat' && + + + setAiChatEnabled(!aiChatEnabled)} /> + + + + {t('lr.room.model')} + * {t('lr.room.model2')} + setAiChatModel(e.target.value)} /> + + + {t('lr.room.prompt')} + * {t('lr.room.prompt2')} + setAiChatPrompt(e.target.value)} /> + + + {t('lr.room.window')} + * {t('lr.room.window2')} + setAiChatMaxWindow(e.target.value)} /> + + + {t('lr.room.words')} + * {t('lr.room.words2')} + setAiChatMaxWords(e.target.value)} /> + + } + {configItem === 'tts' && + + + setAiTtsEnabled(!aiTtsEnabled)} /> + + + } + +

  diff --git a/ui/src/pages/ScenarioTranscript.js b/ui/src/pages/ScenarioTranscript.js index fddd20fa..4f61d73b 100644 --- a/ui/src/pages/ScenarioTranscript.js +++ b/ui/src/pages/ScenarioTranscript.js @@ -6,7 +6,7 @@ import {Token} from "../utils"; import axios from "axios"; import {useErrorHandler} from "react-error-boundary"; import PopoverConfirm from "../components/PopoverConfirm"; -import {OpenAIWhisperSettings} from "../components/OpenAISettings"; +import {OpenAISecretSettings} from "../components/OpenAISettings"; export default function ScenarioTranscript(props) { const handleError = useErrorHandler(); @@ -280,7 +280,16 @@ function ScenarioTranscriptImpl({activeKey, defaultEnabled, defaultConf, default {t('transcript.service')} - + +

+ + {t('transcript.lang')} + * {t('transcript.lang2')}.   + {t('helper.eg')} en, zh, fr, de, ja, ru , ...   + {t('helper.see')} ISO-639-1. + + setTargetLanguage(e.target.value)} /> +