From 2cc35a43006ff72eae77b3db3ac89ae339ced660 Mon Sep 17 00:00:00 2001
From: Matt Gotteiner <matthew.gotteiner@microsoft.com>
Date: Wed, 16 Apr 2025 16:08:10 -0700
Subject: [PATCH 01/10] checkpoint

---
 app/backend/app.py                            |  16 +-
 app/backend/approaches/approach.py            |  23 +-
 app/backend/approaches/chatapproach.py        | 181 +++++++-----
 .../approaches/chatreadretrieveread.py        | 105 +++----
 .../approaches/chatreadretrievereadvision.py  |  86 +++---
 app/backend/approaches/retrievethenread.py    |  73 ++---
 .../approaches/retrievethenreadvision.py      |  76 ++---
 app/backend/error.py                          |   1 +
 app/frontend/src/api/api.ts                   |   6 +-
 app/frontend/src/api/models.ts                |  27 +-
 .../AnalysisPanel/AnalysisPanel.tsx           |  13 +-
 app/frontend/src/components/Answer/Answer.tsx |  18 +-
 .../src/components/Answer/AnswerParser.tsx    |   6 +-
 app/frontend/src/index.tsx                    |   4 +-
 app/frontend/src/pages/ask/Ask.tsx            |  22 +-
 app/frontend/src/pages/chat/Chat.tsx          | 260 ++++++++++--------
 16 files changed, 501 insertions(+), 416 deletions(-)

diff --git a/app/backend/app.py b/app/backend/app.py
index 263fcf06a6..3d6ce2dc25 100644
--- a/app/backend/app.py
+++ b/app/backend/app.py
@@ -8,6 +8,7 @@
 from collections.abc import AsyncGenerator
 from pathlib import Path
 from typing import Any, Union, cast
+import traceback
 
 from azure.cognitiveservices.speech import (
     ResultReason,
@@ -188,10 +189,11 @@ async def ask(auth_claims: dict[str, Any]):
             approach = cast(Approach, current_app.config[CONFIG_ASK_VISION_APPROACH])
         else:
             approach = cast(Approach, current_app.config[CONFIG_ASK_APPROACH])
-        r = await approach.run(
+        result = await approach.run(
             request_json["messages"], context=context, session_state=request_json.get("session_state")
         )
-        return jsonify(r)
+        results = [r async for r in result]
+        return jsonify({"value": results})
     except Exception as error:
         return error_response(error, "/ask")
 
@@ -208,6 +210,7 @@ async def format_as_ndjson(r: AsyncGenerator[dict, None]) -> AsyncGenerator[str,
         async for event in r:
             yield json.dumps(event, ensure_ascii=False, cls=JSONEncoder) + "\n"
     except Exception as error:
+        traceback.print_exc()
         logging.exception("Exception while generating response stream: %s", error)
         yield json.dumps(error_dict(error))
 
@@ -241,7 +244,8 @@ async def chat(auth_claims: dict[str, Any]):
             context=context,
             session_state=session_state,
         )
-        return jsonify(result)
+        results = [r async for r in result]
+        return jsonify({"value": results})
     except Exception as error:
         return error_response(error, "/chat")
 
@@ -790,12 +794,12 @@ def create_app():
 
     # Log levels should be one of https://docs.python.org/3/library/logging.html#logging-levels
     # Set root level to WARNING to avoid seeing overly verbose logs from SDKS
-    logging.basicConfig(level=logging.WARNING)
+    logging.basicConfig(level=logging.INFO)
     # Set our own logger levels to INFO by default
-    app_level = os.getenv("APP_LOG_LEVEL", "INFO")
+    app_level = os.getenv("APP_LOG_LEVEL", "DEBUG")
     app.logger.setLevel(os.getenv("APP_LOG_LEVEL", app_level))
+    app.logger.setLevel("DEBUG")
     logging.getLogger("scripts").setLevel(app_level)
-
     if allowed_origin := os.getenv("ALLOWED_ORIGIN"):
         allowed_origins = allowed_origin.split(";")
         if len(allowed_origins) > 0:
diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
index 59f1909a54..3d3ed6edca 100644
--- a/app/backend/approaches/approach.py
+++ b/app/backend/approaches/approach.py
@@ -88,31 +88,23 @@ def trim_embedding(cls, embedding: Optional[list[float]]) -> Optional[str]:
 
         return None
 
+@dataclass
+class DataPoints:
+    text: Optional[list[str]] = None
+    images: Optional[list] = None
 
 @dataclass
 class ThoughtStep:
     title: str
     description: Optional[Any]
     props: Optional[dict[str, Any]] = None
+    data_points: Optional[DataPoints] = None
 
     def update_token_usage(self, usage: CompletionUsage) -> None:
         if self.props:
             self.props["token_usage"] = TokenUsageProps.from_completion_usage(usage)
 
 
-@dataclass
-class DataPoints:
-    text: Optional[list[str]] = None
-    images: Optional[list] = None
-
-
-@dataclass
-class ExtraInfo:
-    data_points: DataPoints
-    thoughts: Optional[list[ThoughtStep]] = None
-    followup_questions: Optional[list[Any]] = None
-
-
 @dataclass
 class TokenUsageProps:
     prompt_tokens: int
@@ -403,6 +395,7 @@ def format_thought_step_for_chatcompletion(
         deployment: Optional[str],
         usage: Optional[CompletionUsage] = None,
         reasoning_effort: Optional[ChatCompletionReasoningEffort] = None,
+        data_points: Optional[DataPoints] = None,
     ) -> ThoughtStep:
         properties: dict[str, Any] = {"model": model}
         if deployment:
@@ -414,14 +407,14 @@ def format_thought_step_for_chatcompletion(
             )
         if usage:
             properties["token_usage"] = TokenUsageProps.from_completion_usage(usage)
-        return ThoughtStep(title, messages, properties)
+        return ThoughtStep(title, messages, properties, data_points)
 
     async def run(
         self,
         messages: list[ChatCompletionMessageParam],
         session_state: Any = None,
         context: dict[str, Any] = {},
-    ) -> dict[str, Any]:
+    ) -> AsyncGenerator[dict[str, Any], None]:
         raise NotImplementedError
 
     async def run_stream(
diff --git a/app/backend/approaches/chatapproach.py b/app/backend/approaches/chatapproach.py
index 346c9f3b0a..1559c20a5a 100644
--- a/app/backend/approaches/chatapproach.py
+++ b/app/backend/approaches/chatapproach.py
@@ -2,7 +2,7 @@
 import re
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Awaitable
-from typing import Any, Optional, Union, cast
+from typing import Any, Optional, List, Union
 
 from openai import AsyncStream
 from openai.types.chat import (
@@ -13,9 +13,57 @@
 
 from approaches.approach import (
     Approach,
-    ExtraInfo,
+    ThoughtStep
 )
 
+class StreamingThoughtStep:
+    def __init__(self, step: ThoughtStep, chat_completion: Optional[Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]] = None, role: Optional[str] = "assistant"):
+        self.step = step
+        self.chat_completion = chat_completion
+        self.role = role
+        self._stream = None
+        self._is_streaming = None
+    
+    def has_content(self) -> bool:
+        return self.chat_completion is not None
+
+    def __aiter__(self):
+        return self
+    
+    async def start(self):
+        if self._stream is None and self.chat_completion is not None:
+            self._stream = await self.chat_completion
+            self._is_streaming = True
+
+    async def __anext__(self) -> Union[ChatCompletion, ChatCompletionChunk, ThoughtStep]:
+        if self._is_streaming:
+            # Streaming Implementation: yield each chunk, then the step with token usage
+            if self._stream is None:
+                raise StopAsyncIteration
+
+            try:
+                # Get the next chunk from the async stream
+                chunk = await self._stream.__anext__()
+                if len(chunk.choices) == 0 and chunk.usage:
+                    self.step.update_token_usage(chunk.usage)
+                return chunk
+            except StopAsyncIteration:
+                # If the stream is exhausted, yield the step with token usage
+                self._stream = None
+                return self.step
+        
+        # Non-Streaming Implementation: return the entire response, then the step with token usage
+        if self._stream is None:
+            if self.step is None:
+                raise StopAsyncIteration
+            
+            result = self.step
+            self.step = None
+            return result
+
+        result = self._stream
+        self._stream = None
+        return result
 
 class ChatApproach(Approach, ABC):
 
@@ -24,7 +72,7 @@ class ChatApproach(Approach, ABC):
     @abstractmethod
     async def run_until_final_call(
         self, messages, overrides, auth_claims, should_stream
-    ) -> tuple[ExtraInfo, Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]]:
+    ) -> AsyncGenerator[StreamingThoughtStep, None]:
         pass
 
     def get_search_query(self, chat_completion: ChatCompletion, user_query: str):
@@ -45,7 +93,7 @@ def get_search_query(self, chat_completion: ChatCompletion, user_query: str):
                 return query_text
         return user_query
 
-    def extract_followup_questions(self, content: Optional[str]):
+    def extract_followup_questions(self, content: Optional[str]) -> Optional[List[str]]:
         if content is None:
             return content, []
         return content.split("<<")[0], re.findall(r"<<([^>>]+)>>", content)
@@ -56,25 +104,32 @@ async def run_without_streaming(
         overrides: dict[str, Any],
         auth_claims: dict[str, Any],
         session_state: Any = None,
-    ) -> dict[str, Any]:
-        extra_info, chat_coroutine = await self.run_until_final_call(
+    ) -> AsyncGenerator[dict[str, Any], None]:
+        thoughts = self.run_until_final_call(
             messages, overrides, auth_claims, should_stream=False
         )
-        chat_completion_response: ChatCompletion = await cast(Awaitable[ChatCompletion], chat_coroutine)
-        content = chat_completion_response.choices[0].message.content
-        role = chat_completion_response.choices[0].message.role
-        if overrides.get("suggest_followup_questions"):
-            content, followup_questions = self.extract_followup_questions(content)
-            extra_info.followup_questions = followup_questions
-        # Assume last thought is for generating answer
-        if self.include_token_usage and extra_info.thoughts and chat_completion_response.usage:
-            extra_info.thoughts[-1].update_token_usage(chat_completion_response.usage)
-        chat_app_response = {
-            "message": {"content": content, "role": role},
-            "context": extra_info,
-            "session_state": session_state,
-        }
-        return chat_app_response
+        async for thought in thoughts:
+            content = None
+            role = None
+            thought_step = None
+            followup_questions = None
+            await thought.start()
+            async for chunk in thought:
+                if isinstance(chunk, ChatCompletion):
+                    content = chunk.choices[0].message.content
+                    role = chunk.choices[0].message.role
+                elif isinstance(chunk, ThoughtStep):
+                   thought_step = chunk
+
+            if overrides.get("suggest_followup_questions"):
+                content, followup_questions = self.extract_followup_questions(content)
+                followup_questions = followup_questions
+
+            yield {
+                "message": {"content": content, "role": role},
+                "context": { "thought": thought_step, "followup_questions": followup_questions },
+                "session_state": session_state,
+            }
 
     async def run_with_streaming(
         self,
@@ -82,53 +137,47 @@ async def run_with_streaming(
         overrides: dict[str, Any],
         auth_claims: dict[str, Any],
         session_state: Any = None,
-    ) -> AsyncGenerator[dict, None]:
-        extra_info, chat_coroutine = await self.run_until_final_call(
+    ) -> AsyncGenerator[dict[str, Any], None]:
+        thoughts = self.run_until_final_call(
             messages, overrides, auth_claims, should_stream=True
         )
-        chat_coroutine = cast(Awaitable[AsyncStream[ChatCompletionChunk]], chat_coroutine)
-        yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state}
-
-        followup_questions_started = False
-        followup_content = ""
-        async for event_chunk in await chat_coroutine:
-            # "2023-07-01-preview" API version has a bug where first response has empty choices
-            event = event_chunk.model_dump()  # Convert pydantic model to dict
-            if event["choices"]:
-                # No usage during streaming
-                completion = {
-                    "delta": {
-                        "content": event["choices"][0]["delta"].get("content"),
-                        "role": event["choices"][0]["delta"]["role"],
-                    }
-                }
-                # if event contains << and not >>, it is start of follow-up question, truncate
-                content = completion["delta"].get("content")
-                content = content or ""  # content may either not exist in delta, or explicitly be None
-                if overrides.get("suggest_followup_questions") and "<<" in content:
-                    followup_questions_started = True
-                    earlier_content = content[: content.index("<<")]
-                    if earlier_content:
-                        completion["delta"]["content"] = earlier_content
-                        yield completion
-                    followup_content += content[content.index("<<") :]
-                elif followup_questions_started:
-                    followup_content += content
-                else:
-                    yield completion
-            else:
-                # Final chunk at end of streaming should contain usage
-                # https://cookbook.openai.com/examples/how_to_stream_completions#4-how-to-get-token-usage-data-for-streamed-chat-completion-response
-                if event_chunk.usage and extra_info.thoughts and self.include_token_usage:
-                    extra_info.thoughts[-1].update_token_usage(event_chunk.usage)
-                    yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state}
-
-        if followup_content:
-            _, followup_questions = self.extract_followup_questions(followup_content)
-            yield {
-                "delta": {"role": "assistant"},
-                "context": {"context": extra_info, "followup_questions": followup_questions},
-            }
+        async for thought in thoughts:
+            yield { "delta": { "role": thought.role }, "has_content": thought.has_content() }
+
+            followup_questions_started = False
+            followup_content = ""
+            thought_step = None
+            await thought.start()
+            async for event in thought:
+                if isinstance(event, ChatCompletionChunk):
+                    if event.choices:
+                        completion = {
+                            "delta": {
+                                "content": event.choices[0].delta.content,
+                                "role": event.choices[0].delta.role
+                            }
+                        }
+                        # if event contains << and not >>, it is start of follow-up question, truncate
+                        content = completion["delta"].get("content")
+                        content = content or ""  # content may either not exist in delta, or explicitly be None
+                        if overrides.get("suggest_followup_questions") and "<<" in content:
+                            followup_questions_started = True
+                            earlier_content = content[: content.index("<<")]
+                            if earlier_content:
+                                completion["delta"]["content"] = earlier_content
+                                yield completion
+                            followup_content += content[content.index("<<") :]
+                        elif followup_questions_started:
+                            followup_content += content
+                        else:
+                            yield completion
+                elif isinstance(event, ThoughtStep):
+                    thought_step = event
+
+            followup_questions = None
+            if followup_content:
+                _, followup_questions = self.extract_followup_questions(followup_content)
+            yield {"delta": {"role": thought.role, "finish_reason": "stop" }, "context": { "thought": thought_step, "followup_questions": followup_questions }, "session_state": session_state }
 
     async def run(
         self,
diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py
index 249c7247b2..9f25c57a36 100644
--- a/app/backend/approaches/chatreadretrieveread.py
+++ b/app/backend/approaches/chatreadretrieveread.py
@@ -1,5 +1,5 @@
 from collections.abc import Awaitable
-from typing import Any, Optional, Union, cast
+from typing import Any, Optional, Union, cast, AsyncGenerator
 
 from azure.search.documents.aio import SearchClient
 from azure.search.documents.models import VectorQuery
@@ -11,8 +11,8 @@
     ChatCompletionToolParam,
 )
 
-from approaches.approach import DataPoints, ExtraInfo, ThoughtStep
-from approaches.chatapproach import ChatApproach
+from approaches.approach import DataPoints, ThoughtStep
+from approaches.chatapproach import ChatApproach, StreamingThoughtStep
 from approaches.promptmanager import PromptManager
 from core.authentication import AuthenticationHelper
 
@@ -67,7 +67,7 @@ async def run_until_final_call(
         overrides: dict[str, Any],
         auth_claims: dict[str, Any],
         should_stream: bool = False,
-    ) -> tuple[ExtraInfo, Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]]:
+    ) -> AsyncGenerator[StreamingThoughtStep, None]:
         use_text_search = overrides.get("retrieval_mode") in ["text", "hybrid", None]
         use_vector_search = overrides.get("retrieval_mode") in ["vectors", "hybrid", None]
         use_semantic_ranker = True if overrides.get("semantic_ranker") else False
@@ -88,13 +88,12 @@ async def run_until_final_call(
                 f"{self.chatgpt_model} does not support streaming. Please use a different model or disable streaming."
             )
 
+        # STEP 1: Generate an optimized keyword search query based on the chat history and the last question
         query_messages = self.prompt_manager.render_prompt(
             self.query_rewrite_prompt, {"user_query": original_user_query, "past_messages": messages[:-1]}
         )
         tools: list[ChatCompletionToolParam] = self.query_rewrite_tools
 
-        # STEP 1: Generate an optimized keyword search query based on the chat history and the last question
-
         chat_completion = cast(
             ChatCompletion,
             await self.create_chat_completion(
@@ -111,9 +110,38 @@ async def run_until_final_call(
             ),
         )
 
+        yield StreamingThoughtStep(
+            step=self.format_thought_step_for_chatcompletion(
+                title="Prompt to generate search query",
+                messages=query_messages,
+                overrides=overrides,
+                model=self.chatgpt_model,
+                deployment=self.chatgpt_deployment,
+                usage=chat_completion.usage,
+                reasoning_effort="low",
+            ),
+            role="tool"
+        )
+
         query_text = self.get_search_query(chat_completion, original_user_query)
 
         # STEP 2: Retrieve relevant documents from the search index with the GPT optimized query
+        yield StreamingThoughtStep(
+            step=ThoughtStep(
+                "Search using generated search query",
+                query_text,
+                {
+                    "use_semantic_captions": use_semantic_captions,
+                    "use_semantic_ranker": use_semantic_ranker,
+                    "use_query_rewriting": use_query_rewriting,
+                    "top": top,
+                    "filter": filter,
+                    "use_vector_search": use_vector_search,
+                    "use_text_search": use_text_search,
+                },
+            ),
+            role="tool"
+        )
 
         # If retrieval mode includes vectors, compute an embedding for the query
         vectors: list[VectorQuery] = []
@@ -134,6 +162,14 @@ async def run_until_final_call(
             use_query_rewriting,
         )
 
+        yield StreamingThoughtStep(
+            step=ThoughtStep(
+                "Search results",
+                [result.serialize_for_results() for result in results],
+            ),
+            role="tool"
+        )
+
         # STEP 3: Generate a contextual and content specific answer using the search results and chat history
         text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)
         messages = self.prompt_manager.render_prompt(
@@ -147,55 +183,22 @@ async def run_until_final_call(
             },
         )
 
-        extra_info = ExtraInfo(
-            DataPoints(text=text_sources),
-            thoughts=[
-                self.format_thought_step_for_chatcompletion(
-                    title="Prompt to generate search query",
-                    messages=query_messages,
-                    overrides=overrides,
-                    model=self.chatgpt_model,
-                    deployment=self.chatgpt_deployment,
-                    usage=chat_completion.usage,
-                    reasoning_effort="low",
-                ),
-                ThoughtStep(
-                    "Search using generated search query",
-                    query_text,
-                    {
-                        "use_semantic_captions": use_semantic_captions,
-                        "use_semantic_ranker": use_semantic_ranker,
-                        "use_query_rewriting": use_query_rewriting,
-                        "top": top,
-                        "filter": filter,
-                        "use_vector_search": use_vector_search,
-                        "use_text_search": use_text_search,
-                    },
-                ),
-                ThoughtStep(
-                    "Search results",
-                    [result.serialize_for_results() for result in results],
-                ),
-                self.format_thought_step_for_chatcompletion(
-                    title="Prompt to generate answer",
-                    messages=messages,
-                    overrides=overrides,
-                    model=self.chatgpt_model,
-                    deployment=self.chatgpt_deployment,
-                    usage=None,
-                ),
-            ],
-        )
-
-        chat_coroutine = cast(
-            Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]],
-            self.create_chat_completion(
+        yield StreamingThoughtStep(
+            step=self.format_thought_step_for_chatcompletion(
+                title="Prompt to generate answer",
+                messages=messages,
+                overrides=overrides,
+                model=self.chatgpt_model,
+                deployment=self.chatgpt_deployment,
+                usage=None,
+                data_points=DataPoints(text=text_sources)
+            ),
+            chat_completion=self.create_chat_completion(
                 self.chatgpt_deployment,
                 self.chatgpt_model,
                 messages,
                 overrides,
                 self.get_response_token_limit(self.chatgpt_model, 1024),
                 should_stream,
-            ),
+            )
         )
-        return (extra_info, chat_coroutine)
diff --git a/app/backend/approaches/chatreadretrievereadvision.py b/app/backend/approaches/chatreadretrievereadvision.py
index b56d773a6f..1c6a360ed8 100644
--- a/app/backend/approaches/chatreadretrievereadvision.py
+++ b/app/backend/approaches/chatreadretrievereadvision.py
@@ -1,5 +1,5 @@
 from collections.abc import Awaitable
-from typing import Any, Callable, Optional, Union, cast
+from typing import Any, Callable, Optional, Union, cast, AsyncGenerator
 
 from azure.search.documents.aio import SearchClient
 from azure.storage.blob.aio import ContainerClient
@@ -11,8 +11,8 @@
     ChatCompletionToolParam,
 )
 
-from approaches.approach import DataPoints, ExtraInfo, ThoughtStep
-from approaches.chatapproach import ChatApproach
+from approaches.approach import DataPoints, ThoughtStep
+from approaches.chatapproach import ChatApproach, StreamingThoughtStep
 from approaches.promptmanager import PromptManager
 from core.authentication import AuthenticationHelper
 from core.imageshelper import fetch_image
@@ -77,7 +77,7 @@ async def run_until_final_call(
         overrides: dict[str, Any],
         auth_claims: dict[str, Any],
         should_stream: bool = False,
-    ) -> tuple[ExtraInfo, Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]]:
+    ) -> AsyncGenerator[StreamingThoughtStep, None]:
         seed = overrides.get("seed", None)
         use_text_search = overrides.get("retrieval_mode") in ["text", "hybrid", None]
         use_vector_search = overrides.get("retrieval_mode") in ["vectors", "hybrid", None]
@@ -104,6 +104,19 @@ async def run_until_final_call(
         tools: list[ChatCompletionToolParam] = self.query_rewrite_tools
 
         # STEP 1: Generate an optimized keyword search query based on the chat history and the last question
+        yield StreamingThoughtStep(
+            step=ThoughtStep(
+                "Prompt to generate search query",
+                query_messages,
+                (
+                    {"model": self.chatgpt_model, "deployment": self.chatgpt_deployment}
+                    if self.chatgpt_deployment
+                    else {"model": self.chatgpt_model}
+                ),
+            ),
+            role="tool"
+        )
+
         chat_completion: ChatCompletion = await self.openai_client.chat.completions.create(
             messages=query_messages,
             # Azure OpenAI takes the deployment name as the model name
@@ -144,6 +157,14 @@ async def run_until_final_call(
             use_query_rewriting,
         )
 
+        yield StreamingThoughtStep(
+            step=ThoughtStep(
+                "Search results",
+                [result.serialize_for_results() for result in results]
+            ),
+            role="tool"
+        )
+
         # STEP 3: Generate a contextual and content specific answer using the search results and chat history
         text_sources = []
         image_sources = []
@@ -167,50 +188,18 @@ async def run_until_final_call(
             },
         )
 
-        extra_info = ExtraInfo(
-            DataPoints(text=text_sources, images=image_sources),
-            [
-                ThoughtStep(
-                    "Prompt to generate search query",
-                    query_messages,
-                    (
-                        {"model": self.chatgpt_model, "deployment": self.chatgpt_deployment}
-                        if self.chatgpt_deployment
-                        else {"model": self.chatgpt_model}
-                    ),
+        yield StreamingThoughtStep(
+            step=ThoughtStep(
+                "Prompt to generate answer",
+                messages,
+                (
+                    {"model": self.gpt4v_model, "deployment": self.gpt4v_deployment}
+                    if self.gpt4v_deployment
+                    else {"model": self.gpt4v_model}
                 ),
-                ThoughtStep(
-                    "Search using generated search query",
-                    query_text,
-                    {
-                        "use_semantic_captions": use_semantic_captions,
-                        "use_semantic_ranker": use_semantic_ranker,
-                        "use_query_rewriting": use_query_rewriting,
-                        "top": top,
-                        "filter": filter,
-                        "vector_fields": vector_fields,
-                        "use_text_search": use_text_search,
-                    },
-                ),
-                ThoughtStep(
-                    "Search results",
-                    [result.serialize_for_results() for result in results],
-                ),
-                ThoughtStep(
-                    "Prompt to generate answer",
-                    messages,
-                    (
-                        {"model": self.gpt4v_model, "deployment": self.gpt4v_deployment}
-                        if self.gpt4v_deployment
-                        else {"model": self.gpt4v_model}
-                    ),
-                ),
-            ],
-        )
-
-        chat_coroutine = cast(
-            Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]],
-            self.openai_client.chat.completions.create(
+                data_points=DataPoints(text=text_sources, images=image_sources)
+            ),
+            chat_completion=self.openai_client.chat.completions.create(
                 model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model,
                 messages=messages,
                 temperature=overrides.get("temperature", 0.3),
@@ -218,6 +207,5 @@ async def run_until_final_call(
                 n=1,
                 stream=should_stream,
                 seed=seed,
-            ),
+            )
         )
-        return (extra_info, chat_coroutine)
diff --git a/app/backend/approaches/retrievethenread.py b/app/backend/approaches/retrievethenread.py
index 8bdbb9785e..8eb74a69d4 100644
--- a/app/backend/approaches/retrievethenread.py
+++ b/app/backend/approaches/retrievethenread.py
@@ -1,11 +1,11 @@
-from typing import Any, Optional, cast
+from typing import Any, Optional, cast, AsyncGenerator
 
 from azure.search.documents.aio import SearchClient
 from azure.search.documents.models import VectorQuery
 from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
 
-from approaches.approach import Approach, DataPoints, ExtraInfo, ThoughtStep
+from approaches.approach import Approach, DataPoints, ThoughtStep
 from approaches.promptmanager import PromptManager
 from core.authentication import AuthenticationHelper
 
@@ -58,7 +58,7 @@ async def run(
         messages: list[ChatCompletionMessageParam],
         session_state: Any = None,
         context: dict[str, Any] = {},
-    ) -> dict[str, Any]:
+    ) -> AsyncGenerator[dict[str, Any], None]:
         q = messages[-1]["content"]
         if not isinstance(q, str):
             raise ValueError("The most recent message content must be a string.")
@@ -73,6 +73,26 @@ async def run(
         minimum_search_score = overrides.get("minimum_search_score", 0.0)
         minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
         filter = self.build_filter(overrides, auth_claims)
+        
+        yield {
+            "context": {
+                "thought": ThoughtStep(
+                    "Search using user query",
+                    q,
+                    {
+                        "use_semantic_captions": use_semantic_captions,
+                        "use_semantic_ranker": use_semantic_ranker,
+                        "use_query_rewriting": use_query_rewriting,
+                        "top": top,
+                        "filter": filter,
+                        "use_vector_search": use_vector_search,
+                        "use_text_search": use_text_search,
+                    },
+                )
+            },
+            "session_state": session_state,
+        }
+
 
         # If retrieval mode includes vectors, compute an embedding for the query
         vectors: list[VectorQuery] = []
@@ -101,6 +121,16 @@ async def run(
             | {"user_query": q, "text_sources": text_sources},
         )
 
+        yield {
+            "context": {
+                "thought": ThoughtStep(
+                    "Search results",
+                    [result.serialize_for_results() for result in results],
+                )
+            },
+            "session_state": session_state
+        }
+
         chat_completion = cast(
             ChatCompletion,
             await self.create_chat_completion(
@@ -112,42 +142,21 @@ async def run(
             ),
         )
 
-        extra_info = ExtraInfo(
-            DataPoints(text=text_sources),
-            thoughts=[
-                ThoughtStep(
-                    "Search using user query",
-                    q,
-                    {
-                        "use_semantic_captions": use_semantic_captions,
-                        "use_semantic_ranker": use_semantic_ranker,
-                        "use_query_rewriting": use_query_rewriting,
-                        "top": top,
-                        "filter": filter,
-                        "use_vector_search": use_vector_search,
-                        "use_text_search": use_text_search,
-                    },
-                ),
-                ThoughtStep(
-                    "Search results",
-                    [result.serialize_for_results() for result in results],
-                ),
-                self.format_thought_step_for_chatcompletion(
+        yield {
+            "message": {
+                "content": chat_completion.choices[0].message.content,
+                "role": chat_completion.choices[0].message.role,
+            },
+            "context": {
+                "thought": self.format_thought_step_for_chatcompletion(
                     title="Prompt to generate answer",
                     messages=messages,
                     overrides=overrides,
                     model=self.chatgpt_model,
                     deployment=self.chatgpt_deployment,
                     usage=chat_completion.usage,
+                    data_points=DataPoints(text=text_sources)
                 ),
-            ],
-        )
-
-        return {
-            "message": {
-                "content": chat_completion.choices[0].message.content,
-                "role": chat_completion.choices[0].message.role,
             },
-            "context": extra_info,
             "session_state": session_state,
         }
diff --git a/app/backend/approaches/retrievethenreadvision.py b/app/backend/approaches/retrievethenreadvision.py
index a556fd8b6c..af92a7a509 100644
--- a/app/backend/approaches/retrievethenreadvision.py
+++ b/app/backend/approaches/retrievethenreadvision.py
@@ -1,5 +1,5 @@
 from collections.abc import Awaitable
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, AsyncGenerator
 
 from azure.search.documents.aio import SearchClient
 from azure.storage.blob.aio import ContainerClient
@@ -8,7 +8,7 @@
     ChatCompletionMessageParam,
 )
 
-from approaches.approach import Approach, DataPoints, ExtraInfo, ThoughtStep
+from approaches.approach import Approach, DataPoints, ThoughtStep
 from approaches.promptmanager import PromptManager
 from core.authentication import AuthenticationHelper
 from core.imageshelper import fetch_image
@@ -66,7 +66,7 @@ async def run(
         messages: list[ChatCompletionMessageParam],
         session_state: Any = None,
         context: dict[str, Any] = {},
-    ) -> dict[str, Any]:
+    ) -> AsyncGenerator[dict[str, Any], None]:
         q = messages[-1]["content"]
         if not isinstance(q, str):
             raise ValueError("The most recent message content must be a string.")
@@ -87,6 +87,26 @@ async def run(
         vector_fields = overrides.get("vector_fields", ["embedding"])
         send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
         send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None]
+                
+        yield {
+            "context": {
+                "thought": ThoughtStep(
+                    "Search using user query",
+                    q,
+                    {
+                        "use_semantic_captions": use_semantic_captions,
+                        "use_semantic_ranker": use_semantic_ranker,
+                        "use_query_rewriting": use_query_rewriting,
+                        "top": top,
+                        "filter": filter,
+                        "vector_fields": vector_fields,
+                        "use_vector_search": use_vector_search,
+                        "use_text_search": use_text_search,
+                    },
+                ),
+            },
+            "session_state": session_state,
+        }
 
         # If retrieval mode includes vectors, compute an embedding for the query
         vectors = []
@@ -113,6 +133,16 @@ async def run(
             use_query_rewriting,
         )
 
+        yield {
+            "context": {
+                "thought": ThoughtStep(
+                    "Search results",
+                    [result.serialize_for_results() for result in results],
+                )
+            },
+            "session_state": session_state
+        }
+
         # Process results
         text_sources = []
         image_sources = []
@@ -139,28 +169,13 @@ async def run(
             seed=seed,
         )
 
-        extra_info = ExtraInfo(
-            DataPoints(text=text_sources, images=image_sources),
-            [
-                ThoughtStep(
-                    "Search using user query",
-                    q,
-                    {
-                        "use_semantic_captions": use_semantic_captions,
-                        "use_semantic_ranker": use_semantic_ranker,
-                        "use_query_rewriting": use_query_rewriting,
-                        "top": top,
-                        "filter": filter,
-                        "vector_fields": vector_fields,
-                        "use_vector_search": use_vector_search,
-                        "use_text_search": use_text_search,
-                    },
-                ),
-                ThoughtStep(
-                    "Search results",
-                    [result.serialize_for_results() for result in results],
-                ),
-                ThoughtStep(
+        yield {
+            "message": {
+                "content": chat_completion.choices[0].message.content,
+                "role": chat_completion.choices[0].message.role,
+            },
+            "context": {
+                "thought": ThoughtStep(
                     "Prompt to generate answer",
                     messages,
                     (
@@ -168,15 +183,8 @@ async def run(
                         if self.gpt4v_deployment
                         else {"model": self.gpt4v_model}
                     ),
-                ),
-            ],
-        )
-
-        return {
-            "message": {
-                "content": chat_completion.choices[0].message.content,
-                "role": chat_completion.choices[0].message.role,
+                    data_points=DataPoints(text=text_sources, images=image_sources),
+                )
             },
-            "context": extra_info,
             "session_state": session_state,
         }
diff --git a/app/backend/error.py b/app/backend/error.py
index 0a21afe6b7..e761847e73 100644
--- a/app/backend/error.py
+++ b/app/backend/error.py
@@ -2,6 +2,7 @@
 
 from openai import APIError
 from quart import jsonify
+import traceback
 
 ERROR_MESSAGE = """The app encountered an error processing your request.
 If you are an administrator of the app, view the full error in the logs. See aka.ms/appservice-logs for more information.
diff --git a/app/frontend/src/api/api.ts b/app/frontend/src/api/api.ts
index df95f801b5..dc4c30ffc2 100644
--- a/app/frontend/src/api/api.ts
+++ b/app/frontend/src/api/api.ts
@@ -22,7 +22,7 @@ export async function configApi(): Promise<Config> {
     return (await response.json()) as Config;
 }
 
-export async function askApi(request: ChatAppRequest, idToken: string | undefined): Promise<ChatAppResponse> {
+export async function askApi(request: ChatAppRequest, idToken: string | undefined): Promise<ChatAppResponse[]> {
     const headers = await getHeaders(idToken);
     const response = await fetch(`${BACKEND_URI}/ask`, {
         method: "POST",
@@ -34,11 +34,11 @@ export async function askApi(request: ChatAppRequest, idToken: string | undefine
         throw Error(`Request failed with status ${response.status}`);
     }
     const parsedResponse: ChatAppResponseOrError = await response.json();
-    if (parsedResponse.error) {
+    if ("error" in parsedResponse) {
         throw Error(parsedResponse.error);
     }
 
-    return parsedResponse as ChatAppResponse;
+    return parsedResponse as ChatAppResponse[];
 }
 
 export async function chatApi(request: ChatAppRequest, shouldStream: boolean, idToken: string | undefined): Promise<Response> {
diff --git a/app/frontend/src/api/models.ts b/app/frontend/src/api/models.ts
index c915a19ee5..bf0bd8bcf1 100644
--- a/app/frontend/src/api/models.ts
+++ b/app/frontend/src/api/models.ts
@@ -46,33 +46,38 @@ export type ResponseMessage = {
     role: string;
 };
 
-export type Thoughts = {
+export type Thought = {
     title: string;
     description: any; // It can be any output from the api
     props?: { [key: string]: any };
+    data_points: string[] | null;
 };
 
 export type ResponseContext = {
-    data_points: string[];
     followup_questions: string[] | null;
-    thoughts: Thoughts[];
+    thought: Thought | null;
 };
 
-export type ChatAppResponseOrError = {
+export type ChatAppResponseItem = {
     message: ResponseMessage;
-    delta: ResponseMessage;
-    context: ResponseContext;
+    delta: ResponseMessage | null;
+    context: ResponseContext | null;
     session_state: any;
-    error?: string;
 };
 
 export type ChatAppResponse = {
-    message: ResponseMessage;
-    delta: ResponseMessage;
-    context: ResponseContext;
-    session_state: any;
+    value: ChatAppResponseItem[];
+};
+
+export function getLastResponse(response: ChatAppResponse): ChatAppResponseItem | null {
+    return response.value.length > 0 ? response.value[response.value.length - 1] : null;
+}
+export type ChatAppError = {
+    error?: string;
 };
 
+export type ChatAppResponseOrError = ChatAppResponse | ChatAppError;
+
 export type ChatAppRequestContext = {
     overrides?: ChatAppRequestOverrides;
 };
diff --git a/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx b/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx
index 2cee00c761..1ec4a27da3 100644
--- a/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx
+++ b/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx
@@ -18,14 +18,14 @@ interface Props {
     onActiveTabChanged: (tab: AnalysisPanelTabs) => void;
     activeCitation: string | undefined;
     citationHeight: string;
-    answer: ChatAppResponse;
+    response: ChatAppResponse;
 }
 
 const pivotItemDisabledStyle = { disabled: true, style: { color: "grey" } };
 
-export const AnalysisPanel = ({ answer, activeTab, activeCitation, citationHeight, className, onActiveTabChanged }: Props) => {
-    const isDisabledThoughtProcessTab: boolean = !answer.context.thoughts;
-    const isDisabledSupportingContentTab: boolean = !answer.context.data_points;
+export const AnalysisPanel = ({ response, activeTab, activeCitation, citationHeight, className, onActiveTabChanged }: Props) => {
+    const isDisabledThoughtProcessTab: boolean = !response.value.some(item => item.context?.thought);
+    const isDisabledSupportingContentTab: boolean = !response.value.some(item => item.context?.thought?.data_points);
     const isDisabledCitationTab: boolean = !activeCitation;
     const [citation, setCitation] = useState("");
 
@@ -82,14 +82,15 @@ export const AnalysisPanel = ({ answer, activeTab, activeCitation, citationHeigh
                 headerText={t("headerTexts.thoughtProcess")}
                 headerButtonProps={isDisabledThoughtProcessTab ? pivotItemDisabledStyle : undefined}
             >
-                <ThoughtProcess thoughts={answer.context.thoughts || []} />
+                <ThoughtProcess thoughts={response.value.flatMap(item => item.context?.thought ?? [])} />
             </PivotItem>
             <PivotItem
                 itemKey={AnalysisPanelTabs.SupportingContentTab}
                 headerText={t("headerTexts.supportingContent")}
                 headerButtonProps={isDisabledSupportingContentTab ? pivotItemDisabledStyle : undefined}
             >
-                <SupportingContent supportingContent={answer.context.data_points} />
+                {/* TODO: How do we handle citations?*/}
+                <SupportingContent supportingContent={response.value[response.value.length - 1]?.context?.thought?.data_points ?? []} />
             </PivotItem>
             <PivotItem
                 itemKey={AnalysisPanelTabs.CitationTab}
diff --git a/app/frontend/src/components/Answer/Answer.tsx b/app/frontend/src/components/Answer/Answer.tsx
index 75b0a03504..399096aac8 100644
--- a/app/frontend/src/components/Answer/Answer.tsx
+++ b/app/frontend/src/components/Answer/Answer.tsx
@@ -7,15 +7,16 @@ import remarkGfm from "remark-gfm";
 import rehypeRaw from "rehype-raw";
 
 import styles from "./Answer.module.css";
-import { ChatAppResponse, getCitationFilePath, SpeechConfig } from "../../api";
+import { ChatAppResponse, ChatAppResponseItem, getCitationFilePath, SpeechConfig } from "../../api";
 import { parseAnswerToHtml } from "./AnswerParser";
 import { AnswerIcon } from "./AnswerIcon";
 import { SpeechOutputBrowser } from "./SpeechOutputBrowser";
 import { SpeechOutputAzure } from "./SpeechOutputAzure";
 
 interface Props {
-    answer: ChatAppResponse;
-    index: number;
+    answer: ChatAppResponseItem;
+    response: ChatAppResponse;
+    responseIndex: number;
     speechConfig: SpeechConfig;
     isSelected?: boolean;
     isStreaming: boolean;
@@ -30,7 +31,8 @@ interface Props {
 
 export const Answer = ({
     answer,
-    index,
+    response,
+    responseIndex,
     speechConfig,
     isSelected,
     isStreaming,
@@ -43,7 +45,7 @@ export const Answer = ({
     showSpeechOutputBrowser
 }: Props) => {
     const followupQuestions = answer.context?.followup_questions;
-    const parsedAnswer = useMemo(() => parseAnswerToHtml(answer, isStreaming, onCitationClicked), [answer]);
+    const parsedAnswer = useMemo(() => parseAnswerToHtml(answer, isStreaming, onCitationClicked), [answer.message.content]);
     const { t } = useTranslation();
     const sanitizedAnswerHtml = DOMPurify.sanitize(parsedAnswer.answerHtml);
     const [copied, setCopied] = useState(false);
@@ -80,7 +82,7 @@ export const Answer = ({
                             title={t("tooltips.showThoughtProcess")}
                             ariaLabel={t("tooltips.showThoughtProcess")}
                             onClick={() => onThoughtProcessClicked()}
-                            disabled={!answer.context.thoughts?.length}
+                            disabled={!response.value.some(thought => thought.context?.thought)}
                         />
                         <IconButton
                             style={{ color: "black" }}
@@ -88,10 +90,10 @@ export const Answer = ({
                             title={t("tooltips.showSupportingContent")}
                             ariaLabel={t("tooltips.showSupportingContent")}
                             onClick={() => onSupportingContentClicked()}
-                            disabled={!answer.context.data_points}
+                            disabled={!answer.context?.thought?.data_points?.length}
                         />
                         {showSpeechOutputAzure && (
-                            <SpeechOutputAzure answer={sanitizedAnswerHtml} index={index} speechConfig={speechConfig} isStreaming={isStreaming} />
+                            <SpeechOutputAzure answer={sanitizedAnswerHtml} index={responseIndex} speechConfig={speechConfig} isStreaming={isStreaming} />
                         )}
                         {showSpeechOutputBrowser && <SpeechOutputBrowser answer={sanitizedAnswerHtml} />}
                     </div>
diff --git a/app/frontend/src/components/Answer/AnswerParser.tsx b/app/frontend/src/components/Answer/AnswerParser.tsx
index 3807592f6d..c44690ea4e 100644
--- a/app/frontend/src/components/Answer/AnswerParser.tsx
+++ b/app/frontend/src/components/Answer/AnswerParser.tsx
@@ -1,5 +1,5 @@
 import { renderToStaticMarkup } from "react-dom/server";
-import { ChatAppResponse, getCitationFilePath } from "../../api";
+import { ChatAppResponseItem, getCitationFilePath } from "../../api";
 
 type HtmlParsedAnswer = {
     answerHtml: string;
@@ -30,8 +30,8 @@ function isCitationValid(contextDataPoints: any, citationCandidate: string): boo
     return isValidCitation;
 }
 
-export function parseAnswerToHtml(answer: ChatAppResponse, isStreaming: boolean, onCitationClicked: (citationFilePath: string) => void): HtmlParsedAnswer {
-    const contextDataPoints = answer.context.data_points;
+export function parseAnswerToHtml(answer: ChatAppResponseItem, isStreaming: boolean, onCitationClicked: (citationFilePath: string) => void): HtmlParsedAnswer {
+    const contextDataPoints = answer.context?.thought?.data_points ?? [];
     const citations: string[] = [];
 
     // Trim any whitespace from the end of the answer after removing follow-up questions
diff --git a/app/frontend/src/index.tsx b/app/frontend/src/index.tsx
index a8821c8c45..706c28532e 100644
--- a/app/frontend/src/index.tsx
+++ b/app/frontend/src/index.tsx
@@ -23,8 +23,8 @@ const router = createHashRouter([
                 element: <Chat />
             },
             {
-                path: "qa",
-                lazy: () => import("./pages/ask/Ask")
+                path: "qa"
+                //lazy: () => import("./pages/ask/Ask")
             },
             {
                 path: "*",
diff --git a/app/frontend/src/pages/ask/Ask.tsx b/app/frontend/src/pages/ask/Ask.tsx
index 8e38076adb..b339d9c05a 100644
--- a/app/frontend/src/pages/ask/Ask.tsx
+++ b/app/frontend/src/pages/ask/Ask.tsx
@@ -146,12 +146,12 @@ export function Component(): JSX.Element {
                         language: i18n.language,
                         ...(seed !== null ? { seed: seed } : {})
                     }
-                },
+                }
                 // AI Chat Protocol: Client must pass on any session state received from the server
-                session_state: answer ? answer.session_state : null
+                //session_state: answer ? answer.session_state : null
             };
             const result = await askApi(request, token);
-            setAnswer(result);
+            //setAnswer(result);
             setSpeechUrls([null]);
         } catch (e) {
             setError(e);
@@ -287,21 +287,7 @@ export function Component(): JSX.Element {
                         <ExampleList onExampleClicked={onExampleClicked} useGPT4V={useGPT4V} />
                     </div>
                 )}
-                {!isLoading && answer && !error && (
-                    <div className={styles.askAnswerContainer}>
-                        <Answer
-                            answer={answer}
-                            index={0}
-                            speechConfig={speechConfig}
-                            isStreaming={false}
-                            onCitationClicked={x => onShowCitation(x)}
-                            onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab)}
-                            onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab)}
-                            showSpeechOutputAzure={showSpeechOutputAzure}
-                            showSpeechOutputBrowser={showSpeechOutputBrowser}
-                        />
-                    </div>
-                )}
+                {!isLoading && answer && !error && <div className={styles.askAnswerContainer}></div>}
                 {error ? (
                     <div className={styles.askAnswerContainer}>
                         <AnswerError error={error.toString()} onRetry={() => makeApiRequest(lastQuestionRef.current)} />
diff --git a/app/frontend/src/pages/chat/Chat.tsx b/app/frontend/src/pages/chat/Chat.tsx
index 5d00c2c914..784444b351 100644
--- a/app/frontend/src/pages/chat/Chat.tsx
+++ b/app/frontend/src/pages/chat/Chat.tsx
@@ -11,13 +11,16 @@ import {
     chatApi,
     configApi,
     RetrievalMode,
+    ChatAppError,
     ChatAppResponse,
+    ChatAppResponseItem,
     ChatAppResponseOrError,
     ChatAppRequest,
     ResponseMessage,
     VectorFieldOptions,
     GPT4VInput,
-    SpeechConfig
+    SpeechConfig,
+    getLastResponse
 } from "../../api";
 import { Answer, AnswerError, AnswerLoading } from "../../components/Answer";
 import { QuestionInput } from "../../components/QuestionInput";
@@ -37,6 +40,12 @@ import { LoginContext } from "../../loginContext";
 import { LanguagePicker } from "../../i18n/LanguagePicker";
 import { Settings } from "../../components/Settings/Settings";
 
+const enum LoadingType {
+    None = "none",
+    Generating = "generating",
+    Thinking = "thinking"
+}
+
 const Chat = () => {
     const [isConfigPanelOpen, setIsConfigPanelOpen] = useState(false);
     const [isHistoryPanelOpen, setIsHistoryPanelOpen] = useState(false);
@@ -65,16 +74,16 @@ const Chat = () => {
     const lastQuestionRef = useRef<string>("");
     const chatMessageStreamEnd = useRef<HTMLDivElement | null>(null);
 
-    const [isLoading, setIsLoading] = useState<boolean>(false);
+    const [isLoading, setIsLoading] = useState<LoadingType>(LoadingType.None);
     const [isStreaming, setIsStreaming] = useState<boolean>(false);
     const [error, setError] = useState<unknown>();
 
     const [activeCitation, setActiveCitation] = useState<string>();
     const [activeAnalysisPanelTab, setActiveAnalysisPanelTab] = useState<AnalysisPanelTabs | undefined>(undefined);
 
-    const [selectedAnswer, setSelectedAnswer] = useState<number>(0);
-    const [answers, setAnswers] = useState<[user: string, response: ChatAppResponse][]>([]);
-    const [streamedAnswers, setStreamedAnswers] = useState<[user: string, response: ChatAppResponse][]>([]);
+    const [selectedResponse, setSelectedResponse] = useState<number>(0);
+    const [responses, setResponses] = useState<[user: string, response: ChatAppResponse][]>([]);
+    const [streamedResponses, setStreamedResponses] = useState<[user: string, response: ChatAppResponse][]>([]);
     const [speechUrls, setSpeechUrls] = useState<(string | null)[]>([]);
 
     const [showGPT4VOptions, setShowGPT4VOptions] = useState<boolean>(false);
@@ -129,47 +138,62 @@ const Chat = () => {
         });
     };
 
-    const handleAsyncRequest = async (question: string, answers: [string, ChatAppResponse][], responseBody: ReadableStream<any>) => {
-        let answer: string = "";
-        let askResponse: ChatAppResponse = {} as ChatAppResponse;
-
-        const updateState = (newContent: string) => {
+    const handleAsyncRequest = async (question: string, responses: [string, ChatAppResponse][], responseBody: ReadableStream<any>) => {
+        var response: ChatAppResponse = {
+            value: []
+        };
+        const updateState = () => {
             return new Promise(resolve => {
                 setTimeout(() => {
-                    answer += newContent;
-                    const latestResponse: ChatAppResponse = {
-                        ...askResponse,
-                        message: { content: answer, role: askResponse.message.role }
-                    };
-                    setStreamedAnswers([...answers, [question, latestResponse]]);
+                    setStreamedResponses([...responses, [question, response]]);
                     resolve(null);
                 }, 33);
             });
         };
         try {
             setIsStreaming(true);
+
             for await (const event of readNDJSONStream(responseBody)) {
-                if (event["context"] && event["context"]["data_points"]) {
-                    event["message"] = event["delta"];
-                    askResponse = event as ChatAppResponse;
-                } else if (event["delta"] && event["delta"]["content"]) {
-                    setIsLoading(false);
-                    await updateState(event["delta"]["content"]);
-                } else if (event["context"]) {
-                    // Update context with new keys from latest event
-                    askResponse.context = { ...askResponse.context, ...event["context"] };
-                } else if (event["error"]) {
-                    throw Error(event["error"]);
+                if (event["error"]) {
+                    return { error: event["error"] } as ChatAppError;
+                }
+
+                if (event["delta"]) {
+                    if (event["delta"]["finish_reason"] == "stop") {
+                        setIsLoading(LoadingType.None);
+                        response.value[response.value.length - 1].context = event["context"];
+                        response.value[response.value.length - 1].session_state = event["session_state"];
+
+                        await updateState();
+                        continue;
+                    }
+
+                    if ("has_content" in event) {
+                        setIsLoading(event["has_content"] ? LoadingType.Generating : LoadingType.Thinking);
+                        var responseItem: ChatAppResponseItem = {
+                            message: event["has_content"] ? { content: "", role: "" } : null
+                        } as ChatAppResponseItem;
+                        response.value.push(responseItem);
+
+                        continue;
+                    }
+
+                    if (event["delta"]["content"]) {
+                        setIsLoading(LoadingType.None);
+                        response.value[response.value.length - 1].message.content += event["delta"]["content"];
+                        response.value[response.value.length - 1].message.role = event["delta"]["role"];
+
+                        await updateState();
+                        continue;
+                    }
                 }
             }
         } finally {
             setIsStreaming(false);
+            setIsLoading(LoadingType.None);
         }
-        const fullResponse: ChatAppResponse = {
-            ...askResponse,
-            message: { content: answer, role: askResponse.message.role }
-        };
-        return fullResponse;
+
+        return response;
     };
 
     const client = useLogin ? useMsal().instance : undefined;
@@ -186,17 +210,20 @@ const Chat = () => {
         lastQuestionRef.current = question;
 
         error && setError(undefined);
-        setIsLoading(true);
+        setIsLoading(shouldStream ? LoadingType.Thinking : LoadingType.Generating);
         setActiveCitation(undefined);
         setActiveAnalysisPanelTab(undefined);
 
         const token = client ? await getToken(client) : undefined;
 
         try {
-            const messages: ResponseMessage[] = answers.flatMap(a => [
-                { content: a[0], role: "user" },
-                { content: a[1].message.content, role: "assistant" }
-            ]);
+            const messages: ResponseMessage[] = responses.flatMap(a => {
+                let lastMessage = getLastResponse(a[1])?.message;
+                return [
+                    { content: a[0], role: "user" },
+                    { content: lastMessage?.content ?? "", role: lastMessage?.role ?? "assistant" }
+                ];
+            });
 
             const request: ChatAppRequest = {
                 messages: [...messages, { content: question, role: "user" }],
@@ -225,7 +252,7 @@ const Chat = () => {
                     }
                 },
                 // AI Chat Protocol: Client must pass on any session state received from the server
-                session_state: answers.length ? answers[answers.length - 1][1].session_state : null
+                session_state: responses.length ? getLastResponse(responses[responses.length - 1][1])?.session_state : null
             };
 
             const response = await chatApi(request, shouldStream, token);
@@ -235,29 +262,22 @@ const Chat = () => {
             if (response.status > 299 || !response.ok) {
                 throw Error(`Request failed with status ${response.status}`);
             }
-            if (shouldStream) {
-                const parsedResponse: ChatAppResponse = await handleAsyncRequest(question, answers, response.body);
-                setAnswers([...answers, [question, parsedResponse]]);
-                if (typeof parsedResponse.session_state === "string" && parsedResponse.session_state !== "") {
-                    const token = client ? await getToken(client) : undefined;
-                    historyManager.addItem(parsedResponse.session_state, [...answers, [question, parsedResponse]], token);
-                }
-            } else {
-                const parsedResponse: ChatAppResponseOrError = await response.json();
-                if (parsedResponse.error) {
-                    throw Error(parsedResponse.error);
-                }
-                setAnswers([...answers, [question, parsedResponse as ChatAppResponse]]);
-                if (typeof parsedResponse.session_state === "string" && parsedResponse.session_state !== "") {
-                    const token = client ? await getToken(client) : undefined;
-                    historyManager.addItem(parsedResponse.session_state, [...answers, [question, parsedResponse as ChatAppResponse]], token);
-                }
+            var parsedResponse: ChatAppResponseOrError = shouldStream ? await handleAsyncRequest(question, responses, response.body) : await response.json();
+
+            if ("error" in parsedResponse) {
+                throw Error(parsedResponse.error);
+            }
+            setResponses([...responses, [question, parsedResponse as ChatAppResponse]]);
+            let lastMessage = getLastResponse(parsedResponse as ChatAppResponse) ?? ({} as ChatAppResponseItem);
+            if (typeof lastMessage.session_state === "string" && lastMessage.session_state !== "") {
+                const token = client ? await getToken(client) : undefined;
+                historyManager.addItem(lastMessage.session_state, [...responses, [question, parsedResponse as ChatAppResponse]], token);
             }
             setSpeechUrls([...speechUrls, null]);
         } catch (e) {
             setError(e);
         } finally {
-            setIsLoading(false);
+            setIsLoading(LoadingType.None);
         }
     };
 
@@ -266,15 +286,15 @@ const Chat = () => {
         error && setError(undefined);
         setActiveCitation(undefined);
         setActiveAnalysisPanelTab(undefined);
-        setAnswers([]);
+        setResponses([]);
         setSpeechUrls([]);
-        setStreamedAnswers([]);
-        setIsLoading(false);
+        setStreamedResponses([]);
+        setIsLoading(LoadingType.None);
         setIsStreaming(false);
     };
 
     useEffect(() => chatMessageStreamEnd.current?.scrollIntoView({ behavior: "smooth" }), [isLoading]);
-    useEffect(() => chatMessageStreamEnd.current?.scrollIntoView({ behavior: "auto" }), [streamedAnswers]);
+    useEffect(() => chatMessageStreamEnd.current?.scrollIntoView({ behavior: "auto" }), [streamedResponses]);
     useEffect(() => {
         getConfig();
     }, []);
@@ -349,24 +369,24 @@ const Chat = () => {
     };
 
     const onShowCitation = (citation: string, index: number) => {
-        if (activeCitation === citation && activeAnalysisPanelTab === AnalysisPanelTabs.CitationTab && selectedAnswer === index) {
+        if (activeCitation === citation && activeAnalysisPanelTab === AnalysisPanelTabs.CitationTab && selectedResponse === index) {
             setActiveAnalysisPanelTab(undefined);
         } else {
             setActiveCitation(citation);
             setActiveAnalysisPanelTab(AnalysisPanelTabs.CitationTab);
         }
 
-        setSelectedAnswer(index);
+        setSelectedResponse(index);
     };
 
     const onToggleTab = (tab: AnalysisPanelTabs, index: number) => {
-        if (activeAnalysisPanelTab === tab && selectedAnswer === index) {
+        if (activeAnalysisPanelTab === tab && selectedResponse === index) {
             setActiveAnalysisPanelTab(undefined);
         } else {
             setActiveAnalysisPanelTab(tab);
         }
 
-        setSelectedAnswer(index);
+        setSelectedResponse(index);
     };
 
     const { t, i18n } = useTranslation();
@@ -384,7 +404,11 @@ const Chat = () => {
                     )}
                 </div>
                 <div className={styles.commandsContainer}>
-                    <ClearChatButton className={styles.commandButton} onClick={clearChat} disabled={!lastQuestionRef.current || isLoading} />
+                    <ClearChatButton
+                        className={styles.commandButton}
+                        onClick={clearChat}
+                        disabled={!lastQuestionRef.current || isLoading != LoadingType.None}
+                    />
                     {showUserUpload && <UploadFile className={styles.commandButton} disabled={!loggedIn} />}
                     <SettingsButton className={styles.commandButton} onClick={() => setIsConfigPanelOpen(!isConfigPanelOpen)} />
                 </div>
@@ -404,52 +428,64 @@ const Chat = () => {
                     ) : (
                         <div className={styles.chatMessageStream}>
                             {isStreaming &&
-                                streamedAnswers.map((streamedAnswer, index) => (
-                                    <div key={index}>
-                                        <UserChatMessage message={streamedAnswer[0]} />
-                                        <div className={styles.chatMessageGpt}>
-                                            <Answer
-                                                isStreaming={true}
-                                                key={index}
-                                                answer={streamedAnswer[1]}
-                                                index={index}
-                                                speechConfig={speechConfig}
-                                                isSelected={false}
-                                                onCitationClicked={c => onShowCitation(c, index)}
-                                                onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab, index)}
-                                                onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab, index)}
-                                                onFollowupQuestionClicked={q => makeApiRequest(q)}
-                                                showFollowupQuestions={useSuggestFollowupQuestions && answers.length - 1 === index}
-                                                showSpeechOutputAzure={showSpeechOutputAzure}
-                                                showSpeechOutputBrowser={showSpeechOutputBrowser}
-                                            />
-                                        </div>
-                                    </div>
+                                streamedResponses.map((streamedResponse, responseIndex) => (
+                                    <>
+                                        <UserChatMessage key={responseIndex} message={streamedResponse[0]} />
+                                        {streamedResponse[1].value.map(
+                                            (thought, thoughtIndex) =>
+                                                thought.message && (
+                                                    <Answer
+                                                        isStreaming={true}
+                                                        key={`${responseIndex}-${thoughtIndex}`}
+                                                        answer={thought}
+                                                        response={streamedResponse[1]}
+                                                        responseIndex={responseIndex}
+                                                        speechConfig={speechConfig}
+                                                        isSelected={selectedResponse === responseIndex && activeAnalysisPanelTab !== undefined}
+                                                        onCitationClicked={c => onShowCitation(c, responseIndex)}
+                                                        onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab, responseIndex)}
+                                                        onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab, responseIndex)}
+                                                        onFollowupQuestionClicked={q => makeApiRequest(q)}
+                                                        showFollowupQuestions={useSuggestFollowupQuestions && responses.length - 1 === responseIndex}
+                                                        showSpeechOutputAzure={showSpeechOutputAzure}
+                                                        showSpeechOutputBrowser={showSpeechOutputBrowser}
+                                                    />
+                                                )
+                                        )}
+                                    </>
                                 ))}
                             {!isStreaming &&
-                                answers.map((answer, index) => (
-                                    <div key={index}>
-                                        <UserChatMessage message={answer[0]} />
-                                        <div className={styles.chatMessageGpt}>
-                                            <Answer
-                                                isStreaming={false}
-                                                key={index}
-                                                answer={answer[1]}
-                                                index={index}
-                                                speechConfig={speechConfig}
-                                                isSelected={selectedAnswer === index && activeAnalysisPanelTab !== undefined}
-                                                onCitationClicked={c => onShowCitation(c, index)}
-                                                onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab, index)}
-                                                onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab, index)}
-                                                onFollowupQuestionClicked={q => makeApiRequest(q)}
-                                                showFollowupQuestions={useSuggestFollowupQuestions && answers.length - 1 === index}
-                                                showSpeechOutputAzure={showSpeechOutputAzure}
-                                                showSpeechOutputBrowser={showSpeechOutputBrowser}
-                                            />
-                                        </div>
+                                responses.map((response, responseIndex) => (
+                                    <div key={responseIndex}>
+                                        <UserChatMessage message={response[0]} />
+                                        <>
+                                            {response[1].value.map(
+                                                (thought, thoughtIndex) =>
+                                                    thought.message && (
+                                                        <Answer
+                                                            isStreaming={true}
+                                                            key={`${responseIndex}-${thoughtIndex}`}
+                                                            answer={thought}
+                                                            response={response[1]}
+                                                            responseIndex={responseIndex}
+                                                            speechConfig={speechConfig}
+                                                            isSelected={selectedResponse === responseIndex && activeAnalysisPanelTab !== undefined}
+                                                            onCitationClicked={c => onShowCitation(c, responseIndex)}
+                                                            onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab, responseIndex)}
+                                                            onSupportingContentClicked={() =>
+                                                                onToggleTab(AnalysisPanelTabs.SupportingContentTab, responseIndex)
+                                                            }
+                                                            onFollowupQuestionClicked={q => makeApiRequest(q)}
+                                                            showFollowupQuestions={useSuggestFollowupQuestions && responses.length - 1 === responseIndex}
+                                                            showSpeechOutputAzure={showSpeechOutputAzure}
+                                                            showSpeechOutputBrowser={showSpeechOutputBrowser}
+                                                        />
+                                                    )
+                                            )}
+                                        </>
                                     </div>
                                 ))}
-                            {isLoading && (
+                            {isLoading == LoadingType.Generating && (
                                 <>
                                     <UserChatMessage message={lastQuestionRef.current} />
                                     <div className={styles.chatMessageGptMinWidth}>
@@ -473,20 +509,20 @@ const Chat = () => {
                         <QuestionInput
                             clearOnSend
                             placeholder={t("defaultExamples.placeholder")}
-                            disabled={isLoading}
+                            disabled={isLoading != LoadingType.None}
                             onSend={question => makeApiRequest(question)}
                             showSpeechInput={showSpeechInput}
                         />
                     </div>
                 </div>
 
-                {answers.length > 0 && activeAnalysisPanelTab && (
+                {responses.length > 0 && activeAnalysisPanelTab && (
                     <AnalysisPanel
                         className={styles.chatAnalysisPanel}
                         activeCitation={activeCitation}
-                        onActiveTabChanged={x => onToggleTab(x, selectedAnswer)}
+                        onActiveTabChanged={x => onToggleTab(x, selectedResponse)}
                         citationHeight="810px"
-                        answer={answers[selectedAnswer][1]}
+                        response={responses[selectedResponse][1]}
                         activeTab={activeAnalysisPanelTab}
                     />
                 )}
@@ -499,7 +535,7 @@ const Chat = () => {
                         onClose={() => setIsHistoryPanelOpen(false)}
                         onChatSelected={answers => {
                             if (answers.length === 0) return;
-                            setAnswers(answers);
+                            setResponses(answers);
                             lastQuestionRef.current = answers[answers.length - 1][0];
                         }}
                     />

From 7684fc7363dfa17302e2e7b6487384c676e8c23e Mon Sep 17 00:00:00 2001
From: Matt Gotteiner <matthew.gotteiner@microsoft.com>
Date: Thu, 17 Apr 2025 21:45:38 -0700
Subject: [PATCH 02/10] revert

---
 app/frontend/src/api/api.ts                   |   6 +-
 app/frontend/src/api/models.ts                |  27 +-
 .../AnalysisPanel/AnalysisPanel.tsx           |  13 +-
 app/frontend/src/components/Answer/Answer.tsx |  18 +-
 .../src/components/Answer/AnswerParser.tsx    |   6 +-
 app/frontend/src/index.tsx                    |   4 +-
 app/frontend/src/pages/ask/Ask.tsx            |  22 +-
 app/frontend/src/pages/chat/Chat.tsx          | 260 ++++++++----------
 8 files changed, 163 insertions(+), 193 deletions(-)

diff --git a/app/frontend/src/api/api.ts b/app/frontend/src/api/api.ts
index dc4c30ffc2..df95f801b5 100644
--- a/app/frontend/src/api/api.ts
+++ b/app/frontend/src/api/api.ts
@@ -22,7 +22,7 @@ export async function configApi(): Promise<Config> {
     return (await response.json()) as Config;
 }
 
-export async function askApi(request: ChatAppRequest, idToken: string | undefined): Promise<ChatAppResponse[]> {
+export async function askApi(request: ChatAppRequest, idToken: string | undefined): Promise<ChatAppResponse> {
     const headers = await getHeaders(idToken);
     const response = await fetch(`${BACKEND_URI}/ask`, {
         method: "POST",
@@ -34,11 +34,11 @@ export async function askApi(request: ChatAppRequest, idToken: string | undefine
         throw Error(`Request failed with status ${response.status}`);
     }
     const parsedResponse: ChatAppResponseOrError = await response.json();
-    if ("error" in parsedResponse) {
+    if (parsedResponse.error) {
         throw Error(parsedResponse.error);
     }
 
-    return parsedResponse as ChatAppResponse[];
+    return parsedResponse as ChatAppResponse;
 }
 
 export async function chatApi(request: ChatAppRequest, shouldStream: boolean, idToken: string | undefined): Promise<Response> {
diff --git a/app/frontend/src/api/models.ts b/app/frontend/src/api/models.ts
index bf0bd8bcf1..c915a19ee5 100644
--- a/app/frontend/src/api/models.ts
+++ b/app/frontend/src/api/models.ts
@@ -46,38 +46,33 @@ export type ResponseMessage = {
     role: string;
 };
 
-export type Thought = {
+export type Thoughts = {
     title: string;
     description: any; // It can be any output from the api
     props?: { [key: string]: any };
-    data_points: string[] | null;
 };
 
 export type ResponseContext = {
+    data_points: string[];
     followup_questions: string[] | null;
-    thought: Thought | null;
+    thoughts: Thoughts[];
 };
 
-export type ChatAppResponseItem = {
+export type ChatAppResponseOrError = {
     message: ResponseMessage;
-    delta: ResponseMessage | null;
-    context: ResponseContext | null;
+    delta: ResponseMessage;
+    context: ResponseContext;
     session_state: any;
+    error?: string;
 };
 
 export type ChatAppResponse = {
-    value: ChatAppResponseItem[];
-};
-
-export function getLastResponse(response: ChatAppResponse): ChatAppResponseItem | null {
-    return response.value.length > 0 ? response.value[response.value.length - 1] : null;
-}
-export type ChatAppError = {
-    error?: string;
+    message: ResponseMessage;
+    delta: ResponseMessage;
+    context: ResponseContext;
+    session_state: any;
 };
 
-export type ChatAppResponseOrError = ChatAppResponse | ChatAppError;
-
 export type ChatAppRequestContext = {
     overrides?: ChatAppRequestOverrides;
 };
diff --git a/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx b/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx
index 1ec4a27da3..2cee00c761 100644
--- a/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx
+++ b/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx
@@ -18,14 +18,14 @@ interface Props {
     onActiveTabChanged: (tab: AnalysisPanelTabs) => void;
     activeCitation: string | undefined;
     citationHeight: string;
-    response: ChatAppResponse;
+    answer: ChatAppResponse;
 }
 
 const pivotItemDisabledStyle = { disabled: true, style: { color: "grey" } };
 
-export const AnalysisPanel = ({ response, activeTab, activeCitation, citationHeight, className, onActiveTabChanged }: Props) => {
-    const isDisabledThoughtProcessTab: boolean = !response.value.some(item => item.context?.thought);
-    const isDisabledSupportingContentTab: boolean = !response.value.some(item => item.context?.thought?.data_points);
+export const AnalysisPanel = ({ answer, activeTab, activeCitation, citationHeight, className, onActiveTabChanged }: Props) => {
+    const isDisabledThoughtProcessTab: boolean = !answer.context.thoughts;
+    const isDisabledSupportingContentTab: boolean = !answer.context.data_points;
     const isDisabledCitationTab: boolean = !activeCitation;
     const [citation, setCitation] = useState("");
 
@@ -82,15 +82,14 @@ export const AnalysisPanel = ({ response, activeTab, activeCitation, citationHei
                 headerText={t("headerTexts.thoughtProcess")}
                 headerButtonProps={isDisabledThoughtProcessTab ? pivotItemDisabledStyle : undefined}
             >
-                <ThoughtProcess thoughts={response.value.flatMap(item => item.context?.thought ?? [])} />
+                <ThoughtProcess thoughts={answer.context.thoughts || []} />
             </PivotItem>
             <PivotItem
                 itemKey={AnalysisPanelTabs.SupportingContentTab}
                 headerText={t("headerTexts.supportingContent")}
                 headerButtonProps={isDisabledSupportingContentTab ? pivotItemDisabledStyle : undefined}
             >
-                {/* TODO: How do we handle citations?*/}
-                <SupportingContent supportingContent={response.value[response.value.length - 1]?.context?.thought?.data_points ?? []} />
+                <SupportingContent supportingContent={answer.context.data_points} />
             </PivotItem>
             <PivotItem
                 itemKey={AnalysisPanelTabs.CitationTab}
diff --git a/app/frontend/src/components/Answer/Answer.tsx b/app/frontend/src/components/Answer/Answer.tsx
index 399096aac8..75b0a03504 100644
--- a/app/frontend/src/components/Answer/Answer.tsx
+++ b/app/frontend/src/components/Answer/Answer.tsx
@@ -7,16 +7,15 @@ import remarkGfm from "remark-gfm";
 import rehypeRaw from "rehype-raw";
 
 import styles from "./Answer.module.css";
-import { ChatAppResponse, ChatAppResponseItem, getCitationFilePath, SpeechConfig } from "../../api";
+import { ChatAppResponse, getCitationFilePath, SpeechConfig } from "../../api";
 import { parseAnswerToHtml } from "./AnswerParser";
 import { AnswerIcon } from "./AnswerIcon";
 import { SpeechOutputBrowser } from "./SpeechOutputBrowser";
 import { SpeechOutputAzure } from "./SpeechOutputAzure";
 
 interface Props {
-    answer: ChatAppResponseItem;
-    response: ChatAppResponse;
-    responseIndex: number;
+    answer: ChatAppResponse;
+    index: number;
     speechConfig: SpeechConfig;
     isSelected?: boolean;
     isStreaming: boolean;
@@ -31,8 +30,7 @@ interface Props {
 
 export const Answer = ({
     answer,
-    response,
-    responseIndex,
+    index,
     speechConfig,
     isSelected,
     isStreaming,
@@ -45,7 +43,7 @@ export const Answer = ({
     showSpeechOutputBrowser
 }: Props) => {
     const followupQuestions = answer.context?.followup_questions;
-    const parsedAnswer = useMemo(() => parseAnswerToHtml(answer, isStreaming, onCitationClicked), [answer.message.content]);
+    const parsedAnswer = useMemo(() => parseAnswerToHtml(answer, isStreaming, onCitationClicked), [answer]);
     const { t } = useTranslation();
     const sanitizedAnswerHtml = DOMPurify.sanitize(parsedAnswer.answerHtml);
     const [copied, setCopied] = useState(false);
@@ -82,7 +80,7 @@ export const Answer = ({
                             title={t("tooltips.showThoughtProcess")}
                             ariaLabel={t("tooltips.showThoughtProcess")}
                             onClick={() => onThoughtProcessClicked()}
-                            disabled={!response.value.some(thought => thought.context?.thought)}
+                            disabled={!answer.context.thoughts?.length}
                         />
                         <IconButton
                             style={{ color: "black" }}
@@ -90,10 +88,10 @@ export const Answer = ({
                             title={t("tooltips.showSupportingContent")}
                             ariaLabel={t("tooltips.showSupportingContent")}
                             onClick={() => onSupportingContentClicked()}
-                            disabled={!answer.context?.thought?.data_points?.length}
+                            disabled={!answer.context.data_points}
                         />
                         {showSpeechOutputAzure && (
-                            <SpeechOutputAzure answer={sanitizedAnswerHtml} index={responseIndex} speechConfig={speechConfig} isStreaming={isStreaming} />
+                            <SpeechOutputAzure answer={sanitizedAnswerHtml} index={index} speechConfig={speechConfig} isStreaming={isStreaming} />
                         )}
                         {showSpeechOutputBrowser && <SpeechOutputBrowser answer={sanitizedAnswerHtml} />}
                     </div>
diff --git a/app/frontend/src/components/Answer/AnswerParser.tsx b/app/frontend/src/components/Answer/AnswerParser.tsx
index c44690ea4e..3807592f6d 100644
--- a/app/frontend/src/components/Answer/AnswerParser.tsx
+++ b/app/frontend/src/components/Answer/AnswerParser.tsx
@@ -1,5 +1,5 @@
 import { renderToStaticMarkup } from "react-dom/server";
-import { ChatAppResponseItem, getCitationFilePath } from "../../api";
+import { ChatAppResponse, getCitationFilePath } from "../../api";
 
 type HtmlParsedAnswer = {
     answerHtml: string;
@@ -30,8 +30,8 @@ function isCitationValid(contextDataPoints: any, citationCandidate: string): boo
     return isValidCitation;
 }
 
-export function parseAnswerToHtml(answer: ChatAppResponseItem, isStreaming: boolean, onCitationClicked: (citationFilePath: string) => void): HtmlParsedAnswer {
-    const contextDataPoints = answer.context?.thought?.data_points ?? [];
+export function parseAnswerToHtml(answer: ChatAppResponse, isStreaming: boolean, onCitationClicked: (citationFilePath: string) => void): HtmlParsedAnswer {
+    const contextDataPoints = answer.context.data_points;
     const citations: string[] = [];
 
     // Trim any whitespace from the end of the answer after removing follow-up questions
diff --git a/app/frontend/src/index.tsx b/app/frontend/src/index.tsx
index 706c28532e..a8821c8c45 100644
--- a/app/frontend/src/index.tsx
+++ b/app/frontend/src/index.tsx
@@ -23,8 +23,8 @@ const router = createHashRouter([
                 element: <Chat />
             },
             {
-                path: "qa"
-                //lazy: () => import("./pages/ask/Ask")
+                path: "qa",
+                lazy: () => import("./pages/ask/Ask")
             },
             {
                 path: "*",
diff --git a/app/frontend/src/pages/ask/Ask.tsx b/app/frontend/src/pages/ask/Ask.tsx
index b339d9c05a..8e38076adb 100644
--- a/app/frontend/src/pages/ask/Ask.tsx
+++ b/app/frontend/src/pages/ask/Ask.tsx
@@ -146,12 +146,12 @@ export function Component(): JSX.Element {
                         language: i18n.language,
                         ...(seed !== null ? { seed: seed } : {})
                     }
-                }
+                },
                 // AI Chat Protocol: Client must pass on any session state received from the server
-                //session_state: answer ? answer.session_state : null
+                session_state: answer ? answer.session_state : null
             };
             const result = await askApi(request, token);
-            //setAnswer(result);
+            setAnswer(result);
             setSpeechUrls([null]);
         } catch (e) {
             setError(e);
@@ -287,7 +287,21 @@ export function Component(): JSX.Element {
                         <ExampleList onExampleClicked={onExampleClicked} useGPT4V={useGPT4V} />
                     </div>
                 )}
-                {!isLoading && answer && !error && <div className={styles.askAnswerContainer}></div>}
+                {!isLoading && answer && !error && (
+                    <div className={styles.askAnswerContainer}>
+                        <Answer
+                            answer={answer}
+                            index={0}
+                            speechConfig={speechConfig}
+                            isStreaming={false}
+                            onCitationClicked={x => onShowCitation(x)}
+                            onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab)}
+                            onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab)}
+                            showSpeechOutputAzure={showSpeechOutputAzure}
+                            showSpeechOutputBrowser={showSpeechOutputBrowser}
+                        />
+                    </div>
+                )}
                 {error ? (
                     <div className={styles.askAnswerContainer}>
                         <AnswerError error={error.toString()} onRetry={() => makeApiRequest(lastQuestionRef.current)} />
diff --git a/app/frontend/src/pages/chat/Chat.tsx b/app/frontend/src/pages/chat/Chat.tsx
index 784444b351..5d00c2c914 100644
--- a/app/frontend/src/pages/chat/Chat.tsx
+++ b/app/frontend/src/pages/chat/Chat.tsx
@@ -11,16 +11,13 @@ import {
     chatApi,
     configApi,
     RetrievalMode,
-    ChatAppError,
     ChatAppResponse,
-    ChatAppResponseItem,
     ChatAppResponseOrError,
     ChatAppRequest,
     ResponseMessage,
     VectorFieldOptions,
     GPT4VInput,
-    SpeechConfig,
-    getLastResponse
+    SpeechConfig
 } from "../../api";
 import { Answer, AnswerError, AnswerLoading } from "../../components/Answer";
 import { QuestionInput } from "../../components/QuestionInput";
@@ -40,12 +37,6 @@ import { LoginContext } from "../../loginContext";
 import { LanguagePicker } from "../../i18n/LanguagePicker";
 import { Settings } from "../../components/Settings/Settings";
 
-const enum LoadingType {
-    None = "none",
-    Generating = "generating",
-    Thinking = "thinking"
-}
-
 const Chat = () => {
     const [isConfigPanelOpen, setIsConfigPanelOpen] = useState(false);
     const [isHistoryPanelOpen, setIsHistoryPanelOpen] = useState(false);
@@ -74,16 +65,16 @@ const Chat = () => {
     const lastQuestionRef = useRef<string>("");
     const chatMessageStreamEnd = useRef<HTMLDivElement | null>(null);
 
-    const [isLoading, setIsLoading] = useState<LoadingType>(LoadingType.None);
+    const [isLoading, setIsLoading] = useState<boolean>(false);
     const [isStreaming, setIsStreaming] = useState<boolean>(false);
     const [error, setError] = useState<unknown>();
 
     const [activeCitation, setActiveCitation] = useState<string>();
     const [activeAnalysisPanelTab, setActiveAnalysisPanelTab] = useState<AnalysisPanelTabs | undefined>(undefined);
 
-    const [selectedResponse, setSelectedResponse] = useState<number>(0);
-    const [responses, setResponses] = useState<[user: string, response: ChatAppResponse][]>([]);
-    const [streamedResponses, setStreamedResponses] = useState<[user: string, response: ChatAppResponse][]>([]);
+    const [selectedAnswer, setSelectedAnswer] = useState<number>(0);
+    const [answers, setAnswers] = useState<[user: string, response: ChatAppResponse][]>([]);
+    const [streamedAnswers, setStreamedAnswers] = useState<[user: string, response: ChatAppResponse][]>([]);
     const [speechUrls, setSpeechUrls] = useState<(string | null)[]>([]);
 
     const [showGPT4VOptions, setShowGPT4VOptions] = useState<boolean>(false);
@@ -138,62 +129,47 @@ const Chat = () => {
         });
     };
 
-    const handleAsyncRequest = async (question: string, responses: [string, ChatAppResponse][], responseBody: ReadableStream<any>) => {
-        var response: ChatAppResponse = {
-            value: []
-        };
-        const updateState = () => {
+    const handleAsyncRequest = async (question: string, answers: [string, ChatAppResponse][], responseBody: ReadableStream<any>) => {
+        let answer: string = "";
+        let askResponse: ChatAppResponse = {} as ChatAppResponse;
+
+        const updateState = (newContent: string) => {
             return new Promise(resolve => {
                 setTimeout(() => {
-                    setStreamedResponses([...responses, [question, response]]);
+                    answer += newContent;
+                    const latestResponse: ChatAppResponse = {
+                        ...askResponse,
+                        message: { content: answer, role: askResponse.message.role }
+                    };
+                    setStreamedAnswers([...answers, [question, latestResponse]]);
                     resolve(null);
                 }, 33);
             });
         };
         try {
             setIsStreaming(true);
-
             for await (const event of readNDJSONStream(responseBody)) {
-                if (event["error"]) {
-                    return { error: event["error"] } as ChatAppError;
-                }
-
-                if (event["delta"]) {
-                    if (event["delta"]["finish_reason"] == "stop") {
-                        setIsLoading(LoadingType.None);
-                        response.value[response.value.length - 1].context = event["context"];
-                        response.value[response.value.length - 1].session_state = event["session_state"];
-
-                        await updateState();
-                        continue;
-                    }
-
-                    if ("has_content" in event) {
-                        setIsLoading(event["has_content"] ? LoadingType.Generating : LoadingType.Thinking);
-                        var responseItem: ChatAppResponseItem = {
-                            message: event["has_content"] ? { content: "", role: "" } : null
-                        } as ChatAppResponseItem;
-                        response.value.push(responseItem);
-
-                        continue;
-                    }
-
-                    if (event["delta"]["content"]) {
-                        setIsLoading(LoadingType.None);
-                        response.value[response.value.length - 1].message.content += event["delta"]["content"];
-                        response.value[response.value.length - 1].message.role = event["delta"]["role"];
-
-                        await updateState();
-                        continue;
-                    }
+                if (event["context"] && event["context"]["data_points"]) {
+                    event["message"] = event["delta"];
+                    askResponse = event as ChatAppResponse;
+                } else if (event["delta"] && event["delta"]["content"]) {
+                    setIsLoading(false);
+                    await updateState(event["delta"]["content"]);
+                } else if (event["context"]) {
+                    // Update context with new keys from latest event
+                    askResponse.context = { ...askResponse.context, ...event["context"] };
+                } else if (event["error"]) {
+                    throw Error(event["error"]);
                 }
             }
         } finally {
             setIsStreaming(false);
-            setIsLoading(LoadingType.None);
         }
-
-        return response;
+        const fullResponse: ChatAppResponse = {
+            ...askResponse,
+            message: { content: answer, role: askResponse.message.role }
+        };
+        return fullResponse;
     };
 
     const client = useLogin ? useMsal().instance : undefined;
@@ -210,20 +186,17 @@ const Chat = () => {
         lastQuestionRef.current = question;
 
         error && setError(undefined);
-        setIsLoading(shouldStream ? LoadingType.Thinking : LoadingType.Generating);
+        setIsLoading(true);
         setActiveCitation(undefined);
         setActiveAnalysisPanelTab(undefined);
 
         const token = client ? await getToken(client) : undefined;
 
         try {
-            const messages: ResponseMessage[] = responses.flatMap(a => {
-                let lastMessage = getLastResponse(a[1])?.message;
-                return [
-                    { content: a[0], role: "user" },
-                    { content: lastMessage?.content ?? "", role: lastMessage?.role ?? "assistant" }
-                ];
-            });
+            const messages: ResponseMessage[] = answers.flatMap(a => [
+                { content: a[0], role: "user" },
+                { content: a[1].message.content, role: "assistant" }
+            ]);
 
             const request: ChatAppRequest = {
                 messages: [...messages, { content: question, role: "user" }],
@@ -252,7 +225,7 @@ const Chat = () => {
                     }
                 },
                 // AI Chat Protocol: Client must pass on any session state received from the server
-                session_state: responses.length ? getLastResponse(responses[responses.length - 1][1])?.session_state : null
+                session_state: answers.length ? answers[answers.length - 1][1].session_state : null
             };
 
             const response = await chatApi(request, shouldStream, token);
@@ -262,22 +235,29 @@ const Chat = () => {
             if (response.status > 299 || !response.ok) {
                 throw Error(`Request failed with status ${response.status}`);
             }
-            var parsedResponse: ChatAppResponseOrError = shouldStream ? await handleAsyncRequest(question, responses, response.body) : await response.json();
-
-            if ("error" in parsedResponse) {
-                throw Error(parsedResponse.error);
-            }
-            setResponses([...responses, [question, parsedResponse as ChatAppResponse]]);
-            let lastMessage = getLastResponse(parsedResponse as ChatAppResponse) ?? ({} as ChatAppResponseItem);
-            if (typeof lastMessage.session_state === "string" && lastMessage.session_state !== "") {
-                const token = client ? await getToken(client) : undefined;
-                historyManager.addItem(lastMessage.session_state, [...responses, [question, parsedResponse as ChatAppResponse]], token);
+            if (shouldStream) {
+                const parsedResponse: ChatAppResponse = await handleAsyncRequest(question, answers, response.body);
+                setAnswers([...answers, [question, parsedResponse]]);
+                if (typeof parsedResponse.session_state === "string" && parsedResponse.session_state !== "") {
+                    const token = client ? await getToken(client) : undefined;
+                    historyManager.addItem(parsedResponse.session_state, [...answers, [question, parsedResponse]], token);
+                }
+            } else {
+                const parsedResponse: ChatAppResponseOrError = await response.json();
+                if (parsedResponse.error) {
+                    throw Error(parsedResponse.error);
+                }
+                setAnswers([...answers, [question, parsedResponse as ChatAppResponse]]);
+                if (typeof parsedResponse.session_state === "string" && parsedResponse.session_state !== "") {
+                    const token = client ? await getToken(client) : undefined;
+                    historyManager.addItem(parsedResponse.session_state, [...answers, [question, parsedResponse as ChatAppResponse]], token);
+                }
             }
             setSpeechUrls([...speechUrls, null]);
         } catch (e) {
             setError(e);
         } finally {
-            setIsLoading(LoadingType.None);
+            setIsLoading(false);
         }
     };
 
@@ -286,15 +266,15 @@ const Chat = () => {
         error && setError(undefined);
         setActiveCitation(undefined);
         setActiveAnalysisPanelTab(undefined);
-        setResponses([]);
+        setAnswers([]);
         setSpeechUrls([]);
-        setStreamedResponses([]);
-        setIsLoading(LoadingType.None);
+        setStreamedAnswers([]);
+        setIsLoading(false);
         setIsStreaming(false);
     };
 
     useEffect(() => chatMessageStreamEnd.current?.scrollIntoView({ behavior: "smooth" }), [isLoading]);
-    useEffect(() => chatMessageStreamEnd.current?.scrollIntoView({ behavior: "auto" }), [streamedResponses]);
+    useEffect(() => chatMessageStreamEnd.current?.scrollIntoView({ behavior: "auto" }), [streamedAnswers]);
     useEffect(() => {
         getConfig();
     }, []);
@@ -369,24 +349,24 @@ const Chat = () => {
     };
 
     const onShowCitation = (citation: string, index: number) => {
-        if (activeCitation === citation && activeAnalysisPanelTab === AnalysisPanelTabs.CitationTab && selectedResponse === index) {
+        if (activeCitation === citation && activeAnalysisPanelTab === AnalysisPanelTabs.CitationTab && selectedAnswer === index) {
             setActiveAnalysisPanelTab(undefined);
         } else {
             setActiveCitation(citation);
             setActiveAnalysisPanelTab(AnalysisPanelTabs.CitationTab);
         }
 
-        setSelectedResponse(index);
+        setSelectedAnswer(index);
     };
 
     const onToggleTab = (tab: AnalysisPanelTabs, index: number) => {
-        if (activeAnalysisPanelTab === tab && selectedResponse === index) {
+        if (activeAnalysisPanelTab === tab && selectedAnswer === index) {
             setActiveAnalysisPanelTab(undefined);
         } else {
             setActiveAnalysisPanelTab(tab);
         }
 
-        setSelectedResponse(index);
+        setSelectedAnswer(index);
     };
 
     const { t, i18n } = useTranslation();
@@ -404,11 +384,7 @@ const Chat = () => {
                     )}
                 </div>
                 <div className={styles.commandsContainer}>
-                    <ClearChatButton
-                        className={styles.commandButton}
-                        onClick={clearChat}
-                        disabled={!lastQuestionRef.current || isLoading != LoadingType.None}
-                    />
+                    <ClearChatButton className={styles.commandButton} onClick={clearChat} disabled={!lastQuestionRef.current || isLoading} />
                     {showUserUpload && <UploadFile className={styles.commandButton} disabled={!loggedIn} />}
                     <SettingsButton className={styles.commandButton} onClick={() => setIsConfigPanelOpen(!isConfigPanelOpen)} />
                 </div>
@@ -428,64 +404,52 @@ const Chat = () => {
                     ) : (
                         <div className={styles.chatMessageStream}>
                             {isStreaming &&
-                                streamedResponses.map((streamedResponse, responseIndex) => (
-                                    <>
-                                        <UserChatMessage key={responseIndex} message={streamedResponse[0]} />
-                                        {streamedResponse[1].value.map(
-                                            (thought, thoughtIndex) =>
-                                                thought.message && (
-                                                    <Answer
-                                                        isStreaming={true}
-                                                        key={`${responseIndex}-${thoughtIndex}`}
-                                                        answer={thought}
-                                                        response={streamedResponse[1]}
-                                                        responseIndex={responseIndex}
-                                                        speechConfig={speechConfig}
-                                                        isSelected={selectedResponse === responseIndex && activeAnalysisPanelTab !== undefined}
-                                                        onCitationClicked={c => onShowCitation(c, responseIndex)}
-                                                        onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab, responseIndex)}
-                                                        onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab, responseIndex)}
-                                                        onFollowupQuestionClicked={q => makeApiRequest(q)}
-                                                        showFollowupQuestions={useSuggestFollowupQuestions && responses.length - 1 === responseIndex}
-                                                        showSpeechOutputAzure={showSpeechOutputAzure}
-                                                        showSpeechOutputBrowser={showSpeechOutputBrowser}
-                                                    />
-                                                )
-                                        )}
-                                    </>
+                                streamedAnswers.map((streamedAnswer, index) => (
+                                    <div key={index}>
+                                        <UserChatMessage message={streamedAnswer[0]} />
+                                        <div className={styles.chatMessageGpt}>
+                                            <Answer
+                                                isStreaming={true}
+                                                key={index}
+                                                answer={streamedAnswer[1]}
+                                                index={index}
+                                                speechConfig={speechConfig}
+                                                isSelected={false}
+                                                onCitationClicked={c => onShowCitation(c, index)}
+                                                onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab, index)}
+                                                onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab, index)}
+                                                onFollowupQuestionClicked={q => makeApiRequest(q)}
+                                                showFollowupQuestions={useSuggestFollowupQuestions && answers.length - 1 === index}
+                                                showSpeechOutputAzure={showSpeechOutputAzure}
+                                                showSpeechOutputBrowser={showSpeechOutputBrowser}
+                                            />
+                                        </div>
+                                    </div>
                                 ))}
                             {!isStreaming &&
-                                responses.map((response, responseIndex) => (
-                                    <div key={responseIndex}>
-                                        <UserChatMessage message={response[0]} />
-                                        <>
-                                            {response[1].value.map(
-                                                (thought, thoughtIndex) =>
-                                                    thought.message && (
-                                                        <Answer
-                                                            isStreaming={true}
-                                                            key={`${responseIndex}-${thoughtIndex}`}
-                                                            answer={thought}
-                                                            response={response[1]}
-                                                            responseIndex={responseIndex}
-                                                            speechConfig={speechConfig}
-                                                            isSelected={selectedResponse === responseIndex && activeAnalysisPanelTab !== undefined}
-                                                            onCitationClicked={c => onShowCitation(c, responseIndex)}
-                                                            onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab, responseIndex)}
-                                                            onSupportingContentClicked={() =>
-                                                                onToggleTab(AnalysisPanelTabs.SupportingContentTab, responseIndex)
-                                                            }
-                                                            onFollowupQuestionClicked={q => makeApiRequest(q)}
-                                                            showFollowupQuestions={useSuggestFollowupQuestions && responses.length - 1 === responseIndex}
-                                                            showSpeechOutputAzure={showSpeechOutputAzure}
-                                                            showSpeechOutputBrowser={showSpeechOutputBrowser}
-                                                        />
-                                                    )
-                                            )}
-                                        </>
+                                answers.map((answer, index) => (
+                                    <div key={index}>
+                                        <UserChatMessage message={answer[0]} />
+                                        <div className={styles.chatMessageGpt}>
+                                            <Answer
+                                                isStreaming={false}
+                                                key={index}
+                                                answer={answer[1]}
+                                                index={index}
+                                                speechConfig={speechConfig}
+                                                isSelected={selectedAnswer === index && activeAnalysisPanelTab !== undefined}
+                                                onCitationClicked={c => onShowCitation(c, index)}
+                                                onThoughtProcessClicked={() => onToggleTab(AnalysisPanelTabs.ThoughtProcessTab, index)}
+                                                onSupportingContentClicked={() => onToggleTab(AnalysisPanelTabs.SupportingContentTab, index)}
+                                                onFollowupQuestionClicked={q => makeApiRequest(q)}
+                                                showFollowupQuestions={useSuggestFollowupQuestions && answers.length - 1 === index}
+                                                showSpeechOutputAzure={showSpeechOutputAzure}
+                                                showSpeechOutputBrowser={showSpeechOutputBrowser}
+                                            />
+                                        </div>
                                     </div>
                                 ))}
-                            {isLoading == LoadingType.Generating && (
+                            {isLoading && (
                                 <>
                                     <UserChatMessage message={lastQuestionRef.current} />
                                     <div className={styles.chatMessageGptMinWidth}>
@@ -509,20 +473,20 @@ const Chat = () => {
                         <QuestionInput
                             clearOnSend
                             placeholder={t("defaultExamples.placeholder")}
-                            disabled={isLoading != LoadingType.None}
+                            disabled={isLoading}
                             onSend={question => makeApiRequest(question)}
                             showSpeechInput={showSpeechInput}
                         />
                     </div>
                 </div>
 
-                {responses.length > 0 && activeAnalysisPanelTab && (
+                {answers.length > 0 && activeAnalysisPanelTab && (
                     <AnalysisPanel
                         className={styles.chatAnalysisPanel}
                         activeCitation={activeCitation}
-                        onActiveTabChanged={x => onToggleTab(x, selectedResponse)}
+                        onActiveTabChanged={x => onToggleTab(x, selectedAnswer)}
                         citationHeight="810px"
-                        response={responses[selectedResponse][1]}
+                        answer={answers[selectedAnswer][1]}
                         activeTab={activeAnalysisPanelTab}
                     />
                 )}
@@ -535,7 +499,7 @@ const Chat = () => {
                         onClose={() => setIsHistoryPanelOpen(false)}
                         onChatSelected={answers => {
                             if (answers.length === 0) return;
-                            setResponses(answers);
+                            setAnswers(answers);
                             lastQuestionRef.current = answers[answers.length - 1][0];
                         }}
                     />

From 9688143193be60fd4a490d9e4636e322d1f97906 Mon Sep 17 00:00:00 2001
From: Matt Gotteiner <matthew.gotteiner@microsoft.com>
Date: Thu, 17 Apr 2025 22:26:51 -0700
Subject: [PATCH 03/10] update

---
 app/backend/approaches/approach.py            |  18 +--
 app/backend/approaches/chatapproach.py        | 117 ++++++++++--------
 .../approaches/chatreadretrieveread.py        |   6 +-
 .../approaches/chatreadretrievereadvision.py  |   6 +-
 app/backend/approaches/retrievethenread.py    |  73 +++++------
 .../approaches/retrievethenreadvision.py      |  76 +++++-------
 app/frontend/src/pages/chat/Chat.tsx          |  17 +--
 7 files changed, 154 insertions(+), 159 deletions(-)

diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
index 3d3ed6edca..050a9381c4 100644
--- a/app/backend/approaches/approach.py
+++ b/app/backend/approaches/approach.py
@@ -88,22 +88,26 @@ def trim_embedding(cls, embedding: Optional[list[float]]) -> Optional[str]:
 
         return None
 
-@dataclass
-class DataPoints:
-    text: Optional[list[str]] = None
-    images: Optional[list] = None
-
 @dataclass
 class ThoughtStep:
     title: str
     description: Optional[Any]
     props: Optional[dict[str, Any]] = None
-    data_points: Optional[DataPoints] = None
-
     def update_token_usage(self, usage: CompletionUsage) -> None:
         if self.props:
             self.props["token_usage"] = TokenUsageProps.from_completion_usage(usage)
 
+@dataclass
+class DataPoints:
+    text: Optional[list[str]] = None
+    images: Optional[list] = None
+
+
+@dataclass
+class ExtraInfo:
+    data_points: DataPoints
+    thoughts: Optional[list[ThoughtStep]] = None
+    followup_questions: Optional[list[Any]] = None
 
 @dataclass
 class TokenUsageProps:
diff --git a/app/backend/approaches/chatapproach.py b/app/backend/approaches/chatapproach.py
index 1559c20a5a..1b50964afd 100644
--- a/app/backend/approaches/chatapproach.py
+++ b/app/backend/approaches/chatapproach.py
@@ -13,14 +13,23 @@
 
 from approaches.approach import (
     Approach,
+    DataPoints,
+    ExtraInfo,
     ThoughtStep
 )
 
 class StreamingThoughtStep:
-    def __init__(self, step: ThoughtStep, chat_completion: Optional[Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]] = None, role: Optional[str] = "assistant"):
+    def __init__(
+            self,
+            step: ThoughtStep,
+            chat_completion: Optional[Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]] = None,
+            role: Optional[str] = "assistant",
+            data_points: Optional[DataPoints] = None):
+        
         self.step = step
         self.chat_completion = chat_completion
         self.role = role
+        self.data_points = data_points
         self._stream = None
         self._is_streaming = None
     
@@ -35,7 +44,7 @@ async def start(self):
             self._stream = await self.chat_completion
             self._is_streaming = True
 
-    async def __anext__(self) -> Union[ChatCompletion, ChatCompletionChunk, ThoughtStep]:
+    async def __anext__(self) -> Union[ChatCompletion, ChatCompletionChunk, DataPoints, ThoughtStep]:
         if self._is_streaming:
             # Streaming Implementation: yield each chunk, then the step with token usage
             if self._stream is None:
@@ -54,9 +63,14 @@ async def __anext__(self) -> Union[ChatCompletion, ChatCompletionChunk, ThoughtS
         
         # Non-Streaming Implementation: return the entire response, then the step with token usage
         if self._stream is None:
-            if self.step is None:
+            if self.step is None and self.data_points is None:
                 raise StopAsyncIteration
             
+            if self.data_points is not None:
+                result = self.data_points
+                self.data_points = None
+                return result
+
             result = self.step
             self.step = None
             return result
@@ -93,7 +107,7 @@ def get_search_query(self, chat_completion: ChatCompletion, user_query: str):
                 return query_text
         return user_query
 
-    def extract_followup_questions(self, content: Optional[str]) -> Optional[List[str]]:
+    def extract_followup_questions(self, content: Optional[str]):
         if content is None:
             return content, []
         return content.split("<<")[0], re.findall(r"<<([^>>]+)>>", content)
@@ -104,32 +118,32 @@ async def run_without_streaming(
         overrides: dict[str, Any],
         auth_claims: dict[str, Any],
         session_state: Any = None,
-    ) -> AsyncGenerator[dict[str, Any], None]:
+    ) -> dict[str, Any]:
         thoughts = self.run_until_final_call(
             messages, overrides, auth_claims, should_stream=False
         )
+        content = None
+        role = None
+        extra_info = ExtraInfo()
         async for thought in thoughts:
-            content = None
-            role = None
-            thought_step = None
-            followup_questions = None
             await thought.start()
             async for chunk in thought:
                 if isinstance(chunk, ChatCompletion):
                     content = chunk.choices[0].message.content
                     role = chunk.choices[0].message.role
                 elif isinstance(chunk, ThoughtStep):
-                   thought_step = chunk
-
-            if overrides.get("suggest_followup_questions"):
-                content, followup_questions = self.extract_followup_questions(content)
-                followup_questions = followup_questions
-
-            yield {
-                "message": {"content": content, "role": role},
-                "context": { "thought": thought_step, "followup_questions": followup_questions },
-                "session_state": session_state,
-            }
+                   extra_info.thoughts.append(chunk)
+                elif isinstance(chunk, DataPoints):
+                    extra_info.data_points = chunk
+
+        if overrides.get("suggest_followup_questions"):
+            content, followup_questions = self.extract_followup_questions(content)
+            followup_questions = followup_questions
+        return {
+            "message": {"content": content, "role": role},
+            "context": extra_info,
+            "session_state": session_state,
+        }
 
     async def run_with_streaming(
         self,
@@ -137,47 +151,48 @@ async def run_with_streaming(
         overrides: dict[str, Any],
         auth_claims: dict[str, Any],
         session_state: Any = None,
-    ) -> AsyncGenerator[dict[str, Any], None]:
+    ) -> AsyncGenerator[dict, None]:
         thoughts = self.run_until_final_call(
             messages, overrides, auth_claims, should_stream=True
         )
-        async for thought in thoughts:
-            yield { "delta": { "role": thought.role }, "has_content": thought.has_content() }
+        extra_info = ExtraInfo()
 
-            followup_questions_started = False
-            followup_content = ""
-            thought_step = None
+        yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state}
+        followup_questions_started = False
+        followup_content = ""
+        async for thought in thoughts:
             await thought.start()
-            async for event in thought:
-                if isinstance(event, ChatCompletionChunk):
-                    if event.choices:
-                        completion = {
-                            "delta": {
-                                "content": event.choices[0].delta.content,
-                                "role": event.choices[0].delta.role
-                            }
-                        }
+            async for chunk in thought:
+                if isinstance(chunk, ChatCompletionChunk):
+                    content = chunk.choices[0].delta.content
+                    role = chunk.choices[0].delta.role
+                    content = content or ""  # content may either not exist in delta, or explicitly be None
+                    completion = { "delta": {"content": content, "role": role} }
+                    if overrides.get("suggest_followup_questions") and "<<" in content:
                         # if event contains << and not >>, it is start of follow-up question, truncate
-                        content = completion["delta"].get("content")
-                        content = content or ""  # content may either not exist in delta, or explicitly be None
-                        if overrides.get("suggest_followup_questions") and "<<" in content:
-                            followup_questions_started = True
-                            earlier_content = content[: content.index("<<")]
-                            if earlier_content:
-                                completion["delta"]["content"] = earlier_content
-                                yield completion
-                            followup_content += content[content.index("<<") :]
-                        elif followup_questions_started:
-                            followup_content += content
-                        else:
+                        followup_questions_started = True
+                        earlier_content = content[: content.index("<<")]
+                        if earlier_content:
+                            completion["delta"]["content"] = earlier_content
                             yield completion
-                elif isinstance(event, ThoughtStep):
-                    thought_step = event
+                        followup_content += content[content.index("<<") :]
+                    elif followup_questions_started:
+                        followup_content += content
+                    else:
+                        yield completion
+                elif isinstance(chunk, ThoughtStep):
+                    extra_info.thoughts.append(chunk)
+                    yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state}
+                elif isinstance(chunk, DataPoints):
+                    extra_info.data_points = chunk
+                    yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state}
 
-            followup_questions = None
             if followup_content:
                 _, followup_questions = self.extract_followup_questions(followup_content)
-            yield {"delta": {"role": thought.role, "finish_reason": "stop" }, "context": { "thought": thought_step, "followup_questions": followup_questions }, "session_state": session_state }
+                yield {
+                    "delta": {"role": "assistant"},
+                    "context": {"context": extra_info, "followup_questions": followup_questions},
+                }
 
     async def run(
         self,
diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py
index 9f25c57a36..529eec120f 100644
--- a/app/backend/approaches/chatreadretrieveread.py
+++ b/app/backend/approaches/chatreadretrieveread.py
@@ -190,8 +190,7 @@ async def run_until_final_call(
                 overrides=overrides,
                 model=self.chatgpt_model,
                 deployment=self.chatgpt_deployment,
-                usage=None,
-                data_points=DataPoints(text=text_sources)
+                usage=None
             ),
             chat_completion=self.create_chat_completion(
                 self.chatgpt_deployment,
@@ -200,5 +199,6 @@ async def run_until_final_call(
                 overrides,
                 self.get_response_token_limit(self.chatgpt_model, 1024),
                 should_stream,
-            )
+            ),
+            data_points=DataPoints(text=text_sources)
         )
diff --git a/app/backend/approaches/chatreadretrievereadvision.py b/app/backend/approaches/chatreadretrievereadvision.py
index 1c6a360ed8..517b84c708 100644
--- a/app/backend/approaches/chatreadretrievereadvision.py
+++ b/app/backend/approaches/chatreadretrievereadvision.py
@@ -196,8 +196,7 @@ async def run_until_final_call(
                     {"model": self.gpt4v_model, "deployment": self.gpt4v_deployment}
                     if self.gpt4v_deployment
                     else {"model": self.gpt4v_model}
-                ),
-                data_points=DataPoints(text=text_sources, images=image_sources)
+                )
             ),
             chat_completion=self.openai_client.chat.completions.create(
                 model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model,
@@ -207,5 +206,6 @@ async def run_until_final_call(
                 n=1,
                 stream=should_stream,
                 seed=seed,
-            )
+            ),
+            data_points=DataPoints(text=text_sources, images=image_sources)
         )
diff --git a/app/backend/approaches/retrievethenread.py b/app/backend/approaches/retrievethenread.py
index 8eb74a69d4..8bdbb9785e 100644
--- a/app/backend/approaches/retrievethenread.py
+++ b/app/backend/approaches/retrievethenread.py
@@ -1,11 +1,11 @@
-from typing import Any, Optional, cast, AsyncGenerator
+from typing import Any, Optional, cast
 
 from azure.search.documents.aio import SearchClient
 from azure.search.documents.models import VectorQuery
 from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
 
-from approaches.approach import Approach, DataPoints, ThoughtStep
+from approaches.approach import Approach, DataPoints, ExtraInfo, ThoughtStep
 from approaches.promptmanager import PromptManager
 from core.authentication import AuthenticationHelper
 
@@ -58,7 +58,7 @@ async def run(
         messages: list[ChatCompletionMessageParam],
         session_state: Any = None,
         context: dict[str, Any] = {},
-    ) -> AsyncGenerator[dict[str, Any], None]:
+    ) -> dict[str, Any]:
         q = messages[-1]["content"]
         if not isinstance(q, str):
             raise ValueError("The most recent message content must be a string.")
@@ -73,26 +73,6 @@ async def run(
         minimum_search_score = overrides.get("minimum_search_score", 0.0)
         minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
         filter = self.build_filter(overrides, auth_claims)
-        
-        yield {
-            "context": {
-                "thought": ThoughtStep(
-                    "Search using user query",
-                    q,
-                    {
-                        "use_semantic_captions": use_semantic_captions,
-                        "use_semantic_ranker": use_semantic_ranker,
-                        "use_query_rewriting": use_query_rewriting,
-                        "top": top,
-                        "filter": filter,
-                        "use_vector_search": use_vector_search,
-                        "use_text_search": use_text_search,
-                    },
-                )
-            },
-            "session_state": session_state,
-        }
-
 
         # If retrieval mode includes vectors, compute an embedding for the query
         vectors: list[VectorQuery] = []
@@ -121,16 +101,6 @@ async def run(
             | {"user_query": q, "text_sources": text_sources},
         )
 
-        yield {
-            "context": {
-                "thought": ThoughtStep(
-                    "Search results",
-                    [result.serialize_for_results() for result in results],
-                )
-            },
-            "session_state": session_state
-        }
-
         chat_completion = cast(
             ChatCompletion,
             await self.create_chat_completion(
@@ -142,21 +112,42 @@ async def run(
             ),
         )
 
-        yield {
-            "message": {
-                "content": chat_completion.choices[0].message.content,
-                "role": chat_completion.choices[0].message.role,
-            },
-            "context": {
-                "thought": self.format_thought_step_for_chatcompletion(
+        extra_info = ExtraInfo(
+            DataPoints(text=text_sources),
+            thoughts=[
+                ThoughtStep(
+                    "Search using user query",
+                    q,
+                    {
+                        "use_semantic_captions": use_semantic_captions,
+                        "use_semantic_ranker": use_semantic_ranker,
+                        "use_query_rewriting": use_query_rewriting,
+                        "top": top,
+                        "filter": filter,
+                        "use_vector_search": use_vector_search,
+                        "use_text_search": use_text_search,
+                    },
+                ),
+                ThoughtStep(
+                    "Search results",
+                    [result.serialize_for_results() for result in results],
+                ),
+                self.format_thought_step_for_chatcompletion(
                     title="Prompt to generate answer",
                     messages=messages,
                     overrides=overrides,
                     model=self.chatgpt_model,
                     deployment=self.chatgpt_deployment,
                     usage=chat_completion.usage,
-                    data_points=DataPoints(text=text_sources)
                 ),
+            ],
+        )
+
+        return {
+            "message": {
+                "content": chat_completion.choices[0].message.content,
+                "role": chat_completion.choices[0].message.role,
             },
+            "context": extra_info,
             "session_state": session_state,
         }
diff --git a/app/backend/approaches/retrievethenreadvision.py b/app/backend/approaches/retrievethenreadvision.py
index af92a7a509..a556fd8b6c 100644
--- a/app/backend/approaches/retrievethenreadvision.py
+++ b/app/backend/approaches/retrievethenreadvision.py
@@ -1,5 +1,5 @@
 from collections.abc import Awaitable
-from typing import Any, Callable, Optional, AsyncGenerator
+from typing import Any, Callable, Optional
 
 from azure.search.documents.aio import SearchClient
 from azure.storage.blob.aio import ContainerClient
@@ -8,7 +8,7 @@
     ChatCompletionMessageParam,
 )
 
-from approaches.approach import Approach, DataPoints, ThoughtStep
+from approaches.approach import Approach, DataPoints, ExtraInfo, ThoughtStep
 from approaches.promptmanager import PromptManager
 from core.authentication import AuthenticationHelper
 from core.imageshelper import fetch_image
@@ -66,7 +66,7 @@ async def run(
         messages: list[ChatCompletionMessageParam],
         session_state: Any = None,
         context: dict[str, Any] = {},
-    ) -> AsyncGenerator[dict[str, Any], None]:
+    ) -> dict[str, Any]:
         q = messages[-1]["content"]
         if not isinstance(q, str):
             raise ValueError("The most recent message content must be a string.")
@@ -87,26 +87,6 @@ async def run(
         vector_fields = overrides.get("vector_fields", ["embedding"])
         send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
         send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None]
-                
-        yield {
-            "context": {
-                "thought": ThoughtStep(
-                    "Search using user query",
-                    q,
-                    {
-                        "use_semantic_captions": use_semantic_captions,
-                        "use_semantic_ranker": use_semantic_ranker,
-                        "use_query_rewriting": use_query_rewriting,
-                        "top": top,
-                        "filter": filter,
-                        "vector_fields": vector_fields,
-                        "use_vector_search": use_vector_search,
-                        "use_text_search": use_text_search,
-                    },
-                ),
-            },
-            "session_state": session_state,
-        }
 
         # If retrieval mode includes vectors, compute an embedding for the query
         vectors = []
@@ -133,16 +113,6 @@ async def run(
             use_query_rewriting,
         )
 
-        yield {
-            "context": {
-                "thought": ThoughtStep(
-                    "Search results",
-                    [result.serialize_for_results() for result in results],
-                )
-            },
-            "session_state": session_state
-        }
-
         # Process results
         text_sources = []
         image_sources = []
@@ -169,13 +139,28 @@ async def run(
             seed=seed,
         )
 
-        yield {
-            "message": {
-                "content": chat_completion.choices[0].message.content,
-                "role": chat_completion.choices[0].message.role,
-            },
-            "context": {
-                "thought": ThoughtStep(
+        extra_info = ExtraInfo(
+            DataPoints(text=text_sources, images=image_sources),
+            [
+                ThoughtStep(
+                    "Search using user query",
+                    q,
+                    {
+                        "use_semantic_captions": use_semantic_captions,
+                        "use_semantic_ranker": use_semantic_ranker,
+                        "use_query_rewriting": use_query_rewriting,
+                        "top": top,
+                        "filter": filter,
+                        "vector_fields": vector_fields,
+                        "use_vector_search": use_vector_search,
+                        "use_text_search": use_text_search,
+                    },
+                ),
+                ThoughtStep(
+                    "Search results",
+                    [result.serialize_for_results() for result in results],
+                ),
+                ThoughtStep(
                     "Prompt to generate answer",
                     messages,
                     (
@@ -183,8 +168,15 @@ async def run(
                         if self.gpt4v_deployment
                         else {"model": self.gpt4v_model}
                     ),
-                    data_points=DataPoints(text=text_sources, images=image_sources),
-                )
+                ),
+            ],
+        )
+
+        return {
+            "message": {
+                "content": chat_completion.choices[0].message.content,
+                "role": chat_completion.choices[0].message.role,
             },
+            "context": extra_info,
             "session_state": session_state,
         }
diff --git a/app/frontend/src/pages/chat/Chat.tsx b/app/frontend/src/pages/chat/Chat.tsx
index 5d00c2c914..6b8a710d29 100644
--- a/app/frontend/src/pages/chat/Chat.tsx
+++ b/app/frontend/src/pages/chat/Chat.tsx
@@ -133,13 +133,13 @@ const Chat = () => {
         let answer: string = "";
         let askResponse: ChatAppResponse = {} as ChatAppResponse;
 
-        const updateState = (newContent: string) => {
+        const updateState = (newContent: string, role: string) => {
             return new Promise(resolve => {
                 setTimeout(() => {
                     answer += newContent;
                     const latestResponse: ChatAppResponse = {
                         ...askResponse,
-                        message: { content: answer, role: askResponse.message.role }
+                        message: { content: answer, role: role }
                     };
                     setStreamedAnswers([...answers, [question, latestResponse]]);
                     resolve(null);
@@ -149,12 +149,9 @@ const Chat = () => {
         try {
             setIsStreaming(true);
             for await (const event of readNDJSONStream(responseBody)) {
-                if (event["context"] && event["context"]["data_points"]) {
-                    event["message"] = event["delta"];
-                    askResponse = event as ChatAppResponse;
-                } else if (event["delta"] && event["delta"]["content"]) {
+                if (event["delta"] && event["delta"]["content"]) {
                     setIsLoading(false);
-                    await updateState(event["delta"]["content"]);
+                    await updateState(event["delta"]["content"], event["delta"]["role"]);
                 } else if (event["context"]) {
                     // Update context with new keys from latest event
                     askResponse.context = { ...askResponse.context, ...event["context"] };
@@ -165,11 +162,7 @@ const Chat = () => {
         } finally {
             setIsStreaming(false);
         }
-        const fullResponse: ChatAppResponse = {
-            ...askResponse,
-            message: { content: answer, role: askResponse.message.role }
-        };
-        return fullResponse;
+        return askResponse;
     };
 
     const client = useLogin ? useMsal().instance : undefined;

From fcf959e30dc57d3578cd542cb3f08e3b1b63094a Mon Sep 17 00:00:00 2001
From: Matt Gotteiner <matthew.gotteiner@microsoft.com>
Date: Thu, 17 Apr 2025 23:00:07 -0700
Subject: [PATCH 04/10] fix streaming

---
 app/backend/approaches/approach.py     | 11 ++-
 app/backend/approaches/chatapproach.py | 92 +++++++++++++-------------
 app/frontend/src/pages/chat/Chat.tsx   |  7 +-
 3 files changed, 52 insertions(+), 58 deletions(-)

diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
index 050a9381c4..fdafc19e54 100644
--- a/app/backend/approaches/approach.py
+++ b/app/backend/approaches/approach.py
@@ -1,7 +1,7 @@
 import os
 from abc import ABC
 from collections.abc import AsyncGenerator, Awaitable
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import (
     Any,
     Callable,
@@ -105,8 +105,8 @@ class DataPoints:
 
 @dataclass
 class ExtraInfo:
-    data_points: DataPoints
-    thoughts: Optional[list[ThoughtStep]] = None
+    data_points: DataPoints = None
+    thoughts: list[ThoughtStep] = field(default_factory=list)
     followup_questions: Optional[list[Any]] = None
 
 @dataclass
@@ -398,8 +398,7 @@ def format_thought_step_for_chatcompletion(
         model: str,
         deployment: Optional[str],
         usage: Optional[CompletionUsage] = None,
-        reasoning_effort: Optional[ChatCompletionReasoningEffort] = None,
-        data_points: Optional[DataPoints] = None,
+        reasoning_effort: Optional[ChatCompletionReasoningEffort] = None
     ) -> ThoughtStep:
         properties: dict[str, Any] = {"model": model}
         if deployment:
@@ -411,7 +410,7 @@ def format_thought_step_for_chatcompletion(
             )
         if usage:
             properties["token_usage"] = TokenUsageProps.from_completion_usage(usage)
-        return ThoughtStep(title, messages, properties, data_points)
+        return ThoughtStep(title, messages, properties)
 
     async def run(
         self,
diff --git a/app/backend/approaches/chatapproach.py b/app/backend/approaches/chatapproach.py
index 1b50964afd..51261d6c77 100644
--- a/app/backend/approaches/chatapproach.py
+++ b/app/backend/approaches/chatapproach.py
@@ -32,9 +32,6 @@ def __init__(
         self.data_points = data_points
         self._stream = None
         self._is_streaming = None
-    
-    def has_content(self) -> bool:
-        return self.chat_completion is not None
 
     def __aiter__(self):
         return self
@@ -45,39 +42,38 @@ async def start(self):
             self._is_streaming = True
 
     async def __anext__(self) -> Union[ChatCompletion, ChatCompletionChunk, DataPoints, ThoughtStep]:
-        if self._is_streaming:
-            # Streaming Implementation: yield each chunk, then the step with token usage
-            if self._stream is None:
-                raise StopAsyncIteration
-
-            try:
-                # Get the next chunk from the async stream
-                chunk = await self._stream.__anext__()
-                if len(chunk.choices) == 0 and chunk.usage:
-                    self.step.update_token_usage(chunk.usage)
-                return chunk
-            except StopAsyncIteration:
-                # If the stream is exhausted, yield the step with token usage
-                self._stream = None
-                return self.step
+        # If there are data points, return them first to render citations
+        if self.data_points is not None:
+            result = self.data_points
+            self.data_points = None
+            return result
         
-        # Non-Streaming Implementation: return the entire response, then the step with token usage
-        if self._stream is None:
-            if self.step is None and self.data_points is None:
-                raise StopAsyncIteration
-            
-            if self.data_points is not None:
-                result = self.data_points
-                self.data_points = None
-                return result
-
+        if self._stream is not None:
+            if self._is_streaming:
+                try:
+                    # Get the next chunk from the async stream
+                    chunk = await self._stream.__anext__()
+                    if len(chunk.choices) == 0 and chunk.usage:
+                        self.step.update_token_usage(chunk.usage)
+                    return chunk
+                except StopAsyncIteration:
+                    # If the stream is exhausted, yield the step with token usage
+                    self._stream = None
+                    return self.step
+        
+            # Non-Streaming Implementation: return the entire response, then the step with token usage
+            result = self._stream
+            self._stream = None
+            return result
+    
+        if self.step is not None:
             result = self.step
             self.step = None
             return result
+    
+        # No more items to yield
+        raise StopAsyncIteration
 
-        result = self._stream
-        self._stream = None
-        return result
 
 class ChatApproach(Approach, ABC):
 
@@ -164,22 +160,23 @@ async def run_with_streaming(
             await thought.start()
             async for chunk in thought:
                 if isinstance(chunk, ChatCompletionChunk):
-                    content = chunk.choices[0].delta.content
-                    role = chunk.choices[0].delta.role
-                    content = content or ""  # content may either not exist in delta, or explicitly be None
-                    completion = { "delta": {"content": content, "role": role} }
-                    if overrides.get("suggest_followup_questions") and "<<" in content:
-                        # if event contains << and not >>, it is start of follow-up question, truncate
-                        followup_questions_started = True
-                        earlier_content = content[: content.index("<<")]
-                        if earlier_content:
-                            completion["delta"]["content"] = earlier_content
+                    if chunk.choices:
+                        content = chunk.choices[0].delta.content
+                        role = chunk.choices[0].delta.role
+                        content = content or ""  # content may either not exist in delta, or explicitly be None
+                        completion = { "delta": {"content": content, "role": role} }
+                        if overrides.get("suggest_followup_questions") and "<<" in content:
+                            # if event contains << and not >>, it is start of follow-up question, truncate
+                            followup_questions_started = True
+                            earlier_content = content[: content.index("<<")]
+                            if earlier_content:
+                                completion["delta"]["content"] = earlier_content
+                                yield completion
+                            followup_content += content[content.index("<<") :]
+                        elif followup_questions_started:
+                            followup_content += content
+                        else:
                             yield completion
-                        followup_content += content[content.index("<<") :]
-                    elif followup_questions_started:
-                        followup_content += content
-                    else:
-                        yield completion
                 elif isinstance(chunk, ThoughtStep):
                     extra_info.thoughts.append(chunk)
                     yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state}
@@ -189,9 +186,10 @@ async def run_with_streaming(
 
             if followup_content:
                 _, followup_questions = self.extract_followup_questions(followup_content)
+                extra_info.followup_questions = followup_questions
                 yield {
                     "delta": {"role": "assistant"},
-                    "context": {"context": extra_info, "followup_questions": followup_questions},
+                    "context": extra_info,
                 }
 
     async def run(
diff --git a/app/frontend/src/pages/chat/Chat.tsx b/app/frontend/src/pages/chat/Chat.tsx
index 6b8a710d29..7ab010f337 100644
--- a/app/frontend/src/pages/chat/Chat.tsx
+++ b/app/frontend/src/pages/chat/Chat.tsx
@@ -137,11 +137,8 @@ const Chat = () => {
             return new Promise(resolve => {
                 setTimeout(() => {
                     answer += newContent;
-                    const latestResponse: ChatAppResponse = {
-                        ...askResponse,
-                        message: { content: answer, role: role }
-                    };
-                    setStreamedAnswers([...answers, [question, latestResponse]]);
+                    askResponse.message = { content: answer, role: role };
+                    setStreamedAnswers([...answers, [question, { ...askResponse }]]);
                     resolve(null);
                 }, 33);
             });

From 1a5c7382aeaaa2c1610e2b515eb577552a9ef023 Mon Sep 17 00:00:00 2001
From: Matt Gotteiner <matthew.gotteiner@microsoft.com>
Date: Thu, 17 Apr 2025 23:14:15 -0700
Subject: [PATCH 05/10] initial reflection prompt

---
 .../approaches/chatreadretrievereflectread.py | 206 ++++++++++++++++++
 .../prompts/chat_reflect_answer.prompty       |  65 ++++++
 2 files changed, 271 insertions(+)
 create mode 100644 app/backend/approaches/chatreadretrievereflectread.py
 create mode 100644 app/backend/approaches/prompts/chat_reflect_answer.prompty

diff --git a/app/backend/approaches/chatreadretrievereflectread.py b/app/backend/approaches/chatreadretrievereflectread.py
new file mode 100644
index 0000000000..647cf3cdd1
--- /dev/null
+++ b/app/backend/approaches/chatreadretrievereflectread.py
@@ -0,0 +1,206 @@
+from collections.abc import Awaitable
+from typing import Any, Optional, Union, cast, AsyncGenerator
+
+from azure.search.documents.aio import SearchClient
+from azure.search.documents.models import VectorQuery
+from openai import AsyncOpenAI, AsyncStream
+from openai.types.chat import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    ChatCompletionMessageParam,
+    ChatCompletionToolParam,
+)
+
+from approaches.approach import DataPoints, ThoughtStep
+from approaches.chatapproach import ChatApproach, StreamingThoughtStep
+from approaches.promptmanager import PromptManager
+from core.authentication import AuthenticationHelper
+
+
+class ChatReadRetrieveReflectReadApproach(ChatApproach):
+    """
+    A multi-step approach that first uses OpenAI to turn the user's question into a search query,
+    then uses Azure AI Search to retrieve relevant documents, and then sends the conversation history,
+    original user question, and search results to OpenAI to generate a response.
+    """
+
+    def __init__(
+        self,
+        *,
+        search_client: SearchClient,
+        auth_helper: AuthenticationHelper,
+        openai_client: AsyncOpenAI,
+        chatgpt_model: str,
+        chatgpt_deployment: Optional[str],  # Not needed for non-Azure OpenAI
+        embedding_deployment: Optional[str],  # Not needed for non-Azure OpenAI or for retrieval_mode="text"
+        embedding_model: str,
+        embedding_dimensions: int,
+        sourcepage_field: str,
+        content_field: str,
+        query_language: str,
+        query_speller: str,
+        prompt_manager: PromptManager,
+        reasoning_effort: Optional[str] = None,
+        max_steps: Optional[int] = None,
+    ):
+        self.search_client = search_client
+        self.openai_client = openai_client
+        self.auth_helper = auth_helper
+        self.chatgpt_model = chatgpt_model
+        self.chatgpt_deployment = chatgpt_deployment
+        self.embedding_deployment = embedding_deployment
+        self.embedding_model = embedding_model
+        self.embedding_dimensions = embedding_dimensions
+        self.sourcepage_field = sourcepage_field
+        self.content_field = content_field
+        self.query_language = query_language
+        self.query_speller = query_speller
+        self.prompt_manager = prompt_manager
+        self.query_rewrite_prompt = self.prompt_manager.load_prompt("chat_query_rewrite.prompty")
+        self.query_rewrite_tools = self.prompt_manager.load_tools("chat_query_rewrite_tools.json")
+        self.answer_prompt = self.prompt_manager.load_prompt("chat_answer_question.prompty")
+        self.reasoning_effort = reasoning_effort
+        self.include_token_usage = True
+        self.max_steps = max_steps or 3
+
+    async def run_until_final_call(
+        self,
+        messages: list[ChatCompletionMessageParam],
+        overrides: dict[str, Any],
+        auth_claims: dict[str, Any],
+        should_stream: bool = False,
+    ) -> AsyncGenerator[StreamingThoughtStep, None]:
+        use_text_search = overrides.get("retrieval_mode") in ["text", "hybrid", None]
+        use_vector_search = overrides.get("retrieval_mode") in ["vectors", "hybrid", None]
+        use_semantic_ranker = True if overrides.get("semantic_ranker") else False
+        use_semantic_captions = True if overrides.get("semantic_captions") else False
+        use_query_rewriting = True if overrides.get("query_rewriting") else False
+        top = overrides.get("top", 3)
+        minimum_search_score = overrides.get("minimum_search_score", 0.0)
+        minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
+        filter = self.build_filter(overrides, auth_claims)
+
+        original_user_query = messages[-1]["content"]
+        if not isinstance(original_user_query, str):
+            raise ValueError("The most recent message content must be a string.")
+
+        reasoning_model_support = self.GPT_REASONING_MODELS.get(self.chatgpt_model)
+        if reasoning_model_support and (not reasoning_model_support.streaming and should_stream):
+            raise Exception(
+                f"{self.chatgpt_model} does not support streaming. Please use a different model or disable streaming."
+            )
+
+        # STEP 1: Generate an optimized keyword search query based on the chat history and the last question
+        query_messages = self.prompt_manager.render_prompt(
+            self.query_rewrite_prompt, {"user_query": original_user_query, "past_messages": messages[:-1]}
+        )
+        tools: list[ChatCompletionToolParam] = self.query_rewrite_tools
+
+        chat_completion = cast(
+            ChatCompletion,
+            await self.create_chat_completion(
+                self.chatgpt_deployment,
+                self.chatgpt_model,
+                messages=query_messages,
+                overrides=overrides,
+                response_token_limit=self.get_response_token_limit(
+                    self.chatgpt_model, 100
+                ),  # Setting too low risks malformed JSON, setting too high may affect performance
+                temperature=0.0,  # Minimize creativity for search query generation
+                tools=tools,
+                reasoning_effort="low",  # Minimize reasoning for search query generation
+            ),
+        )
+
+        yield StreamingThoughtStep(
+            step=self.format_thought_step_for_chatcompletion(
+                title="Prompt to generate search query",
+                messages=query_messages,
+                overrides=overrides,
+                model=self.chatgpt_model,
+                deployment=self.chatgpt_deployment,
+                usage=chat_completion.usage,
+                reasoning_effort="low",
+            ),
+            role="tool"
+        )
+
+        query_text = self.get_search_query(chat_completion, original_user_query)
+
+        # STEP 2: Retrieve relevant documents from the search index with the GPT optimized query
+        yield StreamingThoughtStep(
+            step=ThoughtStep(
+                "Search using generated search query",
+                query_text,
+                {
+                    "use_semantic_captions": use_semantic_captions,
+                    "use_semantic_ranker": use_semantic_ranker,
+                    "use_query_rewriting": use_query_rewriting,
+                    "top": top,
+                    "filter": filter,
+                    "use_vector_search": use_vector_search,
+                    "use_text_search": use_text_search,
+                },
+            ),
+            role="tool"
+        )
+
+        # If retrieval mode includes vectors, compute an embedding for the query
+        vectors: list[VectorQuery] = []
+        if use_vector_search:
+            vectors.append(await self.compute_text_embedding(query_text))
+
+        results = await self.search(
+            top,
+            query_text,
+            filter,
+            vectors,
+            use_text_search,
+            use_vector_search,
+            use_semantic_ranker,
+            use_semantic_captions,
+            minimum_search_score,
+            minimum_reranker_score,
+            use_query_rewriting,
+        )
+
+        yield StreamingThoughtStep(
+            step=ThoughtStep(
+                "Search results",
+                [result.serialize_for_results() for result in results],
+            ),
+            role="tool"
+        )
+
+        # STEP 3: Generate a contextual and content specific answer using the search results and chat history
+        text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)
+        messages = self.prompt_manager.render_prompt(
+            self.answer_prompt,
+            self.get_system_prompt_variables(overrides.get("prompt_template"))
+            | {
+                "include_follow_up_questions": bool(overrides.get("suggest_followup_questions")),
+                "past_messages": messages[:-1],
+                "user_query": original_user_query,
+                "text_sources": text_sources,
+            },
+        )
+
+        yield StreamingThoughtStep(
+            step=self.format_thought_step_for_chatcompletion(
+                title="Prompt to generate answer",
+                messages=messages,
+                overrides=overrides,
+                model=self.chatgpt_model,
+                deployment=self.chatgpt_deployment,
+                usage=None
+            ),
+            chat_completion=self.create_chat_completion(
+                self.chatgpt_deployment,
+                self.chatgpt_model,
+                messages,
+                overrides,
+                self.get_response_token_limit(self.chatgpt_model, 1024),
+                should_stream,
+            ),
+            data_points=DataPoints(text=text_sources)
+        )
diff --git a/app/backend/approaches/prompts/chat_reflect_answer.prompty b/app/backend/approaches/prompts/chat_reflect_answer.prompty
new file mode 100644
index 0000000000..d5149ceb2f
--- /dev/null
+++ b/app/backend/approaches/prompts/chat_reflect_answer.prompty
@@ -0,0 +1,65 @@
+---
+name: Chat
+description: Answer a question (with chat history) using solely text sources.
+model:
+    api: chat
+---
+system:
+SYSTEM:  
+You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on three communication traits: Relevance, Groundedness, and Correctness. Your job is to assign each trait a score from 1 to 5 using the definitions below.  
+  
+USER:  
+# Definitions  
+  
+## Relevance    
+1 - Irrelevant Response: Unrelated to the question.    
+2 - Incorrect Response: Attempts to answer but gives wrong info.    
+3 - Incomplete Response: Addresses the question but omits key details.    
+4 - Complete Response: Fully addresses the question with accurate, essential details.    
+5 - Comprehensive Response with Insights: Fully accurate and adds relevant insights or implications.  
+  
+## Groundedness    
+1 - Completely Unrelated Response: No relation to context or question.    
+2 - Related Topic but Does Not Respond: Mentions context topic but fails to answer.    
+3 - Attempts to Respond but Contains Incorrect Info: Tries to answer but misstates facts.    
+4 - Partially Correct Response: Correct but omits specific context details.    
+5 - Fully Correct and Complete Response: Thoroughly accurate and includes all context details.  
+  
+## Correctness    
+1 - Completely Incorrect: Contains no correct or relevant facts.    
+2 - Mostly Incorrect: Major factual or logical errors overshadow any correct parts.    
+3 - Partially Correct: Some facts are right but others are wrong or misleading.    
+4 - Mostly Correct: Largely accurate with only minor inaccuracies.    
+5 - Fully Correct: Entirely accurate, fact-based, and logically consistent.  
+  
+# Data    
+CONTEXT: {{context}}    
+QUERY: {{query}}    
+RESPONSE: {{response}}  
+  
+# Tasks    
+For each trait—Relevance, Groundedness, Correctness—produce:    
+• ThoughtChain: start with “Let's think step by step:” and give a concise chain of reasoning.    
+• Explanation: a very short justification.    
+• Score: an integer from 1 to 5.  
+  
+Wrap your outputs in these tags:  
+  
+Relevance:    
+<R0>…ThoughtChain…</R0>    
+<R1>…Explanation…</R1>    
+<R2>…Score…</R2>  
+  
+Groundedness:    
+<G0>…ThoughtChain…</G0>    
+<G1>…Explanation…</G1>    
+<G2>…Score…</G2>  
+  
+Correctness:    
+<C0>…ThoughtChain…</C0>    
+<C1>…Explanation…</C1>    
+<C2>…Score…</C2>  
+  
+Finally, based on your reflections, propose a new query to search a knowledge base for any potentially missing context. Wrap it in:  
+  
+<Q0>…your proposed search query…</Q0>  

From 885da6d8caaeddc058890480fd58e8eb6459b760 Mon Sep 17 00:00:00 2001
From: Matt Gotteiner <matthew.gotteiner@microsoft.com>
Date: Fri, 18 Apr 2025 13:38:49 -0700
Subject: [PATCH 06/10] remove double yield on stream

---
 app/backend/approaches/chatapproach.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/app/backend/approaches/chatapproach.py b/app/backend/approaches/chatapproach.py
index 51261d6c77..33b0c82855 100644
--- a/app/backend/approaches/chatapproach.py
+++ b/app/backend/approaches/chatapproach.py
@@ -57,14 +57,13 @@ async def __anext__(self) -> Union[ChatCompletion, ChatCompletionChunk, DataPoin
                         self.step.update_token_usage(chunk.usage)
                     return chunk
                 except StopAsyncIteration:
-                    # If the stream is exhausted, yield the step with token usage
+                    # Stream is exhausted
                     self._stream = None
-                    return self.step
-        
-            # Non-Streaming Implementation: return the entire response, then the step with token usage
-            result = self._stream
-            self._stream = None
-            return result
+            else:
+                # Non-Streaming Implementation: return the entire response, then the step with token usage
+                result = self._stream
+                self._stream = None
+                return result
     
         if self.step is not None:
             result = self.step

From d7fbd4295076ff66bd080d75e95bbeb59fdfafb5 Mon Sep 17 00:00:00 2001
From: Matt Gotteiner <matthew.gotteiner@microsoft.com>
Date: Mon, 21 Apr 2025 23:20:49 -0700
Subject: [PATCH 07/10] WIP

---
 app/backend/approaches/approach.py            |   5 +-
 app/backend/approaches/chatapproach.py        |  89 +++++++-
 .../approaches/chatreadretrieveread.py        |  69 +++++-
 .../approaches/chatreadretrievereadvision.py  |   3 +-
 .../approaches/chatreadretrievereflectread.py | 206 ------------------
 .../prompts/chat_reflect_answer.prompty       |  51 ++---
 .../prompts/chat_reflect_answer_tools.json    | 111 ++++++++++
 7 files changed, 289 insertions(+), 245 deletions(-)
 delete mode 100644 app/backend/approaches/chatreadretrievereflectread.py
 create mode 100644 app/backend/approaches/prompts/chat_reflect_answer_tools.json

diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
index fdafc19e54..74cb711a56 100644
--- a/app/backend/approaches/approach.py
+++ b/app/backend/approaches/approach.py
@@ -398,7 +398,8 @@ def format_thought_step_for_chatcompletion(
         model: str,
         deployment: Optional[str],
         usage: Optional[CompletionUsage] = None,
-        reasoning_effort: Optional[ChatCompletionReasoningEffort] = None
+        reasoning_effort: Optional[ChatCompletionReasoningEffort] = None,
+        additional_properties: Optional[dict[str, Any]] = None,
     ) -> ThoughtStep:
         properties: dict[str, Any] = {"model": model}
         if deployment:
@@ -410,6 +411,8 @@ def format_thought_step_for_chatcompletion(
             )
         if usage:
             properties["token_usage"] = TokenUsageProps.from_completion_usage(usage)
+        if additional_properties:
+            properties.update(additional_properties)
         return ThoughtStep(title, messages, properties)
 
     async def run(
diff --git a/app/backend/approaches/chatapproach.py b/app/backend/approaches/chatapproach.py
index 33b0c82855..163a6758b7 100644
--- a/app/backend/approaches/chatapproach.py
+++ b/app/backend/approaches/chatapproach.py
@@ -3,6 +3,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Awaitable
 from typing import Any, Optional, List, Union
+from dataclasses import dataclass
 
 from openai import AsyncStream
 from openai.types.chat import (
@@ -24,37 +25,60 @@ def __init__(
             step: ThoughtStep,
             chat_completion: Optional[Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]] = None,
             role: Optional[str] = "assistant",
-            data_points: Optional[DataPoints] = None):
+            data_points: Optional[DataPoints] = None,
+            should_stream: bool = True):
         
         self.step = step
         self.chat_completion = chat_completion
         self.role = role
         self.data_points = data_points
         self._stream = None
-        self._is_streaming = None
+        self.should_stream = should_stream
+        self._steps = []
+        self._step_i = -1
+        self._completion = ""
 
     def __aiter__(self):
         return self
     
     async def start(self):
-        if self._stream is None and self.chat_completion is not None:
+        if self._step_i < 0 and self._stream is None and self.chat_completion is not None:
             self._stream = await self.chat_completion
-            self._is_streaming = True
+    
+    def rewind(self):
+        if not self._steps:
+            raise ValueError("Cannot rewind: no steps recorded.")
+        self._step_i = 0
+    
+    def get_completion(self) -> Optional[str]:
+        return self._completion
 
     async def __anext__(self) -> Union[ChatCompletion, ChatCompletionChunk, DataPoints, ThoughtStep]:
+        if self._step_i >= 0:
+            if self._step_i < len(self._steps):
+                # Return the next step in the recorded steps
+                self._step_i += 1
+                return self._steps[self._step_i - 1]
+
+            raise StopAsyncIteration()
+
         # If there are data points, return them first to render citations
         if self.data_points is not None:
             result = self.data_points
             self.data_points = None
+            self._steps.append(result)
             return result
         
         if self._stream is not None:
-            if self._is_streaming:
+            if self.should_stream:
                 try:
                     # Get the next chunk from the async stream
                     chunk = await self._stream.__anext__()
                     if len(chunk.choices) == 0 and chunk.usage:
                         self.step.update_token_usage(chunk.usage)
+                    elif len(chunk.choices) > 0 and chunk.choices[0].delta.content:
+                        self._completion += chunk.choices[0].delta.content
+                    self._steps.append(chunk)
                     return chunk
                 except StopAsyncIteration:
                     # Stream is exhausted
@@ -63,16 +87,32 @@ async def __anext__(self) -> Union[ChatCompletion, ChatCompletionChunk, DataPoin
                 # Non-Streaming Implementation: return the entire response, then the step with token usage
                 result = self._stream
                 self._stream = None
+                self._completion = result.choices[0].message.content if result.choices else ""
+                self._steps.append(result)
                 return result
     
         if self.step is not None:
             result = self.step
             self.step = None
+            self._steps.append(result)
             return result
     
         # No more items to yield
         raise StopAsyncIteration
 
+@dataclass
+class Reflection:
+    score: Optional[int] = None
+    thought_chain: Optional[str] = None
+    explanation: Optional[str] = None
+
+@dataclass
+class ReflectionResponse:
+    relevance: Optional[Reflection] = None
+    groundedness: Optional[Reflection] = None
+    correctness: Optional[Reflection] = None
+    next_query: Optional[str] = None
+    next_answer: Optional[str] = None
 
 class ChatApproach(Approach, ABC):
 
@@ -102,6 +142,45 @@ def get_search_query(self, chat_completion: ChatCompletion, user_query: str):
                 return query_text
         return user_query
 
+    def get_reflection(self, chat_completion: ChatCompletion) -> Optional[ReflectionResponse]:
+        response_message = chat_completion.choices[0].message
+        reflection_response = ReflectionResponse()
+
+        print(response_message.model_dump())
+        if response_message.tool_calls:
+            for tool in response_message.tool_calls:
+                if tool.type != "function":
+                    continue
+                function = tool.function
+                if function.name == "reflect_answer":
+                    arg = json.loads(function.arguments)
+                    if relevance_reflection := arg.get("relevance"):
+                        reflection_response.relevance = Reflection(
+                            score=relevance_reflection.get("score"),
+                            thought_chain=relevance_reflection.get("thoughtChain"),
+                            explanation=relevance_reflection.get("explanation")
+                        )
+                    if groundedness_reflection := arg.get("groundedness"):
+                        reflection_response.groundedness = Reflection(
+                            score=groundedness_reflection.get("score"),
+                            thought_chain=groundedness_reflection.get("thoughtChain"),
+                            explanation=groundedness_reflection.get("explanation")
+                        )
+                    if correctness_reflection := arg.get("correctness"):
+                        reflection_response.correctness = Reflection(
+                            score=correctness_reflection.get("score"),
+                            thought_chain=correctness_reflection.get("thoughtChain"),
+                            explanation=correctness_reflection.get("explanation")
+                        )
+                if function.name == "search_index":
+                    arg = json.loads(function.arguments)
+                    reflection_response.next_query = arg.get("query")
+                if function.name == "rewrite_answer":
+                    arg = json.loads(function.arguments)
+                    reflection_response.next_answer = arg.get("answer")
+
+        return reflection_response
+
     def extract_followup_questions(self, content: Optional[str]):
         if content is None:
             return content, []
diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py
index 529eec120f..8dc64bf4d0 100644
--- a/app/backend/approaches/chatreadretrieveread.py
+++ b/app/backend/approaches/chatreadretrieveread.py
@@ -1,5 +1,6 @@
 from collections.abc import Awaitable
 from typing import Any, Optional, Union, cast, AsyncGenerator
+import asyncio
 
 from azure.search.documents.aio import SearchClient
 from azure.search.documents.models import VectorQuery
@@ -15,6 +16,7 @@
 from approaches.chatapproach import ChatApproach, StreamingThoughtStep
 from approaches.promptmanager import PromptManager
 from core.authentication import AuthenticationHelper
+import dataclasses
 
 
 class ChatReadRetrieveReadApproach(ChatApproach):
@@ -41,6 +43,7 @@ def __init__(
         query_speller: str,
         prompt_manager: PromptManager,
         reasoning_effort: Optional[str] = None,
+        reflection_max_steps: Optional[int] = None,
     ):
         self.search_client = search_client
         self.openai_client = openai_client
@@ -58,8 +61,11 @@ def __init__(
         self.query_rewrite_prompt = self.prompt_manager.load_prompt("chat_query_rewrite.prompty")
         self.query_rewrite_tools = self.prompt_manager.load_tools("chat_query_rewrite_tools.json")
         self.answer_prompt = self.prompt_manager.load_prompt("chat_answer_question.prompty")
+        self.reflect_prompt = self.prompt_manager.load_prompt("chat_reflect_answer.prompty")
+        self.reflect_tools = self.prompt_manager.load_tools("chat_reflect_answer_tools.json")
         self.reasoning_effort = reasoning_effort
         self.include_token_usage = True
+        self.reflection_max_steps = reflection_max_steps or 3
 
     async def run_until_final_call(
         self,
@@ -73,6 +79,8 @@ async def run_until_final_call(
         use_semantic_ranker = True if overrides.get("semantic_ranker") else False
         use_semantic_captions = True if overrides.get("semantic_captions") else False
         use_query_rewriting = True if overrides.get("query_rewriting") else False
+        use_reflection = True if overrides.get("reflection") else True
+        reflection_max_steps = overrides.get("reflection_max_steps", self.reflection_max_steps)
         top = overrides.get("top", 3)
         minimum_search_score = overrides.get("minimum_search_score", 0.0)
         minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
@@ -183,14 +191,13 @@ async def run_until_final_call(
             },
         )
 
-        yield StreamingThoughtStep(
+        answer_step = StreamingThoughtStep(
             step=self.format_thought_step_for_chatcompletion(
                 title="Prompt to generate answer",
                 messages=messages,
                 overrides=overrides,
                 model=self.chatgpt_model,
                 deployment=self.chatgpt_deployment,
-                usage=None
             ),
             chat_completion=self.create_chat_completion(
                 self.chatgpt_deployment,
@@ -200,5 +207,61 @@ async def run_until_final_call(
                 self.get_response_token_limit(self.chatgpt_model, 1024),
                 should_stream,
             ),
-            data_points=DataPoints(text=text_sources)
+            data_points=DataPoints(text=text_sources),
+            should_stream=should_stream
         )
+        if not use_reflection:
+            yield answer_step
+            return
+
+        # Step 4: Reflection loop to improve the answer
+        for i in range(reflection_max_steps):
+            answer_chunks = []
+
+            # Read the answer
+            await answer_step.start()
+            async for chunk in answer_step:
+                if isinstance(chunk, ThoughtStep):
+                    answer_thought = chunk
+            
+            # STEP 5: Determine the next action to take
+            reflect_messages = self.prompt_manager.render_prompt(
+                self.reflect_prompt, {"text_sources": text_sources, "query": original_user_query, "response": answer_step.get_completion(), "past_messages": messages[:-1]}
+            )
+            tools: list[ChatCompletionToolParam] = self.reflect_tools
+
+            chat_completion = cast(
+                ChatCompletion,
+                await self.create_chat_completion(
+                    self.chatgpt_deployment,
+                    self.chatgpt_model,
+                    messages=reflect_messages,
+                    overrides=overrides,
+                    response_token_limit=self.get_response_token_limit(self.chatgpt_model, 1024),
+                    temperature=0.0,  # Minimize creativity for reflection
+                    tools=tools
+                )
+            )
+            reflection = self.get_reflection(chat_completion)
+
+            yield StreamingThoughtStep(
+                step=self.format_thought_step_for_chatcompletion(
+                    title="Prompt to reflect on answer",
+                    messages=reflect_messages,
+                    overrides=overrides,
+                    model=self.chatgpt_model,
+                    deployment=self.chatgpt_deployment,
+                    usage=chat_completion.usage,
+                    additional_properties=dataclasses.asdict(reflection)
+                ),
+                role="tool"
+            )
+
+            answer_step.rewind()
+            yield answer_step
+            return
+
+
+
+
+
diff --git a/app/backend/approaches/chatreadretrievereadvision.py b/app/backend/approaches/chatreadretrievereadvision.py
index 517b84c708..3472bc6d3c 100644
--- a/app/backend/approaches/chatreadretrievereadvision.py
+++ b/app/backend/approaches/chatreadretrievereadvision.py
@@ -207,5 +207,6 @@ async def run_until_final_call(
                 stream=should_stream,
                 seed=seed,
             ),
-            data_points=DataPoints(text=text_sources, images=image_sources)
+            data_points=DataPoints(text=text_sources, images=image_sources),
+            should_stream=should_stream
         )
diff --git a/app/backend/approaches/chatreadretrievereflectread.py b/app/backend/approaches/chatreadretrievereflectread.py
deleted file mode 100644
index 647cf3cdd1..0000000000
--- a/app/backend/approaches/chatreadretrievereflectread.py
+++ /dev/null
@@ -1,206 +0,0 @@
-from collections.abc import Awaitable
-from typing import Any, Optional, Union, cast, AsyncGenerator
-
-from azure.search.documents.aio import SearchClient
-from azure.search.documents.models import VectorQuery
-from openai import AsyncOpenAI, AsyncStream
-from openai.types.chat import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessageParam,
-    ChatCompletionToolParam,
-)
-
-from approaches.approach import DataPoints, ThoughtStep
-from approaches.chatapproach import ChatApproach, StreamingThoughtStep
-from approaches.promptmanager import PromptManager
-from core.authentication import AuthenticationHelper
-
-
-class ChatReadRetrieveReflectReadApproach(ChatApproach):
-    """
-    A multi-step approach that first uses OpenAI to turn the user's question into a search query,
-    then uses Azure AI Search to retrieve relevant documents, and then sends the conversation history,
-    original user question, and search results to OpenAI to generate a response.
-    """
-
-    def __init__(
-        self,
-        *,
-        search_client: SearchClient,
-        auth_helper: AuthenticationHelper,
-        openai_client: AsyncOpenAI,
-        chatgpt_model: str,
-        chatgpt_deployment: Optional[str],  # Not needed for non-Azure OpenAI
-        embedding_deployment: Optional[str],  # Not needed for non-Azure OpenAI or for retrieval_mode="text"
-        embedding_model: str,
-        embedding_dimensions: int,
-        sourcepage_field: str,
-        content_field: str,
-        query_language: str,
-        query_speller: str,
-        prompt_manager: PromptManager,
-        reasoning_effort: Optional[str] = None,
-        max_steps: Optional[int] = None,
-    ):
-        self.search_client = search_client
-        self.openai_client = openai_client
-        self.auth_helper = auth_helper
-        self.chatgpt_model = chatgpt_model
-        self.chatgpt_deployment = chatgpt_deployment
-        self.embedding_deployment = embedding_deployment
-        self.embedding_model = embedding_model
-        self.embedding_dimensions = embedding_dimensions
-        self.sourcepage_field = sourcepage_field
-        self.content_field = content_field
-        self.query_language = query_language
-        self.query_speller = query_speller
-        self.prompt_manager = prompt_manager
-        self.query_rewrite_prompt = self.prompt_manager.load_prompt("chat_query_rewrite.prompty")
-        self.query_rewrite_tools = self.prompt_manager.load_tools("chat_query_rewrite_tools.json")
-        self.answer_prompt = self.prompt_manager.load_prompt("chat_answer_question.prompty")
-        self.reasoning_effort = reasoning_effort
-        self.include_token_usage = True
-        self.max_steps = max_steps or 3
-
-    async def run_until_final_call(
-        self,
-        messages: list[ChatCompletionMessageParam],
-        overrides: dict[str, Any],
-        auth_claims: dict[str, Any],
-        should_stream: bool = False,
-    ) -> AsyncGenerator[StreamingThoughtStep, None]:
-        use_text_search = overrides.get("retrieval_mode") in ["text", "hybrid", None]
-        use_vector_search = overrides.get("retrieval_mode") in ["vectors", "hybrid", None]
-        use_semantic_ranker = True if overrides.get("semantic_ranker") else False
-        use_semantic_captions = True if overrides.get("semantic_captions") else False
-        use_query_rewriting = True if overrides.get("query_rewriting") else False
-        top = overrides.get("top", 3)
-        minimum_search_score = overrides.get("minimum_search_score", 0.0)
-        minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
-        filter = self.build_filter(overrides, auth_claims)
-
-        original_user_query = messages[-1]["content"]
-        if not isinstance(original_user_query, str):
-            raise ValueError("The most recent message content must be a string.")
-
-        reasoning_model_support = self.GPT_REASONING_MODELS.get(self.chatgpt_model)
-        if reasoning_model_support and (not reasoning_model_support.streaming and should_stream):
-            raise Exception(
-                f"{self.chatgpt_model} does not support streaming. Please use a different model or disable streaming."
-            )
-
-        # STEP 1: Generate an optimized keyword search query based on the chat history and the last question
-        query_messages = self.prompt_manager.render_prompt(
-            self.query_rewrite_prompt, {"user_query": original_user_query, "past_messages": messages[:-1]}
-        )
-        tools: list[ChatCompletionToolParam] = self.query_rewrite_tools
-
-        chat_completion = cast(
-            ChatCompletion,
-            await self.create_chat_completion(
-                self.chatgpt_deployment,
-                self.chatgpt_model,
-                messages=query_messages,
-                overrides=overrides,
-                response_token_limit=self.get_response_token_limit(
-                    self.chatgpt_model, 100
-                ),  # Setting too low risks malformed JSON, setting too high may affect performance
-                temperature=0.0,  # Minimize creativity for search query generation
-                tools=tools,
-                reasoning_effort="low",  # Minimize reasoning for search query generation
-            ),
-        )
-
-        yield StreamingThoughtStep(
-            step=self.format_thought_step_for_chatcompletion(
-                title="Prompt to generate search query",
-                messages=query_messages,
-                overrides=overrides,
-                model=self.chatgpt_model,
-                deployment=self.chatgpt_deployment,
-                usage=chat_completion.usage,
-                reasoning_effort="low",
-            ),
-            role="tool"
-        )
-
-        query_text = self.get_search_query(chat_completion, original_user_query)
-
-        # STEP 2: Retrieve relevant documents from the search index with the GPT optimized query
-        yield StreamingThoughtStep(
-            step=ThoughtStep(
-                "Search using generated search query",
-                query_text,
-                {
-                    "use_semantic_captions": use_semantic_captions,
-                    "use_semantic_ranker": use_semantic_ranker,
-                    "use_query_rewriting": use_query_rewriting,
-                    "top": top,
-                    "filter": filter,
-                    "use_vector_search": use_vector_search,
-                    "use_text_search": use_text_search,
-                },
-            ),
-            role="tool"
-        )
-
-        # If retrieval mode includes vectors, compute an embedding for the query
-        vectors: list[VectorQuery] = []
-        if use_vector_search:
-            vectors.append(await self.compute_text_embedding(query_text))
-
-        results = await self.search(
-            top,
-            query_text,
-            filter,
-            vectors,
-            use_text_search,
-            use_vector_search,
-            use_semantic_ranker,
-            use_semantic_captions,
-            minimum_search_score,
-            minimum_reranker_score,
-            use_query_rewriting,
-        )
-
-        yield StreamingThoughtStep(
-            step=ThoughtStep(
-                "Search results",
-                [result.serialize_for_results() for result in results],
-            ),
-            role="tool"
-        )
-
-        # STEP 3: Generate a contextual and content specific answer using the search results and chat history
-        text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)
-        messages = self.prompt_manager.render_prompt(
-            self.answer_prompt,
-            self.get_system_prompt_variables(overrides.get("prompt_template"))
-            | {
-                "include_follow_up_questions": bool(overrides.get("suggest_followup_questions")),
-                "past_messages": messages[:-1],
-                "user_query": original_user_query,
-                "text_sources": text_sources,
-            },
-        )
-
-        yield StreamingThoughtStep(
-            step=self.format_thought_step_for_chatcompletion(
-                title="Prompt to generate answer",
-                messages=messages,
-                overrides=overrides,
-                model=self.chatgpt_model,
-                deployment=self.chatgpt_deployment,
-                usage=None
-            ),
-            chat_completion=self.create_chat_completion(
-                self.chatgpt_deployment,
-                self.chatgpt_model,
-                messages,
-                overrides,
-                self.get_response_token_limit(self.chatgpt_model, 1024),
-                should_stream,
-            ),
-            data_points=DataPoints(text=text_sources)
-        )
diff --git a/app/backend/approaches/prompts/chat_reflect_answer.prompty b/app/backend/approaches/prompts/chat_reflect_answer.prompty
index d5149ceb2f..f95175eb41 100644
--- a/app/backend/approaches/prompts/chat_reflect_answer.prompty
+++ b/app/backend/approaches/prompts/chat_reflect_answer.prompty
@@ -5,10 +5,8 @@ model:
     api: chat
 ---
 system:
-SYSTEM:  
 You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on three communication traits: Relevance, Groundedness, and Correctness. Your job is to assign each trait a score from 1 to 5 using the definitions below.  
   
-USER:  
 # Definitions  
   
 ## Relevance    
@@ -32,34 +30,29 @@ USER:
 4 - Mostly Correct: Largely accurate with only minor inaccuracies.    
 5 - Fully Correct: Entirely accurate, fact-based, and logically consistent.  
   
-# Data    
-CONTEXT: {{context}}    
-QUERY: {{query}}    
-RESPONSE: {{response}}  
-  
 # Tasks    
 For each trait—Relevance, Groundedness, Correctness—produce:    
 • ThoughtChain: start with “Let's think step by step:” and give a concise chain of reasoning.    
 • Explanation: a very short justification.    
-• Score: an integer from 1 to 5.  
-  
-Wrap your outputs in these tags:  
-  
-Relevance:    
-<R0>…ThoughtChain…</R0>    
-<R1>…Explanation…</R1>    
-<R2>…Score…</R2>  
-  
-Groundedness:    
-<G0>…ThoughtChain…</G0>    
-<G1>…Explanation…</G1>    
-<G2>…Score…</G2>  
-  
-Correctness:    
-<C0>…ThoughtChain…</C0>    
-<C1>…Explanation…</C1>    
-<C2>…Score…</C2>  
-  
-Finally, based on your reflections, propose a new query to search a knowledge base for any potentially missing context. Wrap it in:  
-  
-<Q0>…your proposed search query…</Q0>  
+• Score: an integer from 1 to 5.
+
+Based on your reflection, if it is necessary to search a knowledge base for any potentially missing context, propose it.
+
+Based on your reflection, if it is necessary to adjust the final answer to improve the quality of the response, propose the adjusted answer
+
+# Data    
+CONTEXT:
+{% for text_source in text_sources %}
+{{ text_source }}
+{% endfor %}
+QUERY: {{query}}    
+RESPONSE: {{response}}
+
+user:
+
+Conversation History:
+
+{% for message in past_messages %}
+{{ message["role"] }}:
+{{ message["content"] }}
+{% endfor %}
diff --git a/app/backend/approaches/prompts/chat_reflect_answer_tools.json b/app/backend/approaches/prompts/chat_reflect_answer_tools.json
new file mode 100644
index 0000000000..92549c91c8
--- /dev/null
+++ b/app/backend/approaches/prompts/chat_reflect_answer_tools.json
@@ -0,0 +1,111 @@
+[
+    {
+        "type": "function",
+        "function": {
+            "name": "reflect_answer",
+            "description": "Reflect on an answer to a question and decide on the next step based on the context of the conversation.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "relevance": {
+                        "type": "object",
+                        "description": "Relevance of the answer to the question asked.",
+                        "properties": {
+                            "thoughtChain": {
+                                "type": "string",
+                                "description": "A chain of thoughts that led to the conclusion about the relevance of the answer."
+                            },
+                            "explanation": {
+                                "type": "string",
+                                "description": "An explanation of why the answer is relevant or not relevant to the question asked."
+                            },
+                            "score": {
+                                "type": "number",
+                                "description": "A score indicating how relevant the answer is to the question asked, on a scale from 1 to 5."
+                            }
+                        }
+                    },
+                    "groundedness": {
+                        "type": "object",
+                        "description": "Groundedness of the answer based on the context for the question.",
+                        "properties": {
+                            "thoughtChain": {
+                                "type": "string",
+                                "description": "A chain of thoughts that led to the conclusion about the groundedness of the answer."
+                            },
+                            "explanation": {
+                                "type": "string",
+                                "description": "An explanation of why the answer is grounded based on the context."
+                            },
+                            "score": {
+                                "type": "number",
+                                "description": "A score indicating how grounded the answer is to the context given, on a scale from 1 to 5."
+                            }
+                        }
+                    },
+                    "correctness": {
+                        "type": "object",
+                        "description": "Correctness of the answer based on the context for the question.",
+                        "properties": {
+                            "thoughtChain": {
+                                "type": "string",
+                                "description": "A chain of thoughts that led to the conclusion about the correctness of the answer."
+                            },
+                            "explanation": {
+                                "type": "string",
+                                "description": "An explanation of why the answer is correct or not correct based on the context."
+                            },
+                            "score": {
+                                "type": "number",
+                                "description": "A score indicating how correct the answer is to the context given, on a scale from 1 to 5."
+                            }
+                        }
+                    }
+                },
+                "required": [
+                    "relevance",
+                    "groundedness",
+                    "correctness"
+                ]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "search_index",
+            "description": "Search the index to find relevant information based on the question asked.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "query": {
+                        "type": "string",
+                        "description": "The query to search in the index."
+                    }
+                },
+                "required": [
+                    "query"
+                ]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "rewrite_answer",
+            "description": "Rewrite an answer to a question based on the context of the conversation and the relevance, groundedness, and correctness of the original answer.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "answer": {
+                        "type": "string",
+                        "description": "The updated answer to the query"
+                    }
+                },
+                "required": [
+                    "answer"
+                ]
+            }
+        }
+    }
+]
\ No newline at end of file

From bce7553cd215b8b82b135c25a41c3907a1f4b823 Mon Sep 17 00:00:00 2001
From: Matt Gotteiner <matthew.gotteiner@microsoft.com>
Date: Tue, 22 Apr 2025 13:46:35 -0700
Subject: [PATCH 08/10] checkpoint

---
 app/backend/app.py                            |   6 +
 app/backend/approaches/approach.py            |  11 +-
 app/backend/approaches/chatapproach.py        |  29 ++-
 .../approaches/chatreadretrieveread.py        | 166 +++++++++++++++---
 .../prompts/chat_answer_question.prompty      |  22 +++
 .../prompts/chat_reflect_answer.prompty       |  16 +-
 .../prompts/chat_reflect_answer_tools.json    |  50 ++----
 .../AnalysisPanel/AnalysisPanel.module.css    |  27 +++
 .../AnalysisPanel/CandidateAnswer.tsx         |  19 ++
 .../components/AnalysisPanel/Evaluation.tsx   |  24 +++
 .../components/AnalysisPanel/Reflection.tsx   |  19 ++
 .../AnalysisPanel/ThoughtProcess.tsx          |  12 +-
 12 files changed, 329 insertions(+), 72 deletions(-)
 create mode 100644 app/frontend/src/components/AnalysisPanel/CandidateAnswer.tsx
 create mode 100644 app/frontend/src/components/AnalysisPanel/Evaluation.tsx
 create mode 100644 app/frontend/src/components/AnalysisPanel/Reflection.tsx

diff --git a/app/backend/app.py b/app/backend/app.py
index 3d6ce2dc25..01c3bf23e7 100644
--- a/app/backend/app.py
+++ b/app/backend/app.py
@@ -432,6 +432,7 @@ async def setup_clients():
     # Shared by all OpenAI deployments
     OPENAI_HOST = os.getenv("OPENAI_HOST", "azure")
     OPENAI_CHATGPT_MODEL = os.environ["AZURE_OPENAI_CHATGPT_MODEL"]
+    OPENAI_CHATGPT_REFLECTION_MODEL = os.environ.get("AZURE_OPENAI_CHATGPT_REFLECTION_MODEL")
     OPENAI_EMB_MODEL = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-ada-002")
     OPENAI_EMB_DIMENSIONS = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS") or 1536)
     OPENAI_REASONING_EFFORT = os.getenv("AZURE_OPENAI_REASONING_EFFORT")
@@ -442,6 +443,9 @@ async def setup_clients():
     AZURE_OPENAI_CHATGPT_DEPLOYMENT = (
         os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT") if OPENAI_HOST.startswith("azure") else None
     )
+    AZURE_OPENAI_CHATGPT_REFLECTION_DEPLOYMENT = (
+        os.getenv("AZURE_OPENAI_CHATGPT_REFLECTION_DEPLOYMENT") if OPENAI_HOST.startswith("azure") else None
+    )
     AZURE_OPENAI_EMB_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT") if OPENAI_HOST.startswith("azure") else None
     AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL")
     # https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release
@@ -696,6 +700,8 @@ async def setup_clients():
         auth_helper=auth_helper,
         chatgpt_model=OPENAI_CHATGPT_MODEL,
         chatgpt_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
+        chatgpt_reflection_model=OPENAI_CHATGPT_REFLECTION_MODEL,
+        chatgpt_reflection_deployment=AZURE_OPENAI_CHATGPT_REFLECTION_DEPLOYMENT,
         embedding_model=OPENAI_EMB_MODEL,
         embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
         embedding_dimensions=OPENAI_EMB_DIMENSIONS,
diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
index 74cb711a56..e66104ad37 100644
--- a/app/backend/approaches/approach.py
+++ b/app/backend/approaches/approach.py
@@ -28,6 +28,7 @@
     ChatCompletionMessageParam,
     ChatCompletionReasoningEffort,
     ChatCompletionToolParam,
+    ChatCompletionNamedToolChoiceParam
 )
 
 from approaches.promptmanager import PromptManager
@@ -266,17 +267,21 @@ def nonewlines(s: str) -> str:
             return s.replace("\n", " ").replace("\r", " ")
 
         if use_semantic_captions:
-            return [
+            results = [
                 (self.get_citation((doc.sourcepage or ""), use_image_citation))
                 + ": "
                 + nonewlines(" . ".join([cast(str, c.text) for c in (doc.captions or [])]))
                 for doc in results
             ]
         else:
-            return [
+            results = [
                 (self.get_citation((doc.sourcepage or ""), use_image_citation)) + ": " + nonewlines(doc.content or "")
                 for doc in results
             ]
+        
+        # Remove duplicates
+        results = list(set(results))
+        return results
 
     def get_citation(self, sourcepage: str, use_image_citation: bool) -> str:
         if use_image_citation:
@@ -352,6 +357,7 @@ def create_chat_completion(
         response_token_limit: int,
         should_stream: bool = False,
         tools: Optional[list[ChatCompletionToolParam]] = None,
+        tool_choice: Optional[ChatCompletionNamedToolChoiceParam] = None,
         temperature: Optional[float] = None,
         n: Optional[int] = None,
         reasoning_effort: Optional[ChatCompletionReasoningEffort] = None,
@@ -380,6 +386,7 @@ def create_chat_completion(
             params["stream_options"] = {"include_usage": True}
 
         params["tools"] = tools
+        params["tool_choice"] = tool_choice
 
         # Azure OpenAI takes the deployment name as the model name
         return self.openai_client.chat.completions.create(
diff --git a/app/backend/approaches/chatapproach.py b/app/backend/approaches/chatapproach.py
index 163a6758b7..2c9e70967a 100644
--- a/app/backend/approaches/chatapproach.py
+++ b/app/backend/approaches/chatapproach.py
@@ -26,7 +26,8 @@ def __init__(
             chat_completion: Optional[Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]] = None,
             role: Optional[str] = "assistant",
             data_points: Optional[DataPoints] = None,
-            should_stream: bool = True):
+            should_stream: bool = True,
+            completion: Optional[str] = None):
         
         self.step = step
         self.chat_completion = chat_completion
@@ -36,7 +37,8 @@ def __init__(
         self.should_stream = should_stream
         self._steps = []
         self._step_i = -1
-        self._completion = ""
+        self._completion = completion or ""
+        self._has_existing_completion = completion is not None
 
     def __aiter__(self):
         return self
@@ -90,6 +92,10 @@ async def __anext__(self) -> Union[ChatCompletion, ChatCompletionChunk, DataPoin
                 self._completion = result.choices[0].message.content if result.choices else ""
                 self._steps.append(result)
                 return result
+        elif self._has_existing_completion:
+            # Stream is none - yield already done completion
+            self._has_existing_completion = False
+            return self._completion
     
         if self.step is not None:
             result = self.step
@@ -146,7 +152,6 @@ def get_reflection(self, chat_completion: ChatCompletion) -> Optional[Reflection
         response_message = chat_completion.choices[0].message
         reflection_response = ReflectionResponse()
 
-        print(response_message.model_dump())
         if response_message.tool_calls:
             for tool in response_message.tool_calls:
                 if tool.type != "function":
@@ -172,12 +177,10 @@ def get_reflection(self, chat_completion: ChatCompletion) -> Optional[Reflection
                             thought_chain=correctness_reflection.get("thoughtChain"),
                             explanation=correctness_reflection.get("explanation")
                         )
-                if function.name == "search_index":
-                    arg = json.loads(function.arguments)
-                    reflection_response.next_query = arg.get("query")
-                if function.name == "rewrite_answer":
-                    arg = json.loads(function.arguments)
-                    reflection_response.next_answer = arg.get("answer")
+                    if next_answer := arg.get("next_answer"):
+                        reflection_response.next_answer = next_answer
+                    if next_query := arg.get("next_query"):
+                        reflection_response.next_query = next_query
 
         return reflection_response
 
@@ -205,6 +208,9 @@ async def run_without_streaming(
                 if isinstance(chunk, ChatCompletion):
                     content = chunk.choices[0].message.content
                     role = chunk.choices[0].message.role
+                elif isinstance(chunk, str):
+                    content = chunk
+                    role = "assistant" 
                 elif isinstance(chunk, ThoughtStep):
                    extra_info.thoughts.append(chunk)
                 elif isinstance(chunk, DataPoints):
@@ -255,6 +261,11 @@ async def run_with_streaming(
                             followup_content += content
                         else:
                             yield completion
+                elif isinstance(chunk, str):
+                    content = chunk
+                    role = "assistant" 
+                    completion = { "delta": {"content": content, "role": role} }
+                    yield completion
                 elif isinstance(chunk, ThoughtStep):
                     extra_info.thoughts.append(chunk)
                     yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state}
diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py
index 8dc64bf4d0..7df341d5a9 100644
--- a/app/backend/approaches/chatreadretrieveread.py
+++ b/app/backend/approaches/chatreadretrieveread.py
@@ -1,15 +1,15 @@
 from collections.abc import Awaitable
 from typing import Any, Optional, Union, cast, AsyncGenerator
-import asyncio
+from copy import deepcopy
 
 from azure.search.documents.aio import SearchClient
 from azure.search.documents.models import VectorQuery
-from openai import AsyncOpenAI, AsyncStream
+from openai import AsyncOpenAI
 from openai.types.chat import (
     ChatCompletion,
-    ChatCompletionChunk,
     ChatCompletionMessageParam,
     ChatCompletionToolParam,
+    ChatCompletionNamedToolChoiceParam
 )
 
 from approaches.approach import DataPoints, ThoughtStep
@@ -34,6 +34,8 @@ def __init__(
         openai_client: AsyncOpenAI,
         chatgpt_model: str,
         chatgpt_deployment: Optional[str],  # Not needed for non-Azure OpenAI
+        chatgpt_reflection_model: Optional[str],
+        chatgpt_reflection_deployment: Optional[str],  # Not needed for non-Azure OpenAI
         embedding_deployment: Optional[str],  # Not needed for non-Azure OpenAI or for retrieval_mode="text"
         embedding_model: str,
         embedding_dimensions: int,
@@ -50,6 +52,8 @@ def __init__(
         self.auth_helper = auth_helper
         self.chatgpt_model = chatgpt_model
         self.chatgpt_deployment = chatgpt_deployment
+        self.chatgpt_reflection_model = chatgpt_reflection_model
+        self.chatgpt_reflection_deployment = chatgpt_reflection_deployment
         self.embedding_deployment = embedding_deployment
         self.embedding_model = embedding_model
         self.embedding_dimensions = embedding_dimensions
@@ -180,12 +184,13 @@ async def run_until_final_call(
 
         # STEP 3: Generate a contextual and content specific answer using the search results and chat history
         text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)
-        messages = self.prompt_manager.render_prompt(
+        answer_messages = deepcopy(messages)
+        answer_messages = self.prompt_manager.render_prompt(
             self.answer_prompt,
             self.get_system_prompt_variables(overrides.get("prompt_template"))
             | {
                 "include_follow_up_questions": bool(overrides.get("suggest_followup_questions")),
-                "past_messages": messages[:-1],
+                "past_messages": answer_messages[:-1],
                 "user_query": original_user_query,
                 "text_sources": text_sources,
             },
@@ -194,7 +199,7 @@ async def run_until_final_call(
         answer_step = StreamingThoughtStep(
             step=self.format_thought_step_for_chatcompletion(
                 title="Prompt to generate answer",
-                messages=messages,
+                messages=answer_messages,
                 overrides=overrides,
                 model=self.chatgpt_model,
                 deployment=self.chatgpt_deployment,
@@ -202,7 +207,7 @@ async def run_until_final_call(
             chat_completion=self.create_chat_completion(
                 self.chatgpt_deployment,
                 self.chatgpt_model,
-                messages,
+                answer_messages,
                 overrides,
                 self.get_response_token_limit(self.chatgpt_model, 1024),
                 should_stream,
@@ -214,16 +219,30 @@ async def run_until_final_call(
             yield answer_step
             return
 
+        answer_passed_eval = False
+        next_answer = ""
         # Step 4: Reflection loop to improve the answer
         for i in range(reflection_max_steps):
-            answer_chunks = []
-
-            # Read the answer
+            # Read the candidate answer step
             await answer_step.start()
             async for chunk in answer_step:
-                if isinstance(chunk, ThoughtStep):
-                    answer_thought = chunk
-            
+                pass
+    
+            yield StreamingThoughtStep(
+                step=self.format_thought_step_for_chatcompletion(
+                    title="Generate candidate answer",
+                    messages=answer_messages,
+                    overrides=overrides,
+                    model=self.chatgpt_model,
+                    deployment=self.chatgpt_deployment,
+                    additional_properties={
+                        "candidate_answer": answer_step.get_completion()
+                    }
+                ),
+                data_points=DataPoints(text=text_sources),
+                should_stream=False
+            )
+
             # STEP 5: Determine the next action to take
             reflect_messages = self.prompt_manager.render_prompt(
                 self.reflect_prompt, {"text_sources": text_sources, "query": original_user_query, "response": answer_step.get_completion(), "past_messages": messages[:-1]}
@@ -233,13 +252,14 @@ async def run_until_final_call(
             chat_completion = cast(
                 ChatCompletion,
                 await self.create_chat_completion(
-                    self.chatgpt_deployment,
-                    self.chatgpt_model,
+                    self.chatgpt_reflection_deployment,
+                    self.chatgpt_reflection_model,
                     messages=reflect_messages,
                     overrides=overrides,
                     response_token_limit=self.get_response_token_limit(self.chatgpt_model, 1024),
                     temperature=0.0,  # Minimize creativity for reflection
-                    tools=tools
+                    tools=tools,
+                    tool_choice=ChatCompletionNamedToolChoiceParam(function={"name": self.reflect_tools[0]["function"]["name"]}, type="function"),
                 )
             )
             reflection = self.get_reflection(chat_completion)
@@ -249,18 +269,124 @@ async def run_until_final_call(
                     title="Prompt to reflect on answer",
                     messages=reflect_messages,
                     overrides=overrides,
-                    model=self.chatgpt_model,
-                    deployment=self.chatgpt_deployment,
+                    model=self.chatgpt_reflection_model,
+                    deployment=self.chatgpt_reflection_deployment,
                     usage=chat_completion.usage,
                     additional_properties=dataclasses.asdict(reflection)
                 ),
                 role="tool"
             )
 
+            # If the reflection was good, stop generating
+            answer_passed_eval = reflection.groundedness.score >= 4 and reflection.correctness.score >= 4 and reflection.relevance.score >= 4
+            if answer_passed_eval:
+                break
+
+            if reflection.next_answer:
+                next_answer = reflection.next_answer
+            if reflection.next_query:
+                # Repeat STEP 2: Retrieve relevant documents from the search index with the GPT optimized query
+                yield StreamingThoughtStep(
+                    step=ThoughtStep(
+                        "Updated search using reflected search query",
+                        reflection.next_query,
+                        {
+                            "use_semantic_captions": use_semantic_captions,
+                            "use_semantic_ranker": use_semantic_ranker,
+                            "use_query_rewriting": use_query_rewriting,
+                            "top": top,
+                            "filter": filter,
+                            "use_vector_search": use_vector_search,
+                            "use_text_search": use_text_search,
+                        },
+                    ),
+                    role="tool"
+                )
+
+                # If retrieval mode includes vectors, compute an embedding for the query
+                vectors: list[VectorQuery] = []
+                if use_vector_search:
+                    vectors.append(await self.compute_text_embedding(reflection.next_query))
+
+                reflection_results = await self.search(
+                    top,
+                    reflection.next_query,
+                    filter,
+                    vectors,
+                    use_text_search,
+                    use_vector_search,
+                    use_semantic_ranker,
+                    use_semantic_captions,
+                    minimum_search_score,
+                    minimum_reranker_score,
+                    use_query_rewriting,
+                )
+
+                # Repeat STEP 3: Generate a contextual and content specific answer using the search results and chat history
+                results.extend(reflection_results)
+                text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)
+                answer_messages = deepcopy(messages)
+                answer_messages = self.prompt_manager.render_prompt(
+                    self.answer_prompt,
+                    self.get_system_prompt_variables(overrides.get("prompt_template"))
+                    | {
+                        "include_follow_up_questions": bool(overrides.get("suggest_followup_questions")),
+                        "past_messages": answer_messages[:-1],
+                        "user_query": original_user_query,
+                        "text_sources": text_sources,
+                        "previous_answer": answer_step.get_completion(),
+                        "previous_answer_evaluations": [
+                            { "label": "Groundedness", "score": reflection.groundedness.score, "explanation": reflection.groundedness.explanation },
+                            { "label": "Correctness", "score": reflection.correctness.score, "explanation": reflection.correctness.explanation },
+                            { "label": "Relevance", "score": reflection.relevance.score, "explanation": reflection.relevance.explanation },
+                        ],
+                        "revised_answer": reflection.next_answer
+                    },
+                )
+
+                answer_step = StreamingThoughtStep(
+                    step=self.format_thought_step_for_chatcompletion(
+                        title="Prompt to generate updated reflected answer",
+                        messages=answer_messages,
+                        overrides=overrides,
+                        model=self.chatgpt_model,
+                        deployment=self.chatgpt_deployment,
+                    ),
+                    chat_completion=self.create_chat_completion(
+                        self.chatgpt_deployment,
+                        self.chatgpt_model,
+                        answer_messages,
+                        overrides,
+                        self.get_response_token_limit(self.chatgpt_model, 1024),
+                        should_stream,
+                    ),
+                    data_points=DataPoints(text=text_sources),
+                    should_stream=should_stream
+                )
+            else:
+                # No new query, yield revised answer
+                break
+
+        if answer_passed_eval:
             answer_step.rewind()
             yield answer_step
-            return
-
+        else:
+            next_answer = reflection.next_answer or next_answer
+            if next_answer:
+                yield StreamingThoughtStep(
+                    step=self.format_thought_step_for_chatcompletion(
+                        title="Using reflection revised answer",
+                        messages=answer_messages,
+                        overrides=overrides,
+                        model=self.chatgpt_model,
+                        deployment=self.chatgpt_deployment,
+                    ),
+                    completion=next_answer,
+                    data_points=DataPoints(text=text_sources),
+                    should_stream=False
+                )
+            else:
+                yield answer_step
 
 
 
diff --git a/app/backend/approaches/prompts/chat_answer_question.prompty b/app/backend/approaches/prompts/chat_answer_question.prompty
index 3dcb05ae21..91a7733a6e 100644
--- a/app/backend/approaches/prompts/chat_answer_question.prompty
+++ b/app/backend/approaches/prompts/chat_answer_question.prompty
@@ -37,6 +37,28 @@ Do not repeat questions that have already been asked.
 Make sure the last question ends with ">>".
 {% endif %}
 
+{% if previous_answer %}
+You've previously attempted to answer this question and it has been evalauted that the previous answer was not sufficient
+Previous answer:
+{{ previous_answer }}
+
+Why was the previous answer insufficient?
+{% for evaluation in previous_answer_evaluations %}
+Evaluation: {{ evaluation["label"] }}
+Score: {{ evaluation["score"] }}
+Explanation: {{ evaluation["explanation"] }}
+{% endfor %}
+
+{% if revised_answer %}
+The previous answer was revised to improve the quality of the response. Use this to help generate a better answer:
+Revised answer:
+{{ revised_answer }}
+
+{% endif %}
+
+Use this information to improve the answer next time
+{% endif %}
+
 {% for message in past_messages %}
 {{ message["role"] }}:
 {{ message["content"] }}
diff --git a/app/backend/approaches/prompts/chat_reflect_answer.prompty b/app/backend/approaches/prompts/chat_reflect_answer.prompty
index f95175eb41..8f78f3d18e 100644
--- a/app/backend/approaches/prompts/chat_reflect_answer.prompty
+++ b/app/backend/approaches/prompts/chat_reflect_answer.prompty
@@ -34,11 +34,25 @@ You are an expert in evaluating the quality of a RESPONSE from an intelligent sy
 For each trait—Relevance, Groundedness, Correctness—produce:    
 • ThoughtChain: start with “Let's think step by step:” and give a concise chain of reasoning.    
 • Explanation: a very short justification.    
-• Score: an integer from 1 to 5.
+• Score: an integer from 1 to 5. A response like "I don't know" can never achieve a high score
 
 Based on your reflection, if it is necessary to search a knowledge base for any potentially missing context, propose it.
+Query generation guidelines:
+You have access to Azure AI Search index with 100's of documents.
+Generate a search query based on the conversation and the new question.
+Do not include cited source filenames and document names e.g. info.txt or doc.pdf in the search query terms.
+Do not include any text inside [] or <<>> in the search query terms.
+Do not include any special characters like '+'.
+If the question is not in English, translate the question to English before generating the search query.
 
 Based on your reflection, if it is necessary to adjust the final answer to improve the quality of the response, propose the adjusted answer
+If the answer cannot be improved, return an empty string.
+Answer generation guidelines:
+Assistant helps the company employees with their healthcare plan questions, and questions about the employee handbook. Be brief in your answers.
+Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.
+If the question is not in English, answer in the language used in the question.
+Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].
+
 
 # Data    
 CONTEXT:
diff --git a/app/backend/approaches/prompts/chat_reflect_answer_tools.json b/app/backend/approaches/prompts/chat_reflect_answer_tools.json
index 92549c91c8..8a405fae11 100644
--- a/app/backend/approaches/prompts/chat_reflect_answer_tools.json
+++ b/app/backend/approaches/prompts/chat_reflect_answer_tools.json
@@ -60,50 +60,22 @@
                                 "description": "A score indicating how correct the answer is to the context given, on a scale from 1 to 5."
                             }
                         }
-                    }
-                },
-                "required": [
-                    "relevance",
-                    "groundedness",
-                    "correctness"
-                ]
-            }
-        }
-    },
-    {
-        "type": "function",
-        "function": {
-            "name": "search_index",
-            "description": "Search the index to find relevant information based on the question asked.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "query": {
+                    },
+                    "next_query": {
                         "type": "string",
-                        "description": "The query to search in the index."
-                    }
-                },
-                "required": [
-                    "query"
-                ]
-            }
-        }
-    },
-    {
-        "type": "function",
-        "function": {
-            "name": "rewrite_answer",
-            "description": "Rewrite an answer to a question based on the context of the conversation and the relevance, groundedness, and correctness of the original answer.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "answer": {
+                        "description": "The query to search in the index if necessary. Output an empty string if it's not necessary"
+                    },
+                    "next_answer": {
                         "type": "string",
-                        "description": "The updated answer to the query"
+                        "description": "The answer to the question asked, which may be empty if no correction is necessary or a new one based on the context."
                     }
                 },
                 "required": [
-                    "answer"
+                    "relevance",
+                    "groundedness",
+                    "correctness",
+                    "next_query",
+                    "next_answer"
                 ]
             }
         }
diff --git a/app/frontend/src/components/AnalysisPanel/AnalysisPanel.module.css b/app/frontend/src/components/AnalysisPanel/AnalysisPanel.module.css
index 84b9f110ea..17ad5e751e 100644
--- a/app/frontend/src/components/AnalysisPanel/AnalysisPanel.module.css
+++ b/app/frontend/src/components/AnalysisPanel/AnalysisPanel.module.css
@@ -134,3 +134,30 @@
     background-color: #424242;
     color: #ffffff;
 }
+
+.evaluationContainer {
+    margin: 16px 0;
+    padding: 16px;
+    border: 1px solid #ddd;
+    border-radius: 8px;
+    background-color: #f9f9f9;
+}
+
+.evaluationLabel {
+    font-size: 18px;
+    font-weight: bold;
+    margin-bottom: 8px;
+    color: #333;
+}
+
+.evaluationList {
+    list-style-type: none;
+    padding: 0;
+    margin: 0;
+}
+
+.evaluationList li {
+    margin: 4px 0;
+    font-size: 14px;
+    color: #555;
+}
diff --git a/app/frontend/src/components/AnalysisPanel/CandidateAnswer.tsx b/app/frontend/src/components/AnalysisPanel/CandidateAnswer.tsx
new file mode 100644
index 0000000000..5d37fd649b
--- /dev/null
+++ b/app/frontend/src/components/AnalysisPanel/CandidateAnswer.tsx
@@ -0,0 +1,19 @@
+import React from "react";
+import styles from "./AnalysisPanel.module.css";
+
+interface CandidateAnswerProps {
+    candidate_answer: string | undefined;
+}
+
+export const CandidateAnswer: React.FC<CandidateAnswerProps> = ({ candidate_answer }) => {
+    return (
+        <div className={styles.evaluationContainer}>
+            <div className={styles.evaluationLabel}>Candidate Answer</div>
+            {candidate_answer ? (
+                <div className={styles.answerText}>{candidate_answer}</div>
+            ) : (
+                <div className={styles.answerText}>No candidate answer available</div>
+            )}
+        </div>
+    );
+};
diff --git a/app/frontend/src/components/AnalysisPanel/Evaluation.tsx b/app/frontend/src/components/AnalysisPanel/Evaluation.tsx
new file mode 100644
index 0000000000..6bc145f2c5
--- /dev/null
+++ b/app/frontend/src/components/AnalysisPanel/Evaluation.tsx
@@ -0,0 +1,24 @@
+import React from "react";
+import styles from "./AnalysisPanel.module.css";
+
+interface EvaluationProps {
+    label: string;
+    value: {
+        thought_chain: string;
+        score: number;
+        explanation: string;
+    };
+}
+
+export const Evaluation: React.FC<EvaluationProps> = ({ label, value }) => {
+    return (
+        <div className={styles.evaluationContainer}>
+            <div className={styles.evaluationLabel}>{label}</div>
+            <ul className={styles.evaluationList}>
+                <li>Thought Chain: {value.thought_chain}</li>
+                <li>Score: {value.score}</li>
+                <li>Explanation: {value.explanation}</li>
+            </ul>
+        </div>
+    );
+};
diff --git a/app/frontend/src/components/AnalysisPanel/Reflection.tsx b/app/frontend/src/components/AnalysisPanel/Reflection.tsx
new file mode 100644
index 0000000000..3f057298d7
--- /dev/null
+++ b/app/frontend/src/components/AnalysisPanel/Reflection.tsx
@@ -0,0 +1,19 @@
+import React from "react";
+import styles from "./AnalysisPanel.module.css";
+
+interface ReflectionProps {
+    next_answer: string | undefined;
+    next_query: string | undefined;
+}
+
+export const Reflection: React.FC<ReflectionProps> = ({ next_answer, next_query }) => {
+    return (
+        <div className={styles.evaluationContainer}>
+            <div className={styles.evaluationLabel}>Next Steps</div>
+            <ul className={styles.evaluationList}>
+                <li>Next Query: {next_query}</li>
+                <li>Revised Answer: {next_answer}</li>
+            </ul>
+        </div>
+    );
+};
diff --git a/app/frontend/src/components/AnalysisPanel/ThoughtProcess.tsx b/app/frontend/src/components/AnalysisPanel/ThoughtProcess.tsx
index f666960da1..9b27c8a32c 100644
--- a/app/frontend/src/components/AnalysisPanel/ThoughtProcess.tsx
+++ b/app/frontend/src/components/AnalysisPanel/ThoughtProcess.tsx
@@ -7,6 +7,9 @@ import styles from "./AnalysisPanel.module.css";
 
 import { Thoughts } from "../../api";
 import { TokenUsageGraph } from "./TokenUsageGraph";
+import { Evaluation } from "./Evaluation";
+import { Reflection } from "./Reflection";
+import { CandidateAnswer } from "./CandidateAnswer";
 
 SyntaxHighlighter.registerLanguage("json", json);
 
@@ -14,6 +17,8 @@ interface Props {
     thoughts: Thoughts[];
 }
 
+const known_keys = ["token_usage", "reasoning_effort", "groundedness", "relevance", "correctness", "next_query", "next_answer", "candidate_answer"];
+
 export const ThoughtProcess = ({ thoughts }: Props) => {
     return (
         <ul className={styles.tList}>
@@ -23,13 +28,18 @@ export const ThoughtProcess = ({ thoughts }: Props) => {
                         <div className={styles.tStep}>{t.title}</div>
                         <Stack horizontal tokens={{ childrenGap: 5 }}>
                             {t.props &&
-                                (Object.keys(t.props).filter(k => k !== "token_usage") || []).map((k: any) => (
+                                (Object.keys(t.props).filter(k => !known_keys.includes(k)) || []).map((k: any) => (
                                     <span className={styles.tProp} key={k}>
                                         {k}: {JSON.stringify(t.props?.[k])}
                                     </span>
                                 ))}
                         </Stack>
                         {t.props?.token_usage && <TokenUsageGraph tokenUsage={t.props.token_usage} reasoningEffort={t.props.reasoning_effort} />}
+                        {t.props?.groundedness && <Evaluation label="Groundedness" value={t.props.groundedness} />}
+                        {t.props?.relevance && <Evaluation label="Relevance" value={t.props.relevance} />}
+                        {t.props?.correctness && <Evaluation label="Correctness" value={t.props.correctness} />}
+                        {(t.props?.next_query || t.props?.next_answer) && <Reflection next_query={t.props.next_query} next_answer={t.props.next_answer} />}
+                        {t.props?.candidate_answer && <CandidateAnswer candidate_answer={t.props.candidate_answer} />}
                         {Array.isArray(t.description) ? (
                             <SyntaxHighlighter language="json" wrapLongLines className={styles.tCodeBlock} style={a11yLight}>
                                 {JSON.stringify(t.description, null, 2)}

From 6380656185125f9aaf70665dd15f7d4d9b2d9323 Mon Sep 17 00:00:00 2001
From: Matt Gotteiner <matthew.gotteiner@microsoft.com>
Date: Tue, 22 Apr 2025 14:12:52 -0700
Subject: [PATCH 09/10] checkpoint 2

---
 app/backend/app.py                            |  4 ++++
 .../approaches/chatreadretrieveread.py        |  2 +-
 app/backend/config.py                         |  1 +
 app/frontend/src/api/models.ts                |  2 ++
 .../src/components/Settings/Settings.tsx      | 19 +++++++++++++++++++
 app/frontend/src/locales/en/translation.json  |  3 +++
 app/frontend/src/pages/ask/Ask.tsx            |  9 +++++++++
 app/frontend/src/pages/chat/Chat.tsx          | 10 ++++++++++
 8 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/app/backend/app.py b/app/backend/app.py
index 01c3bf23e7..194bcadeee 100644
--- a/app/backend/app.py
+++ b/app/backend/app.py
@@ -73,6 +73,7 @@
     CONFIG_LANGUAGE_PICKER_ENABLED,
     CONFIG_OPENAI_CLIENT,
     CONFIG_QUERY_REWRITING_ENABLED,
+    CONFIG_REFLECTION_ENABLED,
     CONFIG_REASONING_EFFORT_ENABLED,
     CONFIG_SEARCH_CLIENT,
     CONFIG_SEMANTIC_RANKER_DEPLOYED,
@@ -301,6 +302,7 @@ def config():
             "showGPT4VOptions": current_app.config[CONFIG_GPT4V_DEPLOYED],
             "showSemanticRankerOption": current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED],
             "showQueryRewritingOption": current_app.config[CONFIG_QUERY_REWRITING_ENABLED],
+            "showReflectionOption": current_app.config[CONFIG_REFLECTION_ENABLED],
             "showReasoningEffortOption": current_app.config[CONFIG_REASONING_EFFORT_ENABLED],
             "streamingEnabled": current_app.config[CONFIG_STREAMING_ENABLED],
             "defaultReasoningEffort": current_app.config[CONFIG_DEFAULT_REASONING_EFFORT],
@@ -479,6 +481,7 @@ async def setup_clients():
 
     USE_GPT4V = os.getenv("USE_GPT4V", "").lower() == "true"
     USE_USER_UPLOAD = os.getenv("USE_USER_UPLOAD", "").lower() == "true"
+    USE_REFLECTION = os.getenv("USE_REFLECTION", "").lower() == "true"
     ENABLE_LANGUAGE_PICKER = os.getenv("ENABLE_LANGUAGE_PICKER", "").lower() == "true"
     USE_SPEECH_INPUT_BROWSER = os.getenv("USE_SPEECH_INPUT_BROWSER", "").lower() == "true"
     USE_SPEECH_OUTPUT_BROWSER = os.getenv("USE_SPEECH_OUTPUT_BROWSER", "").lower() == "true"
@@ -663,6 +666,7 @@ async def setup_clients():
         or OPENAI_CHATGPT_MODEL not in Approach.GPT_REASONING_MODELS
         or Approach.GPT_REASONING_MODELS[OPENAI_CHATGPT_MODEL].streaming
     )
+    current_app.config[CONFIG_REFLECTION_ENABLED] = USE_REFLECTION
     current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = os.getenv("USE_VECTORS", "").lower() != "false"
     current_app.config[CONFIG_USER_UPLOAD_ENABLED] = bool(USE_USER_UPLOAD)
     current_app.config[CONFIG_LANGUAGE_PICKER_ENABLED] = ENABLE_LANGUAGE_PICKER
diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py
index 7df341d5a9..7b9cb58200 100644
--- a/app/backend/approaches/chatreadretrieveread.py
+++ b/app/backend/approaches/chatreadretrieveread.py
@@ -83,7 +83,7 @@ async def run_until_final_call(
         use_semantic_ranker = True if overrides.get("semantic_ranker") else False
         use_semantic_captions = True if overrides.get("semantic_captions") else False
         use_query_rewriting = True if overrides.get("query_rewriting") else False
-        use_reflection = True if overrides.get("reflection") else True
+        use_reflection = True if overrides.get("reflection") else False
         reflection_max_steps = overrides.get("reflection_max_steps", self.reflection_max_steps)
         top = overrides.get("top", 3)
         minimum_search_score = overrides.get("minimum_search_score", 0.0)
diff --git a/app/backend/config.py b/app/backend/config.py
index 5f3354f2da..b190e11615 100644
--- a/app/backend/config.py
+++ b/app/backend/config.py
@@ -11,6 +11,7 @@
 CONFIG_GPT4V_DEPLOYED = "gpt4v_deployed"
 CONFIG_SEMANTIC_RANKER_DEPLOYED = "semantic_ranker_deployed"
 CONFIG_QUERY_REWRITING_ENABLED = "query_rewriting_enabled"
+CONFIG_REFLECTION_ENABLED = "reflection_enabled"
 CONFIG_REASONING_EFFORT_ENABLED = "reasoning_effort_enabled"
 CONFIG_VISION_REASONING_EFFORT_ENABLED = "vision_reasoning_effort_enabled"
 CONFIG_DEFAULT_REASONING_EFFORT = "default_reasoning_effort"
diff --git a/app/frontend/src/api/models.ts b/app/frontend/src/api/models.ts
index c915a19ee5..ef7847ea71 100644
--- a/app/frontend/src/api/models.ts
+++ b/app/frontend/src/api/models.ts
@@ -21,6 +21,7 @@ export type ChatAppRequestOverrides = {
     semantic_ranker?: boolean;
     semantic_captions?: boolean;
     query_rewriting?: boolean;
+    reflection?: boolean;
     reasoning_effort?: string;
     include_category?: string;
     exclude_category?: string;
@@ -89,6 +90,7 @@ export type Config = {
     showSemanticRankerOption: boolean;
     showQueryRewritingOption: boolean;
     showReasoningEffortOption: boolean;
+    showReflectionOption: boolean;
     streamingEnabled: boolean;
     showVectorOption: boolean;
     showUserUpload: boolean;
diff --git a/app/frontend/src/components/Settings/Settings.tsx b/app/frontend/src/components/Settings/Settings.tsx
index b16beb0246..a06469d985 100644
--- a/app/frontend/src/components/Settings/Settings.tsx
+++ b/app/frontend/src/components/Settings/Settings.tsx
@@ -20,6 +20,7 @@ export interface SettingsProps {
     useSemanticRanker: boolean;
     useSemanticCaptions: boolean;
     useQueryRewriting: boolean;
+    useReflection: boolean;
     reasoningEffort: string;
     excludeCategory: string;
     includeCategory: string;
@@ -30,6 +31,7 @@ export interface SettingsProps {
     showSemanticRankerOption: boolean;
     showQueryRewritingOption: boolean;
     showReasoningEffortOption: boolean;
+    showReflectionOption: boolean;
     showGPT4VOptions: boolean;
     showVectorOption: boolean;
     useOidSecurityFilter: boolean;
@@ -57,6 +59,7 @@ export const Settings = ({
     useSemanticRanker,
     useSemanticCaptions,
     useQueryRewriting,
+    useReflection,
     reasoningEffort,
     excludeCategory,
     includeCategory,
@@ -67,6 +70,7 @@ export const Settings = ({
     showSemanticRankerOption,
     showQueryRewritingOption,
     showReasoningEffortOption,
+    showReflectionOption,
     showGPT4VOptions,
     showVectorOption,
     useOidSecurityFilter,
@@ -106,6 +110,7 @@ export const Settings = ({
     const semanticRankerFieldId = useId("semanticRankerField");
     const queryRewritingFieldId = useId("queryRewritingField");
     const reasoningEffortFieldId = useId("reasoningEffortField");
+    const reflectionField = useId("reflectionField");
     const semanticCaptionsId = useId("semanticCaptions");
     const semanticCaptionsFieldId = useId("semanticCaptionsField");
     const useOidSecurityFilterId = useId("useOidSecurityFilter");
@@ -266,6 +271,20 @@ export const Settings = ({
                 </>
             )}
 
+            {showReflectionOption && (
+                <>
+                    <Checkbox
+                        id={reasoningEffortFieldId}
+                        className={styles.settingsSeparator}
+                        checked={useReflection}
+                        label={t("labels.useReflection")}
+                        onChange={(_ev, checked) => onChange("useReflection", !!checked)}
+                        aria-labelledby={reasoningEffortFieldId}
+                        onRenderLabel={props => renderLabel(props, reasoningEffortFieldId, reasoningEffortFieldId, t("helpTexts.useReflection"))}
+                    />
+                </>
+            )}
+
             {showReasoningEffortOption && (
                 <Dropdown
                     id={reasoningEffortFieldId}
diff --git a/app/frontend/src/locales/en/translation.json b/app/frontend/src/locales/en/translation.json
index 03ee719042..6b90f01f70 100644
--- a/app/frontend/src/locales/en/translation.json
+++ b/app/frontend/src/locales/en/translation.json
@@ -92,6 +92,7 @@
         "useSemanticCaptions": "Use semantic captions",
         "useQueryRewriting": "Use query rewriting for retrieval",
         "reasoningEffort": "Reasoning effort",
+        "useReflection": "Use reflection",
         "reasoningEffortOptions": {
             "low": "Low",
             "medium": "Medium",
@@ -150,6 +151,8 @@
             "Enables Azure AI Search query rewriting, a process that modifies the user's query to improve search results. Requires semantic ranker to be enabled.",
         "reasoningEffort":
             "Sets the reasoning effort for the LLM. Higher values result in more reasoning, but may take longer to generate a response. The default is medium.",
+        "useReflection":
+            "Enables the LLM to reflect on the answer it generated and provide a thought process, explanation, and score for the relevance, groundedness, and correctness of the answer.",
         "useSemanticCaptions":
              "Sends semantic captions to the LLM instead of the full search result. A semantic caption is extracted from a search result during the process of semantic ranking.",
         "suggestFollowupQuestions": "Asks the LLM to suggest follow-up questions based on the user's query.",
diff --git a/app/frontend/src/pages/ask/Ask.tsx b/app/frontend/src/pages/ask/Ask.tsx
index 8e38076adb..649ab40afa 100644
--- a/app/frontend/src/pages/ask/Ask.tsx
+++ b/app/frontend/src/pages/ask/Ask.tsx
@@ -33,6 +33,7 @@ export function Component(): JSX.Element {
     const [useSemanticRanker, setUseSemanticRanker] = useState<boolean>(true);
     const [useSemanticCaptions, setUseSemanticCaptions] = useState<boolean>(false);
     const [useQueryRewriting, setUseQueryRewriting] = useState<boolean>(false);
+    const [useReflection, setUseReflection] = useState<boolean>(false);
     const [reasoningEffort, setReasoningEffort] = useState<string>("");
     const [useGPT4V, setUseGPT4V] = useState<boolean>(false);
     const [gpt4vInput, setGPT4VInput] = useState<GPT4VInput>(GPT4VInput.TextAndImages);
@@ -45,6 +46,7 @@ export function Component(): JSX.Element {
     const [showGPT4VOptions, setShowGPT4VOptions] = useState<boolean>(false);
     const [showSemanticRankerOption, setShowSemanticRankerOption] = useState<boolean>(false);
     const [showQueryRewritingOption, setShowQueryRewritingOption] = useState<boolean>(false);
+    const [showReflectionOption, setShowReflectionOption] = useState<boolean>(false);
     const [showReasoningEffortOption, setShowReasoningEffortOption] = useState<boolean>(false);
     const [showVectorOption, setShowVectorOption] = useState<boolean>(false);
     const [showUserUpload, setShowUserUpload] = useState<boolean>(false);
@@ -84,6 +86,8 @@ export function Component(): JSX.Element {
             setShowSemanticRankerOption(config.showSemanticRankerOption);
             setUseQueryRewriting(config.showQueryRewritingOption);
             setShowQueryRewritingOption(config.showQueryRewritingOption);
+            setUseReflection(config.showReflectionOption);
+            setShowReflectionOption(config.showReflectionOption);
             setShowReasoningEffortOption(config.showReasoningEffortOption);
             if (config.showReasoningEffortOption) {
                 setReasoningEffort(config.defaultReasoningEffort);
@@ -195,6 +199,9 @@ export function Component(): JSX.Element {
             case "useQueryRewriting":
                 setUseQueryRewriting(value);
                 break;
+            case "useReflection":
+                setUseReflection(value);
+                break;
             case "reasoningEffort":
                 setReasoningEffort(value);
                 break;
@@ -340,6 +347,7 @@ export function Component(): JSX.Element {
                     useSemanticRanker={useSemanticRanker}
                     useSemanticCaptions={useSemanticCaptions}
                     useQueryRewriting={useQueryRewriting}
+                    useReflection={useReflection}
                     reasoningEffort={reasoningEffort}
                     excludeCategory={excludeCategory}
                     includeCategory={includeCategory}
@@ -350,6 +358,7 @@ export function Component(): JSX.Element {
                     showSemanticRankerOption={showSemanticRankerOption}
                     showQueryRewritingOption={showQueryRewritingOption}
                     showReasoningEffortOption={showReasoningEffortOption}
+                    showReflectionOption={showReflectionOption}
                     showGPT4VOptions={showGPT4VOptions}
                     showVectorOption={showVectorOption}
                     useOidSecurityFilter={useOidSecurityFilter}
diff --git a/app/frontend/src/pages/chat/Chat.tsx b/app/frontend/src/pages/chat/Chat.tsx
index 7ab010f337..09284a723a 100644
--- a/app/frontend/src/pages/chat/Chat.tsx
+++ b/app/frontend/src/pages/chat/Chat.tsx
@@ -49,6 +49,7 @@ const Chat = () => {
     const [retrievalMode, setRetrievalMode] = useState<RetrievalMode>(RetrievalMode.Hybrid);
     const [useSemanticRanker, setUseSemanticRanker] = useState<boolean>(true);
     const [useQueryRewriting, setUseQueryRewriting] = useState<boolean>(false);
+    const [useReflection, setUseReflection] = useState<boolean>(false);
     const [reasoningEffort, setReasoningEffort] = useState<string>("");
     const [streamingEnabled, setStreamingEnabled] = useState<boolean>(true);
     const [shouldStream, setShouldStream] = useState<boolean>(true);
@@ -80,6 +81,7 @@ const Chat = () => {
     const [showGPT4VOptions, setShowGPT4VOptions] = useState<boolean>(false);
     const [showSemanticRankerOption, setShowSemanticRankerOption] = useState<boolean>(false);
     const [showQueryRewritingOption, setShowQueryRewritingOption] = useState<boolean>(false);
+    const [showReflectionOption, setShowReflectionOption] = useState<boolean>(false);
     const [showReasoningEffortOption, setShowReasoningEffortOption] = useState<boolean>(false);
     const [showVectorOption, setShowVectorOption] = useState<boolean>(false);
     const [showUserUpload, setShowUserUpload] = useState<boolean>(false);
@@ -107,6 +109,8 @@ const Chat = () => {
             setShowSemanticRankerOption(config.showSemanticRankerOption);
             setUseQueryRewriting(config.showQueryRewritingOption);
             setShowQueryRewritingOption(config.showQueryRewritingOption);
+            setUseReflection(config.showReflectionOption);
+            setShowReflectionOption(config.showReflectionOption);
             setShowReasoningEffortOption(config.showReasoningEffortOption);
             setStreamingEnabled(config.streamingEnabled);
             if (!config.streamingEnabled) {
@@ -203,6 +207,7 @@ const Chat = () => {
                         semantic_ranker: useSemanticRanker,
                         semantic_captions: useSemanticCaptions,
                         query_rewriting: useQueryRewriting,
+                        reflection: useReflection,
                         reasoning_effort: reasoningEffort,
                         suggest_followup_questions: useSuggestFollowupQuestions,
                         use_oid_security_filter: useOidSecurityFilter,
@@ -298,6 +303,9 @@ const Chat = () => {
             case "reasoningEffort":
                 setReasoningEffort(value);
                 break;
+            case "useReflection":
+                setUseReflection(value);
+                break;
             case "useSemanticCaptions":
                 setUseSemanticCaptions(value);
                 break;
@@ -514,6 +522,7 @@ const Chat = () => {
                         useSemanticRanker={useSemanticRanker}
                         useSemanticCaptions={useSemanticCaptions}
                         useQueryRewriting={useQueryRewriting}
+                        useReflection={useReflection}
                         reasoningEffort={reasoningEffort}
                         excludeCategory={excludeCategory}
                         includeCategory={includeCategory}
@@ -523,6 +532,7 @@ const Chat = () => {
                         vectorFieldList={vectorFieldList}
                         showSemanticRankerOption={showSemanticRankerOption}
                         showQueryRewritingOption={showQueryRewritingOption}
+                        showReflectionOption={showReflectionOption}
                         showReasoningEffortOption={showReasoningEffortOption}
                         showGPT4VOptions={showGPT4VOptions}
                         showVectorOption={showVectorOption}

From 1d599892de5435c80cac4bf85be189dc6ce50a8e Mon Sep 17 00:00:00 2001
From: Matt Gotteiner <matthew.gotteiner@microsoft.com>
Date: Wed, 23 Apr 2025 11:01:00 -0700
Subject: [PATCH 10/10] fix order

---
 app/backend/approaches/chatreadretrieveread.py       |  9 ++++++++-
 .../src/components/AnalysisPanel/ThoughtProcess.tsx  | 12 +++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py
index 7b9cb58200..521bc8c014 100644
--- a/app/backend/approaches/chatreadretrieveread.py
+++ b/app/backend/approaches/chatreadretrieveread.py
@@ -321,9 +321,16 @@ async def run_until_final_call(
                     minimum_reranker_score,
                     use_query_rewriting,
                 )
+                results.extend(reflection_results)
+                yield StreamingThoughtStep(
+                    step=ThoughtStep(
+                        "Search results",
+                        [result.serialize_for_results() for result in results],
+                    ),
+                    role="tool"
+                )
 
                 # Repeat STEP 3: Generate a contextual and content specific answer using the search results and chat history
-                results.extend(reflection_results)
                 text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)
                 answer_messages = deepcopy(messages)
                 answer_messages = self.prompt_manager.render_prompt(
diff --git a/app/frontend/src/components/AnalysisPanel/ThoughtProcess.tsx b/app/frontend/src/components/AnalysisPanel/ThoughtProcess.tsx
index 9b27c8a32c..b0e7517839 100644
--- a/app/frontend/src/components/AnalysisPanel/ThoughtProcess.tsx
+++ b/app/frontend/src/components/AnalysisPanel/ThoughtProcess.tsx
@@ -35,11 +35,7 @@ export const ThoughtProcess = ({ thoughts }: Props) => {
                                 ))}
                         </Stack>
                         {t.props?.token_usage && <TokenUsageGraph tokenUsage={t.props.token_usage} reasoningEffort={t.props.reasoning_effort} />}
-                        {t.props?.groundedness && <Evaluation label="Groundedness" value={t.props.groundedness} />}
-                        {t.props?.relevance && <Evaluation label="Relevance" value={t.props.relevance} />}
-                        {t.props?.correctness && <Evaluation label="Correctness" value={t.props.correctness} />}
-                        {(t.props?.next_query || t.props?.next_answer) && <Reflection next_query={t.props.next_query} next_answer={t.props.next_answer} />}
-                        {t.props?.candidate_answer && <CandidateAnswer candidate_answer={t.props.candidate_answer} />}
+
                         {Array.isArray(t.description) ? (
                             <SyntaxHighlighter language="json" wrapLongLines className={styles.tCodeBlock} style={a11yLight}>
                                 {JSON.stringify(t.description, null, 2)}
@@ -47,6 +43,12 @@ export const ThoughtProcess = ({ thoughts }: Props) => {
                         ) : (
                             <div>{t.description}</div>
                         )}
+
+                        {t.props?.groundedness && <Evaluation label="Groundedness" value={t.props.groundedness} />}
+                        {t.props?.relevance && <Evaluation label="Relevance" value={t.props.relevance} />}
+                        {t.props?.correctness && <Evaluation label="Correctness" value={t.props.correctness} />}
+                        {(t.props?.next_query || t.props?.next_answer) && <Reflection next_query={t.props.next_query} next_answer={t.props.next_answer} />}
+                        {t.props?.candidate_answer && <CandidateAnswer candidate_answer={t.props.candidate_answer} />}
                     </li>
                 );
             })}