Merge commit '0cfb132ab7cf5befa08564b7bb8127321b5baa2f' into xw/vscode

All-Hands-AI · Nov 12, 2024 · 6e60203 · 6e60203
2 parents 0fddc96 + 0cfb132
commit 6e60203
Show file tree

Hide file tree

Showing 119 changed files with 1,441 additions and 1,653 deletions.
diff --git a/.github/workflows/ghcr-build.yml b/.github/workflows/ghcr-build.yml
@@ -286,7 +286,6 @@ jobs:
           image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
           image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
 
-          SKIP_CONTAINER_LOGS=true \
           TEST_RUNTIME=eventstream \
           SANDBOX_USER_ID=$(id -u) \
           SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
@@ -364,7 +363,6 @@ jobs:
           image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
           image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
 
-          SKIP_CONTAINER_LOGS=true \
           TEST_RUNTIME=eventstream \
           SANDBOX_USER_ID=$(id -u) \
           SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \

diff --git a/README.md b/README.md
@@ -44,6 +44,7 @@ docker run -it --pull=always \
     -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.13-nikolaik \
     -v /var/run/docker.sock:/var/run/docker.sock \
     -p 3000:3000 \
+    -e LOG_ALL_EVENTS=true \
     --add-host host.docker.internal:host-gateway \
     --name openhands-app \
     docker.all-hands.dev/all-hands-ai/openhands:0.13

diff --git a/docs/modules/usage/how-to/headless-mode.md b/docs/modules/usage/how-to/headless-mode.md
@@ -49,6 +49,7 @@ docker run -it \
     -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
     -e LLM_API_KEY=$LLM_API_KEY \
     -e LLM_MODEL=$LLM_MODEL \
+    -e LOG_ALL_EVENTS=true \
     -v $WORKSPACE_BASE:/opt/workspace_base \
     -v /var/run/docker.sock:/var/run/docker.sock \
     --add-host host.docker.internal:host-gateway \

diff --git a/docs/modules/usage/installation.mdx b/docs/modules/usage/installation.mdx
@@ -17,6 +17,7 @@ docker run -it --rm --pull=always \
     -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.13-nikolaik \
     -v /var/run/docker.sock:/var/run/docker.sock \
     -p 3000:3000 \
+    -e LOG_ALL_EVENTS=true \
     --add-host host.docker.internal:host-gateway \
     --name openhands-app \
     docker.all-hands.dev/all-hands-ai/openhands:0.13

diff --git a/docs/modules/usage/llms/llms.md b/docs/modules/usage/llms/llms.md
@@ -4,11 +4,11 @@ OpenHands can connect to any LLM supported by LiteLLM. However, it requires a po
 
 ## Model Recommendations
 
-Based on a recent evaluation of language models for coding tasks (using the SWE-bench dataset), we can provide some recommendations for model selection. The full analysis can be found in [this blog article](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed).
+Based on our evaluations of language models for coding tasks (using the SWE-bench dataset), we can provide some recommendations for model selection. Some analyses can be found in [this blog article comparing LLMs](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) and [this blog article with some more recent results](https://www.all-hands.dev/blog/openhands-codeact-21-an-open-state-of-the-art-software-development-agent).
 
 When choosing a model, consider both the quality of outputs and the associated costs. Here's a summary of the findings:
 
-- Claude 3.5 Sonnet is the best by a fair amount, achieving a 27% resolve rate with the default agent in OpenHands.
+- Claude 3.5 Sonnet is the best by a fair amount, achieving a 53% resolve rate on SWE-Bench Verified with the default agent in OpenHands.
 - GPT-4o lags behind, and o1-mini actually performed somewhat worse than GPT-4o. We went in and analyzed the results a little, and briefly it seemed like o1 was sometimes "overthinking" things, performing extra environment configuration tasks when it could just go ahead and finish the task.
 - Finally, the strongest open models were Llama 3.1 405 B and deepseek-v2.5, and they performed reasonably, even besting some of the closed models.
 

diff --git a/docs/modules/usage/runtimes.md b/docs/modules/usage/runtimes.md
@@ -59,7 +59,7 @@ docker run # ...
     -e RUNTIME=remote \
     -e SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.app.all-hands.dev" \
     -e SANDBOX_API_KEY="your-all-hands-api-key" \
-    -e SANDBOX_KEEP_REMOTE_RUNTIME_ALIVE="true" \
+    -e SANDBOX_KEEP_RUNTIME_ALIVE="true" \
     # ...
 ```
 

diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py
@@ -35,7 +35,8 @@ def codeact_user_response_eda(state: State) -> str:
 
     # retrieve the latest model message from history
     if state.history:
-        model_guess = state.get_last_agent_message()
+        last_agent_message = state.get_last_agent_message()
+        model_guess = last_agent_message.content if last_agent_message else ''
 
     assert game is not None, 'Game is not initialized.'
     msg = game.generate_user_response(model_guess)
@@ -140,7 +141,8 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    final_message = state.get_last_agent_message()
+    last_agent_message = state.get_last_agent_message()
+    final_message = last_agent_message.content if last_agent_message else ''
 
     logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
     test_result = game.reward()

diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py
@@ -102,7 +102,8 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     # retrieve the last message from the agent
-    model_answer_raw = state.get_last_agent_message()
+    last_agent_message = state.get_last_agent_message()
+    model_answer_raw = last_agent_message.content if last_agent_message else ''
 
     # attempt to parse model_answer
     ast_eval_fn = instance['ast_eval']

diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py
@@ -66,7 +66,7 @@ def get_config(
             browsergym_eval_env=env_id,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_remote_runtime_alive=False,
+            keep_runtime_alive=False,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/scienceagentbench/run_infer.py b/evaluation/scienceagentbench/run_infer.py
@@ -72,7 +72,7 @@ def get_config(
             timeout=300,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_remote_runtime_alive=False,
+            keep_runtime_alive=False,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py
@@ -83,6 +83,7 @@ def get_config(instance: pd.Series) -> AppConfig:
             timeout=1800,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            remote_runtime_init_timeout=1800,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
@@ -145,7 +145,8 @@ def get_config(
             platform='linux/amd64',
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_remote_runtime_alive=False,
+            keep_runtime_alive=False,
+            remote_runtime_init_timeout=1800,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py
@@ -127,7 +127,8 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
         raise ValueError('State should not be None.')
 
     # retrieve the last message from the agent
-    model_answer_raw = state.get_last_agent_message()
+    last_agent_message = state.get_last_agent_message()
+    model_answer_raw = last_agent_message.content if last_agent_message else ''
 
     # attempt to parse model_answer
     correct = eval_answer(str(model_answer_raw), str(answer))

diff --git a/frontend/__tests__/components/chat/chat-interface.test.tsx b/frontend/__tests__/components/chat/chat-interface.test.tsx
@@ -16,14 +16,14 @@ describe("Empty state", () => {
     send: vi.fn(),
   }));
 
-  const { useSocket: useSocketMock } = vi.hoisted(() => ({
-    useSocket: vi.fn(() => ({ send: sendMock, runtimeActive: true })),
+  const { useWsClient: useWsClientMock } = vi.hoisted(() => ({
+    useWsClient: vi.fn(() => ({ send: sendMock, runtimeActive: true })),
   }));
 
   beforeAll(() => {
     vi.mock("#/context/socket", async (importActual) => ({
-      ...(await importActual<typeof import("#/context/socket")>()),
-      useSocket: useSocketMock,
+      ...(await importActual<typeof import("#/context/ws-client-provider")>()),
+      useWsClient: useWsClientMock,
     }));
   });
 
@@ -77,7 +77,7 @@ describe("Empty state", () => {
     "should load the a user message to the input when selecting",
     async () => {
       // this is to test that the message is in the UI before the socket is called
-      useSocketMock.mockImplementation(() => ({
+      useWsClientMock.mockImplementation(() => ({
         send: sendMock,
         runtimeActive: false, // mock an inactive runtime setup
       }));
@@ -106,7 +106,7 @@ describe("Empty state", () => {
   it.fails(
     "should send the message to the socket only if the runtime is active",
     async () => {
-      useSocketMock.mockImplementation(() => ({
+      useWsClientMock.mockImplementation(() => ({
         send: sendMock,
         runtimeActive: false, // mock an inactive runtime setup
       }));
@@ -123,7 +123,7 @@ describe("Empty state", () => {
       await user.click(displayedSuggestions[0]);
       expect(sendMock).not.toHaveBeenCalled();
 
-      useSocketMock.mockImplementation(() => ({
+      useWsClientMock.mockImplementation(() => ({
         send: sendMock,
         runtimeActive: true, // mock an active runtime setup
       }));

diff --git a/frontend/__tests__/hooks/use-terminal.test.tsx b/frontend/__tests__/hooks/use-terminal.test.tsx
@@ -2,8 +2,9 @@ import { beforeAll, describe, expect, it, vi } from "vitest";
 import { render } from "@testing-library/react";
 import { afterEach } from "node:test";
 import { useTerminal } from "#/hooks/useTerminal";
-import { SocketProvider } from "#/context/socket";
 import { Command } from "#/state/commandSlice";
+import { WsClientProvider } from "#/context/ws-client-provider";
+import { ReactNode } from "react";
 
 interface TestTerminalComponentProps {
   commands: Command[];
@@ -18,6 +19,17 @@ function TestTerminalComponent({
   return <div ref={ref} />;
 }
 
+interface WrapperProps {
+  children: ReactNode;
+}
+
+
+function Wrapper({children}: WrapperProps) {
+  return (
+    <WsClientProvider enabled={true} token="NO_JWT" ghToken="NO_GITHUB" settings={null}>{children}</WsClientProvider>
+  )
+}
+
 describe("useTerminal", () => {
   const mockTerminal = vi.hoisted(() => ({
     loadAddon: vi.fn(),
@@ -50,7 +62,7 @@ describe("useTerminal", () => {
 
   it("should render", () => {
     render(<TestTerminalComponent commands={[]} secrets={[]} />, {
-      wrapper: SocketProvider,
+      wrapper: Wrapper,
     });
   });
 
@@ -61,7 +73,7 @@ describe("useTerminal", () => {
     ];
 
     render(<TestTerminalComponent commands={commands} secrets={[]} />, {
-      wrapper: SocketProvider,
+      wrapper: Wrapper,
     });
 
     expect(mockTerminal.writeln).toHaveBeenNthCalledWith(1, "echo hello");
@@ -85,7 +97,7 @@ describe("useTerminal", () => {
         secrets={[secret, anotherSecret]}
       />,
       {
-        wrapper: SocketProvider,
+        wrapper: Wrapper,
       },
     );
 

diff --git a/frontend/__tests__/utils/extractModelAndProvider.test.ts b/frontend/__tests__/utils/extractModelAndProvider.test.ts
@@ -59,9 +59,9 @@ describe("extractModelAndProvider", () => {
       separator: "/",
     });
 
-    expect(extractModelAndProvider("claude-3-5-sonnet-20241022")).toEqual({
+    expect(extractModelAndProvider("claude-3-5-sonnet-20240620")).toEqual({
       provider: "anthropic",
-      model: "claude-3-5-sonnet-20241022",
+      model: "claude-3-5-sonnet-20240620",
       separator: "/",
     });
 

diff --git a/frontend/__tests__/utils/organizeModelsAndProviders.test.ts b/frontend/__tests__/utils/organizeModelsAndProviders.test.ts
@@ -15,7 +15,7 @@ test("organizeModelsAndProviders", () => {
     "gpt-4o",
     "together-ai-21.1b-41b",
     "gpt-4o-mini",
-    "claude-3-5-sonnet-20241022",
+    "anthropic/claude-3-5-sonnet-20241022",
     "claude-3-haiku-20240307",
     "claude-2",
     "claude-2.1",

diff --git a/frontend/src/components/AgentControlBar.tsx b/frontend/src/components/AgentControlBar.tsx
@@ -6,7 +6,7 @@ import PlayIcon from "#/assets/play";
 import { generateAgentStateChangeEvent } from "#/services/agentStateService";
 import { RootState } from "#/store";
 import AgentState from "#/types/AgentState";
-import { useSocket } from "#/context/socket";
+import { useWsClient } from "#/context/ws-client-provider";
 
 const IgnoreTaskStateMap: Record<string, AgentState[]> = {
   [AgentState.PAUSED]: [
@@ -72,7 +72,7 @@ function ActionButton({
 }
 
 function AgentControlBar() {
-  const { send } = useSocket();
+  const { send } = useWsClient();
   const { curAgentState } = useSelector((state: RootState) => state.agent);
 
   const handleAction = (action: AgentState) => {

diff --git a/frontend/src/components/attach-image-label.tsx b/frontend/src/components/attach-image-label.tsx
@@ -1,4 +1,4 @@
-import Clip from "#/assets/clip.svg?react";
+import Clip from "#/icons/clip.svg?react";
 
 export function AttachImageLabel() {
   return (

diff --git a/frontend/src/components/chat-input.tsx b/frontend/src/components/chat-input.tsx
@@ -1,6 +1,6 @@
 import React from "react";
 import TextareaAutosize from "react-textarea-autosize";
-import ArrowSendIcon from "#/assets/arrow-send.svg?react";
+import ArrowSendIcon from "#/icons/arrow-send.svg?react";
 import { cn } from "#/utils/utils";
 
 interface ChatInputProps {

diff --git a/frontend/src/components/chat-interface.tsx b/frontend/src/components/chat-interface.tsx
@@ -1,7 +1,6 @@
 import { useDispatch, useSelector } from "react-redux";
 import React from "react";
 import posthog from "posthog-js";
-import { useSocket } from "#/context/socket";
 import { convertImageToBase64 } from "#/utils/convert-image-to-base-64";
 import { ChatMessage } from "./chat-message";
 import { FeedbackActions } from "./feedback-actions";
@@ -21,14 +20,15 @@ import { ContinueButton } from "./continue-button";
 import { ScrollToBottomButton } from "./scroll-to-bottom-button";
 import { Suggestions } from "./suggestions";
 import { SUGGESTIONS } from "#/utils/suggestions";
-import BuildIt from "#/assets/build-it.svg?react";
+import BuildIt from "#/icons/build-it.svg?react";
+import { useWsClient } from "#/context/ws-client-provider";
 
 const isErrorMessage = (
   message: Message | ErrorMessage,
 ): message is ErrorMessage => "error" in message;
 
 export function ChatInterface() {
-  const { send } = useSocket();
+  const { send } = useWsClient();
   const dispatch = useDispatch();
   const scrollRef = React.useRef<HTMLDivElement>(null);
   const { scrollDomToBottom, onChatBodyScroll, hitBottom } =

diff --git a/frontend/src/components/chat/ConfirmationButtons.tsx b/frontend/src/components/chat/ConfirmationButtons.tsx
@@ -5,7 +5,7 @@ import RejectIcon from "#/assets/reject";
 import { I18nKey } from "#/i18n/declaration";
 import AgentState from "#/types/AgentState";
 import { generateAgentStateChangeEvent } from "#/services/agentStateService";
-import { useSocket } from "#/context/socket";
+import { useWsClient } from "#/context/ws-client-provider";
 
 interface ActionTooltipProps {
   type: "confirm" | "reject";
@@ -37,7 +37,7 @@ function ActionTooltip({ type, onClick }: ActionTooltipProps) {
 
 function ConfirmationButtons() {
   const { t } = useTranslation();
-  const { send } = useSocket();
+  const { send } = useWsClient();
 
   const handleStateChange = (state: AgentState) => {
     const event = generateAgentStateChangeEvent(state);