Skip to content

Commit

Permalink
Merge commit '0cfb132ab7cf5befa08564b7bb8127321b5baa2f' into xw/vscode
Browse files Browse the repository at this point in the history
  • Loading branch information
xingyaoww committed Nov 12, 2024
2 parents 0fddc96 + 0cfb132 commit 6e60203
Show file tree
Hide file tree
Showing 119 changed files with 1,441 additions and 1,653 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/ghcr-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,6 @@ jobs:
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
SKIP_CONTAINER_LOGS=true \
TEST_RUNTIME=eventstream \
SANDBOX_USER_ID=$(id -u) \
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
Expand Down Expand Up @@ -364,7 +363,6 @@ jobs:
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
SKIP_CONTAINER_LOGS=true \
TEST_RUNTIME=eventstream \
SANDBOX_USER_ID=$(id -u) \
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ docker run -it --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.13-nikolaik \
-v /var/run/docker.sock:/var/run/docker.sock \
-p 3000:3000 \
-e LOG_ALL_EVENTS=true \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.13
Expand Down
1 change: 1 addition & 0 deletions docs/modules/usage/how-to/headless-mode.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ docker run -it \
-e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-e LLM_API_KEY=$LLM_API_KEY \
-e LLM_MODEL=$LLM_MODEL \
-e LOG_ALL_EVENTS=true \
-v $WORKSPACE_BASE:/opt/workspace_base \
-v /var/run/docker.sock:/var/run/docker.sock \
--add-host host.docker.internal:host-gateway \
Expand Down
1 change: 1 addition & 0 deletions docs/modules/usage/installation.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ docker run -it --rm --pull=always \
-e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.13-nikolaik \
-v /var/run/docker.sock:/var/run/docker.sock \
-p 3000:3000 \
-e LOG_ALL_EVENTS=true \
--add-host host.docker.internal:host-gateway \
--name openhands-app \
docker.all-hands.dev/all-hands-ai/openhands:0.13
Expand Down
4 changes: 2 additions & 2 deletions docs/modules/usage/llms/llms.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ OpenHands can connect to any LLM supported by LiteLLM. However, it requires a po

## Model Recommendations

Based on a recent evaluation of language models for coding tasks (using the SWE-bench dataset), we can provide some recommendations for model selection. The full analysis can be found in [this blog article](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed).
Based on our evaluations of language models for coding tasks (using the SWE-bench dataset), we can provide some recommendations for model selection. Some analyses can be found in [this blog article comparing LLMs](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) and [this blog article with some more recent results](https://www.all-hands.dev/blog/openhands-codeact-21-an-open-state-of-the-art-software-development-agent).

When choosing a model, consider both the quality of outputs and the associated costs. Here's a summary of the findings:

- Claude 3.5 Sonnet is the best by a fair amount, achieving a 27% resolve rate with the default agent in OpenHands.
- Claude 3.5 Sonnet is the best by a fair amount, achieving a 53% resolve rate on SWE-Bench Verified with the default agent in OpenHands.
- GPT-4o lags behind, and o1-mini actually performed somewhat worse than GPT-4o. We went in and analyzed the results a little, and briefly it seemed like o1 was sometimes "overthinking" things, performing extra environment configuration tasks when it could just go ahead and finish the task.
- Finally, the strongest open models were Llama 3.1 405 B and deepseek-v2.5, and they performed reasonably, even besting some of the closed models.

Expand Down
2 changes: 1 addition & 1 deletion docs/modules/usage/runtimes.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ docker run # ...
-e RUNTIME=remote \
-e SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.app.all-hands.dev" \
-e SANDBOX_API_KEY="your-all-hands-api-key" \
-e SANDBOX_KEEP_REMOTE_RUNTIME_ALIVE="true" \
-e SANDBOX_KEEP_RUNTIME_ALIVE="true" \
# ...
```

Expand Down
6 changes: 4 additions & 2 deletions evaluation/EDA/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ def codeact_user_response_eda(state: State) -> str:

# retrieve the latest model message from history
if state.history:
model_guess = state.get_last_agent_message()
last_agent_message = state.get_last_agent_message()
model_guess = last_agent_message.content if last_agent_message else ''

assert game is not None, 'Game is not initialized.'
msg = game.generate_user_response(model_guess)
Expand Down Expand Up @@ -140,7 +141,8 @@ def process_instance(
if state is None:
raise ValueError('State should not be None.')

final_message = state.get_last_agent_message()
last_agent_message = state.get_last_agent_message()
final_message = last_agent_message.content if last_agent_message else ''

logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
test_result = game.reward()
Expand Down
3 changes: 2 additions & 1 deletion evaluation/gorilla/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ def process_instance(
raise ValueError('State should not be None.')

# retrieve the last message from the agent
model_answer_raw = state.get_last_agent_message()
last_agent_message = state.get_last_agent_message()
model_answer_raw = last_agent_message.content if last_agent_message else ''

# attempt to parse model_answer
ast_eval_fn = instance['ast_eval']
Expand Down
2 changes: 1 addition & 1 deletion evaluation/miniwob/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def get_config(
browsergym_eval_env=env_id,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_remote_runtime_alive=False,
keep_runtime_alive=False,
),
# do not mount workspace
workspace_base=None,
Expand Down
2 changes: 1 addition & 1 deletion evaluation/scienceagentbench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def get_config(
timeout=300,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_remote_runtime_alive=False,
keep_runtime_alive=False,
),
# do not mount workspace
workspace_base=None,
Expand Down
1 change: 1 addition & 0 deletions evaluation/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def get_config(instance: pd.Series) -> AppConfig:
timeout=1800,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
remote_runtime_init_timeout=1800,
),
# do not mount workspace
workspace_base=None,
Expand Down
3 changes: 2 additions & 1 deletion evaluation/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ def get_config(
platform='linux/amd64',
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_remote_runtime_alive=False,
keep_runtime_alive=False,
remote_runtime_init_timeout=1800,
),
# do not mount workspace
workspace_base=None,
Expand Down
3 changes: 2 additions & 1 deletion evaluation/toolqa/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
raise ValueError('State should not be None.')

# retrieve the last message from the agent
model_answer_raw = state.get_last_agent_message()
last_agent_message = state.get_last_agent_message()
model_answer_raw = last_agent_message.content if last_agent_message else ''

# attempt to parse model_answer
correct = eval_answer(str(model_answer_raw), str(answer))
Expand Down
14 changes: 7 additions & 7 deletions frontend/__tests__/components/chat/chat-interface.test.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ describe("Empty state", () => {
send: vi.fn(),
}));

const { useSocket: useSocketMock } = vi.hoisted(() => ({
useSocket: vi.fn(() => ({ send: sendMock, runtimeActive: true })),
const { useWsClient: useWsClientMock } = vi.hoisted(() => ({
useWsClient: vi.fn(() => ({ send: sendMock, runtimeActive: true })),
}));

beforeAll(() => {
vi.mock("#/context/socket", async (importActual) => ({
...(await importActual<typeof import("#/context/socket")>()),
useSocket: useSocketMock,
...(await importActual<typeof import("#/context/ws-client-provider")>()),
useWsClient: useWsClientMock,
}));
});

Expand Down Expand Up @@ -77,7 +77,7 @@ describe("Empty state", () => {
"should load the a user message to the input when selecting",
async () => {
// this is to test that the message is in the UI before the socket is called
useSocketMock.mockImplementation(() => ({
useWsClientMock.mockImplementation(() => ({
send: sendMock,
runtimeActive: false, // mock an inactive runtime setup
}));
Expand Down Expand Up @@ -106,7 +106,7 @@ describe("Empty state", () => {
it.fails(
"should send the message to the socket only if the runtime is active",
async () => {
useSocketMock.mockImplementation(() => ({
useWsClientMock.mockImplementation(() => ({
send: sendMock,
runtimeActive: false, // mock an inactive runtime setup
}));
Expand All @@ -123,7 +123,7 @@ describe("Empty state", () => {
await user.click(displayedSuggestions[0]);
expect(sendMock).not.toHaveBeenCalled();

useSocketMock.mockImplementation(() => ({
useWsClientMock.mockImplementation(() => ({
send: sendMock,
runtimeActive: true, // mock an active runtime setup
}));
Expand Down
20 changes: 16 additions & 4 deletions frontend/__tests__/hooks/use-terminal.test.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ import { beforeAll, describe, expect, it, vi } from "vitest";
import { render } from "@testing-library/react";
import { afterEach } from "node:test";
import { useTerminal } from "#/hooks/useTerminal";
import { SocketProvider } from "#/context/socket";
import { Command } from "#/state/commandSlice";
import { WsClientProvider } from "#/context/ws-client-provider";
import { ReactNode } from "react";

interface TestTerminalComponentProps {
commands: Command[];
Expand All @@ -18,6 +19,17 @@ function TestTerminalComponent({
return <div ref={ref} />;
}

interface WrapperProps {
children: ReactNode;
}


function Wrapper({children}: WrapperProps) {
return (
<WsClientProvider enabled={true} token="NO_JWT" ghToken="NO_GITHUB" settings={null}>{children}</WsClientProvider>
)
}

describe("useTerminal", () => {
const mockTerminal = vi.hoisted(() => ({
loadAddon: vi.fn(),
Expand Down Expand Up @@ -50,7 +62,7 @@ describe("useTerminal", () => {

it("should render", () => {
render(<TestTerminalComponent commands={[]} secrets={[]} />, {
wrapper: SocketProvider,
wrapper: Wrapper,
});
});

Expand All @@ -61,7 +73,7 @@ describe("useTerminal", () => {
];

render(<TestTerminalComponent commands={commands} secrets={[]} />, {
wrapper: SocketProvider,
wrapper: Wrapper,
});

expect(mockTerminal.writeln).toHaveBeenNthCalledWith(1, "echo hello");
Expand All @@ -85,7 +97,7 @@ describe("useTerminal", () => {
secrets={[secret, anotherSecret]}
/>,
{
wrapper: SocketProvider,
wrapper: Wrapper,
},
);

Expand Down
4 changes: 2 additions & 2 deletions frontend/__tests__/utils/extractModelAndProvider.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ describe("extractModelAndProvider", () => {
separator: "/",
});

expect(extractModelAndProvider("claude-3-5-sonnet-20241022")).toEqual({
expect(extractModelAndProvider("claude-3-5-sonnet-20240620")).toEqual({
provider: "anthropic",
model: "claude-3-5-sonnet-20241022",
model: "claude-3-5-sonnet-20240620",
separator: "/",
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ test("organizeModelsAndProviders", () => {
"gpt-4o",
"together-ai-21.1b-41b",
"gpt-4o-mini",
"claude-3-5-sonnet-20241022",
"anthropic/claude-3-5-sonnet-20241022",
"claude-3-haiku-20240307",
"claude-2",
"claude-2.1",
Expand Down
4 changes: 2 additions & 2 deletions frontend/src/components/AgentControlBar.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import PlayIcon from "#/assets/play";
import { generateAgentStateChangeEvent } from "#/services/agentStateService";
import { RootState } from "#/store";
import AgentState from "#/types/AgentState";
import { useSocket } from "#/context/socket";
import { useWsClient } from "#/context/ws-client-provider";

const IgnoreTaskStateMap: Record<string, AgentState[]> = {
[AgentState.PAUSED]: [
Expand Down Expand Up @@ -72,7 +72,7 @@ function ActionButton({
}

function AgentControlBar() {
const { send } = useSocket();
const { send } = useWsClient();
const { curAgentState } = useSelector((state: RootState) => state.agent);

const handleAction = (action: AgentState) => {
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/components/attach-image-label.tsx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import Clip from "#/assets/clip.svg?react";
import Clip from "#/icons/clip.svg?react";

export function AttachImageLabel() {
return (
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/components/chat-input.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import React from "react";
import TextareaAutosize from "react-textarea-autosize";
import ArrowSendIcon from "#/assets/arrow-send.svg?react";
import ArrowSendIcon from "#/icons/arrow-send.svg?react";
import { cn } from "#/utils/utils";

interface ChatInputProps {
Expand Down
6 changes: 3 additions & 3 deletions frontend/src/components/chat-interface.tsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { useDispatch, useSelector } from "react-redux";
import React from "react";
import posthog from "posthog-js";
import { useSocket } from "#/context/socket";
import { convertImageToBase64 } from "#/utils/convert-image-to-base-64";
import { ChatMessage } from "./chat-message";
import { FeedbackActions } from "./feedback-actions";
Expand All @@ -21,14 +20,15 @@ import { ContinueButton } from "./continue-button";
import { ScrollToBottomButton } from "./scroll-to-bottom-button";
import { Suggestions } from "./suggestions";
import { SUGGESTIONS } from "#/utils/suggestions";
import BuildIt from "#/assets/build-it.svg?react";
import BuildIt from "#/icons/build-it.svg?react";
import { useWsClient } from "#/context/ws-client-provider";

const isErrorMessage = (
message: Message | ErrorMessage,
): message is ErrorMessage => "error" in message;

export function ChatInterface() {
const { send } = useSocket();
const { send } = useWsClient();
const dispatch = useDispatch();
const scrollRef = React.useRef<HTMLDivElement>(null);
const { scrollDomToBottom, onChatBodyScroll, hitBottom } =
Expand Down
4 changes: 2 additions & 2 deletions frontend/src/components/chat/ConfirmationButtons.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import RejectIcon from "#/assets/reject";
import { I18nKey } from "#/i18n/declaration";
import AgentState from "#/types/AgentState";
import { generateAgentStateChangeEvent } from "#/services/agentStateService";
import { useSocket } from "#/context/socket";
import { useWsClient } from "#/context/ws-client-provider";

interface ActionTooltipProps {
type: "confirm" | "reject";
Expand Down Expand Up @@ -37,7 +37,7 @@ function ActionTooltip({ type, onClick }: ActionTooltipProps) {

function ConfirmationButtons() {
const { t } = useTranslation();
const { send } = useSocket();
const { send } = useWsClient();

const handleStateChange = (state: AgentState) => {
const event = generateAgentStateChangeEvent(state);
Expand Down
Loading

0 comments on commit 6e60203

Please sign in to comment.