From f82ae33c7625d80a344d02f5139f19300681a0ec Mon Sep 17 00:00:00 2001 From: openhands Date: Sun, 5 Jan 2025 06:42:14 +0000 Subject: [PATCH] Fix issue #6048: Update documentation of recommended models and add deepseek --- docs/modules/usage/llms/llms.md | 21 +++++++++++---------- frontend/src/utils/verified-models.ts | 10 ++++++++-- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/docs/modules/usage/llms/llms.md b/docs/modules/usage/llms/llms.md index 709e86c3cf9a..3a4bd1a77b7e 100644 --- a/docs/modules/usage/llms/llms.md +++ b/docs/modules/usage/llms/llms.md @@ -5,23 +5,24 @@ OpenHands can connect to any LLM supported by LiteLLM. However, it requires a po ## Model Recommendations Based on our evaluations of language models for coding tasks (using the SWE-bench dataset), we can provide some -recommendations for model selection. Some analyses can be found in [this blog article comparing LLMs](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) and -[this blog article with some more recent results](https://www.all-hands.dev/blog/openhands-codeact-21-an-open-state-of-the-art-software-development-agent). +recommendations for model selection. Our latest benchmarking results can be found in [this spreadsheet](https://docs.google.com/spreadsheets/d/1wOUdFCMyY6Nt0AIqF705KN4JKOWgeI4wUGUP60krXXs/edit?gid=0). When choosing a model, consider both the quality of outputs and the associated costs. Here's a summary of the findings: -- Claude 3.5 Sonnet is the best by a fair amount, achieving a 53% resolve rate on SWE-Bench Verified with the default agent in OpenHands. -- GPT-4o lags behind, and o1-mini actually performed somewhat worse than GPT-4o. We went in and analyzed the results a little, and briefly it seemed like o1 was sometimes "overthinking" things, performing extra environment configuration tasks when it could just go ahead and finish the task. -- Finally, the strongest open models were Llama 3.1 405 B and deepseek-v2.5, and they performed reasonably, even besting some of the closed models. +Most recommended model: +- anthropic/claude-3-5-sonnet-20241022 (41.67% success rate) -Please refer to the [full article](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) for more details. +Models with acceptable performance: +- anthropic/claude-3-5-haiku-20241022 (28.67%) +- deepseek/deepseek-chat (23.00%) +- gpt-4o (18.67%) Based on these findings and community feedback, the following models have been verified to work reasonably well with OpenHands: -- claude-3-5-sonnet (recommended) -- gpt-4 / gpt-4o -- llama-3.1-405b -- deepseek-v2.5 +- claude-3-5-sonnet-20241022 (recommended) +- claude-3-5-haiku-20241022 +- deepseek-chat +- gpt-4o :::warning OpenHands will issue many prompts to the LLM you configure. Most of these LLMs cost money, so be sure to set spending diff --git a/frontend/src/utils/verified-models.ts b/frontend/src/utils/verified-models.ts index 885bd7ac7e8f..128d70c88916 100644 --- a/frontend/src/utils/verified-models.ts +++ b/frontend/src/utils/verified-models.ts @@ -1,6 +1,11 @@ // Here are the list of verified models and providers that we know work well with OpenHands. -export const VERIFIED_PROVIDERS = ["openai", "azure", "anthropic"]; -export const VERIFIED_MODELS = ["gpt-4o", "claude-3-5-sonnet-20241022"]; +export const VERIFIED_PROVIDERS = ["openai", "azure", "anthropic", "deepseek"]; +export const VERIFIED_MODELS = [ + "gpt-4o", + "claude-3-5-sonnet-20241022", + "claude-3-5-haiku-20241022", + "deepseek-chat" +]; // LiteLLM does not return OpenAI models with the provider, so we list them here to set them ourselves for consistency // (e.g., they return `gpt-4o` instead of `openai/gpt-4o`) @@ -21,6 +26,7 @@ export const VERIFIED_ANTHROPIC_MODELS = [ "claude-2.1", "claude-3-5-sonnet-20240620", "claude-3-5-sonnet-20241022", + "claude-3-5-haiku-20241022", "claude-3-haiku-20240307", "claude-3-opus-20240229", "claude-3-sonnet-20240229",