From f82ae33c7625d80a344d02f5139f19300681a0ec Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Sun, 5 Jan 2025 06:42:14 +0000
Subject: [PATCH 1/6] Fix issue #6048: Update documentation of recommended
 models and add deepseek

---
 docs/modules/usage/llms/llms.md       | 21 +++++++++++----------
 frontend/src/utils/verified-models.ts | 10 ++++++++--
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/docs/modules/usage/llms/llms.md b/docs/modules/usage/llms/llms.md
index 709e86c3cf9a..3a4bd1a77b7e 100644
--- a/docs/modules/usage/llms/llms.md
+++ b/docs/modules/usage/llms/llms.md
@@ -5,23 +5,24 @@ OpenHands can connect to any LLM supported by LiteLLM. However, it requires a po
 ## Model Recommendations
 
 Based on our evaluations of language models for coding tasks (using the SWE-bench dataset), we can provide some
-recommendations for model selection. Some analyses can be found in [this blog article comparing LLMs](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) and
-[this blog article with some more recent results](https://www.all-hands.dev/blog/openhands-codeact-21-an-open-state-of-the-art-software-development-agent).
+recommendations for model selection. Our latest benchmarking results can be found in [this spreadsheet](https://docs.google.com/spreadsheets/d/1wOUdFCMyY6Nt0AIqF705KN4JKOWgeI4wUGUP60krXXs/edit?gid=0).
 
 When choosing a model, consider both the quality of outputs and the associated costs. Here's a summary of the findings:
 
-- Claude 3.5 Sonnet is the best by a fair amount, achieving a 53% resolve rate on SWE-Bench Verified with the default agent in OpenHands.
-- GPT-4o lags behind, and o1-mini actually performed somewhat worse than GPT-4o. We went in and analyzed the results a little, and briefly it seemed like o1 was sometimes "overthinking" things, performing extra environment configuration tasks when it could just go ahead and finish the task.
-- Finally, the strongest open models were Llama 3.1 405 B and deepseek-v2.5, and they performed reasonably, even besting some of the closed models.
+Most recommended model:
+- anthropic/claude-3-5-sonnet-20241022 (41.67% success rate)
 
-Please refer to the [full article](https://www.all-hands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) for more details.
+Models with acceptable performance:
+- anthropic/claude-3-5-haiku-20241022 (28.67%)
+- deepseek/deepseek-chat (23.00%)
+- gpt-4o (18.67%)
 
 Based on these findings and community feedback, the following models have been verified to work reasonably well with OpenHands:
 
-- claude-3-5-sonnet (recommended)
-- gpt-4 / gpt-4o
-- llama-3.1-405b
-- deepseek-v2.5
+- claude-3-5-sonnet-20241022 (recommended)
+- claude-3-5-haiku-20241022
+- deepseek-chat
+- gpt-4o
 
 :::warning
 OpenHands will issue many prompts to the LLM you configure. Most of these LLMs cost money, so be sure to set spending
diff --git a/frontend/src/utils/verified-models.ts b/frontend/src/utils/verified-models.ts
index 885bd7ac7e8f..128d70c88916 100644
--- a/frontend/src/utils/verified-models.ts
+++ b/frontend/src/utils/verified-models.ts
@@ -1,6 +1,11 @@
 // Here are the list of verified models and providers that we know work well with OpenHands.
-export const VERIFIED_PROVIDERS = ["openai", "azure", "anthropic"];
-export const VERIFIED_MODELS = ["gpt-4o", "claude-3-5-sonnet-20241022"];
+export const VERIFIED_PROVIDERS = ["openai", "azure", "anthropic", "deepseek"];
+export const VERIFIED_MODELS = [
+  "gpt-4o",
+  "claude-3-5-sonnet-20241022",
+  "claude-3-5-haiku-20241022",
+  "deepseek-chat"
+];
 
 // LiteLLM does not return OpenAI models with the provider, so we list them here to set them ourselves for consistency
 // (e.g., they return `gpt-4o` instead of `openai/gpt-4o`)
@@ -21,6 +26,7 @@ export const VERIFIED_ANTHROPIC_MODELS = [
   "claude-2.1",
   "claude-3-5-sonnet-20240620",
   "claude-3-5-sonnet-20241022",
+  "claude-3-5-haiku-20241022",
   "claude-3-haiku-20240307",
   "claude-3-opus-20240229",
   "claude-3-sonnet-20240229",

From 3160ef431d996d18376206b690e94c79dba39a0b Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 5 Jan 2025 20:24:39 +0100
Subject: [PATCH 2/6] Update frontend/src/utils/verified-models.ts

---
 frontend/src/utils/verified-models.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/src/utils/verified-models.ts b/frontend/src/utils/verified-models.ts
index 128d70c88916..54f0031e681e 100644
--- a/frontend/src/utils/verified-models.ts
+++ b/frontend/src/utils/verified-models.ts
@@ -4,7 +4,7 @@ export const VERIFIED_MODELS = [
   "gpt-4o",
   "claude-3-5-sonnet-20241022",
   "claude-3-5-haiku-20241022",
-  "deepseek-chat"
+  "deepseek-chat",
 ];
 
 // LiteLLM does not return OpenAI models with the provider, so we list them here to set them ourselves for consistency

From 5110c4d59e29172b3e9fa57fc79263f93bddbb1b Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Mon, 6 Jan 2025 12:33:03 +0900
Subject: [PATCH 3/6] Update LLMs doc

---
 docs/modules/usage/llms/llms.md | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/docs/modules/usage/llms/llms.md b/docs/modules/usage/llms/llms.md
index 3a4bd1a77b7e..fd154336f269 100644
--- a/docs/modules/usage/llms/llms.md
+++ b/docs/modules/usage/llms/llms.md
@@ -7,21 +7,11 @@ OpenHands can connect to any LLM supported by LiteLLM. However, it requires a po
 Based on our evaluations of language models for coding tasks (using the SWE-bench dataset), we can provide some
 recommendations for model selection. Our latest benchmarking results can be found in [this spreadsheet](https://docs.google.com/spreadsheets/d/1wOUdFCMyY6Nt0AIqF705KN4JKOWgeI4wUGUP60krXXs/edit?gid=0).
 
-When choosing a model, consider both the quality of outputs and the associated costs. Here's a summary of the findings:
-
-Most recommended model:
-- anthropic/claude-3-5-sonnet-20241022 (41.67% success rate)
-
-Models with acceptable performance:
-- anthropic/claude-3-5-haiku-20241022 (28.67%)
-- deepseek/deepseek-chat (23.00%)
-- gpt-4o (18.67%)
-
 Based on these findings and community feedback, the following models have been verified to work reasonably well with OpenHands:
 
-- claude-3-5-sonnet-20241022 (recommended)
-- claude-3-5-haiku-20241022
-- deepseek-chat
+- anthropic/claude-3-5-sonnet-20241022 (most recommended)
+- anthropic/claude-3-5-haiku-20241022
+- deepseek/deepseek-chat
 - gpt-4o
 
 :::warning

From 467fe2d66a32004d4cb26e82e2334af09af47145 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 6 Jan 2025 03:49:50 +0000
Subject: [PATCH 4/6] Fix issue #6063: [Bug]: Build error on `opencv-python`

---
 poetry.lock    | 20 +-------------------
 pyproject.toml |  2 --
 2 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index bb3729628113..32b1649cb033 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -5422,24 +5422,6 @@ typing-extensions = ">=4.11,<5"
 datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
 realtime = ["websockets (>=13,<15)"]
 
-[[package]]
-name = "opencv-python"
-version = "4.10.0.84"
-description = "Wrapper package for OpenCV python bindings."
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "opencv-python-4.10.0.84.tar.gz", hash = "sha256:72d234e4582e9658ffea8e9cae5b63d488ad06994ef12d81dc303b17472f3526"},
-    {file = "opencv_python-4.10.0.84-cp37-abi3-macosx_12_0_x86_64.whl", hash = "sha256:71e575744f1d23f79741450254660442785f45a0797212852ee5199ef12eed98"},
-    {file = "opencv_python-4.10.0.84-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09a332b50488e2dda866a6c5573ee192fe3583239fb26ff2f7f9ceb0bc119ea6"},
-    {file = "opencv_python-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ace140fc6d647fbe1c692bcb2abce768973491222c067c131d80957c595b71f"},
-    {file = "opencv_python-4.10.0.84-cp37-abi3-win32.whl", hash = "sha256:2db02bb7e50b703f0a2d50c50ced72e95c574e1e5a0bb35a8a86d0b35c98c236"},
-    {file = "opencv_python-4.10.0.84-cp37-abi3-win_amd64.whl", hash = "sha256:32dbbd94c26f611dc5cc6979e6b7aa1f55a64d6b463cc1dcd3c95505a63e48fe"},
-]
-
-[package.dependencies]
-numpy = {version = ">=1.26.0", markers = "python_version >= \"3.12\""}
-
 [[package]]
 name = "openhands-aci"
 version = "0.1.6"
@@ -10083,4 +10065,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "6f8fd9ffcc411aed1c8f50aff98e36bf06932c27b82485e4f9fd05bbe7b195c4"
+content-hash = "3c4cae19fcbd9183bde1bd88cea55454921281e26447d9a2c64404a5defffb3e"
diff --git a/pyproject.toml b/pyproject.toml
index db70ae05e01b..d94d979253f0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -95,7 +95,6 @@ pytest-forked = "*"
 pytest-xdist = "*"
 flake8 = "*"
 openai = "*"
-opencv-python = "*"
 pandas = "*"
 reportlab = "*"
 
@@ -108,7 +107,6 @@ jupyterlab = "*"
 notebook = "*"
 jupyter_kernel_gateway = "*"
 flake8 = "*"
-opencv-python = "*"
 
 [build-system]
 build-backend = "poetry.core.masonry.api"

From a2ad120bc22f8d0029b189ceaef1df927a01f712 Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Mon, 6 Jan 2025 13:39:17 +0900
Subject: [PATCH 5/6] some updates

---
 frontend/src/utils/verified-models.ts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/frontend/src/utils/verified-models.ts b/frontend/src/utils/verified-models.ts
index 54f0031e681e..da77b25e1c7a 100644
--- a/frontend/src/utils/verified-models.ts
+++ b/frontend/src/utils/verified-models.ts
@@ -3,7 +3,6 @@ export const VERIFIED_PROVIDERS = ["openai", "azure", "anthropic", "deepseek"];
 export const VERIFIED_MODELS = [
   "gpt-4o",
   "claude-3-5-sonnet-20241022",
-  "claude-3-5-haiku-20241022",
   "deepseek-chat",
 ];
 

From 1f472ce26206b15692b4da597d5b335297fcfa34 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 6 Jan 2025 12:59:29 +0100
Subject: [PATCH 6/6] Update docs/modules/usage/llms/llms.md

---
 docs/modules/usage/llms/llms.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/modules/usage/llms/llms.md b/docs/modules/usage/llms/llms.md
index fd154336f269..5e6a472d0c0a 100644
--- a/docs/modules/usage/llms/llms.md
+++ b/docs/modules/usage/llms/llms.md
@@ -9,7 +9,7 @@ recommendations for model selection. Our latest benchmarking results can be foun
 
 Based on these findings and community feedback, the following models have been verified to work reasonably well with OpenHands:
 
-- anthropic/claude-3-5-sonnet-20241022 (most recommended)
+- anthropic/claude-3-5-sonnet-20241022 (recommended)
 - anthropic/claude-3-5-haiku-20241022
 - deepseek/deepseek-chat
 - gpt-4o