From cc0bf449a248f2f1c405f9c06a7008378ab713e2 Mon Sep 17 00:00:00 2001
From: Robert Brennan <contact@rbren.io>
Date: Thu, 30 Jan 2025 17:20:23 -0500
Subject: [PATCH 1/6] stop retrying on all exceptions

---
 openhands/llm/llm.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index af25baded4c4..9aba1545529b 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -18,11 +18,8 @@
 from litellm import completion as litellm_completion
 from litellm import completion_cost as litellm_completion_cost
 from litellm.exceptions import (
-    APIConnectionError,
     APIError,
-    InternalServerError,
     RateLimitError,
-    ServiceUnavailableError,
 )
 from litellm.types.utils import CostPerToken, ModelResponse, Usage
 from litellm.utils import create_pretrained_tokenizer
@@ -42,15 +39,7 @@
 __all__ = ['LLM']
 
 # tuple of exceptions to retry on
-LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (
-    APIConnectionError,
-    # FIXME: APIError is useful on 502 from a proxy for example,
-    # but it also retries on other errors that are permanent
-    APIError,
-    InternalServerError,
-    RateLimitError,
-    ServiceUnavailableError,
-)
+LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (RateLimitError,)
 
 # cache prompt supporting models
 # remove this when we gemini and deepseek are supported

From 4019cfe54fe6544f8c459a47497afe95a54eace6 Mon Sep 17 00:00:00 2001
From: Robert Brennan <contact@rbren.io>
Date: Thu, 30 Jan 2025 17:34:58 -0500
Subject: [PATCH 2/6]  fix retry behavior

---
 openhands/core/config/llm_config.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py
index 9beb6d6f5f09..a278d6049cf1 100644
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -59,10 +59,11 @@ class LLMConfig(BaseModel):
     aws_region_name: str | None = Field(default=None)
     openrouter_site_url: str = Field(default='https://docs.all-hands.dev/')
     openrouter_app_name: str = Field(default='OpenHands')
-    num_retries: int = Field(default=8)
+    # total wait time: 5 + 10 + 20 + 30 = 65 seconds
+    num_retries: int = Field(default=4)
     retry_multiplier: float = Field(default=2)
-    retry_min_wait: int = Field(default=15)
-    retry_max_wait: int = Field(default=120)
+    retry_min_wait: int = Field(default=5)
+    retry_max_wait: int = Field(default=30)
     timeout: int | None = Field(default=None)
     max_message_chars: int = Field(
         default=30_000

From c46665cf961b23cb5fd961f3572b51a60d7caf44 Mon Sep 17 00:00:00 2001
From: Robert Brennan <contact@rbren.io>
Date: Thu, 30 Jan 2025 17:46:32 -0500
Subject: [PATCH 3/6] fix tests

---
 tests/unit/test_llm.py | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py
index 227b0006b020..e56b14654ac4 100644
--- a/tests/unit/test_llm.py
+++ b/tests/unit/test_llm.py
@@ -3,10 +3,7 @@
 
 import pytest
 from litellm.exceptions import (
-    APIConnectionError,
-    InternalServerError,
     RateLimitError,
-    ServiceUnavailableError,
 )
 
 from openhands.core.config import LLMConfig
@@ -187,21 +184,6 @@ def test_completion_with_mocked_logger(
 @pytest.mark.parametrize(
     'exception_class,extra_args,expected_retries',
     [
-        (
-            APIConnectionError,
-            {'llm_provider': 'test_provider', 'model': 'test_model'},
-            2,
-        ),
-        (
-            InternalServerError,
-            {'llm_provider': 'test_provider', 'model': 'test_model'},
-            2,
-        ),
-        (
-            ServiceUnavailableError,
-            {'llm_provider': 'test_provider', 'model': 'test_model'},
-            2,
-        ),
         (RateLimitError, {'llm_provider': 'test_provider', 'model': 'test_model'}, 2),
     ],
 )
@@ -254,22 +236,6 @@ def test_completion_rate_limit_wait_time(mock_litellm_completion, default_config
         ), f'Expected wait time between {default_config.retry_min_wait} and {default_config.retry_max_wait} seconds, but got {wait_time}'
 
 
-@patch('openhands.llm.llm.litellm_completion')
-def test_completion_exhausts_retries(mock_litellm_completion, default_config):
-    mock_litellm_completion.side_effect = APIConnectionError(
-        'Persistent error', llm_provider='test_provider', model='test_model'
-    )
-
-    llm = LLM(config=default_config)
-    with pytest.raises(APIConnectionError):
-        llm.completion(
-            messages=[{'role': 'user', 'content': 'Hello!'}],
-            stream=False,
-        )
-
-    assert mock_litellm_completion.call_count == llm.config.num_retries
-
-
 @patch('openhands.llm.llm.litellm_completion')
 def test_completion_operation_cancelled(mock_litellm_completion, default_config):
     mock_litellm_completion.side_effect = OperationCancelled('Operation cancelled')

From ee06747bd9597323e71bd2848d63eb0dc9140875 Mon Sep 17 00:00:00 2001
From: Robert Brennan <contact@rbren.io>
Date: Fri, 31 Jan 2025 11:46:45 -0500
Subject: [PATCH 4/6] fix test

---
 tests/unit/test_llm_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_llm_config.py b/tests/unit/test_llm_config.py
index 342112a44316..fd11deb98580 100644
--- a/tests/unit/test_llm_config.py
+++ b/tests/unit/test_llm_config.py
@@ -188,7 +188,7 @@ def test_load_from_toml_llm_missing_generic(
     assert custom_only.model == 'custom-only-model'
     assert custom_only.api_key.get_secret_value() == 'custom-only-api-key'
     assert custom_only.embedding_model == 'local'  # default value
-    assert custom_only.num_retries == 8  # default value
+    assert custom_only.num_retries == 4  # default value
 
 
 def test_load_from_toml_llm_invalid_config(

From e81b31283ade1cb1ae5ceaa03808e26e7af01870 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Tue, 4 Feb 2025 00:16:44 +0100
Subject: [PATCH 5/6] Update openhands/llm/llm.py

---
 openhands/llm/llm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 492c9d993859..9fab700e897c 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -18,7 +18,6 @@
 from litellm import completion as litellm_completion
 from litellm import completion_cost as litellm_completion_cost
 from litellm.exceptions import (
-    APIError,
     RateLimitError,
 )
 from litellm.types.utils import CostPerToken, ModelResponse, Usage

From 185288b5852ee742e5eba8f985d4dc2f5f190ee6 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Fri, 14 Feb 2025 16:13:31 +0100
Subject: [PATCH 6/6] [rbren no-retries] add user-friendly messages (#6576)

---
 docs/modules/usage/llms/llms.md          | 14 ++++----
 frontend/src/i18n/translation.json       | 44 +++++++++++++++++++++++-
 openhands/controller/agent_controller.py | 11 ++++++
 3 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/docs/modules/usage/llms/llms.md b/docs/modules/usage/llms/llms.md
index 5e6a472d0c0a..f4fa118dd02e 100644
--- a/docs/modules/usage/llms/llms.md
+++ b/docs/modules/usage/llms/llms.md
@@ -63,22 +63,22 @@ We have a few guides for running OpenHands with specific model providers:
 ### API retries and rate limits
 
 LLM providers typically have rate limits, sometimes very low, and may require retries. OpenHands will automatically
-retry requests if it receives a Rate Limit Error (429 error code), API connection error, or other transient errors.
+retry requests if it receives a Rate Limit Error (429 error code).
 
 You can customize these options as you need for the provider you're using. Check their documentation, and set the
 following environment variables to control the number of retries and the time between retries:
 
-- `LLM_NUM_RETRIES` (Default of 8)
-- `LLM_RETRY_MIN_WAIT` (Default of 15 seconds)
-- `LLM_RETRY_MAX_WAIT` (Default of 120 seconds)
+- `LLM_NUM_RETRIES` (Default of 4 times)
+- `LLM_RETRY_MIN_WAIT` (Default of 5 seconds)
+- `LLM_RETRY_MAX_WAIT` (Default of 30 seconds)
 - `LLM_RETRY_MULTIPLIER` (Default of 2)
 
 If you are running OpenHands in development mode, you can also set these options in the `config.toml` file:
 
 ```toml
 [llm]
-num_retries = 8
-retry_min_wait = 15
-retry_max_wait = 120
+num_retries = 4
+retry_min_wait = 5
+retry_max_wait = 30
 retry_multiplier = 2
 ```
diff --git a/frontend/src/i18n/translation.json b/frontend/src/i18n/translation.json
index eaa0ccf43b8e..2cec18014f58 100644
--- a/frontend/src/i18n/translation.json
+++ b/frontend/src/i18n/translation.json
@@ -3803,6 +3803,37 @@
         "pt": "Erro ao autenticar com o provedor LLM. Por favor, verifique sua chave API",
         "tr": "LLM sağlayıcısı ile kimlik doğrulama hatası. Lütfen API anahtarınızı kontrol edin"
     },
+    "STATUS$ERROR_LLM_SERVICE_UNAVAILABLE": {
+        "en": "The LLM provider is currently unavailable. Please try again later.",
+        "es": "El proveedor LLM no está actualmente disponible. Por favor, inténtelo de nuevo más tarde.",
+        "zh-CN": "LLM提供商当前不可用",
+        "zh-TW": "LLM提供商目前無法使用",
+        "ko-KR": "LLM 공급자가 현재 사용 불가능합니다",
+        "ja": "LLMプロバイダーが現在利用できません。後でもう一度試してください。",
+        "no": "LLM-leverandøren er nå ikke tilgjengelig. Vennligst prøv igjen senere.",
+        "ar": "المزود LLM غير متاح حالياً. يرجى المحاولة مرة أخرى لاحقًا.",
+        "de": "Der LLM-Anbieter ist derzeit nicht verfügbar. Bitte versuchen Sie es später erneut.",
+        "fr": "Le fournisseur LLM n'est actuellement pas disponible. Veuillez réessayer plus tard.",
+        "it": "Il provider LLM non è attualmente disponibile. Per favore, riprova più tardi.",
+        "pt": "O provedor LLM não está atualmente disponível. Por favor, tente novamente mais tarde.",
+        "tr": "LLM sağlayıcısı şu anda kullanılamıyor. Lütfen daha sonra tekrar deneyin."
+    },
+    "STATUS$ERROR_LLM_INTERNAL_SERVER_ERROR": {
+        "en": "The request failed with an internal server error.",
+        "es": "La solicitud falló con un error del servidor interno.",
+        "zh-CN": "请求失败，请稍后再试",
+        "zh-TW": "請求失敗，請稍後再試",
+        "ko-KR": "요청이 실패했습니다. 나중에 다시 시도해주세요.",
+        "ja": "リクエストが内部サーバーエラーで失敗しました。後でもう一度試してください。",
+        "no": "Det oppstod en feil ved tilkobling til kjøretidsmiljøet. Vennligst oppdater siden.",
+        "ar": "حدث خطأ أثناء الاتصال بوقت التشغيل. يرجى تحديث الصفحة.",
+        "de": "Beim Verbinden mit der Laufzeitumgebung ist ein Fehler aufgetreten. Bitte aktualisieren Sie die Seite.",
+        "fr": "Une erreur s'est produite lors de la connexion à l'environnement d'exécution. Veuillez rafraîchir la page.",
+        "it": "Si è verificato un errore durante la connessione al runtime. Aggiorna la pagina.",
+        "pt": "Ocorreu um erro ao conectar ao ambiente de execução. Por favor, atualize a página.",
+        "tr": "Çalışma zamanına bağlanırken bir hata oluştu. Lütfen sayfayı yenileyin."
+    },
+
     "STATUS$ERROR_RUNTIME_DISCONNECTED": {
         "en": "There was an error while connecting to the runtime. Please refresh the page.",
         "zh-CN": "运行时已断开连接",
@@ -3820,7 +3851,18 @@
     },
     "STATUS$LLM_RETRY": {
         "en": "Retrying LLM request",
-        "zh-TW": "重新嘗試 LLM 請求中"
+        "es": "Reintentando solicitud LLM",
+        "zh-CN": "重试LLM请求",
+        "zh-TW": "重試LLM請求",
+        "ko-KR": "LLM 요청 재시도",
+        "ja": "LLM リクエストを再試行中",
+        "no": "Gjenforsøker LLM-forespørsel",
+        "ar": "يتم إعادة تحميل الطلب LLM",
+        "de": "LLM-Anfrage erneut versuchen",
+        "fr": "Réessayer la requête LLM",
+        "it": "Ritenta la richiesta LLM",
+        "pt": "Reintentando a solicitação LLM",
+        "tr": "LLM isteğini yeniden deniyor"
     },
     "AGENT_ERROR$BAD_ACTION": {
         "en": "Agent tried to execute a malformed action.",
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index e5a0b24f9694..1e338810198a 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -214,6 +214,17 @@ async def _react_to_exception(
             err_id = ''
             if isinstance(e, litellm.AuthenticationError):
                 err_id = 'STATUS$ERROR_LLM_AUTHENTICATION'
+            elif isinstance(
+                e,
+                (
+                    litellm.ServiceUnavailableError,
+                    litellm.APIConnectionError,
+                    litellm.APIError,
+                ),
+            ):
+                err_id = 'STATUS$ERROR_LLM_SERVICE_UNAVAILABLE'
+            elif isinstance(e, litellm.InternalServerError):
+                err_id = 'STATUS$ERROR_LLM_INTERNAL_SERVER_ERROR'
             elif isinstance(e, RateLimitError):
                 await self.set_agent_state_to(AgentState.RATE_LIMITED)
                 return