From 3994c6c8af1169bfd988288fad7065e38cb09bc1 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Mon, 16 Dec 2024 21:09:57 +0100 Subject: [PATCH] Revert "Fix issue #5609: Use litellm's modify_params with default True (#5611)" This reverts commit 09735c7869bdd34a9233f8d7bea184bd5c8451ba. --- config.template.toml | 5 ----- evaluation/benchmarks/EDA/run_infer.py | 2 +- evaluation/benchmarks/agent_bench/run_infer.py | 2 +- evaluation/benchmarks/aider_bench/run_infer.py | 2 +- evaluation/benchmarks/biocoder/run_infer.py | 2 +- evaluation/benchmarks/bird/run_infer.py | 2 +- evaluation/benchmarks/browsing_delegation/run_infer.py | 2 +- evaluation/benchmarks/commit0_bench/run_infer.py | 2 +- evaluation/benchmarks/discoverybench/run_infer.py | 2 +- evaluation/benchmarks/gaia/run_infer.py | 2 +- evaluation/benchmarks/gorilla/run_infer.py | 2 +- evaluation/benchmarks/gpqa/run_infer.py | 2 +- evaluation/benchmarks/humanevalfix/run_infer.py | 2 +- evaluation/benchmarks/logic_reasoning/run_infer.py | 2 +- evaluation/benchmarks/miniwob/run_infer.py | 2 +- evaluation/benchmarks/mint/run_infer.py | 2 +- evaluation/benchmarks/ml_bench/run_infer.py | 2 +- evaluation/benchmarks/scienceagentbench/run_infer.py | 2 +- evaluation/benchmarks/swe_bench/run_infer.py | 5 +++-- evaluation/benchmarks/toolqa/run_infer.py | 2 +- evaluation/benchmarks/webarena/run_infer.py | 2 +- openhands/core/config/llm_config.py | 2 -- openhands/core/config/utils.py | 10 +++------- openhands/llm/llm.py | 1 - pyproject.toml | 2 -- 25 files changed, 25 insertions(+), 38 deletions(-) diff --git a/config.template.toml b/config.template.toml index 4e5b0d870a29..e88150cd0766 100644 --- a/config.template.toml +++ b/config.template.toml @@ -154,11 +154,6 @@ model = "gpt-4o" # Drop any unmapped (unsupported) params without causing an exception #drop_params = false -# Allow litellm to modify parameters to make them compatible with providers -# for example by inserting a default message (like 'continue') when a message is empty -# and the provider's API would give an error otherwise -#modify_params = true - # Using the prompt caching feature if provided by the LLM and supported #caching_prompt = true diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py index cf63f37e785c..cce795e954bf 100644 --- a/evaluation/benchmarks/EDA/run_infer.py +++ b/evaluation/benchmarks/EDA/run_infer.py @@ -201,7 +201,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py index 6833402741f6..2fb7213ce845 100644 --- a/evaluation/benchmarks/agent_bench/run_infer.py +++ b/evaluation/benchmarks/agent_bench/run_infer.py @@ -306,7 +306,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index 23aab08dc60c..f7796c7696de 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -278,7 +278,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py index 9b973a9bae3b..f5cdd44471a8 100644 --- a/evaluation/benchmarks/biocoder/run_infer.py +++ b/evaluation/benchmarks/biocoder/run_infer.py @@ -327,7 +327,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py index b43bc5341635..839284148095 100644 --- a/evaluation/benchmarks/bird/run_infer.py +++ b/evaluation/benchmarks/bird/run_infer.py @@ -455,7 +455,7 @@ def execute_sql(db_path, sql): llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py index a52082db1e46..5c1ab8c062e3 100644 --- a/evaluation/benchmarks/browsing_delegation/run_infer.py +++ b/evaluation/benchmarks/browsing_delegation/run_infer.py @@ -141,7 +141,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py index 2f703356c2d1..ef2df020310c 100644 --- a/evaluation/benchmarks/commit0_bench/run_infer.py +++ b/evaluation/benchmarks/commit0_bench/run_infer.py @@ -570,7 +570,7 @@ def commit0_setup(dataset: pd.DataFrame, repo_split: str) -> pd.DataFrame: llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) llm_config.log_completions = True if llm_config is None: diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py index 73f4f6590381..6d8dcbd89b3c 100644 --- a/evaluation/benchmarks/discoverybench/run_infer.py +++ b/evaluation/benchmarks/discoverybench/run_infer.py @@ -465,7 +465,7 @@ def create_dataset(repo_location: str, split: str = 'test'): llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py index 582f70dd04e6..fb6d4b3db050 100644 --- a/evaluation/benchmarks/gaia/run_infer.py +++ b/evaluation/benchmarks/gaia/run_infer.py @@ -237,7 +237,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py index 22db1e5a13dd..6f5b6c9d4388 100644 --- a/evaluation/benchmarks/gorilla/run_infer.py +++ b/evaluation/benchmarks/gorilla/run_infer.py @@ -145,7 +145,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py index 30c12245606f..de4124859991 100644 --- a/evaluation/benchmarks/gpqa/run_infer.py +++ b/evaluation/benchmarks/gpqa/run_infer.py @@ -325,7 +325,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py index f60c1696be94..fff2e23730d4 100644 --- a/evaluation/benchmarks/humanevalfix/run_infer.py +++ b/evaluation/benchmarks/humanevalfix/run_infer.py @@ -284,7 +284,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py index d7e5ad868476..116b438b3ee9 100644 --- a/evaluation/benchmarks/logic_reasoning/run_infer.py +++ b/evaluation/benchmarks/logic_reasoning/run_infer.py @@ -287,7 +287,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/miniwob/run_infer.py b/evaluation/benchmarks/miniwob/run_infer.py index e85d0fd2abd0..95e4c935756b 100644 --- a/evaluation/benchmarks/miniwob/run_infer.py +++ b/evaluation/benchmarks/miniwob/run_infer.py @@ -230,7 +230,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py index e27aa679f801..4414e1c4625f 100644 --- a/evaluation/benchmarks/mint/run_infer.py +++ b/evaluation/benchmarks/mint/run_infer.py @@ -278,7 +278,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py index 39eca2d6705f..e97746bb7609 100644 --- a/evaluation/benchmarks/ml_bench/run_infer.py +++ b/evaluation/benchmarks/ml_bench/run_infer.py @@ -291,7 +291,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py index 7e7d5ee5564c..efa6c9e42c57 100644 --- a/evaluation/benchmarks/scienceagentbench/run_infer.py +++ b/evaluation/benchmarks/scienceagentbench/run_infer.py @@ -271,7 +271,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 8b1c36e32e13..01111f75d126 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -9,6 +9,7 @@ from datasets import load_dataset import openhands.agenthub + from evaluation.utils.shared import ( EvalException, EvalMetadata, @@ -75,7 +76,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata): '4. Rerun your reproduce script and confirm that the error is fixed!\n' '5. Think about edgecases and make sure your fix handles them as well\n' "Your thinking should be thorough and so it's fine if it's very long.\n" - ) + ) if RUN_WITH_BROWSING: instruction += ( @@ -488,7 +489,7 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) llm_config.log_completions = True if llm_config is None: diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py index c730966bb77a..c99f15a89ae9 100644 --- a/evaluation/benchmarks/toolqa/run_infer.py +++ b/evaluation/benchmarks/toolqa/run_infer.py @@ -180,7 +180,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/evaluation/benchmarks/webarena/run_infer.py b/evaluation/benchmarks/webarena/run_infer.py index 1d2eae37f7f3..531f134fd988 100644 --- a/evaluation/benchmarks/webarena/run_infer.py +++ b/evaluation/benchmarks/webarena/run_infer.py @@ -211,7 +211,7 @@ def process_instance( llm_config = None if args.llm_config: - llm_config = get_llm_config_arg(args.llm_config, evaluation=True) + llm_config = get_llm_config_arg(args.llm_config) if llm_config is None: raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py index 8cd2b7171447..4e60d4a281ae 100644 --- a/openhands/core/config/llm_config.py +++ b/openhands/core/config/llm_config.py @@ -44,7 +44,6 @@ class LLMConfig: log_completions_folder: The folder to log LLM completions to. Required if log_completions is True. draft_editor: A more efficient LLM to use for file editing. Introduced in [PR 3985](https://github.com/All-Hands-AI/OpenHands/pull/3985). custom_tokenizer: A custom tokenizer to use for token counting. - modify_params: Allow litellm to modify parameters to make them compatible with the provider. For example, insert default messages when empty. Defaults to True. """ model: str = 'claude-3-5-sonnet-20241022' @@ -80,7 +79,6 @@ class LLMConfig: log_completions_folder: str = os.path.join(LOG_DIR, 'completions') draft_editor: Optional['LLMConfig'] = None custom_tokenizer: str | None = None - modify_params: bool = True def defaults_to_dict(self) -> dict: """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional.""" diff --git a/openhands/core/config/utils.py b/openhands/core/config/utils.py index bbfe7d308458..3aedaf952353 100644 --- a/openhands/core/config/utils.py +++ b/openhands/core/config/utils.py @@ -243,9 +243,9 @@ def finalize_config(cfg: AppConfig): ) -# Utility function for command line -l (--llm-config) argument +# Utility function for command line --group argument def get_llm_config_arg( - llm_config_arg: str, toml_file: str = 'config.toml', evaluation: bool = False + llm_config_arg: str, toml_file: str = 'config.toml' ) -> LLMConfig | None: """Get a group of llm settings from the config file. @@ -268,7 +268,6 @@ def get_llm_config_arg( Args: llm_config_arg: The group of llm settings to get from the config.toml file. toml_file: Path to the configuration file to read from. Defaults to 'config.toml'. - evaluation: If True, sets modify_params=False for evaluation purposes. Defaults to False. Returns: LLMConfig: The LLMConfig object with the settings from the config file. @@ -297,10 +296,7 @@ def get_llm_config_arg( # update the llm config with the specified section if 'llm' in toml_config and llm_config_arg in toml_config['llm']: - config = LLMConfig.from_dict(toml_config['llm'][llm_config_arg]) - if evaluation: - config.modify_params = False - return config + return LLMConfig.from_dict(toml_config['llm'][llm_config_arg]) logger.openhands_logger.debug(f'Loading from toml failed for {llm_config_arg}') return None diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index dfa16d977ab4..d7c7309eff3f 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -142,7 +142,6 @@ def __init__( temperature=self.config.temperature, top_p=self.config.top_p, drop_params=self.config.drop_params, - modify_params=self.config.modify_params, ) self._completion_unwrapped = self._completion diff --git a/pyproject.toml b/pyproject.toml index 20c54d64cdfb..2b0d3ca1e8a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,7 +100,6 @@ reportlab = "*" [tool.coverage.run] concurrency = ["gevent"] - [tool.poetry.group.runtime.dependencies] jupyterlab = "*" notebook = "*" @@ -131,7 +130,6 @@ ignore = ["D1"] [tool.ruff.lint.pydocstyle] convention = "google" - [tool.poetry.group.evaluation.dependencies] streamlit = "*" whatthepatch = "*"