diff --git a/README.md b/README.md index 0bbe231..e618a49 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ SciCode sources challenging and realistic research-level coding problems across | 🥇 OpenAI o1-preview |
**7.7**
|
28.5
| | 🥈 Claude3.5-Sonnet |
**4.6**
|
26.0
| | 🥉 Claude3.5-Sonnet (new) |
**4.6**
|
25.3
| +| Deepseek-v3 |
**3.1**
|
23.7
| | Deepseek-Coder-v2 |
**3.1**
|
21.2
| | GPT-4o |
**1.5**
|
25.0
| | GPT-4-Turbo |
**1.5**
|
22.9
| diff --git a/eval/inspect_ai/README.md b/eval/inspect_ai/README.md index ab40cf8..2f02041 100644 --- a/eval/inspect_ai/README.md +++ b/eval/inspect_ai/README.md @@ -14,10 +14,11 @@ inspect eval scicode.py --model --temperature 0 However, there are some additional command line arguments that could be useful as well. -- `--max_connections`: Maximum amount of API connections to the evaluated model. +- `--max-connections`: Maximum amount of API connections to the evaluated model. - `--limit`: Limit of the number of samples to evaluate in the SciCode dataset. - `-T input_path=`: This is useful when user wants to change to another json dataset (e.g., the dev set). - `-T output_dir=`: This changes the default output directory (`./tmp`). +- `-T h5py_file=`: This is used if your h5py file is not downloaded in the recommended directory. - `-T with_background=True/False`: Whether to include problem background. - `-T mode=normal/gold/dummy`: This provides two additional modes for sanity checks. - `normal` mode is the standard mode to evaluate a model @@ -37,6 +38,19 @@ inspect eval scicode.py \ -T mode=gold ``` +User can run the evaluation on `Deepseek-v3` using together ai via the following command: + +```bash +export TOGETHER_API_KEY= +inspect eval scicode.py \ + --model together/deepseek-ai/DeepSeek-V3 \ + --temperature 0 \ + --max-connections 2 \ + --max-tokens 32784 \ + -T output_dir=./tmp/deepseek-v3 \ + -T with_background=False +``` + For more information regarding `inspect_ai`, we refer users to its [official documentation](https://inspect.ai-safety-institute.org.uk/). ### Extra: How SciCode are Evaluated Under the Hood? diff --git a/eval/inspect_ai/scicode.py b/eval/inspect_ai/scicode.py index dc4d744..3e7da05 100644 --- a/eval/inspect_ai/scicode.py +++ b/eval/inspect_ai/scicode.py @@ -336,12 +336,16 @@ async def solve(state: TaskState, generate: Generate) -> TaskState: elif params["mode"] == "gold": response_from_llm = generate_gold_response(state.metadata, idx+1) else: - # ===Model Generation=== - state.user_prompt.text = prompt - state_copy = copy.deepcopy(state) - result = await generate(state=state_copy) - response_from_llm = result.output.completion - # ===Model Generation=== + try: + # ===Model Generation=== + state.user_prompt.text = prompt + state_copy = copy.deepcopy(state) + result = await generate(state=state_copy) + response_from_llm = result.output.completion + # ===Model Generation=== + except: + print(f"Failed to generate response for problem {prob_id} step {idx+1}.") + response_from_llm = generate_dummy_response(prompt) prompt_assistant.register_previous_response( prob_data=state.metadata, response=response_from_llm,