scicode-bench · Zilinghan · Jan 31, 2025 · Jan 31, 2025
diff --git a/README.md b/README.md
@@ -34,6 +34,7 @@ SciCode sources challenging and realistic research-level coding problems across
 | 🥇 OpenAI o1-preview      | <div align="center">**7.7**</div>       | <div align="center" style="color:grey">28.5</div>     |
 | 🥈 Claude3.5-Sonnet       | <div align="center">**4.6**</div>       | <div align="center" style="color:grey">26.0</div>     |
 | 🥉 Claude3.5-Sonnet (new) | <div align="center">**4.6**</div>       | <div align="center" style="color:grey">25.3</div>     |
+| Deepseek-v3              | <div align="center">**3.1**</div>       | <div align="center" style="color:grey">23.7</div>     |
 | Deepseek-Coder-v2        | <div align="center">**3.1**</div>       | <div align="center" style="color:grey">21.2</div>     |
 | GPT-4o                   | <div align="center">**1.5**</div>       | <div align="center" style="color:grey">25.0</div>     |
 | GPT-4-Turbo              | <div align="center">**1.5**</div>       | <div align="center" style="color:grey">22.9</div>     |

diff --git a/eval/inspect_ai/README.md b/eval/inspect_ai/README.md
@@ -14,10 +14,11 @@ inspect eval scicode.py --model <your_model> --temperature 0
 
 However, there are some additional command line arguments that could be useful as well.
 
-- `--max_connections`: Maximum amount of API connections to the evaluated model.
+- `--max-connections`: Maximum amount of API connections to the evaluated model.
 - `--limit`: Limit of the number of samples to evaluate in the SciCode dataset.
 - `-T input_path=<another_input_json_file>`: This is useful when user wants to change to another json dataset (e.g., the dev set).
 - `-T output_dir=<your_output_dir>`: This changes the default output directory (`./tmp`).
+- `-T h5py_file=<your_h5py_file>`: This is used if your h5py file is not downloaded in the recommended directory.
 - `-T with_background=True/False`: Whether to include problem background.
 - `-T mode=normal/gold/dummy`: This provides two additional modes for sanity checks.
     - `normal` mode is the standard mode to evaluate a model
@@ -37,6 +38,19 @@ inspect eval scicode.py \
     -T mode=gold
 ```
 
+User can run the evaluation on `Deepseek-v3` using together ai via the following command:
+
+```bash
+export TOGETHER_API_KEY=<YOUR_API_KEY>
+inspect eval scicode.py \
+    --model together/deepseek-ai/DeepSeek-V3 \
+    --temperature 0 \
+    --max-connections 2 \
+    --max-tokens 32784 \
+    -T output_dir=./tmp/deepseek-v3 \
+    -T with_background=False
+```
+
 For more information regarding `inspect_ai`, we refer users to its [official documentation](https://inspect.ai-safety-institute.org.uk/).
 
 ### Extra: How SciCode are Evaluated Under the Hood?

diff --git a/eval/inspect_ai/scicode.py b/eval/inspect_ai/scicode.py
@@ -336,12 +336,16 @@ async def solve(state: TaskState, generate: Generate) -> TaskState:
             elif params["mode"] == "gold":
                 response_from_llm = generate_gold_response(state.metadata, idx+1)
             else:
-                # ===Model Generation===
-                state.user_prompt.text = prompt
-                state_copy = copy.deepcopy(state)
-                result = await generate(state=state_copy)
-                response_from_llm = result.output.completion
-                # ===Model Generation===
+                try:
+                    # ===Model Generation===
+                    state.user_prompt.text = prompt
+                    state_copy = copy.deepcopy(state)
+                    result = await generate(state=state_copy)
+                    response_from_llm = result.output.completion
+                    # ===Model Generation===
+                except:
+                    print(f"Failed to generate response for problem {prob_id} step {idx+1}.")
+                    response_from_llm = generate_dummy_response(prompt)
             prompt_assistant.register_previous_response(
                 prob_data=state.metadata,
                 response=response_from_llm,