Merge branch 'main' into enyst/usage

All-Hands-AI · Jan 21, 2025 · 5ccb369 · 5ccb369
2 parents caf1346 + ff3880c
commit 5ccb369
Show file tree

Hide file tree

Showing 333 changed files with 17,008 additions and 9,547 deletions.
diff --git a/.github/workflows/ghcr-build.yml b/.github/workflows/ghcr-build.yml
@@ -56,7 +56,7 @@ jobs:
           docker-images: false
           swap-storage: true
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3.2.0
+        uses: docker/setup-qemu-action@v3.3.0
         with:
           image: tonistiigi/binfmt:latest
       - name: Login to GHCR
@@ -119,7 +119,7 @@ jobs:
           docker-images: false
           swap-storage: true
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3.2.0
+        uses: docker/setup-qemu-action@v3.3.0
         with:
           image: tonistiigi/binfmt:latest
       - name: Login to GHCR

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
@@ -56,6 +56,7 @@ jobs:
           LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 10
         run: |
           echo "[llm.eval]" > config.toml
           echo "model = \"$LLM_MODEL\"" >> config.toml
@@ -70,7 +71,7 @@ jobs:
         env:
           SANDBOX_FORCE_REBUILD_RUNTIME: True
         run: |
-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'haiku_run'
 
           # get integration tests report
           REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
@@ -88,6 +89,7 @@ jobs:
           LLM_MODEL: "litellm_proxy/deepseek-chat"
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 10
         run: |
           echo "[llm.eval]" > config.toml
           echo "model = \"$LLM_MODEL\"" >> config.toml
@@ -99,7 +101,7 @@ jobs:
         env:
           SANDBOX_FORCE_REBUILD_RUNTIME: True
         run: |
-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'deepseek_run'
 
           # get integration tests report
           REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
@@ -109,11 +111,75 @@ jobs:
           echo >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
 
+      # -------------------------------------------------------------
+      # Run DelegatorAgent tests for Haiku, limited to t01 and t02
+      - name: Wait a little bit (again)
+        run: sleep 5
+
+      - name: Configure config.toml for testing DelegatorAgent (Haiku)
+        env:
+          LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 30
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Run integration test evaluation for DelegatorAgent (Haiku)
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_haiku_run'
+
+          # Find and export the delegator test results
+          REPORT_FILE_DELEGATOR_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/*haiku*_maxiter_30_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE_DELEGATOR_HAIKU: $REPORT_FILE_DELEGATOR_HAIKU"
+          echo "INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_DELEGATOR_HAIKU >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      # -------------------------------------------------------------
+      # Run DelegatorAgent tests for DeepSeek, limited to t01 and t02
+      - name: Wait a little bit (again)
+        run: sleep 5
+
+      - name: Configure config.toml for testing DelegatorAgent (DeepSeek)
+        env:
+          LLM_MODEL: "litellm_proxy/deepseek-chat"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 30
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Run integration test evaluation for DelegatorAgent (DeepSeek)
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_deepseek_run'
+
+          # Find and export the delegator test results
+          REPORT_FILE_DELEGATOR_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/deepseek*_maxiter_30_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE_DELEGATOR_DEEPSEEK: $REPORT_FILE_DELEGATOR_DEEPSEEK"
+          echo "INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_DELEGATOR_DEEPSEEK >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
       - name: Create archive of evaluation outputs
         run: |
           TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
           cd evaluation/evaluation_outputs/outputs  # Change to the outputs directory
-          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/*  # Only include the actual result directories
+          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/DelegatorAgent/*  # Only include the actual result directories
 
       - name: Upload evaluation results as artifact
         uses: actions/upload-artifact@v4
@@ -154,5 +220,11 @@ jobs:
               **Integration Tests Report (DeepSeek)**
               DeepSeek LLM Test Results:
               ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
+              ---
+                **Integration Tests Report Delegator (Haiku)**
+              ${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU }}
+              ---
+              **Integration Tests Report Delegator (DeepSeek)**
+              ${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK }}
               ---
               Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
diff --git a/.github/workflows/openhands-resolver.yml b/.github/workflows/openhands-resolver.yml
@@ -184,6 +184,7 @@ jobs:
             });
 
       - name: Install OpenHands
+        id: install_openhands
         uses: actions/github-script@v7
         env:
           COMMENT_BODY: ${{ github.event.comment.body || '' }}
@@ -196,7 +197,6 @@ jobs:
             const reviewBody = process.env.REVIEW_BODY.trim();
             const labelName = process.env.LABEL_NAME.trim();
             const eventName = process.env.EVENT_NAME.trim();
-
             // Check conditions
             const isExperimentalLabel = labelName === "fix-me-experimental";
             const isIssueCommentExperimental =
@@ -205,6 +205,9 @@ jobs:
             const isReviewCommentExperimental =
               eventName === "pull_request_review" && reviewBody.includes("@openhands-agent-exp");
 
+            // Set output variable
+            core.setOutput('isExperimental', isExperimentalLabel || isIssueCommentExperimental || isReviewCommentExperimental);
+
             // Perform package installation
             if (isExperimentalLabel || isIssueCommentExperimental || isReviewCommentExperimental) {
               console.log("Installing experimental OpenHands...");
@@ -230,7 +233,8 @@ jobs:
             --issue-number ${{ env.ISSUE_NUMBER }} \
             --issue-type ${{ env.ISSUE_TYPE }} \
             --max-iterations ${{ env.MAX_ITERATIONS }} \
-            --comment-id ${{ env.COMMENT_ID }}
+            --comment-id ${{ env.COMMENT_ID }} \
+            --is-experimental ${{ steps.install_openhands.outputs.isExperimental }}
 
       - name: Check resolution result
         id: check_result

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
@@ -18,24 +18,24 @@ diverse, inclusive, and healthy community.
 Examples of behavior that contributes to a positive environment for our
 community include:
 
-* Demonstrating empathy and kindness toward other people
-* Being respectful of differing opinions, viewpoints, and experiences
-* Giving and gracefully accepting constructive feedback
+* Demonstrating empathy and kindness toward other people.
+* Being respectful of differing opinions, viewpoints, and experiences.
+* Giving and gracefully accepting constructive feedback.
 * Accepting responsibility and apologizing to those affected by our mistakes,
-  and learning from the experience
+  and learning from the experience.
 * Focusing on what is best not just for us as individuals, but for the overall
-  community
+  community.
 
 Examples of unacceptable behavior include:
 
 * The use of sexualized language or imagery, and sexual attention or advances of
-  any kind
-* Trolling, insulting or derogatory comments, and personal or political attacks
-* Public or private harassment
+  any kind.
+* Trolling, insulting or derogatory comments, and personal or political attacks.
+* Public or private harassment.
 * Publishing others' private information, such as a physical or email address,
-  without their explicit permission
+  without their explicit permission.
 * Other conduct which could reasonably be considered inappropriate in a
-  professional setting
+  professional setting.
 
 ## Enforcement Responsibilities
 
@@ -61,7 +61,7 @@ representative at an online or offline event.
 
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement at
-[email protected]
+[email protected].
 All complaints will be reviewed and investigated promptly and fairly.
 
 All community leaders are obligated to respect the privacy and security of the
@@ -113,6 +113,20 @@ individual, or aggression toward or disparagement of classes of individuals.
 **Consequence**: A permanent ban from any sort of public interaction within the
 community.
 
+### Slack and Discord Etiquettes
+
+These Slack and Discord etiquette guidelines are designed to foster an inclusive, respectful, and productive environment for all community members. By following these best practices, we ensure effective communication and collaboration while minimizing disruptions. Let’s work together to build a supportive and welcoming community!
+
+- Communicate respectfully and professionally, avoiding sarcasm or harsh language, and remember that tone can be difficult to interpret in text.
+- Use threads for specific discussions to keep channels organized and easier to follow.
+- Tag others only when their input is critical or urgent, and use @here, @channel or @everyone sparingly to minimize disruptions.
+- Be patient, as open-source contributors and maintainers often have other commitments and may need time to respond.
+- Post questions or discussions in the most relevant channel (e.g., for [slack - #general](https://app.slack.com/client/T06P212QSEA/C06P5NCGSFP) for general topics, [slack - #questions](https://openhands-ai.slack.com/archives/C06U8UTKSAD) for queries/questions, [discord - #general](https://discord.com/channels/1222935860639563850/1222935861386018885)).
+- When asking for help or raising issues, include necessary details like links, screenshots, or clear explanations to provide context.
+- Keep discussions in public channels whenever possible to allow others to benefit from the conversation, unless the matter is sensitive or private.
+- Always adhere to [our standards](https://github.com/All-Hands-AI/OpenHands/blob/main/CODE_OF_CONDUCT.md#our-standards) to ensure a welcoming and collaborative environment.
+- If you choose to mute a channel, consider setting up alerts for topics that still interest you to stay engaged. For Slack, Go to Settings → Notifications → My Keywords to add specific keywords that will notify you when mentioned. For example, if you're here for discussions about LLMs, mute the channel if it’s too busy, but set notifications to alert you only when “LLMs” appears in messages. Also for Discord, go to the channel notifications and choose the option that best describes your need.
+
 ## Attribution
 
 This Code of Conduct is adapted from the [Contributor Covenant][homepage],

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -11,19 +11,19 @@ To understand the codebase, please refer to the README in each module:
    - [agenthub](./openhands/agenthub/README.md)
    - [server](./openhands/server/README.md)
 
-## Setting up your development environment
+## Setting up Your Development Environment
 
 We have a separate doc [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) that tells you how to set up a development workflow.
 
-## How can I contribute?
+## How Can I Contribute?
 
 There are many ways that you can contribute:
 
 1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
 2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
 3. **Improve the Codebase** by sending [PRs](#sending-pull-requests-to-openhands) (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.
 
-## What can I build?
+## What Can I Build?
 Here are a few ways you can help improve the codebase.
 
 #### UI/UX
@@ -35,7 +35,7 @@ of the application, please open an issue first, or better, join the #frontend ch
 to gather consensus from our design team first.
 
 #### Improving the agent
-Our main agent is the CodeAct agent. You can [see its prompts here](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub/codeact_agent)
+Our main agent is the CodeAct agent. You can [see its prompts here](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub/codeact_agent).
 
 Changes to these prompts, and to the underlying behavior in Python, can have a huge impact on user experience.
 You can try modifying the prompts to see how they change the behavior of the agent as you use the app
@@ -63,7 +63,7 @@ At the moment, we have two kinds of tests: [`unit`](./tests/unit) and [`integrat
 ## Sending Pull Requests to OpenHands
 
 You'll need to fork our repository to send us a Pull Request. You can learn more
-about how to fork a GitHub repo and open a PR with your changes in [this article](https://medium.com/swlh/forks-and-pull-requests-how-to-contribute-to-github-repos-8843fac34ce8)
+about how to fork a GitHub repo and open a PR with your changes in [this article](https://medium.com/swlh/forks-and-pull-requests-how-to-contribute-to-github-repos-8843fac34ce8).
 
 ### Pull Request title
 As described [here](https://github.com/commitizen/conventional-commit-types/blob/master/index.json), a valid PR title should begin with one of the following prefixes:
@@ -103,7 +103,7 @@ Further, if you see an issue you like, please leave a "thumbs-up" or a comment,
 
 ### Making Pull Requests
 
-We're generally happy to consider all [PRs](https://github.com/All-Hands-AI/OpenHands/pulls), with the evaluation process varying based on the type of change:
+We're generally happy to consider all pull requests with the evaluation process varying based on the type of change:
 
 #### For Small Improvements
 

diff --git a/Development.md b/Development.md
@@ -3,9 +3,9 @@ This guide is for people working on OpenHands and editing the source code.
 If you wish to contribute your changes, check out the [CONTRIBUTING.md](https://github.com/All-Hands-AI/OpenHands/blob/main/CONTRIBUTING.md) on how to clone and setup the project initially before moving on.
 Otherwise, you can clone the OpenHands project directly.
 
-## Start the server for development
+## Start the Server for Development
 ### 1. Requirements
-* Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install)  [Ubuntu <= 22.04]
+* Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install)  [Ubuntu >= 22.04]
 * [Docker](https://docs.docker.com/engine/install/) (For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
 * [Python](https://www.python.org/downloads/) = 3.12
 * [NodeJS](https://nodejs.org/en/download/package-manager) >= 20.x
@@ -58,7 +58,7 @@ See [our documentation](https://docs.all-hands.dev/modules/usage/llms) for recom
 
 ### 4. Running the application
 #### Option A: Run the Full Application
-Once the setup is complete, launching OpenHands is as simple as running a single command. This command starts both the backend and frontend servers seamlessly, allowing you to interact with OpenHands:
+Once the setup is complete, this command starts both the backend and frontend servers, allowing you to interact with OpenHands:
 ```bash
 make run
 ```
@@ -75,11 +75,11 @@ make run
     ```
 
 ### 6. LLM Debugging
-If you encounter any issues with the Language Model (LM) or you're simply curious, you can inspect the actual LLM prompts and responses. To do so, export DEBUG=1 in the environment and restart the backend.
-OpenHands will then log the prompts and responses in the logs/llm/CURRENT_DATE directory, allowing you to identify the causes.
+If you encounter any issues with the Language Model (LM) or you're simply curious, export DEBUG=1 in the environment and restart the backend.
+OpenHands will log the prompts and responses in the logs/llm/CURRENT_DATE directory, allowing you to identify the causes.
 
 ### 7. Help
-Need assistance or information on available targets and commands? The help command provides all the necessary guidance to ensure a smooth experience with OpenHands.
+Need help or info on available targets and commands? Use the help command for all the guidance you need with OpenHands.
 ```bash
 make help
  ```
@@ -93,14 +93,14 @@ poetry run pytest ./tests/unit/test_*.py
 ```
 
 ### 9. Add or update dependency
-1. Add your dependency in `pyproject.toml` or use `poetry add xxx`
-2. Update the poetry.lock file via `poetry lock --no-update`
+1. Add your dependency in `pyproject.toml` or use `poetry add xxx`.
+2. Update the poetry.lock file via `poetry lock --no-update`.
 
 ### 9. Use existing Docker image
 To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by
 setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
 
-Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.18-nikolaik`
+Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.20-nikolaik`
 
 ## Develop inside Docker container
 
@@ -110,7 +110,7 @@ TL;DR
 make docker-dev
 ```
 
-See more details [here](./containers/dev/README.md)
+See more details [here](./containers/dev/README.md).
 
 If you are just interested in running `OpenHands` without installing all the required tools on your host.