Skip to content

Commit

Permalink
(fix) Fix runtime (RT) tests and split tests in 2 actions (openhands/…
Browse files Browse the repository at this point in the history
…root) (All-Hands-AI#3791)

Co-authored-by: Engel Nyst <[email protected]>
  • Loading branch information
tobitege and enyst authored Sep 14, 2024
1 parent 57390eb commit 554636c
Show file tree
Hide file tree
Showing 21 changed files with 878 additions and 713 deletions.
89 changes: 81 additions & 8 deletions .github/workflows/ghcr_runtime.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Workflow that builds, tests and then pushes the runtime docker images to the ghcr.io repository
name: Build, Test and Publish Runtime Image
name: Build, Test and Publish RT Image

# Only run one workflow of the same group at a time.
# There can be at most one running and one pending job in a concurrency group at any time.
Expand Down Expand Up @@ -104,9 +104,9 @@ jobs:
name: runtime-${{ matrix.base_image.tag }}
path: /tmp/runtime-${{ matrix.base_image.tag }}.tar

# Run unit tests with the EventStream runtime Docker images
test_runtime:
name: Test Runtime
# Run unit tests with the EventStream runtime Docker images as root
test_runtime_root:
name: RT Unit Tests (Root)
needs: [ghcr_build_runtime]
runs-on: ubuntu-latest
strategy:
Expand Down Expand Up @@ -164,19 +164,92 @@ jobs:
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
SKIP_CONTAINER_LOGS=true \
TEST_RUNTIME=eventstream \
SANDBOX_USER_ID=$(id -u) \
SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
TEST_IN_CI=true \
poetry run pytest -n 2 --reruns 2 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
RUN_AS_OPENHANDS=false \
poetry run pytest -n 3 --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

# Run unit tests with the EventStream runtime Docker images as openhands user
test_runtime_oh:
name: RT Unit Tests (openhands)
runs-on: ubuntu-latest
needs: [ghcr_build_runtime]
strategy:
matrix:
base_image: ['nikolaik']
steps:
- uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
tool-cache: true
android: true
dotnet: true
haskell: true
large-packages: true
swap-storage: true
# Forked repos can't push to GHCR, so we need to download the image as an artifact
- name: Download runtime image for fork
if: github.event.pull_request.head.repo.fork
uses: actions/download-artifact@v4
with:
name: runtime-${{ matrix.base_image }}
path: /tmp
- name: Load runtime image for fork
if: github.event.pull_request.head.repo.fork
run: |
docker load --input /tmp/runtime-${{ matrix.base_image }}.tar
- name: Cache Poetry dependencies
uses: actions/cache@v4
with:
path: |
~/.cache/pypoetry
~/.virtualenvs
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install poetry via pipx
run: pipx install poetry
- name: Install Python dependencies using Poetry
run: make install-python-dependencies
- name: Run runtime tests
run: |
# We install pytest-xdist in order to run tests across CPUs. However, tests start to fail when we run
# then across more than 2 CPUs for some reason
poetry run pip install pytest-xdist
# Install to be able to retry on failures for flaky tests
poetry run pip install pytest-rerunfailures
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ github.sha }}-${{ matrix.base_image }}
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
SKIP_CONTAINER_LOGS=true \
TEST_RUNTIME=eventstream \
SANDBOX_USER_ID=$(id -u) \
SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
TEST_IN_CI=true \
RUN_AS_OPENHANDS=true \
poetry run pytest -n 3 --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

# Run integration tests with the eventstream runtime Docker image
runtime_integration_tests_on_linux:
name: Runtime Integration Tests on Linux
name: RT Integration Tests (Linux)
runs-on: ubuntu-latest
needs: [ghcr_build_runtime]
strategy:
Expand Down Expand Up @@ -237,7 +310,7 @@ jobs:
name: All Runtime Tests Passed
if: ${{ !cancelled() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}
runs-on: ubuntu-latest
needs: [test_runtime, runtime_integration_tests_on_linux]
needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
steps:
- name: All tests passed
run: echo "All runtime tests have passed successfully!"
Expand All @@ -246,7 +319,7 @@ jobs:
name: All Runtime Tests Passed
if: ${{ cancelled() || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}
runs-on: ubuntu-latest
needs: [test_runtime, runtime_integration_tests_on_linux]
needs: [test_runtime_root, test_runtime_oh, runtime_integration_tests_on_linux]
steps:
- name: Some tests failed
run: |
Expand Down
12 changes: 6 additions & 6 deletions openhands/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
if isinstance(value, dict):
try:
if key is not None and key.lower() == 'agent':
logger.openhands_logger.info(
logger.openhands_logger.debug(
'Attempt to load default agent config from config toml'
)
non_dict_fields = {
Expand All @@ -517,13 +517,13 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
cfg.set_agent_config(agent_config, 'agent')
for nested_key, nested_value in value.items():
if isinstance(nested_value, dict):
logger.openhands_logger.info(
logger.openhands_logger.debug(
f'Attempt to load group {nested_key} from config toml as agent config'
)
agent_config = AgentConfig(**nested_value)
cfg.set_agent_config(agent_config, nested_key)
elif key is not None and key.lower() == 'llm':
logger.openhands_logger.info(
logger.openhands_logger.debug(
'Attempt to load default LLM config from config toml'
)
non_dict_fields = {
Expand All @@ -533,7 +533,7 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
cfg.set_llm_config(llm_config, 'llm')
for nested_key, nested_value in value.items():
if isinstance(nested_value, dict):
logger.openhands_logger.info(
logger.openhands_logger.debug(
f'Attempt to load group {nested_key} from config toml as llm config'
)
llm_config = LLMConfig(**nested_value)
Expand Down Expand Up @@ -584,10 +584,10 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):

def finalize_config(cfg: AppConfig):
"""More tweaks to the config after it's been loaded."""
cfg.workspace_base = os.path.abspath(cfg.workspace_base)
# Set workspace_mount_path if not set by the user
if cfg.workspace_mount_path is UndefinedString.UNDEFINED:
cfg.workspace_mount_path = os.path.abspath(cfg.workspace_base)
cfg.workspace_base = os.path.abspath(cfg.workspace_base)
cfg.workspace_mount_path = cfg.workspace_base

if cfg.workspace_mount_rewrite: # and not config.workspace_mount_path:
# TODO why do we need to check if workspace_mount_path is None?
Expand Down
4 changes: 4 additions & 0 deletions openhands/runtime/builder/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ def image_exists(self, image_name: str) -> bool:
Returns:
bool: Whether the Docker image exists in the registry or in the local store
"""
if not image_name:
logger.error(f'Invalid image name: `{image_name}`')
return False

try:
logger.info(f'Checking, if image exists locally:\n{image_name}')
self.docker_client.images.get(image_name)
Expand Down
113 changes: 83 additions & 30 deletions openhands/runtime/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ def __init__(
self.lock = asyncio.Lock()
self.plugins: dict[str, Plugin] = {}
self.browser = BrowserEnv(browsergym_eval_env)
self._initial_pwd = work_dir

@property
def initial_pwd(self):
Expand Down Expand Up @@ -116,44 +115,96 @@ async def ainit(self):
logger.info('Runtime client initialized.')

def _init_user(self, username: str, user_id: int) -> None:
"""Create user if not exists."""
"""Create working directory and user if not exists.
It performs the following steps effectively:
* Creates the Working Directory:
- Uses mkdir -p to create the directory.
- Sets ownership to username:root.
- Adjusts permissions to be readable and writable by group and others.
* User Verification and Creation:
- Checks if the user exists using id -u.
- If the user exists with the correct UID, it skips creation.
- If the UID differs, it logs a warning and updates self.user_id.
- If the user doesn't exist, it proceeds to create the user.
* Sudo Configuration:
- Appends %sudo ALL=(ALL) NOPASSWD:ALL to /etc/sudoers to grant
passwordless sudo access to the sudo group.
- Adds the user to the sudo group with the useradd command, handling
UID conflicts by incrementing the UID if necessary.
"""

# First create the working directory, independent of the user
logger.info(f'Client working directory: {self.initial_pwd}')
command = f'umask 002; mkdir -p {self.initial_pwd}'
output = subprocess.run(command, shell=True, capture_output=True)
out_str = output.stdout.decode()

command = f'chown -R {username}:root {self.initial_pwd}'
output = subprocess.run(command, shell=True, capture_output=True)
out_str += output.stdout.decode()

command = f'chmod g+rw {self.initial_pwd}'
output = subprocess.run(command, shell=True, capture_output=True)
out_str += output.stdout.decode()
logger.debug(f'Created working directory. Output: [{out_str}]')

# Skip root since it is already created
if username == 'root':
return

# Check if the username already exists
existing_user_id = -1
try:
subprocess.run(
result = subprocess.run(
f'id -u {username}', shell=True, check=True, capture_output=True
)
logger.debug(f'User {username} already exists. Skipping creation.')
existing_user_id = int(result.stdout.decode().strip())

# The user ID already exists, skip setup
if existing_user_id == user_id:
logger.debug(
f'User `{username}` already has the provided UID {user_id}. Skipping user setup.'
)
else:
logger.warning(
f'User `{username}` already exists with UID {existing_user_id}. Skipping user setup.'
)
self.user_id = existing_user_id
return
except subprocess.CalledProcessError:
pass # User does not exist, continue with creation
except subprocess.CalledProcessError as e:
# Returncode 1 indicates, that the user does not exist yet
if e.returncode == 1:
logger.debug(
f'User `{username}` does not exist. Proceeding with user creation.'
)
else:
logger.error(
f'Error checking user `{username}`, skipping setup:\n{e}\n'
)
raise

# Add sudoer
sudoer_line = r"echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers"
output = subprocess.run(sudoer_line, shell=True, capture_output=True)
if output.returncode != 0:
raise RuntimeError(f'Failed to add sudoer: {output.stderr.decode()}')
logger.debug(f'Added sudoer successfully. Output: [{output.stdout.decode()}]')
sudoer_line = r'%sudo ALL=(ALL) NOPASSWD:ALL\n'
sudoers_path = '/etc/sudoers.d/99_sudo'
if not Path(sudoers_path).exists():
with open(sudoers_path, 'w') as f:
f.write(sudoer_line)
output = subprocess.run(['chmod', '0440', sudoers_path])
if output.returncode != 0:
logger.error('Failed to chmod 99_sudo file!')
else:
logger.debug('Added sudoer successfully.')

# Attempt to add the user, retrying with incremented user_id if necessary
while True:
command = (
f'useradd -rm -d /home/{username} -s /bin/bash '
f'-g root -G sudo -u {user_id} {username}'
)

if not os.path.exists(self.initial_pwd):
command += f' && mkdir -p {self.initial_pwd}'
command += f' && chown -R {username}:root {self.initial_pwd}'
command += f' && chmod g+s {self.initial_pwd}'

output = subprocess.run(command, shell=True, capture_output=True)
if output.returncode == 0:
logger.debug(
f'Added user {username} successfully with UID {user_id}. Output: [{output.stdout.decode()}]'
f'Added user `{username}` successfully with UID {user_id}. Output: [{output.stdout.decode()}]'
)
break
elif f'UID {user_id} is not unique' in output.stderr.decode():
Expand All @@ -163,7 +214,7 @@ def _init_user(self, username: str, user_id: int) -> None:
user_id += 1
else:
raise RuntimeError(
f'Failed to create user {username}: {output.stderr.decode()}'
f'Failed to create user `{username}`! Output: [{output.stderr.decode()}]'
)

def _init_bash_shell(self, work_dir: str, username: str) -> None:
Expand All @@ -181,17 +232,20 @@ def _init_bash_shell(self, work_dir: str, username: str) -> None:

# This should NOT match "PS1=\u@\h:\w [PEXPECT]$" when `env` is executed
self.__bash_expect_regex = r'\[PEXPECT_BEGIN\]\s*(.*?)\s*([a-z0-9_-]*)@([a-zA-Z0-9.-]*):(.+)\s*\[PEXPECT_END\]'

self.shell.sendline(f'export PS1="{self.__bash_PS1}"; export PS2=""')
# Set umask to allow group write permissions
self.shell.sendline(f'umask 002; export PS1="{self.__bash_PS1}"; export PS2=""')
self.shell.expect(self.__bash_expect_regex)

self.shell.sendline(
f'if [ ! -d "{work_dir}" ]; then mkdir -p "{work_dir}"; fi && cd "{work_dir}"'
)
self.shell.expect(self.__bash_expect_regex)
logger.debug(
f'Bash initialized. Working directory: {work_dir}. Output: {self.shell.before}'
f'Bash initialized. Working directory: {work_dir}. Output: [{self.shell.before}]'
)
# Ensure the group has write permissions on the working directory
self.shell.sendline(f'chmod g+rw "{work_dir}"')
self.shell.expect(self.__bash_expect_regex)

async def _init_bash_commands(self):
logger.info(f'Initializing by running {len(INIT_COMMANDS)} bash commands...')
Expand Down Expand Up @@ -295,14 +349,14 @@ def _continue_bash(
bash_prompt = self._get_bash_prompt_and_update_pwd()
if keep_prompt:
output += '\r\n' + bash_prompt
logger.debug(f'Command output: {output}')
# logger.debug(f'Command output:\n{output}')
return output, exit_code

async def run_action(self, action) -> Observation:
action_type = action.action
logger.debug(f'Running action: {action}')
logger.debug(f'Running action:\n{action}')
observation = await getattr(self, action_type)(action)
logger.debug(f'Action output: {observation}')
logger.debug(f'Action output:\n{observation}')
return observation

async def run(self, action: CmdRunAction) -> CmdOutputObservation:
Expand Down Expand Up @@ -355,10 +409,9 @@ async def run_ipython(self, action: IPythonRunCellAction) -> Observation:
_jupyter_plugin: JupyterPlugin = self.plugins['jupyter'] # type: ignore
# This is used to make AgentSkills in Jupyter aware of the
# current working directory in Bash
if self.pwd != getattr(self, '_jupyter_pwd', None):
logger.debug(
f"{self.pwd} != {getattr(self, '_jupyter_pwd', None)} -> reset Jupyter PWD"
)
jupyter_pwd = getattr(self, '_jupyter_pwd', None)
if self.pwd != jupyter_pwd:
logger.debug(f'{self.pwd} != {jupyter_pwd} -> reset Jupyter PWD')
reset_jupyter_pwd_code = f'import os; os.chdir("{self.pwd}")'
_aux_action = IPythonRunCellAction(code=reset_jupyter_pwd_code)
_reset_obs = await _jupyter_plugin.run(_aux_action)
Expand Down Expand Up @@ -450,7 +503,7 @@ async def write(self, action: FileWriteAction) -> Observation:
os.chown(filepath, file_stat.st_uid, file_stat.st_gid)
else:
# set the new file permissions if the file is new
os.chmod(filepath, 0o644)
os.chmod(filepath, 0o664)
os.chown(filepath, self.user_id, self.user_id)

except FileNotFoundError:
Expand Down
Loading

0 comments on commit 554636c

Please sign in to comment.