diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml index 3e853e43..a6f54d58 100644 --- a/.github/workflows/benchmarks.yaml +++ b/.github/workflows/benchmarks.yaml @@ -108,12 +108,19 @@ jobs: count = args[2]; } } - core.setOutput('path', path); + + core.setOutput( + 'path', + path + .split(',') + .map(x => `./autotx/tests/${x}`) + .join(',') + ); core.setOutput('count', count); - name: Run Benchmarks run: | - python benchmarks.py ./autotx/tests/${{ steps.parse_args.outputs.path }} ${{ steps.parse_args.outputs.count }} benchmark-results + python benchmarks.py ${{ steps.parse_args.outputs.path }} ${{ steps.parse_args.outputs.count }} benchmark-results - name: Comment on PR with benchmark results uses: actions/github-script@v6 @@ -131,7 +138,39 @@ jobs: }) - name: Upload artifact + id: upload-artifact uses: actions/upload-artifact@v2 with: name: benchmarks-debug path: ./benchmarks/benchmark-results + + - name: Fetch Artifact ID + id: fetch-artifact-id + uses: actions/github-script@v6 + with: + script: | + const run_id = ${{ github.run_id }}; + const artifacts = await github.rest.actions.listWorkflowRunArtifacts({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: run_id, + }); + const artifact = artifacts.data.artifacts.find(artifact => artifact.name === 'benchmarks-debug'); + return artifact.id; + + - name: Comment on PR with benchmark results and download link + uses: actions/github-script@v6 + with: + script: | + const fs = require('fs'); + const summaryPath = './benchmarks/benchmark-results/summary.md'; + const contents = fs.readFileSync(summaryPath, 'utf8'); + const artifactID = ${{ steps.fetch-artifact-id.outputs.result }}; + const downloadURL = `https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}/artifacts/${artifactID}`; + + github.rest.issues.updateComment({ + comment_id: ${{ env.BENCHMARKS_COMMENT_ID }}, + owner: context.repo.owner, + repo: context.repo.repo, + body: '[Finished benchmarks](https://github.com/${{github.repository}}/actions/runs/${{github.run_id}})\n\n' + contents + '\n\n[Download Artifacts](' + downloadURL + ')' + }) diff --git a/autotx/AutoTx.py b/autotx/AutoTx.py index c4eb30a1..c88c0e8c 100644 --- a/autotx/AutoTx.py +++ b/autotx/AutoTx.py @@ -96,6 +96,7 @@ def run(self, prompt: str, non_interactive: bool) -> None: Verifier is an expert in verifiying if user goals are met. Verifier analyzes chat and responds with TERMINATE if the goal is met. Verifier can consider the goal met if the other agents have prepared the necessary transactions. + Let the other agents complete all the necessary parts of the goal before calling TERMINATE. If some information needs to be returned to the user or if there are any errors encountered during the process, add this in your answer. Start any error messages with "ERROR:" to clearly indicate the issue. Then say the word TERMINATE. diff --git a/autotx/agents/ResearchTokensAgent.py b/autotx/agents/ResearchTokensAgent.py index 1a38b92e..2333f540 100644 --- a/autotx/agents/ResearchTokensAgent.py +++ b/autotx/agents/ResearchTokensAgent.py @@ -17,6 +17,7 @@ system_message = lambda autotx: dedent(f""" You are an AI assistant. Assist the user (address: {autotx.manager.address}) in their task of researching tokens. You are an expert in Ethereum tokens and can help users research tokens. + ONLY focus on the token research aspect of the user's goal and let other agents handle other tasks. You use the tools available to assist the user in their tasks. Retrieve token information, get token price, market cap, and price change percentage """ diff --git a/autotx/agents/SendTokensAgent.py b/autotx/agents/SendTokensAgent.py index 94a7d4be..dfb21937 100644 --- a/autotx/agents/SendTokensAgent.py +++ b/autotx/agents/SendTokensAgent.py @@ -20,11 +20,13 @@ name = "send-tokens" system_message = lambda autotx: dedent(f""" - You are an AI assistant. Assist the user (address: {autotx.manager.address}) in their tasks by fetching balances and preparing transactions to send tokens. - You are an expert in Ethereum tokens (native and erc20) and can help users send tokens and check their balances. + You are an expert in Ethereum tokens (native and erc20) and can assist the user (address: {autotx.manager.address}) in their tasks by fetching balances and preparing transactions to send tokens. + ONLY focus on the sending and balance aspect of the user's goal and let other agents handle other tasks. You use the tools available to assist the user in their tasks. - Your job is to only prepare the transactions and the user will take care of executing them. - NOTE: There is no reason to call get_balance after calling transfer as the transfers are only prepared and not executed. + Your job is to only prepare the transactions by calling the prepare_transfer_transaction tool and the user will take care of executing them. + NOTE: There is no reason to call get_token_balance after calling prepare_transfer_transaction as the transfers are only prepared and not executed. + Do not just respond with JSON, instead call the tools with the correct arguments. + Take extra care in the order of transactions to prepare. """ ) @@ -42,6 +44,8 @@ def run( receiver: Annotated[str, "The receiver's address or ENS domain"], token: Annotated[str, "Symbol of token to transfer"] ) -> str: + amount = float(amount) + web3 = load_w3() receiver_addr = ETHAddress(receiver, web3) token_address = ETHAddress(autotx.network.tokens[token.lower()], web3) diff --git a/autotx/agents/SwapTokensAgent.py b/autotx/agents/SwapTokensAgent.py index 0a19b8ff..6a888517 100644 --- a/autotx/agents/SwapTokensAgent.py +++ b/autotx/agents/SwapTokensAgent.py @@ -12,26 +12,62 @@ system_message = lambda autotx: dedent(f""" You are an expert at buying and selling tokens. Assist the user (address: {autotx.manager.address}) in their task of swapping tokens. + ONLY focus on the buy and sell (swap) aspect of the user's goal and let other agents handle other tasks. You use the tools available to assist the user in their tasks. - Perform token swaps, manage liquidity, and query pool statistics on the Uniswap protocol - An autonomous agent skilled in Ethereum blockchain interactions, specifically tailored for the Uniswap V3 protocol. Note a balance of a token is not required to perform a swap, if there is an earlier prepared transaction that will provide the token. - Examples: + Below are examples, NOTE these are only examples and in practice you need to call the prepare_swap_transaction tool with the correct arguments. + Example 1: + User: Send 0.1 ETH to vitalik.eth and then swap ETH to 5 USDC + Advisor reworded: Send 0.1 ETH to vitalik.eth and then buy USDC with 0.1 ETH from address {autotx.manager.address} + ... + Other agent messages + ... + Call prepare_swap_transaction with args: {{ - "token_to_sell": "5 ETH", - "token_to_buy": "USDC" - }} // Prepares a swap transaction to sell 5 ETH and buy USDC + "token_to_sell": "ETH", + "token_to_buy": "5 USDC" + }} + Example 2: + User: Swap ETH to 5 USDC, then swap that USDC for 6 UNI + Advisor reworded: Swap ETH to 5 USDC, then swap 5 USDC for 6 UNI for user address {autotx.manager.address} + Call prepare_swap_transaction with args: {{ "token_to_sell": "ETH", "token_to_buy": "5 USDC" - }} // Prepares a swap transaction to sell ETH and buy 5 USDC - - Invalid Example: + }} + and then + Call prepare_swap_transaction with args: + {{ + "token_to_sell": "USDC", + "token_to_buy": "6 UNI" + }} + + Example 4 (Mistake): + User: Swap ETH for 5 USDC, then swap that USDC for 6 UNI + Advisor reworded: Swap ETH for 5 USDC, then swap 5 USDC for 6 UNI for user address {autotx.manager.address} + Call prepare_swap_transaction with args: {{ - "token_to_sell": "5 ETH", + "token_to_sell": "ETH", "token_to_buy": "5 USDC" - }} // Invalid input. Only one token amount should be provided, not both. + }} + and then + Call prepare_swap_transaction with args: + {{ + "token_to_sell": "5 USDC", + "token_to_buy": "6 UNI" + }} + Invalid input. Only one token amount should be provided. IMPORTANT: Take another look at the user's goal, and try again. + To fix the error run: + Call prepare_swap_transaction with args: + {{ + "token_to_sell": "USDC", + "token_to_buy": "6 UNI" + }} + Above are examples, NOTE these are only examples and in practice you need to call the prepare_swap_transaction tool with the correct arguments. + Take extra care in ensuring you have to right amount next to the token symbol. + Listen to the user more than the advisor! + Only call tools, do not respond with JSON. """ ) diff --git a/autotx/tests/agents/token/test_swap.py b/autotx/tests/agents/token/test_swap.py index 9bf566e7..4dabf411 100644 --- a/autotx/tests/agents/token/test_swap.py +++ b/autotx/tests/agents/token/test_swap.py @@ -2,7 +2,6 @@ from autotx.utils.ethereum.networks import NetworkInfo from autotx.utils.ethereum.eth_address import ETHAddress - def test_auto_tx_swap_with_non_default_token(configuration, auto_tx): (_, _, _, manager) = configuration web3 = load_w3() @@ -48,3 +47,54 @@ def test_auto_tx_swap_multiple(configuration, auto_tx): assert usdc_balance + 500 == manager.balance_of(usdc_address) assert wbtc_balance < manager.balance_of(wbtc_address) + +def test_auto_tx_swap_triple(configuration, auto_tx): + (_, _, _, manager) = configuration + web3 = load_w3() + network_info = NetworkInfo(web3.eth.chain_id) + usdc_address = ETHAddress(network_info.tokens["usdc"], web3) + uni_address = ETHAddress(network_info.tokens["uni"], web3) + wbtc_address = ETHAddress(network_info.tokens["wbtc"], web3) + + prompt = "Buy 1 USDC, 0.5 UNI and 0.05 WBTC with ETH" + usdc_balance = manager.balance_of(usdc_address) + uni_balance = manager.balance_of(uni_address) + wbtc_balance = manager.balance_of(wbtc_address) + + auto_tx.run(prompt, non_interactive=True) + + assert usdc_balance + 1 == manager.balance_of(usdc_address) + assert uni_balance + 0.5 == manager.balance_of(uni_address) + assert wbtc_balance + 0.05 == manager.balance_of(wbtc_address) + +def test_auto_tx_swap_complex_1(configuration, auto_tx): # This one is complex because it confuses the LLM with WBTC amount + (_, _, _, manager) = configuration + web3 = load_w3() + network_info = NetworkInfo(web3.eth.chain_id) + usdc_address = ETHAddress(network_info.tokens["usdc"], web3) + wbtc_address = ETHAddress(network_info.tokens["wbtc"], web3) + + prompt = "Swap ETH to 0.05 WBTC, then, swap WBTC to 1000 USDC" + usdc_balance = manager.balance_of(usdc_address) + wbtc_balance = manager.balance_of(wbtc_address) + + auto_tx.run(prompt, non_interactive=True) + + assert usdc_balance + 1000 == manager.balance_of(usdc_address) + assert wbtc_balance < manager.balance_of(wbtc_address) + +def test_auto_tx_swap_complex_2(configuration, auto_tx): # This one is complex because it confuses the LLM with WBTC amount + (_, _, _, manager) = configuration + web3 = load_w3() + network_info = NetworkInfo(web3.eth.chain_id) + usdc_address = ETHAddress(network_info.tokens["usdc"], web3) + wbtc_address = ETHAddress(network_info.tokens["wbtc"], web3) + + prompt = "Buy 1000 USDC with ETH, then sell the USDC to buy 0.001 WBTC" + usdc_balance = manager.balance_of(usdc_address) + wbtc_balance = manager.balance_of(wbtc_address) + + auto_tx.run(prompt, non_interactive=True) + + assert usdc_balance < manager.balance_of(usdc_address) + assert wbtc_balance + 0.001 == manager.balance_of(wbtc_address) diff --git a/autotx/tests/agents/token/test_swap_and_send.py b/autotx/tests/agents/token/test_swap_and_send.py index bf8f18ce..bea7f214 100644 --- a/autotx/tests/agents/token/test_swap_and_send.py +++ b/autotx/tests/agents/token/test_swap_and_send.py @@ -1,15 +1,14 @@ -from autotx.utils.ethereum import get_erc20_balance, load_w3 +from autotx.utils.ethereum import get_erc20_balance, get_native_balance, load_w3 from autotx.utils.ethereum.networks import NetworkInfo from autotx.utils.ethereum.eth_address import ETHAddress - -def test_auto_tx_swap_and_send_simple(configuration, auto_tx): +def test_auto_tx_swap_and_send_simple(configuration, auto_tx, test_accounts): (_, _, client, manager) = configuration web3 = load_w3() network_info = NetworkInfo(web3.eth.chain_id) wbtc_address = ETHAddress(network_info.tokens["wbtc"], web3) - receiver = ETHAddress("0x10f8Bf6a479F320ead074411A4b0e7944eA8C9c1", client.w3) + receiver = test_accounts[0] prompt = f"Swap ETH to 0.05 WBTC, and send 0.01 WBTC to {receiver}" @@ -24,14 +23,14 @@ def test_auto_tx_swap_and_send_simple(configuration, auto_tx): assert new_wbtc_safe_address == wbtc_safe_address + 0.04 assert new_receiver_wbtc_balance == receiver_wbtc_balance + 0.01 -def test_auto_tx_swap_and_send_complex(configuration, auto_tx): +def test_auto_tx_swap_and_send_complex(configuration, auto_tx, test_accounts): (_, _, client, manager) = configuration web3 = load_w3() network_info = NetworkInfo(web3.eth.chain_id) usdc_address = ETHAddress(network_info.tokens["usdc"], web3) wbtc_address = ETHAddress(network_info.tokens["wbtc"], web3) - receiver = ETHAddress("0x10f8Bf6a479F320ead074411A4b0e7944eA8C9c1", client.w3) + receiver = test_accounts[0] prompt = f"Swap ETH to 0.05 WBTC, then, swap WBTC to 1000 USDC and send 50 USDC to {receiver}" @@ -47,4 +46,64 @@ def test_auto_tx_swap_and_send_complex(configuration, auto_tx): assert new_wbtc_safe_address > wbtc_safe_address assert new_usdc_safe_address == usdc_safe_address + 950 - assert new_receiver_usdc_balance == receiver_usdc_balance + 50 \ No newline at end of file + assert new_receiver_usdc_balance == receiver_usdc_balance + 50 + +def test_auto_tx_send_and_swap_simple(configuration, auto_tx, test_accounts): + (_, _, client, manager) = configuration + web3 = load_w3() + network_info = NetworkInfo(web3.eth.chain_id) + wbtc_address = ETHAddress(network_info.tokens["wbtc"], web3) + + receiver = test_accounts[0] + + prompt = f"Send 0.1 ETH to {receiver}, and then swap ETH to 0.05 WBTC" + + user_wbtc_balance = manager.balance_of(wbtc_address) + receiver_native_balance = get_native_balance(client.w3, receiver) + receiver_wbtc_balance = get_erc20_balance(client.w3, wbtc_address, receiver) + + auto_tx.run(prompt, non_interactive=True) + + new_user_wbtc_balance = manager.balance_of(wbtc_address) + new_receiver_native_balance = get_native_balance(client.w3, receiver) + new_receiver_wbtc_balance = get_erc20_balance(client.w3, wbtc_address, receiver) + + assert new_user_wbtc_balance == user_wbtc_balance + 0.05 + assert receiver_wbtc_balance == 0 + assert new_receiver_wbtc_balance == receiver_wbtc_balance + assert new_receiver_native_balance == receiver_native_balance + 0.1 + +def test_auto_tx_send_and_swap_complex(configuration, auto_tx, test_accounts): + (_, _, client, manager) = configuration + web3 = load_w3() + network_info = NetworkInfo(web3.eth.chain_id) + usdc_address = ETHAddress(network_info.tokens["usdc"], web3) + wbtc_address = ETHAddress(network_info.tokens["wbtc"], web3) + + receiver_1 = test_accounts[0] + receiver_2 = test_accounts[1] + + prompt = f"Send 0.1 ETH to {receiver_1}, then swap ETH to 0.05 WBTC, then, swap WBTC to 1000 USDC and send 50 USDC to {receiver_2}" + + wbtc_safe_address = manager.balance_of(wbtc_address) + usdc_safe_address = manager.balance_of(usdc_address) + receiver_1_native_balance = get_native_balance(client.w3, receiver_1) + receiver_2_usdc_balance = get_erc20_balance(client.w3, usdc_address, receiver_2) + + auto_tx.run(prompt, non_interactive=True) + + new_wbtc_safe_address = manager.balance_of(wbtc_address) + new_usdc_safe_address = manager.balance_of(usdc_address) + new_receiver_1_native_balance = get_native_balance(client.w3, receiver_1) + new_receiver_1_usdc_balance = get_erc20_balance(client.w3, usdc_address, receiver_1) + new_receiver_1_wbtc_balance = get_erc20_balance(client.w3, wbtc_address, receiver_1) + new_receiver_2_wbtc_balance = get_erc20_balance(client.w3, wbtc_address, receiver_2) + new_receiver_2_usdc_balance = get_erc20_balance(client.w3, usdc_address, receiver_2) + + assert new_wbtc_safe_address > wbtc_safe_address + assert new_usdc_safe_address == usdc_safe_address + 950 + assert new_receiver_1_native_balance == receiver_1_native_balance + 0.1 + assert new_receiver_1_usdc_balance == 0 + assert new_receiver_1_wbtc_balance == 0 + assert new_receiver_2_usdc_balance == receiver_2_usdc_balance + 50 + assert new_receiver_2_wbtc_balance == 0 \ No newline at end of file diff --git a/autotx/utils/agent/build_goal.py b/autotx/utils/agent/build_goal.py index f891bbf6..e15e7105 100644 --- a/autotx/utils/agent/build_goal.py +++ b/autotx/utils/agent/build_goal.py @@ -78,6 +78,7 @@ def analyze_user_prompt(chat_history: str, agents_information: str, smart_accoun If the prompt is not clear or missing information, you MUST ask for more information. If the prompt is invalid, unsupported or outside the scope of the agents, you MUST ask for a new prompt. Always ensure you have all the information needed to define the goal that can be executed without prior context. + DO NOT make any assumptions about the user's intent or context and ALWAYS take into account the available tools and their descriptions. The available agents and tools: {agents_information} diff --git a/benchmarks.py b/benchmarks.py index 29a2a913..c879b744 100644 --- a/benchmarks.py +++ b/benchmarks.py @@ -75,13 +75,20 @@ def print_summary_table(test_path: str, iterations: int, tests_results: dict, to # Calculate total success percentage total_success_percentage = (total_passes / total_attempts * 100) if total_attempts > 0 else 0 - prev_rates = sum(float(total_benchmarks["benchmarks"][result["name"]] if total_benchmarks["benchmarks"].get(result["name"]) else 0.0) for result in tests_results) - any_prev_rate = any(True if total_benchmarks["benchmarks"].get(result["name"]) else False for result in tests_results) - prev_total_success_percentage = (prev_rates / len(tests_results)) if len(tests_results) > 0 else 0 - total_success_color = "lightgreen" if total_success_percentage > prev_total_success_percentage and any_prev_rate else "none" if total_success_percentage == prev_total_success_percentage or prev_total_success_percentage == 0.0 else "red" + prev_rates = list(map(lambda x: float(total_benchmarks["benchmarks"][x["name"]]), filter(lambda result: total_benchmarks["benchmarks"].get(result["name"]), tests_results))) + any_prev_rate = any(prev_rates) + prev_total_success_percentage = sum(prev_rates) / len(prev_rates) if len(prev_rates) > 0 else 0 + + current_rates = list(filter(lambda x: total_benchmarks["benchmarks"].get(x["name"]), tests_results)) + current_total_success_percentage = sum(map(lambda x: x["passes"] / (x["passes"] + x["fails"]) * 100, current_rates)) / len(current_rates) if len(current_rates) > 0 else 0 + + total_success_color = "lightgreen" if current_total_success_percentage > prev_total_success_percentage and any_prev_rate \ + else "yellow" if not any_prev_rate \ + else "none" if current_total_success_percentage == prev_total_success_percentage \ + else "red" - total_sign = "+" if total_success_percentage >= prev_total_success_percentage else "" - total_diff = f"({total_sign}{total_success_percentage - prev_total_success_percentage:.0f})" if any_prev_rate and total_success_percentage != prev_total_success_percentage else "" + total_sign = "+" if current_total_success_percentage >= prev_total_success_percentage else "" + total_diff = f" ({current_total_success_percentage:.0f}/{total_sign}{current_total_success_percentage - prev_total_success_percentage:.0f})" if any_prev_rate and current_total_success_percentage != prev_total_success_percentage else "" # Constructing the markdown content md_content = [] @@ -95,22 +102,39 @@ def print_summary_table(test_path: str, iterations: int, tests_results: dict, to for test_result in tests_results: prev_success_rate = float(total_benchmarks["benchmarks"][test_result["name"]] if total_benchmarks["benchmarks"].get(test_result["name"]) else 0.0) + has_prev_rate = True if total_benchmarks["benchmarks"].get(test_result["name"]) else False success_rate = (test_result['passes'] / (test_result['passes'] + test_result['fails'])) * 100 - color = "lightgreen" if success_rate > prev_success_rate and prev_success_rate != 0.0 else "none" if success_rate == prev_success_rate or prev_success_rate == 0.0 else "red" + color = "lightgreen" if success_rate > prev_success_rate and has_prev_rate \ + else "yellow" if not has_prev_rate \ + else "none" if success_rate == prev_success_rate \ + else "red" sign = "+" if success_rate >= prev_success_rate else "" - diff = f"({sign}{success_rate - prev_success_rate:.0f})" if prev_success_rate != 0.0 and success_rate != prev_success_rate else "" + diff = f"({sign}{success_rate - prev_success_rate:.0f})" if has_prev_rate and success_rate != prev_success_rate else "" avg_time = f"{test_result['avg_time']:.0f}s" if test_result['avg_time'] < 60 else f"{test_result['avg_time']/60:.2f}m" md_content.append(f"| `{test_result['name']}` | ${{\\color{{{color}}} \\large \\texttt {{{success_rate:.0f}}} \\normalsize \\texttt {{{diff}}} }}$ | ${{\\color{{{color}}} \\large \\texttt {{{test_result['passes']}}}}}$ | ${{\\color{{{color}}} \\large \\texttt {{{test_result['fails']}}}}}$ | {avg_time} |") md_content.append(f"\n**Total run time:** {total_run_time/60:.2f} minutes\n") + print(f"### Test Run Summary\n") + print(f"- **Run from:** `{test_path}`") + print(f"- **Iterations:** {iterations}") + print(f"- **Total Success Rate (%):** {total_success_percentage:.2f}{total_diff}\n") + print(f"### Detailed Results\n") + print(f"| Test Name | Success Rate (%) | Passes | Fails | Avg Time |") + print(f"| --- | --- | --- | --- | --- |") + for test_result in tests_results: + prev_success_rate = float(total_benchmarks["benchmarks"][test_result["name"]] if total_benchmarks["benchmarks"].get(test_result["name"]) else 0.0) + has_prev_rate = True if total_benchmarks["benchmarks"].get(test_result["name"]) else False success_rate = (test_result['passes'] / (test_result['passes'] + test_result['fails'])) * 100 + sign = "+" if success_rate >= prev_success_rate else "" + diff = f"({sign}{success_rate - prev_success_rate:.0f})" if has_prev_rate and success_rate != prev_success_rate else "" + avg_time = f"{test_result['avg_time']:.0f}s" if test_result['avg_time'] < 60 else f"{test_result['avg_time']/60:.2f}m" - print(f"| `{test_result['name']}` | {success_rate:.0f} | {test_result['passes']} | {test_result['fails']} | {avg_time} |") + print(f"| `{test_result['name']}` | {success_rate:.0f}{diff} | {test_result['passes']} | {test_result['fails']} | {avg_time} |") print(f"\n**Total run time:** {total_run_time/60:.2f} minutes\n")