+
+
+
\ No newline at end of file
diff --git a/docs/data/0.json b/docs/data/0.json
new file mode 100644
index 0000000..eb4ab7b
--- /dev/null
+++ b/docs/data/0.json
@@ -0,0 +1,542 @@
+{
+ "0": {
+ "file_id": 0,
+ "content": "/README.md",
+ "type": "filepath"
+ },
+ "1": {
+ "file_id": 0,
+ "content": "AppAgent is an open-source project with MIT License offering overlay grid features for Android Studio emulators. It uses multimodal learning and human demonstrations to enable smartphone app operations, and provides instructions on running and improving its experience, suggesting open-sourcing Benchmark and config files, and citing a 2023 research paper with arXiv ID 2312.13771.",
+ "type": "summary"
+ },
+ "2": {
+ "file_id": 0,
+ "content": "# AppAgent\n
\n![](./assets/teaser.png)\nℹ️Should you encounter any issues⚠️ while using our project, please feel free to report them on [GitHub Issues](https://github.com/mnotgod96/AppAgent/issues) or reach out to [Dr. Chi Zhang](https://icoz69.github.io/) via email at dr.zhang.chi@outlook.com.\n## 📝 Changelog\n- __[2024.1.2]__: 🔥Added an optional method for the agent to bring up a grid overlay on the screen to **tap/swipe anywhere** on the screen.\n- __[2023.12.26]__: Added [Tips](#tips) section for better use experience; added instruction for using the **Android Studio emulator** for\n users who do not have Android devices.",
+ "type": "code",
+ "location": "/README.md:13-28"
+ },
+ "5": {
+ "file_id": 0,
+ "content": "This code appears to be a README file for a project called \"AppAgent\". It lists the contributors and provides instructions on how to use the project. The most recent updates include adding an optional method for bringing up a grid overlay on the screen and providing tips for better user experience, including instructions for using the Android Studio emulator for users without Android devices.",
+ "type": "comment"
+ },
+ "6": {
+ "file_id": 0,
+ "content": "- __[2023.12.21]__: 🔥🔥 Open-sourced the git repository, including the detailed configuration steps to implement our AppAgent!\n## 🔆 Introduction\nWe introduce a novel LLM-based multimodal agent framework designed to operate smartphone applications. \nOur framework enables the agent to operate smartphone applications through a simplified action space, mimicking human-like interactions such as tapping and swiping. This novel approach bypasses the need for system back-end access, thereby broadening its applicability across diverse apps.\nCentral to our agent's functionality is its innovative learning method. The agent learns to navigate and use new apps either through autonomous exploration or by observing human demonstrations. This process generates a knowledge base that the agent refers to for executing complex tasks across different applications.\n## ✨ Demo\nThe demo video shows the process of using AppAgent to follow a user on X (Twitter) in the deployment phase.\nhttps://github.com/mnotgod96/AppAgent/assets/40715314/db99d650-dec1-4531-b4b2-e085bfcadfb7",
+ "type": "code",
+ "location": "/README.md:29-45"
+ },
+ "7": {
+ "file_id": 0,
+ "content": "This code provides an introduction to AppAgent, a multimodal agent framework for operating smartphone applications through a simplified action space. The agent can learn by autonomous exploration or human demonstrations and has a demo video available.",
+ "type": "comment"
+ },
+ "8": {
+ "file_id": 0,
+ "content": "An interesting experiment showing AppAgent's ability to pass CAPTCHA.\nhttps://github.com/mnotgod96/AppAgent/assets/27103154/5cc7ba50-dbab-42a0-a411-a9a862482548\nAn example of using the grid overlay to locate a UI element that is not labeled with a numeric tag.\nhttps://github.com/mnotgod96/AppAgent/assets/27103154/71603333-274c-46ed-8381-2f9a34cdfc53\n## 🚀 Quick Start\nThis section will guide you on how to quickly use `gpt-4-vision-preview` as an agent to complete specific tasks for you on\nyour Android app.\n### ⚙️ Step 1. Prerequisites\n1. On your PC, download and install [Android Debug Bridge](https://developer.android.com/tools/adb) (adb) which is a\n command-line tool that lets you communicate with your Android device from the PC.\n2. Get an Android device and enable the USB debugging that can be found in Developer Options in Settings.\n3. Connect your device to your PC using a USB cable.\n4. (Optional) If you do not have an Android device but still want to try AppAgent. We recommend you download\n [",
+ "type": "code",
+ "location": "/README.md:47-70"
+ },
+ "9": {
+ "file_id": 0,
+ "content": "This code is providing quick start instructions for using the gpt-4-vision-preview as an agent to complete tasks on Android apps. It requires installing Android Debug Bridge, enabling USB debugging on the device, and connecting the device to a PC via USB. An optional method for those without an Android device is also suggested.",
+ "type": "comment"
+ },
+ "10": {
+ "file_id": 0,
+ "content": "Android Studio](https://developer.android.com/studio/run/emulator) and use the emulator that comes with it.\n The emulator can be found in the device manager of Android Studio. You can install apps on an emulator by\n downloading APK files from the internet and dragging them to the emulator.\n AppAgent can detect the emulated device and operate apps on it just like operating a real device.\n \n5. Clone this repo and install the dependencies. All scripts in this project are written in Python 3 so make sure you\n have installed it.\n```bash\ncd AppAgent\npip install -r requirements.txt\n```\n### 🤖 Step 2. Configure the Agent\nAppAgent needs to be powered by a multi-modal model which can receive both text and visual inputs. During our experiment\n, we used `gpt-4-vision-preview` as the model to make decisions on how to take actions to complete a task on the smartphone.",
+ "type": "code",
+ "location": "/README.md:70-88"
+ },
+ "11": {
+ "file_id": 0,
+ "content": "Android Studio is mentioned as a tool for running the code and using the emulator. The emulator can be found in Android Studio's device manager, and APK files from the internet can be installed on it. AppAgent can detect an emulated device and function like a real device.\n\nTo use this code, clone the repository and install Python 3 dependencies by running pip install -r requirements.txt in the project directory.",
+ "type": "comment"
+ },
+ "12": {
+ "file_id": 0,
+ "content": "To configure your requests to GPT-4V, you should modify `config.yaml` in the root directory.\nThere are two key parameters that must be configured to try AppAgent:\n1. OpenAI API key: you must purchase an eligible API key from OpenAI so that you can have access to GPT-4V.\n2. Request interval: this is the time interval in seconds between consecutive GPT-4V requests to control the frequency \nof your requests to GPT-4V. Adjust this value according to the status of your account.\nOther parameters in `config.yaml` are well commented. Modify them as you need.\n> Be aware that GPT-4V is not free. Each request/response pair involved in this project costs around $0.03. Use it wisely.\nIf you want to test AppAgent using your own models, you should modify the `ask_gpt_4v` function in `scripts/model.py` \naccordingly.\n### 🔍 Step 3. Exploration Phase\nOur paper proposed a novel solution that involves two phases, exploration, and deployment, to turn GPT-4V into a capable \nagent that can help users operate their Android phones when a task is given. The exploration phase starts with a task ",
+ "type": "code",
+ "location": "/README.md:90-106"
+ },
+ "13": {
+ "file_id": 0,
+ "content": "Configure requests to GPT-4V by modifying `config.yaml` in the root directory. Provide an eligible OpenAI API key and set request interval to control frequency of GPT-4V requests. Other parameters are well commented, adjust as needed. Be aware that GPT-4V is not free; each request costs around $0.03. Test AppAgent with custom models by modifying `ask_gpt_4v` in `scripts/model.py`.",
+ "type": "comment"
+ },
+ "14": {
+ "file_id": 0,
+ "content": "given by you, and you can choose to let the agent either explore the app on its own or learn from your demonstration. \nIn both cases, the agent generates documentation for elements interacted during the exploration/demonstration and \nsaves them for use in the deployment phase.\n#### Option 1: Autonomous Exploration\nThis solution features a fully autonomous exploration which allows the agent to explore the use of the app by attempting\nthe given task without any intervention from humans.\nTo start, run `learn.py` in the root directory. Follow the prompted instructions to select `autonomous exploration` \nas the operating mode and provide the app name and task description. Then, your agent will do the job for you. Under \nthis mode, AppAgent will reflect on its previous action making sure its action adheres to the given task and generate \ndocumentation for the elements explored.\n```bash\npython learn.py\n```\n#### Option 2: Learning from Human Demonstrations\nThis solution requires users to demonstrate a similar task first. AppAgent will learn from the demo and generate ",
+ "type": "code",
+ "location": "/README.md:107-127"
+ },
+ "15": {
+ "file_id": 0,
+ "content": "The code describes two options for using the AppAgent. Option 1 is autonomous exploration, where the agent can explore and learn from the app without human intervention. Option 2 involves learning from a human demonstration. Both methods generate documentation for elements interacted during exploration/demonstration for use in deployment. To start with Option 1, run \"learn.py\" and follow prompts to select autonomous exploration, provide the app name and task description.",
+ "type": "comment"
+ },
+ "16": {
+ "file_id": 0,
+ "content": "documentations for UI elements seen during the demo.\nTo start human demonstration, you should run `learn.py` in the root directory. Follow the prompted instructions to select \n`human demonstration` as the operating mode and provide the app name and task description. A screenshot of your phone \nwill be captured and all interactive elements shown on the screen will be labeled with numeric tags. You need to follow \nthe prompts to determine your next action and the target of the action. When you believe the demonstration is finished, \ntype `stop` to end the demo.\n```bash\npython learn.py\n```\n![](./assets/demo.png)\n### 📱 Step 4. Deployment Phase\nAfter the exploration phase finishes, you can run `run.py` in the root directory. Follow the prompted instructions to enter \nthe name of the app, select the appropriate documentation base you want the agent to use and provide the task \ndescription. Then, your agent will do the job for you. The agent will automatically detect if there is documentation \nbase generat",
+ "type": "code",
+ "location": "/README.md:128-147"
+ },
+ "17": {
+ "file_id": 0,
+ "content": "This code is providing instructions on how to run the human demonstration and the agent for an app using `learn.py` and `run.py` scripts in the root directory. The user needs to follow prompts to provide the app name, task description, and documentation base for the agent to function properly.",
+ "type": "comment"
+ },
+ "18": {
+ "file_id": 0,
+ "content": "ed before for the app; if there is no documentation found, you can also choose to run the agent without any \ndocumentation (success rate not guaranteed).\n```bash\npython run.py\n```\n## 💡 Tips\n- For an improved experience, you might permit AppAgent to undertake a broader range of tasks through autonomous exploration, or you can directly demonstrate more app functions to enhance the app documentation. Generally, the more extensive the documentation provided to the agent, the higher the likelihood of successful task completion.\n- It is always a good practice to inspect the documentation generated by the agent. When you find some documentation not accurately\n describe the function of the element, manually revising the documentation is also an option.\n## 📖 To-Do List\n- [ ] Open source the Benchmark.\n- [x] Open source the configuration.\n## 😉 Citation\n```bib\n@misc{yang2023appagent,\n title={AppAgent: Multimodal Agents as Smartphone Users}, \n author={Chi Zhang and Zhao Yang and Jiaxuan Liu and Yucheng Han and Xin Chen and Zebiao Huang and Bin Fu and Gang Yu},",
+ "type": "code",
+ "location": "/README.md:147-168"
+ },
+ "19": {
+ "file_id": 0,
+ "content": "This code is providing instructions on how to run the AppAgent and its associated tasks. The code suggests that for a better experience, users can permit AppAgent to explore more tasks autonomously or demonstrate more app functions to improve documentation. It also recommends inspecting the generated documentation by the agent and manually revising it if needed. Additionally, the code mentions open-sourcing the Benchmark and configuration files. Lastly, it provides a citation for AppAgent in the form of BibTeX format.",
+ "type": "comment"
+ },
+ "20": {
+ "file_id": 0,
+ "content": " year={2023},\n eprint={2312.13771},\n archivePrefix={arXiv},\n primaryClass={cs.CV}\n}\n```\n## Star History\n[![Star History Chart](https://api.star-history.com/svg?repos=mnotgod96/AppAgent&type=Date)](https://star-history.com/#mnotgod96/AppAgent&Date)\n## License\nThe [MIT license](./assets/license.txt).",
+ "type": "code",
+ "location": "/README.md:169-182"
+ },
+ "21": {
+ "file_id": 0,
+ "content": "The code specifies the publication details for a research paper. It includes the year (2023), eprint ID (2312.13771), archive prefix (arXiv), and primary class (cs.CV).",
+ "type": "comment"
+ },
+ "22": {
+ "file_id": 1,
+ "content": "/config.yaml",
+ "type": "filepath"
+ },
+ "23": {
+ "file_id": 1,
+ "content": "The code configures OpenAI API settings, GPT-4V request interval, Android screenshot and XML directories for an app agent. It also sets a round limit, dark mode, and minimum distance between elements for element labeling in the configuration file.",
+ "type": "summary"
+ },
+ "24": {
+ "file_id": 1,
+ "content": "OPENAI_API_BASE: \"https://api.openai.com/v1/chat/completions\"\nOPENAI_API_KEY: \"sk-\" # Set the value to sk-xxx if you host the openai interface for open llm model\nOPENAI_API_MODEL: \"gpt-4-vision-preview\" # The only OpenAI model by now that accepts visual input\nMAX_TOKENS: 300 # The max token limit for the response completion\nTEMPERATURE: 0.0 # The temperature of the model: the lower the value, the more consistent the output of the model\nREQUEST_INTERVAL: 10 # Time in seconds between consecutive GPT-4V requests\nANDROID_SCREENSHOT_DIR: \"/sdcard/Pictures/Screenshots\" # Set the directory on your Android device to store the intermediate screenshots. Make sure the directory EXISTS on your phone!\nANDROID_XML_DIR: \"/sdcard\" # Set the directory on your Android device to store the intermediate XML files used for determining locations of UI elements on your screen. Make sure the directory EXISTS on your phone!\nDOC_REFINE: false # Set this to true will make the agent refine existing documentation b",
+ "type": "code",
+ "location": "/config.yaml:1-11"
+ },
+ "25": {
+ "file_id": 1,
+ "content": "This code is configuring OpenAI API settings, GPT-4V request interval, Android screenshot and XML directories for an app agent.",
+ "type": "comment"
+ },
+ "26": {
+ "file_id": 1,
+ "content": "ased on the latest demonstration; otherwise, the agent will not regenerate a new documentation for elements with the same resource ID.\nMAX_ROUNDS: 20 # Set the round limit for the agent to complete the task\nDARK_MODE: false # Set this to true if your app is in dark mode to enhance the element labeling\nMIN_DIST: 30 # The minimum distance between elements to prevent overlapping during the labeling process",
+ "type": "code",
+ "location": "/config.yaml:11-14"
+ },
+ "27": {
+ "file_id": 1,
+ "content": "The configuration file sets round limit, dark mode, and minimum distance between elements for agent's element labeling.",
+ "type": "comment"
+ },
+ "28": {
+ "file_id": 2,
+ "content": "/learn.py",
+ "type": "filepath"
+ },
+ "29": {
+ "file_id": 2,
+ "content": "The code is for an argument parser in the exploration phase of AppAgent, enabling users to select between autonomous or human demonstration mode and specifying required parameters. It also includes a document generation script for running specified apps and demos.",
+ "type": "summary"
+ },
+ "30": {
+ "file_id": 2,
+ "content": "import argparse\nimport datetime\nimport os\nimport time\nfrom scripts.utils import print_with_color\narg_desc = \"AppAgent - exploration phase\"\nparser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)\nparser.add_argument(\"--app\")\nparser.add_argument(\"--root_dir\", default=\"./\")\nargs = vars(parser.parse_args())\napp = args[\"app\"]\nroot_dir = args[\"root_dir\"]\nprint_with_color(\"Welcome to the exploration phase of AppAgent!\\nThe exploration phase aims at generating \"\n \"documentations for UI elements through either autonomous exploration or human demonstration. \"\n \"Both options are task-oriented, which means you need to give a task description. During \"\n \"autonomous exploration, the agent will try to complete the task by interacting with possible \"\n \"elements on the UI within limited rounds. Documentations will be generated during the process of \"\n \"interacting with the correct elements to proceed with the task. Human demonstration relies on \"",
+ "type": "code",
+ "location": "/learn.py:1-23"
+ },
+ "31": {
+ "file_id": 2,
+ "content": "This code is for an argument parser in the exploration phase of AppAgent. It allows users to input app and root directory, then provides a description of the phase's purpose: generating documentations for UI elements through autonomous exploration or human demonstration. The task-oriented approach requires giving task descriptions.",
+ "type": "comment"
+ },
+ "32": {
+ "file_id": 2,
+ "content": " \"the user to show the agent how to complete the given task, and the agent will generate \"\n \"documentations for the elements interacted during the human demo. To start, please enter the \"\n \"main interface of the app on your phone.\", \"yellow\")\nprint_with_color(\"Choose from the following modes:\\n1. autonomous exploration\\n2. human demonstration\\n\"\n \"Type 1 or 2.\", \"blue\")\nuser_input = \"\"\nwhile user_input != \"1\" and user_input != \"2\":\n user_input = input()\nif not app:\n print_with_color(\"What is the name of the target app?\", \"blue\")\n app = input()\n app = app.replace(\" \", \"\")\nif user_input == \"1\":\n os.system(f\"python scripts/self_explorer.py --app {app} --root_dir {root_dir}\")\nelse:\n demo_timestamp = int(time.time())\n demo_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime(f\"demo_{app}_%Y-%m-%d_%H-%M-%S\")\n os.system(f\"python scripts/step_recorder.py --app {app} --demo {demo_name} --root_dir {root_dir}\")\n o",
+ "type": "code",
+ "location": "/learn.py:24-44"
+ },
+ "33": {
+ "file_id": 2,
+ "content": "This code asks the user to choose between autonomous exploration or human demonstration mode for the app agent. If \"1\" is entered, it starts autonomous exploration using specific script with app and root_dir parameters. If \"2\" is entered, it begins a human demonstration by creating a demo name and running another script for step recording with app, demo name, and root_dir parameters.",
+ "type": "comment"
+ },
+ "34": {
+ "file_id": 2,
+ "content": "s.system(f\"python scripts/document_generation.py --app {app} --demo {demo_name} --root_dir {root_dir}\")",
+ "type": "code",
+ "location": "/learn.py:44-44"
+ },
+ "35": {
+ "file_id": 2,
+ "content": "Running document generation script for specified app and demo.",
+ "type": "comment"
+ },
+ "36": {
+ "file_id": 3,
+ "content": "/requirements.txt",
+ "type": "filepath"
+ },
+ "37": {
+ "file_id": 3,
+ "content": "List of dependencies for the project: argparse, colorama, opencv-python, pyshtine, pyyaml, requests",
+ "type": "summary"
+ },
+ "38": {
+ "file_id": 3,
+ "content": "argparse\ncolorama\nopencv-python\npyshine\npyyaml\nrequests",
+ "type": "code",
+ "location": "/requirements.txt:1-6"
+ },
+ "39": {
+ "file_id": 3,
+ "content": "List of dependencies for the project: argparse, colorama, opencv-python, pyshtine, pyyaml, requests",
+ "type": "comment"
+ },
+ "40": {
+ "file_id": 4,
+ "content": "/run.py",
+ "type": "filepath"
+ },
+ "41": {
+ "file_id": 4,
+ "content": "Code imports necessary modules, sets up an argument parser, and retrieves app name from user input before executing a task.",
+ "type": "summary"
+ },
+ "42": {
+ "file_id": 4,
+ "content": "import argparse\nimport os\nfrom scripts.utils import print_with_color\narg_desc = \"AppAgent - deployment phase\"\nparser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)\nparser.add_argument(\"--app\")\nparser.add_argument(\"--root_dir\", default=\"./\")\nargs = vars(parser.parse_args())\napp = args[\"app\"]\nroot_dir = args[\"root_dir\"]\nprint_with_color(\"Welcome to the deployment phase of AppAgent!\\nBefore giving me the task, you should first tell me \"\n \"the name of the app you want me to operate and what documentation base you want me to use. I will \"\n \"try my best to complete the task without your intervention. First, please enter the main interface \"\n \"of the app on your phone and provide the following information.\", \"yellow\")\nif not app:\n print_with_color(\"What is the name of the target app?\", \"blue\")\n app = input()\n app = app.replace(\" \", \"\")\nos.system(f\"python scripts/task_executor.py --app {app} --root_dir {root_dir}\")",
+ "type": "code",
+ "location": "/run.py:1-25"
+ },
+ "43": {
+ "file_id": 4,
+ "content": "Code imports necessary modules, sets up an argument parser, and retrieves app name from user input before executing a task.",
+ "type": "comment"
+ },
+ "44": {
+ "file_id": 5,
+ "content": "/scripts/and_controller.py",
+ "type": "filepath"
+ },
+ "45": {
+ "file_id": 5,
+ "content": "The code involves Android class definitions, adb command execution, unique identifier generation from XML attributes, and swipe actions with precision and duration options.",
+ "type": "summary"
+ },
+ "46": {
+ "file_id": 5,
+ "content": "import os\nimport subprocess\nimport xml.etree.ElementTree as ET\nfrom config import load_config\nfrom utils import print_with_color\nconfigs = load_config()\nclass AndroidElement:\n def __init__(self, uid, bbox, attrib):\n self.uid = uid\n self.bbox = bbox\n self.attrib = attrib\ndef execute_adb(adb_command):\n # print(adb_command)\n result = subprocess.run(adb_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n if result.returncode == 0:\n return result.stdout.strip()\n print_with_color(f\"Command execution failed: {adb_command}\", \"red\")\n print_with_color(result.stderr, \"red\")\n return \"ERROR\"\ndef list_all_devices():\n adb_command = \"adb devices\"\n device_list = []\n result = execute_adb(adb_command)\n if result != \"ERROR\":\n devices = result.split(\"\\n\")[1:]\n for d in devices:\n device_list.append(d.split()[0])\n return device_list\ndef get_id_from_element(elem):\n bounds = elem.attrib[\"bounds\"][1:-1].split(\"][\")\n x1, y1 = map(int, bounds[0].split(\",\"))",
+ "type": "code",
+ "location": "/scripts/and_controller.py:1-43"
+ },
+ "47": {
+ "file_id": 5,
+ "content": "Imports required libraries and defines a class for Android elements, function to execute ADB commands, lists all connected devices, and extracts ID from an Android element's bounds.",
+ "type": "comment"
+ },
+ "48": {
+ "file_id": 5,
+ "content": " x2, y2 = map(int, bounds[1].split(\",\"))\n elem_w, elem_h = x2 - x1, y2 - y1\n if \"resource-id\" in elem.attrib and elem.attrib[\"resource-id\"]:\n elem_id = elem.attrib[\"resource-id\"].replace(\":\", \".\").replace(\"/\", \"_\")\n else:\n elem_id = f\"{elem.attrib['class']}_{elem_w}_{elem_h}\"\n if \"content-desc\" in elem.attrib and elem.attrib[\"content-desc\"] and len(elem.attrib[\"content-desc\"]) < 20:\n content_desc = elem.attrib['content-desc'].replace(\"/\", \"_\").replace(\" \", \"\").replace(\":\", \"_\")\n elem_id += f\"_{content_desc}\"\n return elem_id\ndef traverse_tree(xml_path, elem_list, attrib, add_index=False):\n path = []\n for event, elem in ET.iterparse(xml_path, ['start', 'end']):\n if event == 'start':\n path.append(elem)\n if attrib in elem.attrib and elem.attrib[attrib] == \"true\":\n parent_prefix = \"\"\n if len(path) > 1:\n parent_prefix = get_id_from_element(path[-2])\n bounds = elem.attrib[\"bounds\"][1:-1].split(\"][\")",
+ "type": "code",
+ "location": "/scripts/and_controller.py:44-65"
+ },
+ "49": {
+ "file_id": 5,
+ "content": "This code snippet is parsing an XML file and generating unique identifiers for elements within the file. It extracts attributes such as resource-id, class, content-desc, and dimensions of each element to form the identifier. The function \"get_id_from_element\" generates the identifier based on these attributes, and the \"traverse_tree\" function traverses the XML tree, applying certain conditions to generate identifiers for elements that meet those criteria.",
+ "type": "comment"
+ },
+ "50": {
+ "file_id": 5,
+ "content": " x1, y1 = map(int, bounds[0].split(\",\"))\n x2, y2 = map(int, bounds[1].split(\",\"))\n center = (x1 + x2) // 2, (y1 + y2) // 2\n elem_id = get_id_from_element(elem)\n if parent_prefix:\n elem_id = parent_prefix + \"_\" + elem_id\n if add_index:\n elem_id += f\"_{elem.attrib['index']}\"\n close = False\n for e in elem_list:\n bbox = e.bbox\n center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5\n if dist <= configs[\"MIN_DIST\"]:\n close = True\n break\n if not close:\n elem_list.append(AndroidElement(elem_id, ((x1, y1), (x2, y2)), attrib))\n if event == 'end':\n path.pop()\nclass AndroidController:\n def __init__(self, device):",
+ "type": "code",
+ "location": "/scripts/and_controller.py:66-90"
+ },
+ "51": {
+ "file_id": 5,
+ "content": "Functionality: This code creates Android elements based on their bounding box coordinates and appends them to a list.\n\nExplanation: The code checks if the given element is close enough to an existing element in the list by comparing their bounding box centers' distance. If it's not close, it creates a new AndroidElement object with the provided ID, bounding box coordinates, and attributes, and appends it to the list. If it's already close, it skips creating a new element. At the end of the event (presumably loop), if 'end' is reached, the code removes the top element from the path stack.",
+ "type": "comment"
+ },
+ "52": {
+ "file_id": 5,
+ "content": " self.device = device\n self.screenshot_dir = configs[\"ANDROID_SCREENSHOT_DIR\"]\n self.xml_dir = configs[\"ANDROID_XML_DIR\"]\n self.width, self.height = self.get_device_size()\n self.backslash = \"\\\\\"\n def get_device_size(self):\n adb_command = f\"adb -s {self.device} shell wm size\"\n result = execute_adb(adb_command)\n if result != \"ERROR\":\n return map(int, result.split(\": \")[1].split(\"x\"))\n return 0, 0\n def get_screenshot(self, prefix, save_dir):\n cap_command = f\"adb -s {self.device} shell screencap -p \" \\\n f\"{os.path.join(self.screenshot_dir, prefix + '.png').replace(self.backslash, '/')}\"\n pull_command = f\"adb -s {self.device} pull \" \\\n f\"{os.path.join(self.screenshot_dir, prefix + '.png').replace(self.backslash, '/')} \" \\\n f\"{os.path.join(save_dir, prefix + '.png')}\"\n result = execute_adb(cap_command)\n if result != \"ERROR\":\n result = execute_adb(pull_command)",
+ "type": "code",
+ "location": "/scripts/and_controller.py:91-112"
+ },
+ "53": {
+ "file_id": 5,
+ "content": "This code is part of an Android controller that handles device-related operations. It sets the device, screenshot directory, XML directory, and gets the device's width and height. The `get_device_size` function retrieves the screen size using the ADB command, and `get_screenshot` takes a prefix and save directory to capture and save a screenshot.",
+ "type": "comment"
+ },
+ "54": {
+ "file_id": 5,
+ "content": " if result != \"ERROR\":\n return os.path.join(save_dir, prefix + \".png\")\n return result\n return result\n def get_xml(self, prefix, save_dir):\n dump_command = f\"adb -s {self.device} shell uiautomator dump \" \\\n f\"{os.path.join(self.xml_dir, prefix + '.xml').replace(self.backslash, '/')}\"\n pull_command = f\"adb -s {self.device} pull \" \\\n f\"{os.path.join(self.xml_dir, prefix + '.xml').replace(self.backslash, '/')} \" \\\n f\"{os.path.join(save_dir, prefix + '.xml')}\"\n result = execute_adb(dump_command)\n if result != \"ERROR\":\n result = execute_adb(pull_command)\n if result != \"ERROR\":\n return os.path.join(save_dir, prefix + \".xml\")\n return result\n return result\n def back(self):\n adb_command = f\"adb -s {self.device} shell input keyevent KEYCODE_BACK\"\n ret = execute_adb(adb_command)\n return ret\n def tap(self, x, y):",
+ "type": "code",
+ "location": "/scripts/and_controller.py:113-137"
+ },
+ "55": {
+ "file_id": 5,
+ "content": "This code defines a class with three methods. The `and_controller` class allows executing commands on an Android device using adb (Android Debug Bridge).\n\nThe `back()` method sends the back key event to the device.\n\nThe `get_xml(prefix, save_dir)` method dumps and pulls an XML file from the device to the specified save directory, returning the saved file path if successful; otherwise, it returns any error message.\n\nThe `execute_adb(command)` function is used to execute adb commands but its implementation is not shown in this code block.",
+ "type": "comment"
+ },
+ "56": {
+ "file_id": 5,
+ "content": " adb_command = f\"adb -s {self.device} shell input tap {x} {y}\"\n ret = execute_adb(adb_command)\n return ret\n def text(self, input_str):\n input_str = input_str.replace(\" \", \"%s\")\n input_str = input_str.replace(\"'\", \"\")\n adb_command = f\"adb -s {self.device} shell input text {input_str}\"\n ret = execute_adb(adb_command)\n return ret\n def long_press(self, x, y, duration=1000):\n adb_command = f\"adb -s {self.device} shell input swipe {x} {y} {x} {y} {duration}\"\n ret = execute_adb(adb_command)\n return ret\n def swipe(self, x, y, direction, dist=\"medium\", quick=False):\n unit_dist = int(self.width / 10)\n if dist == \"long\":\n unit_dist *= 3\n elif dist == \"medium\":\n unit_dist *= 2\n if direction == \"up\":\n offset = 0, -2 * unit_dist\n elif direction == \"down\":\n offset = 0, 2 * unit_dist\n elif direction == \"left\":\n offset = -1 * unit_dist, 0\n elif direction == \"right\":",
+ "type": "code",
+ "location": "/scripts/and_controller.py:138-166"
+ },
+ "57": {
+ "file_id": 5,
+ "content": "The code above contains four methods: \"tap\", \"text\", \"long_press\", and \"swipe\". Each method takes specific arguments such as (x, y) coordinates for taps and swipes, input text for text input, and duration for long press. The methods execute adb commands on a connected device to perform the specified action.",
+ "type": "comment"
+ },
+ "58": {
+ "file_id": 5,
+ "content": " offset = unit_dist, 0\n else:\n return \"ERROR\"\n duration = 100 if quick else 400\n adb_command = f\"adb -s {self.device} shell input swipe {x} {y} {x+offset[0]} {y+offset[1]} {duration}\"\n ret = execute_adb(adb_command)\n return ret\n def swipe_precise(self, start, end, duration=400):\n start_x, start_y = start\n end_x, end_y = end\n adb_command = f\"adb -s {self.device} shell input swipe {start_x} {start_x} {end_x} {end_y} {duration}\"\n ret = execute_adb(adb_command)\n return ret",
+ "type": "code",
+ "location": "/scripts/and_controller.py:167-180"
+ },
+ "59": {
+ "file_id": 5,
+ "content": "Code performs swipe actions on a device using ADB (Android Debug Bridge) commands. It allows for different swipe durations based on the \"quick\" parameter and has two functions: \"swipe\" and \"swipe_precise\".",
+ "type": "comment"
+ },
+ "60": {
+ "file_id": 6,
+ "content": "/scripts/config.py",
+ "type": "filepath"
+ },
+ "61": {
+ "file_id": 6,
+ "content": "Function to load configuration from a YAML file, merging it with environment variables.",
+ "type": "summary"
+ },
+ "62": {
+ "file_id": 6,
+ "content": "import os\nimport yaml\ndef load_config(config_path=\"./config.yaml\"):\n configs = dict(os.environ)\n with open(config_path, \"r\") as file:\n yaml_data = yaml.safe_load(file)\n configs.update(yaml_data)\n return configs",
+ "type": "code",
+ "location": "/scripts/config.py:1-10"
+ },
+ "63": {
+ "file_id": 6,
+ "content": "Function to load configuration from a YAML file, merging it with environment variables.",
+ "type": "comment"
+ },
+ "64": {
+ "file_id": 7,
+ "content": "/scripts/document_generation.py",
+ "type": "filepath"
+ },
+ "65": {
+ "file_id": 7,
+ "content": "The code sets up arguments for the \"AppAgent - Human Demonstration\" program, creates directories, processes lines from a record file, encodes images and extracts action types and parameters. It handles user actions by generating prompts with regular expressions and includes an else block to check for existing documents and refine them if enabled. The code waits for GPT-4V to generate documentation, constructs content including prompts and images, updates `doc_content`, logs entries, handles errors, and sleeps between requests.",
+ "type": "summary"
+ },
+ "66": {
+ "file_id": 7,
+ "content": "import argparse\nimport ast\nimport json\nimport os\nimport re\nimport sys\nimport time\nimport prompts\nfrom config import load_config\nfrom model import ask_gpt4v\nfrom utils import print_with_color, encode_image\narg_desc = \"AppAgent - Human Demonstration\"\nparser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)\nparser.add_argument(\"--app\", required=True)\nparser.add_argument(\"--demo\", required=True)\nparser.add_argument(\"--root_dir\", default=\"./\")\nargs = vars(parser.parse_args())\nconfigs = load_config()\nroot_dir = args[\"root_dir\"]\nwork_dir = os.path.join(root_dir, \"apps\")\nif not os.path.exists(work_dir):\n os.mkdir(work_dir)\napp = args[\"app\"]\nwork_dir = os.path.join(work_dir, app)\ndemo_dir = os.path.join(work_dir, \"demos\")\ndemo_name = args[\"demo\"]\ntask_dir = os.path.join(demo_dir, demo_name)\nxml_dir = os.path.join(task_dir, \"xml\")\nlabeled_ss_dir = os.path.join(task_dir, \"labeled_screenshots\")\nrecord_path = os.path.join(task_dir, \"record.txt\")\ntask_desc_path = os.path.join(task_dir, \"task_desc.txt\")",
+ "type": "code",
+ "location": "/scripts/document_generation.py:1-35"
+ },
+ "67": {
+ "file_id": 7,
+ "content": "This code is setting up arguments for a program called \"AppAgent - Human Demonstration\". It specifies required parameters such as the app and demo to be used. The code also creates directories if they do not exist, and defines paths for various files and directories related to the task at hand.",
+ "type": "comment"
+ },
+ "68": {
+ "file_id": 7,
+ "content": "if not os.path.exists(task_dir) or not os.path.exists(xml_dir) or not os.path.exists(labeled_ss_dir) \\\n or not os.path.exists(record_path) or not os.path.exists(task_desc_path):\n sys.exit()\nlog_path = os.path.join(task_dir, f\"log_{app}_{demo_name}.txt\")\ndocs_dir = os.path.join(work_dir, \"demo_docs\")\nif not os.path.exists(docs_dir):\n os.mkdir(docs_dir)\nprint_with_color(f\"Starting to generate documentations for the app {app} based on the demo {demo_name}\", \"yellow\")\ndoc_count = 0\nwith open(record_path, \"r\") as infile:\n step = len(infile.readlines()) - 1\n infile.seek(0)\n for i in range(1, step + 1):\n img_before = encode_image(os.path.join(labeled_ss_dir, f\"{demo_name}_{i}.png\"))\n img_after = encode_image(os.path.join(labeled_ss_dir, f\"{demo_name}_{i + 1}.png\"))\n rec = infile.readline().strip()\n action, resource_id = rec.split(\":::\")\n action_type = action.split(\"(\")[0]\n action_param = re.findall(r\"\\((.*?)\\)\", action)[0]\n if action_type == \"tap\":",
+ "type": "code",
+ "location": "/scripts/document_generation.py:36-57"
+ },
+ "69": {
+ "file_id": 7,
+ "content": "Code is checking if certain directories exist and creating a directory for document generation. It then reads from a record file, processes each line, encoding images before_and_after the step, extracting action type and parameters.",
+ "type": "comment"
+ },
+ "70": {
+ "file_id": 7,
+ "content": " prompt_template = prompts.tap_doc_template\n prompt = re.sub(r\"\", action_param, prompt_template)\n elif action_type == \"text\":\n input_area, input_text = action_param.split(\":sep:\")\n prompt_template = prompts.text_doc_template\n prompt = re.sub(r\"\", input_area, prompt_template)\n elif action_type == \"long_press\":\n prompt_template = prompts.long_press_doc_template\n prompt = re.sub(r\"\", action_param, prompt_template)\n elif action_type == \"swipe\":\n swipe_area, swipe_dir = action_param.split(\":sep:\")\n if swipe_dir == \"up\" or swipe_dir == \"down\":\n action_type = \"v_swipe\"\n elif swipe_dir == \"left\" or swipe_dir == \"right\":\n action_type = \"h_swipe\"\n prompt_template = prompts.swipe_doc_template\n prompt = re.sub(r\"\", swipe_dir, prompt_template)\n prompt = re.sub(r\"\", swipe_area, prompt)",
+ "type": "code",
+ "location": "/scripts/document_generation.py:58-75"
+ },
+ "71": {
+ "file_id": 7,
+ "content": "This code handles different user actions and generates prompts based on the action type. It uses regular expressions to replace placeholders in prompt templates with specific action parameters.",
+ "type": "comment"
+ },
+ "72": {
+ "file_id": 7,
+ "content": " else:\n break\n task_desc = open(task_desc_path, \"r\").read()\n prompt = re.sub(r\"\", task_desc, prompt)\n doc_name = resource_id + \".txt\"\n doc_path = os.path.join(docs_dir, doc_name)\n if os.path.exists(doc_path):\n doc_content = ast.literal_eval(open(doc_path).read())\n if doc_content[action_type]:\n if configs[\"DOC_REFINE\"]:\n suffix = re.sub(r\"\", doc_content[action_type], prompts.refine_doc_suffix)\n prompt += suffix\n print_with_color(f\"Documentation for the element {resource_id} already exists. The doc will be \"\n f\"refined based on the latest demo.\", \"yellow\")\n else:\n print_with_color(f\"Documentation for the element {resource_id} already exists. Turn on DOC_REFINE \"\n f\"in the config file if needed.\", \"yellow\")\n continue\n else:",
+ "type": "code",
+ "location": "/scripts/document_generation.py:76-96"
+ },
+ "73": {
+ "file_id": 7,
+ "content": "Else block: checks if a document for the current task already exists and refines it if DOC_REFINE is enabled in the config file.",
+ "type": "comment"
+ },
+ "74": {
+ "file_id": 7,
+ "content": " doc_content = {\n \"tap\": \"\",\n \"text\": \"\",\n \"v_swipe\": \"\",\n \"h_swipe\": \"\",\n \"long_press\": \"\"\n }\n print_with_color(f\"Waiting for GPT-4V to generate documentation for the element {resource_id}\", \"yellow\")\n content = [\n {\n \"type\": \"text\",\n \"text\": prompt\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{img_before}\"\n }\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{img_after}\"\n }\n }\n ]\n rsp = ask_gpt4v(content)\n if \"error\" not in rsp:\n msg = rsp[\"choices\"][0][\"message\"][\"content\"]\n doc_content[action_type] = msg\n with open(log_path, \"a\") as logfile:\n log_item = {\"step\": i, \"prompt\": prompt, \"image_before\": f\"{demo_name}_{i}.png\",",
+ "type": "code",
+ "location": "/scripts/document_generation.py:97-130"
+ },
+ "75": {
+ "file_id": 7,
+ "content": "The code is waiting for GPT-4V to generate documentation for an element with the resource ID. It then constructs content, possibly a prompt and two images before and after an action on the element. If there are no errors in the response from GPT-4V, it updates the `doc_content` dictionary with the generated message, and writes a log entry with the step number, prompt, and image names.",
+ "type": "comment"
+ },
+ "76": {
+ "file_id": 7,
+ "content": " \"image_after\": f\"{demo_name}_{i + 1}.png\", \"response\": rsp}\n logfile.write(json.dumps(log_item) + \"\\n\")\n with open(doc_path, \"w\") as outfile:\n outfile.write(str(doc_content))\n doc_count += 1\n print_with_color(f\"Documentation generated and saved to {doc_path}\", \"yellow\")\n else:\n print_with_color(rsp[\"error\"][\"message\"], \"red\")\n time.sleep(configs[\"REQUEST_INTERVAL\"])\nprint_with_color(f\"Documentation generation phase completed. {doc_count} docs generated.\", \"yellow\")",
+ "type": "code",
+ "location": "/scripts/document_generation.py:131-141"
+ },
+ "77": {
+ "file_id": 7,
+ "content": "Generates and saves documents, writes log entries, handles errors with colorful output, sleeps for a specified interval between requests.",
+ "type": "comment"
+ },
+ "78": {
+ "file_id": 8,
+ "content": "/scripts/model.py",
+ "type": "filepath"
+ },
+ "79": {
+ "file_id": 8,
+ "content": "The code imports modules, loads configuration, defines functions for requesting OpenAI API, parsing response JSON, extracting information, printing with color formatting, handling model responses and exceptions, and deciding/formatting actions based on the act name.",
+ "type": "summary"
+ },
+ "80": {
+ "file_id": 8,
+ "content": "import re\nimport requests\nfrom config import load_config\nfrom utils import print_with_color\nconfigs = load_config()\ndef ask_gpt4v(content):\n headers = {\n \"Content-Type\": \"application/json\",\n \"Authorization\": f\"Bearer {configs['OPENAI_API_KEY']}\"\n }\n payload = {\n \"model\": configs[\"OPENAI_API_MODEL\"],\n \"messages\": [\n {\n \"role\": \"user\",\n \"content\": content\n }\n ],\n \"temperature\": configs[\"TEMPERATURE\"],\n \"max_tokens\": configs[\"MAX_TOKENS\"]\n }\n response = requests.post(configs[\"OPENAI_API_BASE\"], headers=headers, json=payload)\n if \"error\" not in response.json():\n usage = response.json()[\"usage\"]\n prompt_tokens = usage[\"prompt_tokens\"]\n completion_tokens = usage[\"completion_tokens\"]\n print_with_color(f\"Request cost is \"\n f\"${'{0:.2f}'.format(prompt_tokens / 1000 * 0.01 + completion_tokens / 1000 * 0.03)}\",\n \"yellow\")\n return response.json()",
+ "type": "code",
+ "location": "/scripts/model.py:1-34"
+ },
+ "81": {
+ "file_id": 8,
+ "content": "This code imports necessary modules and loads configuration from a file. It then defines a function `ask_gpt4v` that sends a request to an OpenAI API using provided configuration, returns the response JSON, and prints the request cost if there is no error in the response.",
+ "type": "comment"
+ },
+ "82": {
+ "file_id": 8,
+ "content": "def parse_explore_rsp(rsp):\n try:\n msg = rsp[\"choices\"][0][\"message\"][\"content\"]\n observation = re.findall(r\"Observation: (.*?)$\", msg, re.MULTILINE)[0]\n think = re.findall(r\"Thought: (.*?)$\", msg, re.MULTILINE)[0]\n act = re.findall(r\"Action: (.*?)$\", msg, re.MULTILINE)[0]\n last_act = re.findall(r\"Summary: (.*?)$\", msg, re.MULTILINE)[0]\n print_with_color(\"Observation:\", \"yellow\")\n print_with_color(observation, \"magenta\")\n print_with_color(\"Thought:\", \"yellow\")\n print_with_color(think, \"magenta\")\n print_with_color(\"Action:\", \"yellow\")\n print_with_color(act, \"magenta\")\n print_with_color(\"Summary:\", \"yellow\")\n print_with_color(last_act, \"magenta\")\n if \"FINISH\" in act:\n return [\"FINISH\"]\n act_name = act.split(\"(\")[0]\n if act_name == \"tap\":\n area = int(re.findall(r\"tap\\((.*?)\\)\", act)[0])\n return [act_name, area, last_act]\n elif act_name == \"text\":\n input_str = re.findall(r\"text\\((.*?)\\)\", act)[0][1:-1]",
+ "type": "code",
+ "location": "/scripts/model.py:37-59"
+ },
+ "83": {
+ "file_id": 8,
+ "content": "Function `parse_explore_rsp` parses a response and extracts observation, thought, action, and summary. It then prints observation, thought, action, and summary with colors, and returns the action name, area (if action is 'tap'), and last_act (if action is 'text') or finishes if \"FINISH\" found in action.",
+ "type": "comment"
+ },
+ "84": {
+ "file_id": 8,
+ "content": " return [act_name, input_str, last_act]\n elif act_name == \"long_press\":\n area = int(re.findall(r\"long_press\\((.*?)\\)\", act)[0])\n return [act_name, area, last_act]\n elif act_name == \"swipe\":\n params = re.findall(r\"swipe\\((.*?)\\)\", act)[0]\n area, swipe_dir, dist = params.split(\",\")\n area = int(area)\n swipe_dir = swipe_dir.strip()[1:-1]\n dist = dist.strip()[1:-1]\n return [act_name, area, swipe_dir, dist, last_act]\n elif act_name == \"grid\":\n return [act_name]\n else:\n print_with_color(f\"ERROR: Undefined act {act_name}!\", \"red\")\n return [\"ERROR\"]\n except Exception as e:\n print_with_color(f\"ERROR: an exception occurs while parsing the model response: {e}\", \"red\")\n print_with_color(rsp, \"red\")\n return [\"ERROR\"]\ndef parse_grid_rsp(rsp):\n try:\n msg = rsp[\"choices\"][0][\"message\"][\"content\"]\n observation = re.findall(r\"Observation: (.*?)$\", msg, re.MULTILINE)[0]",
+ "type": "code",
+ "location": "/scripts/model.py:60-85"
+ },
+ "85": {
+ "file_id": 8,
+ "content": "This code is parsing a response from a model and returns different information based on the type of action specified in the response. If an undefined action or error occurs, it prints an error message. The \"parse_grid_rsp\" function specifically handles grid actions.",
+ "type": "comment"
+ },
+ "86": {
+ "file_id": 8,
+ "content": " think = re.findall(r\"Thought: (.*?)$\", msg, re.MULTILINE)[0]\n act = re.findall(r\"Action: (.*?)$\", msg, re.MULTILINE)[0]\n last_act = re.findall(r\"Summary: (.*?)$\", msg, re.MULTILINE)[0]\n print_with_color(\"Observation:\", \"yellow\")\n print_with_color(observation, \"magenta\")\n print_with_color(\"Thought:\", \"yellow\")\n print_with_color(think, \"magenta\")\n print_with_color(\"Action:\", \"yellow\")\n print_with_color(act, \"magenta\")\n print_with_color(\"Summary:\", \"yellow\")\n print_with_color(last_act, \"magenta\")\n if \"FINISH\" in act:\n return [\"FINISH\"]\n act_name = act.split(\"(\")[0]\n if act_name == \"tap\":\n params = re.findall(r\"tap\\((.*?)\\)\", act)[0].split(\",\")\n area = int(params[0].strip())\n subarea = params[1].strip()[1:-1]\n return [act_name + \"_grid\", area, subarea, last_act]\n elif act_name == \"long_press\":\n params = re.findall(r\"long_press\\((.*?)\\)\", act)[0].split(\",\")",
+ "type": "code",
+ "location": "/scripts/model.py:86-106"
+ },
+ "87": {
+ "file_id": 8,
+ "content": "Extracts observation, thought, action, and summary from the message string. Displays them with color formatting. If \"FINISH\" is found in the action, it returns [\"FINISH\"]. For actions \"tap\", extracts grid area and subarea parameters. If \"long_press\" found, extracts the parameters.",
+ "type": "comment"
+ },
+ "88": {
+ "file_id": 8,
+ "content": " area = int(params[0].strip())\n subarea = params[1].strip()[1:-1]\n return [act_name + \"_grid\", area, subarea, last_act]\n elif act_name == \"swipe\":\n params = re.findall(r\"swipe\\((.*?)\\)\", act)[0].split(\",\")\n start_area = int(params[0].strip())\n start_subarea = params[1].strip()[1:-1]\n end_area = int(params[2].strip())\n end_subarea = params[3].strip()[1:-1]\n return [act_name + \"_grid\", start_area, start_subarea, end_area, end_subarea, last_act]\n elif act_name == \"grid\":\n return [act_name]\n else:\n print_with_color(f\"ERROR: Undefined act {act_name}!\", \"red\")\n return [\"ERROR\"]\n except Exception as e:\n print_with_color(f\"ERROR: an exception occurs while parsing the model response: {e}\", \"red\")\n print_with_color(rsp, \"red\")\n return [\"ERROR\"]\ndef parse_reflect_rsp(rsp):\n try:\n msg = rsp[\"choices\"][0][\"message\"][\"content\"]\n decision = re.findall(r\"Decision: (.*?)$\", msg, re.MULTILINE)[0]",
+ "type": "code",
+ "location": "/scripts/model.py:107-131"
+ },
+ "89": {
+ "file_id": 8,
+ "content": "This code is parsing the response from a model and determines the appropriate action based on the act name. It returns a specific grid if the act is 'grid'. If the act name is undefined, it prints an error message in red color. If any exception occurs while parsing the response, it also prints an error message with details of the exception.",
+ "type": "comment"
+ },
+ "90": {
+ "file_id": 8,
+ "content": " think = re.findall(r\"Thought: (.*?)$\", msg, re.MULTILINE)[0]\n print_with_color(\"Decision:\", \"yellow\")\n print_with_color(decision, \"magenta\")\n print_with_color(\"Thought:\", \"yellow\")\n print_with_color(think, \"magenta\")\n if decision == \"INEFFECTIVE\":\n return [decision, think]\n elif decision == \"BACK\" or decision == \"CONTINUE\" or decision == \"SUCCESS\":\n doc = re.findall(r\"Documentation: (.*?)$\", msg, re.MULTILINE)[0]\n print_with_color(\"Documentation:\", \"yellow\")\n print_with_color(doc, \"magenta\")\n return [decision, think, doc]\n else:\n print_with_color(f\"ERROR: Undefined decision {decision}!\", \"red\")\n return [\"ERROR\"]\n except Exception as e:\n print_with_color(f\"ERROR: an exception occurs while parsing the model response: {e}\", \"red\")\n print_with_color(rsp, \"red\")\n return [\"ERROR\"]",
+ "type": "code",
+ "location": "/scripts/model.py:132-150"
+ },
+ "91": {
+ "file_id": 8,
+ "content": "This code extracts decision, thought, and documentation from a message using regular expressions. It then prints them with colored formatting and returns the information as a list. If an undefined decision or exception occurs, it returns an error message.",
+ "type": "comment"
+ },
+ "92": {
+ "file_id": 9,
+ "content": "/scripts/prompts.py",
+ "type": "filepath"
+ },
+ "93": {
+ "file_id": 9,
+ "content": "The code showcases mobile app UI templates for touch interactions, and instructs in describing UI elements task-oriented manner with pronouns, specifying output format as \"Decision: SUCCESS\" followed by an explanation of the action's impact on the task.",
+ "type": "summary"
+ },
+ "94": {
+ "file_id": 9,
+ "content": "tap_doc_template = \"\"\"I will give you the screenshot of a mobile app before and after tapping the UI element labeled \nwith the number on the screen. The numeric tag of each element is located at the center of the element. \nTapping this UI element is a necessary part of proceeding with a larger task, which is to . Your task is to \ndescribe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI \nelement should focus on the general function. For example, if the UI element is used to navigate to the chat window \nwith John, your description should not include the name of the specific person. Just say: \"Tapping this area will \nnavigate the user to the chat window\". Never include the numeric tag of the UI element in your description. You can use \npronouns such as \"the UI element\" to refer to the element.\"\"\"\ntext_doc_template = \"\"\"I will give you the screenshot of a mobile app before and after typing in the input area labeled",
+ "type": "code",
+ "location": "/scripts/prompts.py:1-10"
+ },
+ "95": {
+ "file_id": 9,
+ "content": "ap_doc_template: Describes a mobile app screenshot before and after tapping a UI element with a number, focusing on the general function without mentioning numeric tag or specific details.",
+ "type": "comment"
+ },
+ "96": {
+ "file_id": 9,
+ "content": "with the number on the screen. The numeric tag of each element is located at the center of the element. \nTyping in this UI element is a necessary part of proceeding with a larger task, which is to . Your task is \nto describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the \nUI element should focus on the general function. For example, if the change of the screenshot shows that the user typed \n\"How are you?\" in the chat box, you do not need to mention the actual text. Just say: \"This input area is used for the \nuser to type a message to send to the chat window.\". Never include the numeric tag of the UI element in your \ndescription. You can use pronouns such as \"the UI element\" to refer to the element.\"\"\"\nlong_press_doc_template = \"\"\"I will give you the screenshot of a mobile app before and after long pressing the UI \nelement labeled with the number on the screen. The numeric tag of each element is located at the center of ",
+ "type": "code",
+ "location": "/scripts/prompts.py:11-20"
+ },
+ "97": {
+ "file_id": 9,
+ "content": "Long press documentation template for a mobile app UI element. Provides screenshot comparison before and after long pressing.",
+ "type": "comment"
+ },
+ "98": {
+ "file_id": 9,
+ "content": "the element. Long pressing this UI element is a necessary part of proceeding with a larger task, which is to \n. Your task is to describe the functionality of the UI element concisely in one or two sentences. Notice \nthat your description of the UI element should focus on the general function. For example, if long pressing the UI \nelement redirects the user to the chat window with John, your description should not include the name of the specific \nperson. Just say: \"Long pressing this area will redirect the user to the chat window\". Never include the numeric tag of \nthe UI element in your description. You can use pronouns such as \"the UI element\" to refer to the element.\"\"\"\nswipe_doc_template = \"\"\"I will give you the screenshot of a mobile app before and after swiping the UI \nelement labeled with the number on the screen. The numeric tag of each element is located at the center of \nthe element. Swiping this UI element is a necessary part of proceeding with a larger task, which is to . ",
+ "type": "code",
+ "location": "/scripts/prompts.py:21-30"
+ },
+ "99": {
+ "file_id": 9,
+ "content": "This code is generating a template for describing the functionality of swiping a specific UI element in a mobile app. The description should focus on the general function, without including the numeric tag or name of the person related to the task.",
+ "type": "comment"
+ }
+}
\ No newline at end of file
diff --git a/docs/data/1.json b/docs/data/1.json
new file mode 100644
index 0000000..5afe2d2
--- /dev/null
+++ b/docs/data/1.json
@@ -0,0 +1,548 @@
+{
+ "100": {
+ "file_id": 9,
+ "content": "Your task is to describe the functionality of the UI element concisely in one or two sentences. Notice that your \ndescription of the UI element should be as general as possible. For example, if swiping the UI element increases the \ncontrast ratio of an image of a building, your description should be just like this: \"Swiping this area enables the \nuser to tune a specific parameter of the image\". Never include the numeric tag of the UI element in your description. \nYou can use pronouns such as \"the UI element\" to refer to the element.\"\"\"\nrefine_doc_suffix = \"\"\"\\nA documentation of this UI element generated from previous demos is shown below. Your \ngenerated description should be based on this previous doc and optimize it. Notice that it is possible that your \nunderstanding of the function of the UI element derived from the given screenshots conflicts with the previous doc, \nbecause the function of a UI element can be flexible. In this case, your generated description should combine both.\nOld documentation of this UI element: \"\"\"",
+ "type": "code",
+ "location": "/scripts/prompts.py:31-41"
+ },
+ "101": {
+ "file_id": 9,
+ "content": "This code appears to be related to generating documentation for a UI element. It provides instructions on how to describe the functionality of the UI element concisely, using general terms and without including the numeric tag. The \"refine_doc_suffix\" variable suggests incorporating previous documentation if available, but also resolving any conflicts that might arise due to flexibility in the UI element's function.",
+ "type": "comment"
+ },
+ "102": {
+ "file_id": 9,
+ "content": "task_template = \"\"\"You are an agent that is trained to perform some basic tasks on a smartphone. You will be given a \nsmartphone screenshot. The interactive UI elements on the screenshot are labeled with numeric tags starting from 1. The \nnumeric tag of each interactive element is located in the center of the element.\nYou can call the following functions to control the smartphone:\n1. tap(element: int)\nThis function is used to tap an UI element shown on the smartphone screen.\n\"element\" is a numeric tag assigned to an UI element shown on the smartphone screen.\nA simple use case can be tap(5), which taps the UI element labeled with the number 5.\n2. text(text_input: str)\nThis function is used to insert text input in an input field/box. text_input is the string you want to insert and must \nbe wrapped with double quotation marks. A simple use case can be text(\"Hello, world!\"), which inserts the string \n\"Hello, world!\" into the input area on the smartphone screen. This function is usually callable when you see a keyboard ",
+ "type": "code",
+ "location": "/scripts/prompts.py:43-57"
+ },
+ "103": {
+ "file_id": 9,
+ "content": "The code provides a template for instructions on how to interact with a smartphone using two functions: tap() and text(). It explains that the user needs to provide an element number for tap(), and a string wrapped in double quotes for text(). These functions can be used to control the phone, such as tapping UI elements or inserting text input.",
+ "type": "comment"
+ },
+ "104": {
+ "file_id": 9,
+ "content": "showing in the lower half of the screen.\n3. long_press(element: int)\nThis function is used to long press an UI element shown on the smartphone screen.\n\"element\" is a numeric tag assigned to an UI element shown on the smartphone screen.\nA simple use case can be long_press(5), which long presses the UI element labeled with the number 5.\n4. swipe(element: int, direction: str, dist: str)\nThis function is used to swipe an UI element shown on the smartphone screen, usually a scroll view or a slide bar.\n\"element\" is a numeric tag assigned to an UI element shown on the smartphone screen. \"direction\" is a string that \nrepresents one of the four directions: up, down, left, right. \"direction\" must be wrapped with double quotation \nmarks. \"dist\" determines the distance of the swipe and can be one of the three options: short, medium, long. You should \nchoose the appropriate distance option according to your need.\nA simple use case can be swipe(21, \"up\", \"medium\"), which swipes up the UI element labeled with the number 21 for a ",
+ "type": "code",
+ "location": "/scripts/prompts.py:58-71"
+ },
+ "105": {
+ "file_id": 9,
+ "content": "Code snippet 57-70 describes functions for interacting with smartphone screen elements. \"long_press(element: int)\" is used to long press an element identified by its numeric tag. \"swipe(element: int, direction: str, dist: str)\" swipes an element in a specific direction and distance specified by the input parameters. The element is identified using its numeric tag, while direction and distance are string inputs enclosed in quotes.",
+ "type": "comment"
+ },
+ "106": {
+ "file_id": 9,
+ "content": "medium distance.\n5. grid()\nYou should call this function when you find the element you want to interact with is not labeled with a numeric tag and \nother elements with numeric tags cannot help with the task. The function will bring up a grid overlay to divide the \nsmartphone screen into small areas and this will give you more freedom to choose any part of the screen to tap, long \npress, or swipe.\n\nThe task you need to complete is to . Your past actions to proceed with this task are summarized as \nfollows: \nNow, given the documentation and the following labeled screenshot, you need to think and call the function needed to \nproceed with the task. Your output should include three parts in the given format:\nObservation: \nThought: \nAction: \nSummary: \nYou can only take one action at a time, so please directly call the function.\"\"\"\ntask_template_grid = \"\"\"You are an agent that is trained to perform some basic tasks on a smartphone. You will be given \na smartphone screenshot overlaid by a grid. The grid divides the screenshot into small square areas. Each area is \nlabeled with an integer in the top-left corner.\nYou can call the following functions to control the smartphone:\n1. tap(area: int, subarea: str)\nThis function is used to tap a grid area shown on the smartphone screen. \"area\" is the integer label assigned to a grid \narea shown on the smartphone screen. \"subarea\" is a string representing the exact location to tap within the grid area. \nIt can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left, bottom, and ",
+ "type": "code",
+ "location": "/scripts/prompts.py:87-102"
+ },
+ "109": {
+ "file_id": 9,
+ "content": "Code snippet is providing a task template for an agent that will interact with a smartphone by tapping on grid areas. The agent can call the tap() function to perform this action.",
+ "type": "comment"
+ },
+ "110": {
+ "file_id": 9,
+ "content": "bottom-right.\nA simple use case can be tap(5, \"center\"), which taps the exact center of the grid area labeled with the number 5.\n2. long_press(area: int, subarea: str)\nThis function is used to long press a grid area shown on the smartphone screen. \"area\" is the integer label assigned to \na grid area shown on the smartphone screen. \"subarea\" is a string representing the exact location to long press within \nthe grid area. It can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left, bottom, \nand bottom-right.\nA simple use case can be long_press(7, \"top-left\"), which long presses the top left part of the grid area labeled with \nthe number 7.\n3. swipe(start_area: int, start_subarea: str, end_area: int, end_subarea: str)\nThis function is used to perform a swipe action on the smartphone screen, especially when you want to interact with a \nscroll view or a slide bar. \"start_area\" is the integer label assigned to the grid area which marks the starting \nlocation of the swipe. \"start_subarea\" is a string representing the exact location to begin the swipe within the grid ",
+ "type": "code",
+ "location": "/scripts/prompts.py:103-117"
+ },
+ "111": {
+ "file_id": 9,
+ "content": "This code defines functions for interacting with a smartphone screen, including tap, long_press, and swipe actions. The functions allow specifying the grid area and subarea for precise touch interactions.",
+ "type": "comment"
+ },
+ "112": {
+ "file_id": 9,
+ "content": "area. \"end_area\" is the integer label assigned to the grid area which marks the ending location of the swipe. \n\"end_subarea\" is a string representing the exact location to end the swipe within the grid area.\nThe two subarea parameters can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left, \nbottom, and bottom-right.\nA simple use case can be swipe(21, \"center\", 25, \"right\"), which performs a swipe starting from the center of grid area \n21 to the right part of grid area 25.\nThe task you need to complete is to . Your past actions to proceed with this task are summarized as \nfollows: \nNow, given the following labeled screenshot, you need to think and call the function needed to proceed with the task. \nYour output should include three parts in the given format:\nObservation: \nThought: \nAction: \nSummary: \nYou can only take one action at a time, so please directly call the function.\"\"\"\nself_explore_task_template = \"\"\"You are an agent that is trained to complete certain tasks on a smartphone. You will be \ngiven a screenshot of a smartphone app. The interactive UI elements on the screenshot are labeled with numeric tags \nstarting from 1. \nYou can call the following functions to interact with those labeled elements to control the smartphone:\n1. tap(element: int)\nThis function is used to tap an UI element shown on the smartphone screen.\n\"element\" is a numeric tag assigned to an UI element shown on the smartphone screen.\nA simple use case can be tap(5), which taps the UI element labeled with the number 5.\n2. text(text_input: str)",
+ "type": "code",
+ "location": "/scripts/prompts.py:132-149"
+ },
+ "115": {
+ "file_id": 9,
+ "content": "This code defines a template for self-exploration tasks. It explains that the agent is trained to complete tasks on a smartphone, given a screenshot with labeled UI elements, and provides information about the functions available (tap(element: int) and text(text_input: str)) to interact with those UI elements. The agent must call these functions one at a time and summarize past actions.",
+ "type": "comment"
+ },
+ "116": {
+ "file_id": 9,
+ "content": "This function is used to insert text input in an input field/box. text_input is the string you want to insert and must \nbe wrapped with double quotation marks. A simple use case can be text(\"Hello, world!\"), which inserts the string \n\"Hello, world!\" into the input area on the smartphone screen. This function is only callable when you see a keyboard \nshowing in the lower half of the screen.\n3. long_press(element: int)\nThis function is used to long press an UI element shown on the smartphone screen.\n\"element\" is a numeric tag assigned to an UI element shown on the smartphone screen.\nA simple use case can be long_press(5), which long presses the UI element labeled with the number 5.\n4. swipe(element: int, direction: str, dist: str)\nThis function is used to swipe an UI element shown on the smartphone screen, usually a scroll view or a slide bar.\n\"element\" is a numeric tag assigned to an UI element shown on the smartphone screen. \"direction\" is a string that \nrepresents one of the four directions: up, down, left, right. \"direction\" must be wrapped with double quotation ",
+ "type": "code",
+ "location": "/scripts/prompts.py:150-163"
+ },
+ "117": {
+ "file_id": 9,
+ "content": "This function inserts text into an input field, long presses a UI element, or swipes an element on the smartphone screen.\ntext: Inserts string into input area when keyboard shows.\nlong_press: Long presses UI element with assigned numeric tag.\nswipe: Swipes UI element in specified direction and distance.",
+ "type": "comment"
+ },
+ "118": {
+ "file_id": 9,
+ "content": "marks. \"dist\" determines the distance of the swipe and can be one of the three options: short, medium, long. You should \nchoose the appropriate distance option according to your need.\nA simple use case can be swipe(21, \"up\", \"medium\"), which swipes up the UI element labeled with the number 21 for a \nmedium distance.\nThe task you need to complete is to . Your past actions to proceed with this task are summarized as \nfollows: \nNow, given the following labeled screenshot, you need to think and call the function needed to proceed with the task. \nYour output should include three parts in the given format:\nObservation: \nThought: \nAction: ",
+ "type": "code",
+ "location": "/scripts/prompts.py:164-177"
+ },
+ "119": {
+ "file_id": 9,
+ "content": "Observation: The code explains the usage of a swipe function with options for distance (\"short\", \"medium\", or \"long\") and direction.\nThought: To complete the task, I need to call the appropriate function with the correct parameters.\nAction: FINISH",
+ "type": "comment"
+ },
+ "120": {
+ "file_id": 9,
+ "content": "Summary: \nYou can only take one action at a time, so please directly call the function.\"\"\"\nself_explore_reflect_template = \"\"\"I will give you screenshots of a mobile app before and after the UI \nelement labeled with the number '' on the first screenshot. The numeric tag of each element is located at \nthe center of the element. The action of this UI element was described as follows:\n\nThe action was also an attempt to proceed with a larger task, which is to . Your job is to carefully analyze \nthe difference between the two screenshots to determine if the action is in accord with the description above and at \nthe same time effectively moved the task forward. Your output should be determined based on the following situations:\n1. BACK\nIf you think the action navigated you to a page where you cannot proceed with the given task, you should go back to the ",
+ "type": "code",
+ "location": "/scripts/prompts.py:178-190"
+ },
+ "121": {
+ "file_id": 9,
+ "content": "The code is a prompt template for analyzing differences in mobile app screenshots before and after an action. The user needs to determine if the action was effective and helped progress the task.",
+ "type": "comment"
+ },
+ "122": {
+ "file_id": 9,
+ "content": "previous interface. At the same time, describe the functionality of the UI element concisely in one or two sentences by \nobserving the difference between the two screenshots. Notice that your description of the UI element should focus on \nthe general function. Never include the numeric tag of the UI element in your description. You can use pronouns such as \n\"the UI element\" to refer to the element. Your output should be in the following format:\nDecision: BACK\nThought: \nDocumentation: \n2. INEFFECTIVE\nIf you find the action changed nothing on the screen (screenshots before and after the action are identical), you \nshould continue to interact with other elements on the screen. Notice that if you find the location of the cursor \nchanged between the two screenshots, then they are not identical. Your output should be in the following format:\nDecision: INEFFECTIVE\nThought: ",
+ "type": "code",
+ "location": "/scripts/prompts.py:191-203"
+ },
+ "123": {
+ "file_id": 9,
+ "content": "Decision: BACK\nThought: Reverses the last action and returns to previous interface.\nDocumentation: Allows user to undo the previous action and go back to the previous screen.",
+ "type": "comment"
+ },
+ "124": {
+ "file_id": 9,
+ "content": "3. CONTINUE\nIf you find the action changed something on the screen but does not reflect the action description above and did not \nmove the given task forward, you should continue to interact with other elements on the screen. At the same time, \ndescribe the functionality of the UI element concisely in one or two sentences by observing the difference between the \ntwo screenshots. Notice that your description of the UI element should focus on the general function. Never include the \nnumeric tag of the UI element in your description. You can use pronouns such as \"the UI element\" to refer to the \nelement. Your output should be in the following format:\nDecision: CONTINUE\nThought: \nDocumentation: \n4. SUCCESS\nIf you think the action successfully moved the task forward (even though it did not completed the task), you should \ndescribe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI ",
+ "type": "code",
+ "location": "/scripts/prompts.py:204-217"
+ },
+ "125": {
+ "file_id": 9,
+ "content": "Code snippet discusses the process of interacting with a UI element and documenting its functionality when the action doesn't fully complete the task but still makes progress.",
+ "type": "comment"
+ },
+ "126": {
+ "file_id": 9,
+ "content": "element should focus on the general function. Never include the numeric tag of the UI element in your description. You \ncan use pronouns such as \"the UI element\" to refer to the element. Your output should be in the following format:\nDecision: SUCCESS\nThought: \nDocumentation: \n\"\"\"",
+ "type": "code",
+ "location": "/scripts/prompts.py:218-223"
+ },
+ "127": {
+ "file_id": 9,
+ "content": "The code provides instructions for describing UI elements in a task-oriented manner, using pronouns and avoiding numeric tags. It specifies the output format as \"Decision: SUCCESS\" followed by a thought explaining why the action moved the task forward, along with documentation describing the function of the UI element.",
+ "type": "comment"
+ },
+ "128": {
+ "file_id": 10,
+ "content": "/scripts/self_explorer.py",
+ "type": "filepath"
+ },
+ "129": {
+ "file_id": 10,
+ "content": "The code prepares the environment, generates tasks, and logs interactions with GPT-4. It handles various actions, checks for completion, queries GPT-4 upon task completion, logs relevant information, manages errors, and processes, logs, and updates actions while managing errors and documentation; autonomous exploration ends upon reaching max rounds or in case of unexpected events, displaying a yellow or red message with doc count and success status.",
+ "type": "summary"
+ },
+ "130": {
+ "file_id": 10,
+ "content": "import argparse\nimport ast\nimport datetime\nimport json\nimport os\nimport re\nimport sys\nimport time\nimport prompts\nfrom config import load_config\nfrom and_controller import list_all_devices, AndroidController, traverse_tree\nfrom model import ask_gpt4v, parse_explore_rsp, parse_reflect_rsp\nfrom utils import print_with_color, draw_bbox_multi, encode_image\narg_desc = \"AppAgent - Autonomous Exploration\"\nparser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)\nparser.add_argument(\"--app\")\nparser.add_argument(\"--root_dir\", default=\"./\")\nargs = vars(parser.parse_args())\nconfigs = load_config()\napp = args[\"app\"]\nroot_dir = args[\"root_dir\"]\nif not app:\n print_with_color(\"What is the name of the target app?\", \"blue\")\n app = input()\n app = app.replace(\" \", \"\")\nwork_dir = os.path.join(root_dir, \"apps\")\nif not os.path.exists(work_dir):\n os.mkdir(work_dir)\nwork_dir = os.path.join(work_dir, app)\nif not os.path.exists(work_dir):\n os.mkdir(work_dir)\ndemo_dir = os.path.join(work_dir, \"demos\")",
+ "type": "code",
+ "location": "/scripts/self_explorer.py:1-38"
+ },
+ "131": {
+ "file_id": 10,
+ "content": "The code imports necessary libraries and defines arguments for executing the autonomous exploration script. It then loads configuration files, retrieves input from the user for target app name, creates directories if they don't exist, and prepares the environment for running the script.",
+ "type": "comment"
+ },
+ "132": {
+ "file_id": 10,
+ "content": "if not os.path.exists(demo_dir):\n os.mkdir(demo_dir)\ndemo_timestamp = int(time.time())\ntask_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime(\"self_explore_%Y-%m-%d_%H-%M-%S\")\ntask_dir = os.path.join(demo_dir, task_name)\nos.mkdir(task_dir)\ndocs_dir = os.path.join(work_dir, \"auto_docs\")\nif not os.path.exists(docs_dir):\n os.mkdir(docs_dir)\nexplore_log_path = os.path.join(task_dir, f\"log_explore_{task_name}.txt\")\nreflect_log_path = os.path.join(task_dir, f\"log_reflect_{task_name}.txt\")\ndevice_list = list_all_devices()\nif not device_list:\n print_with_color(\"ERROR: No device found!\", \"red\")\n sys.exit()\nprint_with_color(f\"List of devices attached:\\n{str(device_list)}\", \"yellow\")\nif len(device_list) == 1:\n device = device_list[0]\n print_with_color(f\"Device selected: {device}\", \"yellow\")\nelse:\n print_with_color(\"Please choose the Android device to start demo by entering its ID:\", \"blue\")\n device = input()\ncontroller = AndroidController(device)\nwidth, height = controller.get_device_size()",
+ "type": "code",
+ "location": "/scripts/self_explorer.py:39-63"
+ },
+ "133": {
+ "file_id": 10,
+ "content": "Checking if demo directory exists and creating it, then generating a task name and directory. Creating directories for auto_docs, log files, listing devices, and assigning one device based on the number of devices found.",
+ "type": "comment"
+ },
+ "134": {
+ "file_id": 10,
+ "content": "if not width and not height:\n print_with_color(\"ERROR: Invalid device size!\", \"red\")\n sys.exit()\nprint_with_color(f\"Screen resolution of {device}: {width}x{height}\", \"yellow\")\nprint_with_color(\"Please enter the description of the task you want me to complete in a few sentences:\", \"blue\")\ntask_desc = input()\nround_count = 0\ndoc_count = 0\nuseless_list = set()\nlast_act = \"None\"\ntask_complete = False\nwhile round_count < configs[\"MAX_ROUNDS\"]:\n round_count += 1\n print_with_color(f\"Round {round_count}\", \"yellow\")\n screenshot_before = controller.get_screenshot(f\"{round_count}_before\", task_dir)\n xml_path = controller.get_xml(f\"{round_count}\", task_dir)\n if screenshot_before == \"ERROR\" or xml_path == \"ERROR\":\n break\n clickable_list = []\n focusable_list = []\n traverse_tree(xml_path, clickable_list, \"clickable\", True)\n traverse_tree(xml_path, focusable_list, \"focusable\", True)\n elem_list = []\n for elem in clickable_list:\n if elem.uid in useless_list:\n continue",
+ "type": "code",
+ "location": "/scripts/self_explorer.py:64-91"
+ },
+ "135": {
+ "file_id": 10,
+ "content": "Checks if width and height are provided. If not, displays an error message and exits. Otherwise, prints device resolution and prompts for task description. Starts a loop to complete the task in multiple rounds until reaching MAX_ROUNDS.",
+ "type": "comment"
+ },
+ "136": {
+ "file_id": 10,
+ "content": " elem_list.append(elem)\n for elem in focusable_list:\n if elem.uid in useless_list:\n continue\n bbox = elem.bbox\n center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n close = False\n for e in clickable_list:\n bbox = e.bbox\n center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5\n if dist <= configs[\"MIN_DIST\"]:\n close = True\n break\n if not close:\n elem_list.append(elem)\n draw_bbox_multi(screenshot_before, os.path.join(task_dir, f\"{round_count}_before_labeled.png\"), elem_list,\n dark_mode=configs[\"DARK_MODE\"])\n prompt = re.sub(r\"\", task_desc, prompts.self_explore_task_template)\n prompt = re.sub(r\"\", last_act, prompt)\n base64_img_before = encode_image(os.path.join(task_dir, f\"{round_count}_before_labeled.png\"))",
+ "type": "code",
+ "location": "/scripts/self_explorer.py:92-113"
+ },
+ "137": {
+ "file_id": 10,
+ "content": "This code finds focusable elements on the screen, checks if they are close to any clickable elements, and adds them to a list. It then draws bounding boxes around these elements in an image and generates a task prompt with the image encoded as base64.",
+ "type": "comment"
+ },
+ "138": {
+ "file_id": 10,
+ "content": " content = [\n {\n \"type\": \"text\",\n \"text\": prompt\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{base64_img_before}\"\n }\n }\n ]\n print_with_color(\"Thinking about what to do in the next step...\", \"yellow\")\n rsp = ask_gpt4v(content)\n if \"error\" not in rsp:\n with open(explore_log_path, \"a\") as logfile:\n log_item = {\"step\": round_count, \"prompt\": prompt, \"image\": f\"{round_count}_before_labeled.png\",\n \"response\": rsp}\n logfile.write(json.dumps(log_item) + \"\\n\")\n res = parse_explore_rsp(rsp)\n act_name = res[0]\n last_act = res[-1]\n res = res[:-1]\n if act_name == \"FINISH\":\n task_complete = True\n break\n if act_name == \"tap\":\n _, area = res\n tl, br = elem_list[area - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.tap(x, y)",
+ "type": "code",
+ "location": "/scripts/self_explorer.py:114-145"
+ },
+ "139": {
+ "file_id": 10,
+ "content": "This code is sending a prompt to GPT-4 and receiving a response. It then logs the step, prompt, image, and response before parsing the response and checking if it's a \"FINISH\" command or a \"tap\" action.",
+ "type": "comment"
+ },
+ "140": {
+ "file_id": 10,
+ "content": " if ret == \"ERROR\":\n print_with_color(\"ERROR: tap execution failed\", \"red\")\n break\n elif act_name == \"text\":\n _, input_str = res\n ret = controller.text(input_str)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: text execution failed\", \"red\")\n break\n elif act_name == \"long_press\":\n _, area = res\n tl, br = elem_list[area - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.long_press(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: long press execution failed\", \"red\")\n break\n elif act_name == \"swipe\":\n _, area, swipe_dir, dist = res\n tl, br = elem_list[area - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.swipe(x, y, swipe_dir, dist)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: swipe execution failed\", \"red\")",
+ "type": "code",
+ "location": "/scripts/self_explorer.py:146-169"
+ },
+ "141": {
+ "file_id": 10,
+ "content": "This code handles different actions (tap, text, long_press, swipe) performed by the script. It checks if the execution of each action fails and prints an error message with color formatting in case of failure.",
+ "type": "comment"
+ },
+ "142": {
+ "file_id": 10,
+ "content": " break\n else:\n break\n time.sleep(configs[\"REQUEST_INTERVAL\"])\n else:\n print_with_color(rsp[\"error\"][\"message\"], \"red\")\n break\n screenshot_after = controller.get_screenshot(f\"{round_count}_after\", task_dir)\n if screenshot_after == \"ERROR\":\n break\n draw_bbox_multi(screenshot_after, os.path.join(task_dir, f\"{round_count}_after_labeled.png\"), elem_list,\n dark_mode=configs[\"DARK_MODE\"])\n base64_img_after = encode_image(os.path.join(task_dir, f\"{round_count}_after_labeled.png\"))\n if act_name == \"tap\":\n prompt = re.sub(r\"\", \"tapping\", prompts.self_explore_reflect_template)\n elif act_name == \"text\":\n continue\n elif act_name == \"long_press\":\n prompt = re.sub(r\"\", \"long pressing\", prompts.self_explore_reflect_template)\n elif act_name == \"swipe\":\n swipe_dir = res[2]\n if swipe_dir == \"up\" or swipe_dir == \"down\":\n act_name = \"v_swipe\"\n elif swipe_dir == \"left\" or swipe_dir == \"right\":",
+ "type": "code",
+ "location": "/scripts/self_explorer.py:170-195"
+ },
+ "143": {
+ "file_id": 10,
+ "content": "Checking if task is complete and breaks loop",
+ "type": "comment"
+ },
+ "144": {
+ "file_id": 10,
+ "content": " act_name = \"h_swipe\"\n prompt = re.sub(r\"\", \"swiping\", prompts.self_explore_reflect_template)\n else:\n print_with_color(\"ERROR: Undefined act!\", \"red\")\n break\n prompt = re.sub(r\"\", str(area), prompt)\n prompt = re.sub(r\"\", task_desc, prompt)\n prompt = re.sub(r\"\", last_act, prompt)\n content = [\n {\n \"type\": \"text\",\n \"text\": prompt\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{base64_img_before}\"\n }\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{base64_img_after}\"\n }\n }\n ]\n print_with_color(\"Reflecting on my previous action...\", \"yellow\")\n rsp = ask_gpt4v(content)\n if \"error\" not in rsp:\n resource_id = elem_list[int(area) - 1].uid\n with open(reflect_log_path, \"a\") as logfile:\n ",
+ "type": "code",
+ "location": "/scripts/self_explorer.py:196-228"
+ },
+ "145": {
+ "file_id": 10,
+ "content": "Code is preparing a message to ask GPT-4 about a previous action. It replaces placeholders in the prompt with appropriate values and sends it to GPT-4 for response. If there's no error in the response, it logs relevant information into reflect_log_path.",
+ "type": "comment"
+ },
+ "146": {
+ "file_id": 10,
+ "content": "log_item = {\"step\": round_count, \"prompt\": prompt, \"image_before\": f\"{round_count}_before_labeled.png\",\n \"image_after\": f\"{round_count}_after.png\", \"response\": rsp}\n logfile.write(json.dumps(log_item) + \"\\n\")\n res = parse_reflect_rsp(rsp)\n decision = res[0]\n if decision == \"ERROR\":\n break\n if decision == \"INEFFECTIVE\":\n useless_list.add(resource_id)\n last_act = \"None\"\n elif decision == \"BACK\" or decision == \"CONTINUE\" or decision == \"SUCCESS\":\n if decision == \"BACK\" or decision == \"CONTINUE\":\n useless_list.add(resource_id)\n last_act = \"None\"\n if decision == \"BACK\":\n ret = controller.back()\n if ret == \"ERROR\":\n print_with_color(\"ERROR: back execution failed\", \"red\")\n break\n doc = res[-1]\n doc_name = resource_id + \".txt\"\n doc_path = os.path.join(docs_dir, doc_name)",
+ "type": "code",
+ "location": "/scripts/self_explorer.py:228-249"
+ },
+ "147": {
+ "file_id": 10,
+ "content": "Writing log item to file\nParses response and makes decision\nHandles \"ERROR\", \"INEFFECTIVE\", and other decisions\nIf \"BACK\" or \"CONTINUE\", adds resource_id to useless list, sets last_act to \"None\", and executes back action if necessary\nDocs processing begins",
+ "type": "comment"
+ },
+ "148": {
+ "file_id": 10,
+ "content": " if os.path.exists(doc_path):\n doc_content = ast.literal_eval(open(doc_path).read())\n if doc_content[act_name]:\n print_with_color(f\"Documentation for the element {resource_id} already exists.\", \"yellow\")\n continue\n else:\n doc_content = {\n \"tap\": \"\",\n \"text\": \"\",\n \"v_swipe\": \"\",\n \"h_swipe\": \"\",\n \"long_press\": \"\"\n }\n doc_content[act_name] = doc\n with open(doc_path, \"w\") as outfile:\n outfile.write(str(doc_content))\n doc_count += 1\n print_with_color(f\"Documentation generated and saved to {doc_path}\", \"yellow\")\n else:\n print_with_color(f\"ERROR: Undefined decision! {decision}\", \"red\")\n break\n else:\n print_with_color(rsp[\"error\"][\"message\"], \"red\")\n break\n time.sleep(configs[\"REQUEST_INTERVAL\"])\nif task_complete:",
+ "type": "code",
+ "location": "/scripts/self_explorer.py:250-276"
+ },
+ "149": {
+ "file_id": 10,
+ "content": "If file exists, read its content, and if the action's documentation already exists, print a message and continue. Otherwise, create an empty dictionary for the document content, add the current action's documentation, save it to file, increment documentation count, and print a success message. If there is an undefined decision or error in response, print an error message and break the loop. After each task, sleep for the specified request interval.",
+ "type": "comment"
+ },
+ "150": {
+ "file_id": 10,
+ "content": " print_with_color(f\"Autonomous exploration completed successfully. {doc_count} docs generated.\", \"yellow\")\nelif round_count == configs[\"MAX_ROUNDS\"]:\n print_with_color(f\"Autonomous exploration finished due to reaching max rounds. {doc_count} docs generated.\",\n \"yellow\")\nelse:\n print_with_color(f\"Autonomous exploration finished unexpectedly. {doc_count} docs generated.\", \"red\")",
+ "type": "code",
+ "location": "/scripts/self_explorer.py:277-282"
+ },
+ "151": {
+ "file_id": 10,
+ "content": "Autonomous exploration ended. Yellow message if max rounds reached, red if unexpected, displays doc count and success status.",
+ "type": "comment"
+ },
+ "152": {
+ "file_id": 11,
+ "content": "/scripts/step_recorder.py",
+ "type": "filepath"
+ },
+ "153": {
+ "file_id": 11,
+ "content": "The code sets up command line arguments for \"AppAgent,\" enables user actions selection or input gestures, validates inputs, performs corresponding actions with a controller object, logs data, handles errors, and displays recorded steps.",
+ "type": "summary"
+ },
+ "154": {
+ "file_id": 11,
+ "content": "import argparse\nimport datetime\nimport cv2\nimport os\nimport shutil\nimport sys\nimport time\nfrom and_controller import list_all_devices, AndroidController, traverse_tree\nfrom config import load_config\nfrom utils import print_with_color, draw_bbox_multi\narg_desc = \"AppAgent - Human Demonstration\"\nparser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)\nparser.add_argument(\"--app\")\nparser.add_argument(\"--demo\")\nparser.add_argument(\"--root_dir\", default=\"./\")\nargs = vars(parser.parse_args())\napp = args[\"app\"]\ndemo_name = args[\"demo\"]\nroot_dir = args[\"root_dir\"]\nconfigs = load_config()\nif not app:\n print_with_color(\"What is the name of the app you are going to demo?\", \"blue\")\n app = input()\n app = app.replace(\" \", \"\")\nif not demo_name:\n demo_timestamp = int(time.time())\n demo_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime(f\"demo_{app}_%Y-%m-%d_%H-%M-%S\")\nwork_dir = os.path.join(root_dir, \"apps\")\nif not os.path.exists(work_dir):\n os.mkdir(work_dir)",
+ "type": "code",
+ "location": "/scripts/step_recorder.py:1-37"
+ },
+ "155": {
+ "file_id": 11,
+ "content": "This code is setting up the command line arguments for an application called \"AppAgent\" which records human demonstrations of mobile app interactions. It checks if the app and demo names are provided, creates a directory to store the recorded data, and sets default values if any arguments are missing.",
+ "type": "comment"
+ },
+ "156": {
+ "file_id": 11,
+ "content": "work_dir = os.path.join(work_dir, app)\nif not os.path.exists(work_dir):\n os.mkdir(work_dir)\ndemo_dir = os.path.join(work_dir, \"demos\")\nif not os.path.exists(demo_dir):\n os.mkdir(demo_dir)\ntask_dir = os.path.join(demo_dir, demo_name)\nif os.path.exists(task_dir):\n shutil.rmtree(task_dir)\nos.mkdir(task_dir)\nraw_ss_dir = os.path.join(task_dir, \"raw_screenshots\")\nos.mkdir(raw_ss_dir)\nxml_dir = os.path.join(task_dir, \"xml\")\nos.mkdir(xml_dir)\nlabeled_ss_dir = os.path.join(task_dir, \"labeled_screenshots\")\nos.mkdir(labeled_ss_dir)\nrecord_path = os.path.join(task_dir, \"record.txt\")\nrecord_file = open(record_path, \"w\")\ntask_desc_path = os.path.join(task_dir, \"task_desc.txt\")\ndevice_list = list_all_devices()\nif not device_list:\n print_with_color(\"ERROR: No device found!\", \"red\")\n sys.exit()\nprint_with_color(\"List of devices attached:\\n\" + str(device_list), \"yellow\")\nif len(device_list) == 1:\n device = device_list[0]\n print_with_color(f\"Device selected: {device}\", \"yellow\")\nelse:\n print_with_color(\"Please choose the Android device to start demo by entering its ID:\", \"blue\")",
+ "type": "code",
+ "location": "/scripts/step_recorder.py:38-67"
+ },
+ "157": {
+ "file_id": 11,
+ "content": "Creating directories for storing demo and task files, checking if devices are attached.",
+ "type": "comment"
+ },
+ "158": {
+ "file_id": 11,
+ "content": " device = input()\ncontroller = AndroidController(device)\nwidth, height = controller.get_device_size()\nif not width and not height:\n print_with_color(\"ERROR: Invalid device size!\", \"red\")\n sys.exit()\nprint_with_color(f\"Screen resolution of {device}: {width}x{height}\", \"yellow\")\nprint_with_color(\"Please state the goal of your following demo actions clearly, e.g. send a message to John\", \"blue\")\ntask_desc = input()\nwith open(task_desc_path, \"w\") as f:\n f.write(task_desc)\nprint_with_color(\"All interactive elements on the screen are labeled with red and blue numeric tags. Elements \"\n \"labeled with red tags are clickable elements; elements labeled with blue tags are scrollable \"\n \"elements.\", \"blue\")\nstep = 0\nwhile True:\n step += 1\n screenshot_path = controller.get_screenshot(f\"{demo_name}_{step}\", raw_ss_dir)\n xml_path = controller.get_xml(f\"{demo_name}_{step}\", xml_dir)\n if screenshot_path == \"ERROR\" or xml_path == \"ERROR\":\n break\n clickable_list = []",
+ "type": "code",
+ "location": "/scripts/step_recorder.py:68-92"
+ },
+ "159": {
+ "file_id": 11,
+ "content": "Device input and check for valid device size.\nGet device resolution and print it, request user to specify demo goal.\nSave goal description in file.\nLabel interactive elements with red and blue numeric tags, clickable with red, scrollable with blue.\nLoop to capture screenshots and XML until errors occur.",
+ "type": "comment"
+ },
+ "160": {
+ "file_id": 11,
+ "content": " focusable_list = []\n traverse_tree(xml_path, clickable_list, \"clickable\", True)\n traverse_tree(xml_path, focusable_list, \"focusable\", True)\n elem_list = clickable_list.copy()\n for elem in focusable_list:\n bbox = elem.bbox\n center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n close = False\n for e in clickable_list:\n bbox = e.bbox\n center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5\n if dist <= configs[\"MIN_DIST\"]:\n close = True\n break\n if not close:\n elem_list.append(elem)\n labeled_img = draw_bbox_multi(screenshot_path, os.path.join(labeled_ss_dir, f\"{demo_name}_{step}.png\"), elem_list,\n True)\n cv2.imshow(\"image\", labeled_img)\n cv2.waitKey(0)\n cv2.destroyAllWindows()\n user_input = \"xxx\"\n print_with_col",
+ "type": "code",
+ "location": "/scripts/step_recorder.py:93-116"
+ },
+ "161": {
+ "file_id": 11,
+ "content": "Code comments:\n\n1. Traverse the XML tree to find clickable and focusable elements (lines 92-104).\n2. Copy the clickable list to a new list called elem_list.\n3. Check if any focusable element is close to any clickable element based on distance threshold (configs[\"MIN_DIST\"]). If not, add it to elem_list (lines 107-114).\n4. Display the labeled image with bounding boxes for elements in elem_list using draw_bbox_multi function.\n5. Show the image and wait for user input (cv2 functions).\n6. Set user_input variable to \"xxx\" and print with colored output.",
+ "type": "comment"
+ },
+ "162": {
+ "file_id": 11,
+ "content": "or(\"Choose one of the following actions you want to perform on the current screen:\\ntap, text, long \"\n \"press, swipe, stop\", \"blue\")\n while user_input.lower() != \"tap\" and user_input.lower() != \"text\" and user_input.lower() != \"long press\" \\\n and user_input.lower() != \"swipe\" and user_input.lower() != \"stop\":\n user_input = input()\n if user_input.lower() == \"tap\":\n print_with_color(f\"Which element do you want to tap? Choose a numeric tag from 1 to {len(elem_list)}:\", \"blue\")\n user_input = \"xxx\"\n while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:\n user_input = input()\n tl, br = elem_list[int(user_input) - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.tap(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: tap execution failed\", \"red\")\n break\n record_file.write(f\"tap({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\\n\")",
+ "type": "code",
+ "location": "/scripts/step_recorder.py:116-132"
+ },
+ "163": {
+ "file_id": 11,
+ "content": "This code asks the user to choose an action from a list of options. It continues to ask for input until the correct option is chosen. If \"tap\" is chosen, it prompts for the element to tap by its numeric tag, then taps the element on screen and writes a record if successful.",
+ "type": "comment"
+ },
+ "164": {
+ "file_id": 11,
+ "content": " elif user_input.lower() == \"text\":\n print_with_color(f\"Which element do you want to input the text string? Choose a numeric tag from 1 to \"\n f\"{len(elem_list)}:\", \"blue\")\n input_area = \"xxx\"\n while not input_area.isnumeric() or int(input_area) > len(elem_list) or int(input_area) < 1:\n input_area = input()\n print_with_color(\"Enter your input text below:\", \"blue\")\n user_input = \"\"\n while not user_input:\n user_input = input()\n controller.text(user_input)\n record_file.write(f\"text({input_area}:sep:\\\"{user_input}\\\"):::{elem_list[int(input_area) - 1].uid}\\n\")\n elif user_input.lower() == \"long press\":\n print_with_color(f\"Which element do you want to long press? Choose a numeric tag from 1 to {len(elem_list)}:\",\n \"blue\")\n user_input = \"xxx\"\n while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:\n user_input = input()",
+ "type": "code",
+ "location": "/scripts/step_recorder.py:133-150"
+ },
+ "165": {
+ "file_id": 11,
+ "content": "This code segment allows the user to input text or simulate a long press on an element by choosing a numeric tag from 1 to the total number of elements in the list. It prompts for the input, validates the input, and performs the corresponding action using the controller object. The data is then recorded in a file with appropriate formatting.",
+ "type": "comment"
+ },
+ "166": {
+ "file_id": 11,
+ "content": " tl, br = elem_list[int(user_input) - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.long_press(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: long press execution failed\", \"red\")\n break\n record_file.write(f\"long_press({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\\n\")\n elif user_input.lower() == \"swipe\":\n print_with_color(f\"What is the direction of your swipe? Choose one from the following options:\\nup, down, left,\"\n f\" right\", \"blue\")\n user_input = \"\"\n while user_input != \"up\" and user_input != \"down\" and user_input != \"left\" and user_input != \"right\":\n user_input = input()\n swipe_dir = user_input\n print_with_color(f\"Which element do you want to swipe? Choose a numeric tag from 1 to {len(elem_list)}:\")\n while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:\n user_input = input()",
+ "type": "code",
+ "location": "/scripts/step_recorder.py:151-167"
+ },
+ "167": {
+ "file_id": 11,
+ "content": "The code is prompting the user for input to perform a long press or swipe action on an element from a list. It retrieves the bounding box coordinates, calculates the center point, performs the requested action, and logs the information if it was successful.",
+ "type": "comment"
+ },
+ "168": {
+ "file_id": 11,
+ "content": " tl, br = elem_list[int(user_input) - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.swipe(x, y, swipe_dir)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: swipe execution failed\", \"red\")\n break\n record_file.write(f\"swipe({int(user_input)}:sep:{swipe_dir}):::{elem_list[int(user_input) - 1].uid}\\n\")\n elif user_input.lower() == \"stop\":\n record_file.write(\"stop\\n\")\n record_file.close()\n break\n else:\n break\n time.sleep(3)\nprint_with_color(f\"Demonstration phase completed. {step} steps were recorded.\", \"yellow\")",
+ "type": "code",
+ "location": "/scripts/step_recorder.py:168-183"
+ },
+ "169": {
+ "file_id": 11,
+ "content": "This code takes user input to record a swipe action, writes it into a file along with the element's unique ID, and handles stopping the recording. If swipe execution fails, it prints an error message and breaks the loop. After completion, it displays the number of steps recorded in yellow color.",
+ "type": "comment"
+ },
+ "170": {
+ "file_id": 12,
+ "content": "/scripts/task_executor.py",
+ "type": "filepath"
+ },
+ "171": {
+ "file_id": 12,
+ "content": "The code imports libraries, handles user inputs and directory creation, verifies grid images, populates lists from XML tree, manages UI elements, replaces placeholders with data, handles actions like tap/text/long press, logs and parses responses based on grid setting, checks for errors, and prints success or error messages.",
+ "type": "summary"
+ },
+ "172": {
+ "file_id": 12,
+ "content": "import argparse\nimport ast\nimport datetime\nimport json\nimport os\nimport re\nimport sys\nimport time\nimport prompts\nfrom config import load_config\nfrom and_controller import list_all_devices, AndroidController, traverse_tree\nfrom model import ask_gpt4v, parse_explore_rsp, parse_grid_rsp\nfrom utils import print_with_color, draw_bbox_multi, encode_image, draw_grid\narg_desc = \"AppAgent Executor\"\nparser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)\nparser.add_argument(\"--app\")\nparser.add_argument(\"--root_dir\", default=\"./\")\nargs = vars(parser.parse_args())\nconfigs = load_config()\napp = args[\"app\"]\nroot_dir = args[\"root_dir\"]\nif not app:\n print_with_color(\"What is the name of the app you want me to operate?\", \"blue\")\n app = input()\n app = app.replace(\" \", \"\")\napp_dir = os.path.join(os.path.join(root_dir, \"apps\"), app)\nwork_dir = os.path.join(root_dir, \"tasks\")\nif not os.path.exists(work_dir):\n os.mkdir(work_dir)\nauto_docs_dir = os.path.join(app_dir, \"auto_docs\")",
+ "type": "code",
+ "location": "/scripts/task_executor.py:1-36"
+ },
+ "173": {
+ "file_id": 12,
+ "content": "Code imports necessary libraries, sets up argument parsing for executing the AppAgent tasks. It defines the description of the executor, loads configuration from config file and gets the name of app to be operated. If no app name is given, it prompts user for input and proceeds with executing tasks for specified app. It also creates work directory if it does not exist and defines auto_docs_dir in the app directory.",
+ "type": "comment"
+ },
+ "174": {
+ "file_id": 12,
+ "content": "demo_docs_dir = os.path.join(app_dir, \"demo_docs\")\ntask_timestamp = int(time.time())\ndir_name = datetime.datetime.fromtimestamp(task_timestamp).strftime(f\"task_{app}_%Y-%m-%d_%H-%M-%S\")\ntask_dir = os.path.join(work_dir, dir_name)\nos.mkdir(task_dir)\nlog_path = os.path.join(task_dir, f\"log_{app}_{dir_name}.txt\")\nno_doc = False\nif not os.path.exists(auto_docs_dir) and not os.path.exists(demo_docs_dir):\n print_with_color(f\"No documentations found for the app {app}. Do you want to proceed with no docs? Enter y or n\",\n \"red\")\n user_input = \"\"\n while user_input != \"y\" and user_input != \"n\":\n user_input = input().lower()\n if user_input == \"y\":\n no_doc = True\n else:\n sys.exit()\nelif os.path.exists(auto_docs_dir) and os.path.exists(demo_docs_dir):\n print_with_color(f\"The app {app} has documentations generated from both autonomous exploration and human \"\n f\"demonstration. Which one do you want to use? Type 1 or 2.\\n1. Autonomous exploration\\n2. Human \"",
+ "type": "code",
+ "location": "/scripts/task_executor.py:37-57"
+ },
+ "175": {
+ "file_id": 12,
+ "content": "Creating a new task directory with a timestamped name, checking for documentation directories, and asking user to choose which documentation to use or if no documents are found, prompting the user whether to proceed without documents.",
+ "type": "comment"
+ },
+ "176": {
+ "file_id": 12,
+ "content": " f\"Demonstration\",\n \"blue\")\n user_input = \"\"\n while user_input != \"1\" and user_input != \"2\":\n user_input = input()\n if user_input == \"1\":\n docs_dir = auto_docs_dir\n else:\n docs_dir = demo_docs_dir\nelif os.path.exists(auto_docs_dir):\n print_with_color(f\"Documentations generated from autonomous exploration were found for the app {app}. The doc base \"\n f\"is selected automatically.\", \"yellow\")\n docs_dir = auto_docs_dir\nelse:\n print_with_color(f\"Documentations generated from human demonstration were found for the app {app}. The doc base is \"\n f\"selected automatically.\", \"yellow\")\n docs_dir = demo_docs_dir\ndevice_list = list_all_devices()\nif not device_list:\n print_with_color(\"ERROR: No device found!\", \"red\")\n sys.exit()\nprint_with_color(f\"List of devices attached:\\n{str(device_list)}\", \"yellow\")\nif len(device_list) == 1:\n device = device_list[0]\n print_with_color(f\"Device selected: {device}\", \"yellow\")",
+ "type": "code",
+ "location": "/scripts/task_executor.py:58-83"
+ },
+ "177": {
+ "file_id": 12,
+ "content": "This code snippet prompts the user to select a document base from either automatically generated ones or demonstration ones. It then checks if any devices are attached and prints relevant messages based on the conditions met.",
+ "type": "comment"
+ },
+ "178": {
+ "file_id": 12,
+ "content": "else:\n print_with_color(\"Please choose the Android device to start demo by entering its ID:\", \"blue\")\n device = input()\ncontroller = AndroidController(device)\nwidth, height = controller.get_device_size()\nif not width and not height:\n print_with_color(\"ERROR: Invalid device size!\", \"red\")\n sys.exit()\nprint_with_color(f\"Screen resolution of {device}: {width}x{height}\", \"yellow\")\nprint_with_color(\"Please enter the description of the task you want me to complete in a few sentences:\", \"blue\")\ntask_desc = input()\nround_count = 0\nlast_act = \"None\"\ntask_complete = False\ngrid_on = False\nrows, cols = 0, 0\ndef area_to_xy(area, subarea):\n area -= 1\n row, col = area // cols, area % cols\n x_0, y_0 = col * (width // cols), row * (height // rows)\n if subarea == \"top-left\":\n x, y = x_0 + (width // cols) // 4, y_0 + (height // rows) // 4\n elif subarea == \"top\":\n x, y = x_0 + (width // cols) // 2, y_0 + (height // rows) // 4\n elif subarea == \"top-right\":\n x, y = x_0 + (width // cols) * 3 // 4, y_0 + (height // rows) // 4",
+ "type": "code",
+ "location": "/scripts/task_executor.py:84-113"
+ },
+ "179": {
+ "file_id": 12,
+ "content": "User is prompted to choose an Android device for the demo by entering its ID. The device size is checked and if it's invalid, an error is displayed and the program exits. Otherwise, user is asked to provide a task description in a few sentences. A function `area_to_xy` is defined to convert area number to x-y coordinates on the screen. \n\nStorage location: \"AppAgent/scripts/task_executor.py\":115-142\nCode:\n```\ndef get_subarea(area):\n while True:\n try:\n subarea = input(\"Please enter the subarea (top, top-left, top-right) for area \" + str(area+1) + \":\").lower()\n if subarea in (\"top\", \"top-left\", \"top-right\"):\n return subarea\n except Exception as e:\n print_with_color(\"ERROR:\", \"red\")\n print(e)\n```\nComment for code:\n\nThe function `get_subarea` prompts the user to enter the sub-areas for each area in a loop until valid input is provided. Valid inputs are 'top', 'top-left' or 'top-right'. If invalid input is entered, an error message is displayed along with the exception and the program continues to prompt until valid input is given.",
+ "type": "comment"
+ },
+ "180": {
+ "file_id": 12,
+ "content": " elif subarea == \"left\":\n x, y = x_0 + (width // cols) // 4, y_0 + (height // rows) // 2\n elif subarea == \"right\":\n x, y = x_0 + (width // cols) * 3 // 4, y_0 + (height // rows) // 2\n elif subarea == \"bottom-left\":\n x, y = x_0 + (width // cols) // 4, y_0 + (height // rows) * 3 // 4\n elif subarea == \"bottom\":\n x, y = x_0 + (width // cols) // 2, y_0 + (height // rows) * 3 // 4\n elif subarea == \"bottom-right\":\n x, y = x_0 + (width // cols) * 3 // 4, y_0 + (height // rows) * 3 // 4\n else:\n x, y = x_0 + (width // cols) // 2, y_0 + (height // rows) // 2\n return x, y\nwhile round_count < configs[\"MAX_ROUNDS\"]:\n round_count += 1\n print_with_color(f\"Round {round_count}\", \"yellow\")\n screenshot_path = controller.get_screenshot(f\"{dir_name}_{round_count}\", task_dir)\n xml_path = controller.get_xml(f\"{dir_name}_{round_count}\", task_dir)\n if screenshot_path == \"ERROR\" or xml_path == \"ERROR\":\n break\n if grid_on:\n rows, cols = draw_grid(screenshot_path, os.path.join(task_dir, f\"{dir_name}_{round_count}_grid.png\"))",
+ "type": "code",
+ "location": "/scripts/task_executor.py:114-137"
+ },
+ "181": {
+ "file_id": 12,
+ "content": "Code calculates the coordinates for subareas of a screenshot and continues with round processing. It checks if screenshots or XML paths are errors, then breaks if so. If grid is on, it draws the grid.",
+ "type": "comment"
+ },
+ "182": {
+ "file_id": 12,
+ "content": " base64_img = encode_image(os.path.join(task_dir, f\"{dir_name}_{round_count}_grid.png\"))\n prompt = prompts.task_template_grid\n else:\n clickable_list = []\n focusable_list = []\n traverse_tree(xml_path, clickable_list, \"clickable\", True)\n traverse_tree(xml_path, focusable_list, \"focusable\", True)\n elem_list = clickable_list.copy()\n for elem in focusable_list:\n bbox = elem.bbox\n center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n close = False\n for e in clickable_list:\n bbox = e.bbox\n center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5\n if dist <= configs[\"MIN_DIST\"]:\n close = True\n break\n if not close:\n elem_list.append(elem)\n draw_bbox_multi(screenshot_path, os.path.join(task_dir, f\"{dir_name}_{round_count}_labeled.png\"), elem_list,",
+ "type": "code",
+ "location": "/scripts/task_executor.py:138-159"
+ },
+ "183": {
+ "file_id": 12,
+ "content": "If `os.path.join(task_dir, f\"{dir_name}_{round_count}_grid.png\")` exists:\n- Encode the image using `encode_image()`.\n- Set prompt to `prompts.task_template_grid`.\nElse:\n- Create empty `clickable_list` and `focusable_list`.\n- Traverse XML tree to populate `clickable_list` and `focusable_list`.\n- Combine `clickable_list` and `focusable_list` into `elem_list`, excluding duplicates.\n- Draw bounding boxes for elements in `elem_list` on `screenshot_path` and save as `os.path.join(task_dir, f\"{dir_name}_{round_count}_labeled.png\")`.",
+ "type": "comment"
+ },
+ "184": {
+ "file_id": 12,
+ "content": " dark_mode=configs[\"DARK_MODE\"])\n base64_img = encode_image(os.path.join(task_dir, f\"{dir_name}_{round_count}_labeled.png\"))\n if no_doc:\n prompt = re.sub(r\"\", \"\", prompts.task_template)\n else:\n ui_doc = \"\"\"\n You also have access to the following documentations that describes the functionalities of UI \n elements you can interact on the screen. These docs are crucial for you to determine the target of your \n next action. You should always prioritize these documented elements for interaction:\"\"\"\n for i, elem in enumerate(elem_list):\n doc_path = os.path.join(docs_dir, f\"{elem.uid}.txt\")\n if not os.path.exists(doc_path):\n continue\n ui_doc += f\"Documentation of UI element labeled with the numeric tag '{i + 1}':\\n\"\n doc_content = ast.literal_eval(open(doc_path, \"r\").read())\n if doc_content[\"tap\"]:",
+ "type": "code",
+ "location": "/scripts/task_executor.py:160-175"
+ },
+ "185": {
+ "file_id": 12,
+ "content": "This code is checking if there are any documentation files for UI elements and constructing the prompt accordingly. If there are no documentation files, it removes the \"\" placeholder from the task template. Otherwise, it adds a formatted documentation section to the prompt, listing each element's documentation file path and content.",
+ "type": "comment"
+ },
+ "186": {
+ "file_id": 12,
+ "content": " ui_doc += f\"This UI element is clickable. {doc_content['tap']}\\n\\n\"\n if doc_content[\"text\"]:\n ui_doc += f\"This UI element can receive text input. The text input is used for the following \" \\\n f\"purposes: {doc_content['text']}\\n\\n\"\n if doc_content[\"long_press\"]:\n ui_doc += f\"This UI element is long clickable. {doc_content['long_press']}\\n\\n\"\n if doc_content[\"v_swipe\"]:\n ui_doc += f\"This element can be swiped directly without tapping. You can swipe vertically on \" \\\n f\"this UI element. {doc_content['v_swipe']}\\n\\n\"\n if doc_content[\"h_swipe\"]:\n ui_doc += f\"This element can be swiped directly without tapping. You can swipe horizontally on \" \\\n f\"this UI element. {doc_content['h_swipe']}\\n\\n\"\n print_with_color(f\"Documentations retrieved for the current interface:\\n{ui_doc}\", \"magenta\")",
+ "type": "code",
+ "location": "/scripts/task_executor.py:176-188"
+ },
+ "187": {
+ "file_id": 12,
+ "content": "This code retrieves UI documentation for an interface and prints it in color. It includes clickability, text input, long press, vertical swipe, and horizontal swipe information.",
+ "type": "comment"
+ },
+ "188": {
+ "file_id": 12,
+ "content": " prompt = re.sub(r\"\", ui_doc, prompts.task_template)\n prompt = re.sub(r\"\", task_desc, prompt)\n prompt = re.sub(r\"\", last_act, prompt)\n content = [\n {\n \"type\": \"text\",\n \"text\": prompt\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{base64_img}\"\n }\n }\n ]\n print_with_color(\"Thinking about what to do in the next step...\", \"yellow\")\n rsp = ask_gpt4v(content)\n if \"error\" not in rsp:\n with open(log_path, \"a\") as logfile:\n log_item = {\"step\": round_count, \"prompt\": prompt, \"image\": f\"{dir_name}_{round_count}_labeled.png\",\n \"response\": rsp}\n logfile.write(json.dumps(log_item) + \"\\n\")\n if grid_on:\n res = parse_grid_rsp(rsp)\n else:\n res = parse_explore_rsp(rsp)\n act_name = res[0]\n if act_name == \"FINISH\":\n task_complete = True",
+ "type": "code",
+ "location": "/scripts/task_executor.py:189-218"
+ },
+ "189": {
+ "file_id": 12,
+ "content": "This code is creating a prompt by replacing placeholders with relevant information, then sending it to an AI model for response. If there's no error in the response, log the prompt, image, and response, and parse the response based on the grid setting. If the action name is \"FINISH\", set task_complete to True.",
+ "type": "comment"
+ },
+ "190": {
+ "file_id": 12,
+ "content": " break\n if act_name == \"ERROR\":\n break\n last_act = res[-1]\n res = res[:-1]\n if act_name == \"tap\":\n _, area = res\n tl, br = elem_list[area - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.tap(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: tap execution failed\", \"red\")\n break\n elif act_name == \"text\":\n _, input_str = res\n ret = controller.text(input_str)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: text execution failed\", \"red\")\n break\n elif act_name == \"long_press\":\n _, area = res\n tl, br = elem_list[area - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.long_press(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: long press execution failed\", \"red\")\n break",
+ "type": "code",
+ "location": "/scripts/task_executor.py:219-245"
+ },
+ "191": {
+ "file_id": 12,
+ "content": "Code handles various actions such as tap, text, and long press based on the given action name. It also checks for errors during execution and breaks if an error occurs.",
+ "type": "comment"
+ },
+ "192": {
+ "file_id": 12,
+ "content": " elif act_name == \"swipe\":\n _, area, swipe_dir, dist = res\n tl, br = elem_list[area - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.swipe(x, y, swipe_dir, dist)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: swipe execution failed\", \"red\")\n break\n elif act_name == \"grid\":\n grid_on = True\n elif act_name == \"tap_grid\" or act_name == \"long_press_grid\":\n _, area, subarea = res\n x, y = area_to_xy(area, subarea)\n if act_name == \"tap_grid\":\n ret = controller.tap(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: tap execution failed\", \"red\")\n break\n else:\n ret = controller.long_press(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: tap execution failed\", \"red\")\n break\n elif act_name == \"swipe_grid\":",
+ "type": "code",
+ "location": "/scripts/task_executor.py:246-269"
+ },
+ "193": {
+ "file_id": 12,
+ "content": "This code handles different types of actions such as \"swipe\", \"grid\", \"tap_grid\", and \"long_press_grid\". If the action is \"swipe\", it executes a swipe on the screen with specified direction and distance. If it fails, it prints an error message. For grid-related actions, it maps the area and subarea to coordinates and performs either a tap or long press accordingly. Again, if there's an error, it prints an error message.",
+ "type": "comment"
+ },
+ "194": {
+ "file_id": 12,
+ "content": " _, start_area, start_subarea, end_area, end_subarea = res\n start_x, start_y = area_to_xy(start_area, start_subarea)\n end_x, end_y = area_to_xy(end_area, end_subarea)\n ret = controller.swipe_precise((start_x, start_y), (end_x, end_y))\n if ret == \"ERROR\":\n print_with_color(\"ERROR: tap execution failed\", \"red\")\n break\n if act_name != \"grid\":\n grid_on = False\n time.sleep(configs[\"REQUEST_INTERVAL\"])\n else:\n print_with_color(rsp[\"error\"][\"message\"], \"red\")\n break\nif task_complete:\n print_with_color(\"Task completed successfully\", \"yellow\")\nelif round_count == configs[\"MAX_ROUNDS\"]:\n print_with_color(\"Task finished due to reaching max rounds\", \"yellow\")\nelse:\n print_with_color(\"Task finished unexpectedly\", \"red\")",
+ "type": "code",
+ "location": "/scripts/task_executor.py:270-289"
+ },
+ "195": {
+ "file_id": 12,
+ "content": "This code executes a swipe action with precise coordinates, checks for errors, and prints error or success messages based on task completion status.",
+ "type": "comment"
+ },
+ "196": {
+ "file_id": 13,
+ "content": "/scripts/utils.py",
+ "type": "filepath"
+ },
+ "197": {
+ "file_id": 13,
+ "content": "The code defines `print_with_color` and `draw_bbox_multi` functions, used to print text with different colors, draw bounding boxes on images, read images, draw grid lines and labels, save changes using OpenCV's putText, and encode in base64 format.",
+ "type": "summary"
+ },
+ "198": {
+ "file_id": 13,
+ "content": "import base64\nimport cv2\nimport pyshine as ps\nfrom colorama import Fore, Style\ndef print_with_color(text: str, color=\"\"):\n if color == \"red\":\n print(Fore.RED + text)\n elif color == \"green\":\n print(Fore.GREEN + text)\n elif color == \"yellow\":\n print(Fore.YELLOW + text)\n elif color == \"blue\":\n print(Fore.BLUE + text)\n elif color == \"magenta\":\n print(Fore.MAGENTA + text)\n elif color == \"cyan\":\n print(Fore.CYAN + text)\n elif color == \"white\":\n print(Fore.WHITE + text)\n elif color == \"black\":\n print(Fore.BLACK + text)\n else:\n print(text)\n print(Style.RESET_ALL)\ndef draw_bbox_multi(img_path, output_path, elem_list, record_mode=False, dark_mode=False):\n imgcv = cv2.imread(img_path)\n count = 1\n for elem in elem_list:\n try:\n top_left = elem.bbox[0]\n bottom_right = elem.bbox[1]\n left, top = top_left[0], top_left[1]\n right, bottom = bottom_right[0], bottom_right[1]\n label = str(count)",
+ "type": "code",
+ "location": "/scripts/utils.py:1-39"
+ },
+ "199": {
+ "file_id": 13,
+ "content": "The code defines a function `print_with_color` that allows printing text with different colors and a function `draw_bbox_multi` that draws bounding boxes on an image. The `print_with_color` function takes in a string to print and an optional color argument, which can be \"red\", \"green\", \"yellow\", \"blue\", \"magenta\", \"cyan\", \"white\" or \"black\". It then prints the text with the specified color. The `draw_bbox_multi` function reads an image from a file, loops through a list of elements (each having bounding box coordinates), and draws rectangles around each element on the image. Optionally, it can also keep track of the elements order in the record mode and use dark colors in dark mode.",
+ "type": "comment"
+ }
+}
\ No newline at end of file
diff --git a/docs/data/2.json b/docs/data/2.json
new file mode 100644
index 0000000..cfdce16
--- /dev/null
+++ b/docs/data/2.json
@@ -0,0 +1,35 @@
+{
+ "200": {
+ "file_id": 13,
+ "content": " if record_mode:\n if elem.attrib == \"clickable\":\n color = (250, 0, 0)\n elif elem.attrib == \"focusable\":\n color = (0, 0, 250)\n else:\n color = (0, 250, 0)\n imgcv = ps.putBText(imgcv, label, text_offset_x=(left + right) // 2 + 10, text_offset_y=(top + bottom) // 2 + 10,\n vspace=10, hspace=10, font_scale=1, thickness=2, background_RGB=color,\n text_RGB=(255, 250, 250), alpha=0.5)\n else:\n text_color = (10, 10, 10) if dark_mode else (255, 250, 250)\n bg_color = (255, 250, 250) if dark_mode else (10, 10, 10)\n imgcv = ps.putBText(imgcv, label, text_offset_x=(left + right) // 2 + 10, text_offset_y=(top + bottom) // 2 + 10,\n vspace=10, hspace=10, font_scale=1, thickness=2, background_RGB=bg_color,\n text_RGB=text_color, alpha=0.5)",
+ "type": "code",
+ "location": "/scripts/utils.py:40-55"
+ },
+ "201": {
+ "file_id": 13,
+ "content": "Code checks if record mode is active and assigns a color based on whether the element is clickable or focusable. If not in record mode, it sets colors depending on dark/light mode. Then, puts bounding text on the image with specified colors and positions.",
+ "type": "comment"
+ },
+ "202": {
+ "file_id": 13,
+ "content": " except Exception as e:\n print_with_color(f\"ERROR: An exception occurs while labeling the image\\n{e}\", \"red\")\n count += 1\n cv2.imwrite(output_path, imgcv)\n return imgcv\ndef draw_grid(img_path, output_path):\n def get_unit_len(n):\n for i in range(1, n + 1):\n if n % i == 0 and 120 <= i <= 180:\n return i\n return -1\n image = cv2.imread(img_path)\n height, width, _ = image.shape\n color = (255, 116, 113)\n unit_height = get_unit_len(height)\n if unit_height < 0:\n unit_height = 120\n unit_width = get_unit_len(width)\n if unit_width < 0:\n unit_width = 120\n thick = int(unit_width // 50)\n rows = height // unit_height\n cols = width // unit_width\n for i in range(rows):\n for j in range(cols):\n label = i * cols + j + 1\n left = int(j * unit_width)\n top = int(i * unit_height)\n right = int((j + 1) * unit_width)\n bottom = int((i + 1) * unit_height)\n cv2.rectangle(image, (left, top), (right, bottom), color, thick // 2)",
+ "type": "code",
+ "location": "/scripts/utils.py:56-89"
+ },
+ "203": {
+ "file_id": 13,
+ "content": "This code reads an image and draws grid lines on it, dividing the image into equally sized squares. The size of the squares can be customized based on the image's width and height. If any exceptions occur while labeling the image, it will print an error message in red color.",
+ "type": "comment"
+ },
+ "204": {
+ "file_id": 13,
+ "content": " cv2.putText(image, str(label), (left + int(unit_width * 0.05) + 3, top + int(unit_height * 0.3) + 3), 0,\n int(0.01 * unit_width), (0, 0, 0), thick)\n cv2.putText(image, str(label), (left + int(unit_width * 0.05), top + int(unit_height * 0.3)), 0,\n int(0.01 * unit_width), color, thick)\n cv2.imwrite(output_path, image)\n return rows, cols\ndef encode_image(image_path):\n with open(image_path, \"rb\") as image_file:\n return base64.b64encode(image_file.read()).decode('utf-8')",
+ "type": "code",
+ "location": "/scripts/utils.py:90-100"
+ },
+ "205": {
+ "file_id": 13,
+ "content": "This code is used to draw text on an image and then save the modified image using OpenCV's putText function. The image is then encoded in base64 format.",
+ "type": "comment"
+ }
+}
\ No newline at end of file
diff --git a/docs/data/titles/0.json b/docs/data/titles/0.json
new file mode 100644
index 0000000..32e6dd1
--- /dev/null
+++ b/docs/data/titles/0.json
@@ -0,0 +1,102 @@
+{
+ "/README.md": "Open-Source AppAgent for Android Studio",
+ "/README.md:1-13": "Open-Source AppAgent: Powering AI-Driven Mobile Apps",
+ "/README.md:107-127": "Autonomous Exploration and Human Demonstration",
+ "/README.md:128-147": "Running and Demonstrating App with `learn.py` and `run.py`",
+ "/README.md:13-28": "AppAgent README Updates",
+ "/README.md:147-168": "Guide to Running AppAgent and Documentation Improvement",
+ "/README.md:169-182": "Publication Details Format",
+ "/README.md:29-45": "Introducing AppAgent: Multimodal Smartphone App Framework",
+ "/README.md:47-70": "Quick Start GPT-4 Vision Android",
+ "/README.md:70-88": "Run Android Apps on Emulator with AppAgent",
+ "/README.md:90-106": "Customizing GPT-4V Requests",
+ "/config.yaml": "App Config File Setup",
+ "/config.yaml:1-11": "Configuring Settings",
+ "/config.yaml:11-14": "Config File Customization",
+ "/learn.py": "AppAgent Exploration Interface & Document Generator",
+ "/learn.py:1-23": "AppAgent's Exploration Phase Argument Parser",
+ "/learn.py:24-44": "Agent Exploration Mode Selector",
+ "/learn.py:44-44": "Document Generator Script",
+ "/requirements.txt": "Required Dependencies",
+ "/run.py": "Command Line Interface Setup",
+ "/scripts/and_controller.py": "Android Controller Script",
+ "/scripts/and_controller.py:1-43": "Android Device Controller",
+ "/scripts/and_controller.py:113-137": "Android Debug Bridge Commands Execution Class",
+ "/scripts/and_controller.py:138-166": "Touch Actions Script",
+ "/scripts/and_controller.py:167-180": "ADB Swipe Control Script",
+ "/scripts/and_controller.py:44-65": "Unique XML Element Identifier Generator",
+ "/scripts/and_controller.py:66-90": "Bounding Box Android Element Creator",
+ "/scripts/and_controller.py:91-112": "Android Device Controller: Device Size and Screenshot Functionality",
+ "/scripts/config.py": "YAML Config Loader",
+ "/scripts/document_generation.py": "Document Generation Script",
+ "/scripts/document_generation.py:1-35": "Setting Up AppAgent: Arguments and Directories",
+ "/scripts/document_generation.py:131-141": "Error Handling Script",
+ "/scripts/document_generation.py:36-57": "Directory Creation and Data Processing",
+ "/scripts/document_generation.py:58-75": "User Action Prompt Generator",
+ "/scripts/document_generation.py:76-96": "Else Block: Checks and Refines Document",
+ "/scripts/document_generation.py:97-130": "Document Generation Process for Elements in Python",
+ "/scripts/model.py": "OpenAI API Script",
+ "/scripts/model.py:1-34": "OpenAI API Request Script",
+ "/scripts/model.py:107-131": "Act-Based Response Parser",
+ "/scripts/model.py:132-150": "Extracting Information from Messages",
+ "/scripts/model.py:37-59": "Parse Explore Response Function",
+ "/scripts/model.py:60-85": "Grid Action Parser",
+ "/scripts/model.py:86-106": "Parse Message String and Display Results",
+ "/scripts/prompts.py": "Task-Oriented Mobile App UI Templates",
+ "/scripts/prompts.py:1-10": "Screenshot Comparison: Tap Action",
+ "/scripts/prompts.py:103-117": "Smartphone Touch Interaction Functions",
+ "/scripts/prompts.py:11-20": "Long Press Guide for Mobile App UI",
+ "/scripts/prompts.py:118-131": "Swipe Function for Grid Areas and Subareas",
+ "/scripts/prompts.py:132-149": "Smartphone Self-Exploration Task Template",
+ "/scripts/prompts.py:150-163": "Universal UI Interaction Script",
+ "/scripts/prompts.py:164-177": "Swipe Function with Distance and Direction",
+ "/scripts/prompts.py:178-190": "Analyzing Mobile App Screenshot Differences",
+ "/scripts/prompts.py:191-203": "Undo: Revert to Previous Screen",
+ "/scripts/prompts.py:204-217": "Partial UI Progress",
+ "/scripts/prompts.py:21-30": "Swipe UI Element Template",
+ "/scripts/prompts.py:218-223": "Task-Oriented UI Prompts",
+ "/scripts/prompts.py:31-41": "Generate UI Documentation Prompt",
+ "/scripts/prompts.py:43-57": "Control Smartphone with Python Functions",
+ "/scripts/prompts.py:58-71": "Smartphone Element Interaction Functions",
+ "/scripts/prompts.py:72-87": "Grid Overlay Prompt Function",
+ "/scripts/prompts.py:87-102": "Smartphone Grid Tapping Agent",
+ "/scripts/self_explorer.py": "GPT-4 Self Explorer Script",
+ "/scripts/self_explorer.py:1-38": "Autonomous Exploration Script Setup",
+ "/scripts/self_explorer.py:114-145": "GPT-4 Prompt Processing & Response Logging",
+ "/scripts/self_explorer.py:146-169": "Error-Handling Actions",
+ "/scripts/self_explorer.py:170-195": "Breaking Loop with Task Completion",
+ "/scripts/self_explorer.py:196-228": "Prompt Replacement and Logging",
+ "/scripts/self_explorer.py:228-249": "Self-Explorer: Decision Handling and Back Action",
+ "/scripts/self_explorer.py:250-276": "Documentation Updater",
+ "/scripts/self_explorer.py:277-282": "Max Rounds Reached: Yellow Warning, Unexpected Errors: Red Alert",
+ "/scripts/self_explorer.py:39-63": "Initialize Directories and Devices",
+ "/scripts/self_explorer.py:64-91": "Check & Set Resolution, Task Loop",
+ "/scripts/self_explorer.py:92-113": "Focusable Element Identifier",
+ "/scripts/step_recorder.py": "Command-line Step Recorder",
+ "/scripts/step_recorder.py:1-37": "Setting up AppAgent Command Line Args",
+ "/scripts/step_recorder.py:116-132": "Interactive Element Recording Script",
+ "/scripts/step_recorder.py:133-150": "Long Press Simulator",
+ "/scripts/step_recorder.py:151-167": "Long Press/Swipe Recorder Script",
+ "/scripts/step_recorder.py:168-183": "Swipe Recorder & Executor",
+ "/scripts/step_recorder.py:38-67": "Setting Up File Directories and Device Checks",
+ "/scripts/step_recorder.py:68-92": "Device Size and Resolution Capture",
+ "/scripts/step_recorder.py:93-116": "Clickable and Focusable Elements Detection",
+ "/scripts/task_executor.py": "Grid Image Task Executor",
+ "/scripts/task_executor.py:1-36": "AppAgent Task Executor",
+ "/scripts/task_executor.py:114-137": "Grid-Enabled Screenshot Processing",
+ "/scripts/task_executor.py:138-159": "Bounding Box Generation",
+ "/scripts/task_executor.py:160-175": "Documentation Check for UI Elements",
+ "/scripts/task_executor.py:176-188": "Colorful UI Documentation Printer",
+ "/scripts/task_executor.py:189-218": "AI Prompt Generator and Responder",
+ "/scripts/task_executor.py:219-245": "Error-Resilient Action Executor",
+ "/scripts/task_executor.py:246-269": "Swipe and Grid Action Script",
+ "/scripts/task_executor.py:270-289": "Swipe and Check Task Executor",
+ "/scripts/task_executor.py:37-57": "Task Directory and Document Selection",
+ "/scripts/task_executor.py:58-83": "Select Document Base and Check Devices",
+ "/scripts/task_executor.py:84-113": "Asking for Sub-Areas with Validation",
+ "/scripts/utils.py": "Multicolor Text & Bbox Drawing Utils",
+ "/scripts/utils.py:1-39": "Multicolored Text & Image Bounding Boxes",
+ "/scripts/utils.py:40-55": "Record Mode and Element Coloring",
+ "/scripts/utils.py:56-89": "Grid Image Labeler",
+ "/scripts/utils.py:90-100": "Draw and Save Text on Image with OpenCV"
+}
\ No newline at end of file
diff --git a/docs/doc/0d62b719-0c98-4dcf-97e5-94693b1b4140.json b/docs/doc/0d62b719-0c98-4dcf-97e5-94693b1b4140.json
new file mode 100644
index 0000000..4c8c7a8
--- /dev/null
+++ b/docs/doc/0d62b719-0c98-4dcf-97e5-94693b1b4140.json
@@ -0,0 +1,55 @@
+{
+ "summary": "AppAgent is an open-source project with MIT License offering overlay grid features for Android Studio emulators. It uses multimodal learning and human demonstrations to enable smartphone app operations, and provides instructions on running and improving its experience, suggesting open-sourcing Benchmark and config files, and citing a 2023 research paper with arXiv ID 2312.13771.",
+ "details": [
+ {
+ "comment": "# AppAgent\n- ArXiv link (2312.13771)\n- Project page link\n- MIT License\n- Twitter handle for Dr. Chizhang\n- (Incomplete: Model and Dataset badges missing)",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/README.md\":0-12",
+ "content": "# AppAgent\n
\n \n \n \n \n
\n \n[**Ch"
+ },
+ {
+ "comment": "This code appears to be a README file for a project called \"AppAgent\". It lists the contributors and provides instructions on how to use the project. The most recent updates include adding an optional method for bringing up a grid overlay on the screen and providing tips for better user experience, including instructions for using the Android Studio emulator for users without Android devices.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/README.md\":12-27",
+ "content": "i Zhang***\u2020](https://icoz69.github.io/), [**Zhao Yang***](https://github.com/yz93), [**Jiaxuan Liu***](https://www.linkedin.com/in/jiaxuan-liu-9051b7105/), [Yucheng Han](http://tingxueronghua.github.io), [Xin Chen](https://chenxin.tech/), [Zebiao Huang](),\n \n[Bin Fu](https://openreview.net/profile?id=~BIN_FU2), [Gang Yu\u2726](https://www.skicyyu.org/)\n \n(* equal contributions, \u2020 Project Leader, \u2726 Corresponding Author )\n
\n![](./assets/teaser.png)\n\u2139\ufe0fShould you encounter any issues\u26a0\ufe0f while using our project, please feel free to report them on [GitHub Issues](https://github.com/mnotgod96/AppAgent/issues) or reach out to [Dr. Chi Zhang](https://icoz69.github.io/) via email at dr.zhang.chi@outlook.com.\n## \ud83d\udcdd Changelog\n- __[2024.1.2]__: \ud83d\udd25Added an optional method for the agent to bring up a grid overlay on the screen to **tap/swipe anywhere** on the screen.\n- __[2023.12.26]__: Added [Tips](#tips) section for better use experience; added instruction for using the **Android Studio emulator** for\n users who do not have Android devices."
+ },
+ {
+ "comment": "This code provides an introduction to AppAgent, a multimodal agent framework for operating smartphone applications through a simplified action space. The agent can learn by autonomous exploration or human demonstrations and has a demo video available.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/README.md\":28-44",
+ "content": "- __[2023.12.21]__: \ud83d\udd25\ud83d\udd25 Open-sourced the git repository, including the detailed configuration steps to implement our AppAgent!\n## \ud83d\udd06 Introduction\nWe introduce a novel LLM-based multimodal agent framework designed to operate smartphone applications. \nOur framework enables the agent to operate smartphone applications through a simplified action space, mimicking human-like interactions such as tapping and swiping. This novel approach bypasses the need for system back-end access, thereby broadening its applicability across diverse apps.\nCentral to our agent's functionality is its innovative learning method. The agent learns to navigate and use new apps either through autonomous exploration or by observing human demonstrations. This process generates a knowledge base that the agent refers to for executing complex tasks across different applications.\n## \u2728 Demo\nThe demo video shows the process of using AppAgent to follow a user on X (Twitter) in the deployment phase.\nhttps://github.com/mnotgod96/AppAgent/assets/40715314/db99d650-dec1-4531-b4b2-e085bfcadfb7"
+ },
+ {
+ "comment": "This code is providing quick start instructions for using the gpt-4-vision-preview as an agent to complete tasks on Android apps. It requires installing Android Debug Bridge, enabling USB debugging on the device, and connecting the device to a PC via USB. An optional method for those without an Android device is also suggested.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/README.md\":46-69",
+ "content": "An interesting experiment showing AppAgent's ability to pass CAPTCHA.\nhttps://github.com/mnotgod96/AppAgent/assets/27103154/5cc7ba50-dbab-42a0-a411-a9a862482548\nAn example of using the grid overlay to locate a UI element that is not labeled with a numeric tag.\nhttps://github.com/mnotgod96/AppAgent/assets/27103154/71603333-274c-46ed-8381-2f9a34cdfc53\n## \ud83d\ude80 Quick Start\nThis section will guide you on how to quickly use `gpt-4-vision-preview` as an agent to complete specific tasks for you on\nyour Android app.\n### \u2699\ufe0f Step 1. Prerequisites\n1. On your PC, download and install [Android Debug Bridge](https://developer.android.com/tools/adb) (adb) which is a\n command-line tool that lets you communicate with your Android device from the PC.\n2. Get an Android device and enable the USB debugging that can be found in Developer Options in Settings.\n3. Connect your device to your PC using a USB cable.\n4. (Optional) If you do not have an Android device but still want to try AppAgent. We recommend you download\n ["
+ },
+ {
+ "comment": "Android Studio is mentioned as a tool for running the code and using the emulator. The emulator can be found in Android Studio's device manager, and APK files from the internet can be installed on it. AppAgent can detect an emulated device and function like a real device.\n\nTo use this code, clone the repository and install Python 3 dependencies by running pip install -r requirements.txt in the project directory.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/README.md\":69-87",
+ "content": "Android Studio](https://developer.android.com/studio/run/emulator) and use the emulator that comes with it.\n The emulator can be found in the device manager of Android Studio. You can install apps on an emulator by\n downloading APK files from the internet and dragging them to the emulator.\n AppAgent can detect the emulated device and operate apps on it just like operating a real device.\n \n5. Clone this repo and install the dependencies. All scripts in this project are written in Python 3 so make sure you\n have installed it.\n```bash\ncd AppAgent\npip install -r requirements.txt\n```\n### \ud83e\udd16 Step 2. Configure the Agent\nAppAgent needs to be powered by a multi-modal model which can receive both text and visual inputs. During our experiment\n, we used `gpt-4-vision-preview` as the model to make decisions on how to take actions to complete a task on the smartphone."
+ },
+ {
+ "comment": "Configure requests to GPT-4V by modifying `config.yaml` in the root directory. Provide an eligible OpenAI API key and set request interval to control frequency of GPT-4V requests. Other parameters are well commented, adjust as needed. Be aware that GPT-4V is not free; each request costs around $0.03. Test AppAgent with custom models by modifying `ask_gpt_4v` in `scripts/model.py`.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/README.md\":89-105",
+ "content": "To configure your requests to GPT-4V, you should modify `config.yaml` in the root directory.\nThere are two key parameters that must be configured to try AppAgent:\n1. OpenAI API key: you must purchase an eligible API key from OpenAI so that you can have access to GPT-4V.\n2. Request interval: this is the time interval in seconds between consecutive GPT-4V requests to control the frequency \nof your requests to GPT-4V. Adjust this value according to the status of your account.\nOther parameters in `config.yaml` are well commented. Modify them as you need.\n> Be aware that GPT-4V is not free. Each request/response pair involved in this project costs around $0.03. Use it wisely.\nIf you want to test AppAgent using your own models, you should modify the `ask_gpt_4v` function in `scripts/model.py` \naccordingly.\n### \ud83d\udd0d Step 3. Exploration Phase\nOur paper proposed a novel solution that involves two phases, exploration, and deployment, to turn GPT-4V into a capable \nagent that can help users operate their Android phones when a task is given. The exploration phase starts with a task "
+ },
+ {
+ "comment": "The code describes two options for using the AppAgent. Option 1 is autonomous exploration, where the agent can explore and learn from the app without human intervention. Option 2 involves learning from a human demonstration. Both methods generate documentation for elements interacted during exploration/demonstration for use in deployment. To start with Option 1, run \"learn.py\" and follow prompts to select autonomous exploration, provide the app name and task description.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/README.md\":106-126",
+ "content": "given by you, and you can choose to let the agent either explore the app on its own or learn from your demonstration. \nIn both cases, the agent generates documentation for elements interacted during the exploration/demonstration and \nsaves them for use in the deployment phase.\n#### Option 1: Autonomous Exploration\nThis solution features a fully autonomous exploration which allows the agent to explore the use of the app by attempting\nthe given task without any intervention from humans.\nTo start, run `learn.py` in the root directory. Follow the prompted instructions to select `autonomous exploration` \nas the operating mode and provide the app name and task description. Then, your agent will do the job for you. Under \nthis mode, AppAgent will reflect on its previous action making sure its action adheres to the given task and generate \ndocumentation for the elements explored.\n```bash\npython learn.py\n```\n#### Option 2: Learning from Human Demonstrations\nThis solution requires users to demonstrate a similar task first. AppAgent will learn from the demo and generate "
+ },
+ {
+ "comment": "This code is providing instructions on how to run the human demonstration and the agent for an app using `learn.py` and `run.py` scripts in the root directory. The user needs to follow prompts to provide the app name, task description, and documentation base for the agent to function properly.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/README.md\":127-146",
+ "content": "documentations for UI elements seen during the demo.\nTo start human demonstration, you should run `learn.py` in the root directory. Follow the prompted instructions to select \n`human demonstration` as the operating mode and provide the app name and task description. A screenshot of your phone \nwill be captured and all interactive elements shown on the screen will be labeled with numeric tags. You need to follow \nthe prompts to determine your next action and the target of the action. When you believe the demonstration is finished, \ntype `stop` to end the demo.\n```bash\npython learn.py\n```\n![](./assets/demo.png)\n### \ud83d\udcf1 Step 4. Deployment Phase\nAfter the exploration phase finishes, you can run `run.py` in the root directory. Follow the prompted instructions to enter \nthe name of the app, select the appropriate documentation base you want the agent to use and provide the task \ndescription. Then, your agent will do the job for you. The agent will automatically detect if there is documentation \nbase generat"
+ },
+ {
+ "comment": "This code is providing instructions on how to run the AppAgent and its associated tasks. The code suggests that for a better experience, users can permit AppAgent to explore more tasks autonomously or demonstrate more app functions to improve documentation. It also recommends inspecting the generated documentation by the agent and manually revising it if needed. Additionally, the code mentions open-sourcing the Benchmark and configuration files. Lastly, it provides a citation for AppAgent in the form of BibTeX format.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/README.md\":146-167",
+ "content": "ed before for the app; if there is no documentation found, you can also choose to run the agent without any \ndocumentation (success rate not guaranteed).\n```bash\npython run.py\n```\n## \ud83d\udca1 Tips\n- For an improved experience, you might permit AppAgent to undertake a broader range of tasks through autonomous exploration, or you can directly demonstrate more app functions to enhance the app documentation. Generally, the more extensive the documentation provided to the agent, the higher the likelihood of successful task completion.\n- It is always a good practice to inspect the documentation generated by the agent. When you find some documentation not accurately\n describe the function of the element, manually revising the documentation is also an option.\n## \ud83d\udcd6 To-Do List\n- [ ] Open source the Benchmark.\n- [x] Open source the configuration.\n## \ud83d\ude09 Citation\n```bib\n@misc{yang2023appagent,\n title={AppAgent: Multimodal Agents as Smartphone Users}, \n author={Chi Zhang and Zhao Yang and Jiaxuan Liu and Yucheng Han and Xin Chen and Zebiao Huang and Bin Fu and Gang Yu},"
+ },
+ {
+ "comment": "The code specifies the publication details for a research paper. It includes the year (2023), eprint ID (2312.13771), archive prefix (arXiv), and primary class (cs.CV).",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/README.md\":168-181",
+ "content": " year={2023},\n eprint={2312.13771},\n archivePrefix={arXiv},\n primaryClass={cs.CV}\n}\n```\n## Star History\n[![Star History Chart](https://api.star-history.com/svg?repos=mnotgod96/AppAgent&type=Date)](https://star-history.com/#mnotgod96/AppAgent&Date)\n## License\nThe [MIT license](./assets/license.txt)."
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/10a8ae8d-0a1d-43c8-85e9-89dd554013d3.json b/docs/doc/10a8ae8d-0a1d-43c8-85e9-89dd554013d3.json
new file mode 100644
index 0000000..b699d03
--- /dev/null
+++ b/docs/doc/10a8ae8d-0a1d-43c8-85e9-89dd554013d3.json
@@ -0,0 +1,10 @@
+{
+ "summary": "Function to load configuration from a YAML file, merging it with environment variables.",
+ "details": [
+ {
+ "comment": "Function to load configuration from a YAML file, merging it with environment variables.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/config.py\":0-9",
+ "content": "import os\nimport yaml\ndef load_config(config_path=\"./config.yaml\"):\n configs = dict(os.environ)\n with open(config_path, \"r\") as file:\n yaml_data = yaml.safe_load(file)\n configs.update(yaml_data)\n return configs"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/201e196b-9d55-4640-969a-00e06c7a328a.json b/docs/doc/201e196b-9d55-4640-969a-00e06c7a328a.json
new file mode 100644
index 0000000..a2bcce1
--- /dev/null
+++ b/docs/doc/201e196b-9d55-4640-969a-00e06c7a328a.json
@@ -0,0 +1,45 @@
+{
+ "summary": "The code sets up command line arguments for \"AppAgent,\" enables user actions selection or input gestures, validates inputs, performs corresponding actions with a controller object, logs data, handles errors, and displays recorded steps.",
+ "details": [
+ {
+ "comment": "This code is setting up the command line arguments for an application called \"AppAgent\" which records human demonstrations of mobile app interactions. It checks if the app and demo names are provided, creates a directory to store the recorded data, and sets default values if any arguments are missing.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/step_recorder.py\":0-36",
+ "content": "import argparse\nimport datetime\nimport cv2\nimport os\nimport shutil\nimport sys\nimport time\nfrom and_controller import list_all_devices, AndroidController, traverse_tree\nfrom config import load_config\nfrom utils import print_with_color, draw_bbox_multi\narg_desc = \"AppAgent - Human Demonstration\"\nparser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)\nparser.add_argument(\"--app\")\nparser.add_argument(\"--demo\")\nparser.add_argument(\"--root_dir\", default=\"./\")\nargs = vars(parser.parse_args())\napp = args[\"app\"]\ndemo_name = args[\"demo\"]\nroot_dir = args[\"root_dir\"]\nconfigs = load_config()\nif not app:\n print_with_color(\"What is the name of the app you are going to demo?\", \"blue\")\n app = input()\n app = app.replace(\" \", \"\")\nif not demo_name:\n demo_timestamp = int(time.time())\n demo_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime(f\"demo_{app}_%Y-%m-%d_%H-%M-%S\")\nwork_dir = os.path.join(root_dir, \"apps\")\nif not os.path.exists(work_dir):\n os.mkdir(work_dir)"
+ },
+ {
+ "comment": "Creating directories for storing demo and task files, checking if devices are attached.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/step_recorder.py\":37-66",
+ "content": "work_dir = os.path.join(work_dir, app)\nif not os.path.exists(work_dir):\n os.mkdir(work_dir)\ndemo_dir = os.path.join(work_dir, \"demos\")\nif not os.path.exists(demo_dir):\n os.mkdir(demo_dir)\ntask_dir = os.path.join(demo_dir, demo_name)\nif os.path.exists(task_dir):\n shutil.rmtree(task_dir)\nos.mkdir(task_dir)\nraw_ss_dir = os.path.join(task_dir, \"raw_screenshots\")\nos.mkdir(raw_ss_dir)\nxml_dir = os.path.join(task_dir, \"xml\")\nos.mkdir(xml_dir)\nlabeled_ss_dir = os.path.join(task_dir, \"labeled_screenshots\")\nos.mkdir(labeled_ss_dir)\nrecord_path = os.path.join(task_dir, \"record.txt\")\nrecord_file = open(record_path, \"w\")\ntask_desc_path = os.path.join(task_dir, \"task_desc.txt\")\ndevice_list = list_all_devices()\nif not device_list:\n print_with_color(\"ERROR: No device found!\", \"red\")\n sys.exit()\nprint_with_color(\"List of devices attached:\\n\" + str(device_list), \"yellow\")\nif len(device_list) == 1:\n device = device_list[0]\n print_with_color(f\"Device selected: {device}\", \"yellow\")\nelse:\n print_with_color(\"Please choose the Android device to start demo by entering its ID:\", \"blue\")"
+ },
+ {
+ "comment": "Device input and check for valid device size.\nGet device resolution and print it, request user to specify demo goal.\nSave goal description in file.\nLabel interactive elements with red and blue numeric tags, clickable with red, scrollable with blue.\nLoop to capture screenshots and XML until errors occur.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/step_recorder.py\":67-91",
+ "content": " device = input()\ncontroller = AndroidController(device)\nwidth, height = controller.get_device_size()\nif not width and not height:\n print_with_color(\"ERROR: Invalid device size!\", \"red\")\n sys.exit()\nprint_with_color(f\"Screen resolution of {device}: {width}x{height}\", \"yellow\")\nprint_with_color(\"Please state the goal of your following demo actions clearly, e.g. send a message to John\", \"blue\")\ntask_desc = input()\nwith open(task_desc_path, \"w\") as f:\n f.write(task_desc)\nprint_with_color(\"All interactive elements on the screen are labeled with red and blue numeric tags. Elements \"\n \"labeled with red tags are clickable elements; elements labeled with blue tags are scrollable \"\n \"elements.\", \"blue\")\nstep = 0\nwhile True:\n step += 1\n screenshot_path = controller.get_screenshot(f\"{demo_name}_{step}\", raw_ss_dir)\n xml_path = controller.get_xml(f\"{demo_name}_{step}\", xml_dir)\n if screenshot_path == \"ERROR\" or xml_path == \"ERROR\":\n break\n clickable_list = []"
+ },
+ {
+ "comment": "Code comments:\n\n1. Traverse the XML tree to find clickable and focusable elements (lines 92-104).\n2. Copy the clickable list to a new list called elem_list.\n3. Check if any focusable element is close to any clickable element based on distance threshold (configs[\"MIN_DIST\"]). If not, add it to elem_list (lines 107-114).\n4. Display the labeled image with bounding boxes for elements in elem_list using draw_bbox_multi function.\n5. Show the image and wait for user input (cv2 functions).\n6. Set user_input variable to \"xxx\" and print with colored output.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/step_recorder.py\":92-115",
+ "content": " focusable_list = []\n traverse_tree(xml_path, clickable_list, \"clickable\", True)\n traverse_tree(xml_path, focusable_list, \"focusable\", True)\n elem_list = clickable_list.copy()\n for elem in focusable_list:\n bbox = elem.bbox\n center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n close = False\n for e in clickable_list:\n bbox = e.bbox\n center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5\n if dist <= configs[\"MIN_DIST\"]:\n close = True\n break\n if not close:\n elem_list.append(elem)\n labeled_img = draw_bbox_multi(screenshot_path, os.path.join(labeled_ss_dir, f\"{demo_name}_{step}.png\"), elem_list,\n True)\n cv2.imshow(\"image\", labeled_img)\n cv2.waitKey(0)\n cv2.destroyAllWindows()\n user_input = \"xxx\"\n print_with_col"
+ },
+ {
+ "comment": "This code asks the user to choose an action from a list of options. It continues to ask for input until the correct option is chosen. If \"tap\" is chosen, it prompts for the element to tap by its numeric tag, then taps the element on screen and writes a record if successful.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/step_recorder.py\":115-131",
+ "content": "or(\"Choose one of the following actions you want to perform on the current screen:\\ntap, text, long \"\n \"press, swipe, stop\", \"blue\")\n while user_input.lower() != \"tap\" and user_input.lower() != \"text\" and user_input.lower() != \"long press\" \\\n and user_input.lower() != \"swipe\" and user_input.lower() != \"stop\":\n user_input = input()\n if user_input.lower() == \"tap\":\n print_with_color(f\"Which element do you want to tap? Choose a numeric tag from 1 to {len(elem_list)}:\", \"blue\")\n user_input = \"xxx\"\n while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:\n user_input = input()\n tl, br = elem_list[int(user_input) - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.tap(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: tap execution failed\", \"red\")\n break\n record_file.write(f\"tap({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\\n\")"
+ },
+ {
+ "comment": "This code segment allows the user to input text or simulate a long press on an element by choosing a numeric tag from 1 to the total number of elements in the list. It prompts for the input, validates the input, and performs the corresponding action using the controller object. The data is then recorded in a file with appropriate formatting.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/step_recorder.py\":132-149",
+ "content": " elif user_input.lower() == \"text\":\n print_with_color(f\"Which element do you want to input the text string? Choose a numeric tag from 1 to \"\n f\"{len(elem_list)}:\", \"blue\")\n input_area = \"xxx\"\n while not input_area.isnumeric() or int(input_area) > len(elem_list) or int(input_area) < 1:\n input_area = input()\n print_with_color(\"Enter your input text below:\", \"blue\")\n user_input = \"\"\n while not user_input:\n user_input = input()\n controller.text(user_input)\n record_file.write(f\"text({input_area}:sep:\\\"{user_input}\\\"):::{elem_list[int(input_area) - 1].uid}\\n\")\n elif user_input.lower() == \"long press\":\n print_with_color(f\"Which element do you want to long press? Choose a numeric tag from 1 to {len(elem_list)}:\",\n \"blue\")\n user_input = \"xxx\"\n while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:\n user_input = input()"
+ },
+ {
+ "comment": "The code is prompting the user for input to perform a long press or swipe action on an element from a list. It retrieves the bounding box coordinates, calculates the center point, performs the requested action, and logs the information if it was successful.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/step_recorder.py\":150-166",
+ "content": " tl, br = elem_list[int(user_input) - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.long_press(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: long press execution failed\", \"red\")\n break\n record_file.write(f\"long_press({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\\n\")\n elif user_input.lower() == \"swipe\":\n print_with_color(f\"What is the direction of your swipe? Choose one from the following options:\\nup, down, left,\"\n f\" right\", \"blue\")\n user_input = \"\"\n while user_input != \"up\" and user_input != \"down\" and user_input != \"left\" and user_input != \"right\":\n user_input = input()\n swipe_dir = user_input\n print_with_color(f\"Which element do you want to swipe? Choose a numeric tag from 1 to {len(elem_list)}:\")\n while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:\n user_input = input()"
+ },
+ {
+ "comment": "This code takes user input to record a swipe action, writes it into a file along with the element's unique ID, and handles stopping the recording. If swipe execution fails, it prints an error message and breaks the loop. After completion, it displays the number of steps recorded in yellow color.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/step_recorder.py\":167-182",
+ "content": " tl, br = elem_list[int(user_input) - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.swipe(x, y, swipe_dir)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: swipe execution failed\", \"red\")\n break\n record_file.write(f\"swipe({int(user_input)}:sep:{swipe_dir}):::{elem_list[int(user_input) - 1].uid}\\n\")\n elif user_input.lower() == \"stop\":\n record_file.write(\"stop\\n\")\n record_file.close()\n break\n else:\n break\n time.sleep(3)\nprint_with_color(f\"Demonstration phase completed. {step} steps were recorded.\", \"yellow\")"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/256c3f26-4d85-4bcf-965f-430e6575913e.json b/docs/doc/256c3f26-4d85-4bcf-965f-430e6575913e.json
new file mode 100644
index 0000000..e9ca44a
--- /dev/null
+++ b/docs/doc/256c3f26-4d85-4bcf-965f-430e6575913e.json
@@ -0,0 +1,15 @@
+{
+ "summary": "The code configures OpenAI API settings, GPT-4V request interval, Android screenshot and XML directories for an app agent. It also sets a round limit, dark mode, and minimum distance between elements for element labeling in the configuration file.",
+ "details": [
+ {
+ "comment": "This code is configuring OpenAI API settings, GPT-4V request interval, Android screenshot and XML directories for an app agent.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/config.yaml\":0-10",
+ "content": "OPENAI_API_BASE: \"https://api.openai.com/v1/chat/completions\"\nOPENAI_API_KEY: \"sk-\" # Set the value to sk-xxx if you host the openai interface for open llm model\nOPENAI_API_MODEL: \"gpt-4-vision-preview\" # The only OpenAI model by now that accepts visual input\nMAX_TOKENS: 300 # The max token limit for the response completion\nTEMPERATURE: 0.0 # The temperature of the model: the lower the value, the more consistent the output of the model\nREQUEST_INTERVAL: 10 # Time in seconds between consecutive GPT-4V requests\nANDROID_SCREENSHOT_DIR: \"/sdcard/Pictures/Screenshots\" # Set the directory on your Android device to store the intermediate screenshots. Make sure the directory EXISTS on your phone!\nANDROID_XML_DIR: \"/sdcard\" # Set the directory on your Android device to store the intermediate XML files used for determining locations of UI elements on your screen. Make sure the directory EXISTS on your phone!\nDOC_REFINE: false # Set this to true will make the agent refine existing documentation b"
+ },
+ {
+ "comment": "The configuration file sets round limit, dark mode, and minimum distance between elements for agent's element labeling.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/config.yaml\":10-13",
+ "content": "ased on the latest demonstration; otherwise, the agent will not regenerate a new documentation for elements with the same resource ID.\nMAX_ROUNDS: 20 # Set the round limit for the agent to complete the task\nDARK_MODE: false # Set this to true if your app is in dark mode to enhance the element labeling\nMIN_DIST: 30 # The minimum distance between elements to prevent overlapping during the labeling process"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/2dc11f86-624a-429c-bdcc-60102c7e683d.json b/docs/doc/2dc11f86-624a-429c-bdcc-60102c7e683d.json
new file mode 100644
index 0000000..94ac71c
--- /dev/null
+++ b/docs/doc/2dc11f86-624a-429c-bdcc-60102c7e683d.json
@@ -0,0 +1,35 @@
+{
+ "summary": "The code sets up arguments for the \"AppAgent - Human Demonstration\" program, creates directories, processes lines from a record file, encodes images and extracts action types and parameters. It handles user actions by generating prompts with regular expressions and includes an else block to check for existing documents and refine them if enabled. The code waits for GPT-4V to generate documentation, constructs content including prompts and images, updates `doc_content`, logs entries, handles errors, and sleeps between requests.",
+ "details": [
+ {
+ "comment": "This code is setting up arguments for a program called \"AppAgent - Human Demonstration\". It specifies required parameters such as the app and demo to be used. The code also creates directories if they do not exist, and defines paths for various files and directories related to the task at hand.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/document_generation.py\":0-34",
+ "content": "import argparse\nimport ast\nimport json\nimport os\nimport re\nimport sys\nimport time\nimport prompts\nfrom config import load_config\nfrom model import ask_gpt4v\nfrom utils import print_with_color, encode_image\narg_desc = \"AppAgent - Human Demonstration\"\nparser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)\nparser.add_argument(\"--app\", required=True)\nparser.add_argument(\"--demo\", required=True)\nparser.add_argument(\"--root_dir\", default=\"./\")\nargs = vars(parser.parse_args())\nconfigs = load_config()\nroot_dir = args[\"root_dir\"]\nwork_dir = os.path.join(root_dir, \"apps\")\nif not os.path.exists(work_dir):\n os.mkdir(work_dir)\napp = args[\"app\"]\nwork_dir = os.path.join(work_dir, app)\ndemo_dir = os.path.join(work_dir, \"demos\")\ndemo_name = args[\"demo\"]\ntask_dir = os.path.join(demo_dir, demo_name)\nxml_dir = os.path.join(task_dir, \"xml\")\nlabeled_ss_dir = os.path.join(task_dir, \"labeled_screenshots\")\nrecord_path = os.path.join(task_dir, \"record.txt\")\ntask_desc_path = os.path.join(task_dir, \"task_desc.txt\")"
+ },
+ {
+ "comment": "Code is checking if certain directories exist and creating a directory for document generation. It then reads from a record file, processes each line, encoding images before_and_after the step, extracting action type and parameters.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/document_generation.py\":35-56",
+ "content": "if not os.path.exists(task_dir) or not os.path.exists(xml_dir) or not os.path.exists(labeled_ss_dir) \\\n or not os.path.exists(record_path) or not os.path.exists(task_desc_path):\n sys.exit()\nlog_path = os.path.join(task_dir, f\"log_{app}_{demo_name}.txt\")\ndocs_dir = os.path.join(work_dir, \"demo_docs\")\nif not os.path.exists(docs_dir):\n os.mkdir(docs_dir)\nprint_with_color(f\"Starting to generate documentations for the app {app} based on the demo {demo_name}\", \"yellow\")\ndoc_count = 0\nwith open(record_path, \"r\") as infile:\n step = len(infile.readlines()) - 1\n infile.seek(0)\n for i in range(1, step + 1):\n img_before = encode_image(os.path.join(labeled_ss_dir, f\"{demo_name}_{i}.png\"))\n img_after = encode_image(os.path.join(labeled_ss_dir, f\"{demo_name}_{i + 1}.png\"))\n rec = infile.readline().strip()\n action, resource_id = rec.split(\":::\")\n action_type = action.split(\"(\")[0]\n action_param = re.findall(r\"\\((.*?)\\)\", action)[0]\n if action_type == \"tap\":"
+ },
+ {
+ "comment": "This code handles different user actions and generates prompts based on the action type. It uses regular expressions to replace placeholders in prompt templates with specific action parameters.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/document_generation.py\":57-74",
+ "content": " prompt_template = prompts.tap_doc_template\n prompt = re.sub(r\"\", action_param, prompt_template)\n elif action_type == \"text\":\n input_area, input_text = action_param.split(\":sep:\")\n prompt_template = prompts.text_doc_template\n prompt = re.sub(r\"\", input_area, prompt_template)\n elif action_type == \"long_press\":\n prompt_template = prompts.long_press_doc_template\n prompt = re.sub(r\"\", action_param, prompt_template)\n elif action_type == \"swipe\":\n swipe_area, swipe_dir = action_param.split(\":sep:\")\n if swipe_dir == \"up\" or swipe_dir == \"down\":\n action_type = \"v_swipe\"\n elif swipe_dir == \"left\" or swipe_dir == \"right\":\n action_type = \"h_swipe\"\n prompt_template = prompts.swipe_doc_template\n prompt = re.sub(r\"\", swipe_dir, prompt_template)\n prompt = re.sub(r\"\", swipe_area, prompt)"
+ },
+ {
+ "comment": "Else block: checks if a document for the current task already exists and refines it if DOC_REFINE is enabled in the config file.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/document_generation.py\":75-95",
+ "content": " else:\n break\n task_desc = open(task_desc_path, \"r\").read()\n prompt = re.sub(r\"\", task_desc, prompt)\n doc_name = resource_id + \".txt\"\n doc_path = os.path.join(docs_dir, doc_name)\n if os.path.exists(doc_path):\n doc_content = ast.literal_eval(open(doc_path).read())\n if doc_content[action_type]:\n if configs[\"DOC_REFINE\"]:\n suffix = re.sub(r\"\", doc_content[action_type], prompts.refine_doc_suffix)\n prompt += suffix\n print_with_color(f\"Documentation for the element {resource_id} already exists. The doc will be \"\n f\"refined based on the latest demo.\", \"yellow\")\n else:\n print_with_color(f\"Documentation for the element {resource_id} already exists. Turn on DOC_REFINE \"\n f\"in the config file if needed.\", \"yellow\")\n continue\n else:"
+ },
+ {
+ "comment": "The code is waiting for GPT-4V to generate documentation for an element with the resource ID. It then constructs content, possibly a prompt and two images before and after an action on the element. If there are no errors in the response from GPT-4V, it updates the `doc_content` dictionary with the generated message, and writes a log entry with the step number, prompt, and image names.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/document_generation.py\":96-129",
+ "content": " doc_content = {\n \"tap\": \"\",\n \"text\": \"\",\n \"v_swipe\": \"\",\n \"h_swipe\": \"\",\n \"long_press\": \"\"\n }\n print_with_color(f\"Waiting for GPT-4V to generate documentation for the element {resource_id}\", \"yellow\")\n content = [\n {\n \"type\": \"text\",\n \"text\": prompt\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{img_before}\"\n }\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{img_after}\"\n }\n }\n ]\n rsp = ask_gpt4v(content)\n if \"error\" not in rsp:\n msg = rsp[\"choices\"][0][\"message\"][\"content\"]\n doc_content[action_type] = msg\n with open(log_path, \"a\") as logfile:\n log_item = {\"step\": i, \"prompt\": prompt, \"image_before\": f\"{demo_name}_{i}.png\","
+ },
+ {
+ "comment": "Generates and saves documents, writes log entries, handles errors with colorful output, sleeps for a specified interval between requests.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/document_generation.py\":130-140",
+ "content": " \"image_after\": f\"{demo_name}_{i + 1}.png\", \"response\": rsp}\n logfile.write(json.dumps(log_item) + \"\\n\")\n with open(doc_path, \"w\") as outfile:\n outfile.write(str(doc_content))\n doc_count += 1\n print_with_color(f\"Documentation generated and saved to {doc_path}\", \"yellow\")\n else:\n print_with_color(rsp[\"error\"][\"message\"], \"red\")\n time.sleep(configs[\"REQUEST_INTERVAL\"])\nprint_with_color(f\"Documentation generation phase completed. {doc_count} docs generated.\", \"yellow\")"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/42b10750-9d61-49c1-964c-0dcd8e24dbbc.json b/docs/doc/42b10750-9d61-49c1-964c-0dcd8e24dbbc.json
new file mode 100644
index 0000000..b4f25c2
--- /dev/null
+++ b/docs/doc/42b10750-9d61-49c1-964c-0dcd8e24dbbc.json
@@ -0,0 +1,65 @@
+{
+ "summary": "The code imports libraries, handles user inputs and directory creation, verifies grid images, populates lists from XML tree, manages UI elements, replaces placeholders with data, handles actions like tap/text/long press, logs and parses responses based on grid setting, checks for errors, and prints success or error messages.",
+ "details": [
+ {
+ "comment": "Code imports necessary libraries, sets up argument parsing for executing the AppAgent tasks. It defines the description of the executor, loads configuration from config file and gets the name of app to be operated. If no app name is given, it prompts user for input and proceeds with executing tasks for specified app. It also creates work directory if it does not exist and defines auto_docs_dir in the app directory.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/task_executor.py\":0-35",
+ "content": "import argparse\nimport ast\nimport datetime\nimport json\nimport os\nimport re\nimport sys\nimport time\nimport prompts\nfrom config import load_config\nfrom and_controller import list_all_devices, AndroidController, traverse_tree\nfrom model import ask_gpt4v, parse_explore_rsp, parse_grid_rsp\nfrom utils import print_with_color, draw_bbox_multi, encode_image, draw_grid\narg_desc = \"AppAgent Executor\"\nparser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)\nparser.add_argument(\"--app\")\nparser.add_argument(\"--root_dir\", default=\"./\")\nargs = vars(parser.parse_args())\nconfigs = load_config()\napp = args[\"app\"]\nroot_dir = args[\"root_dir\"]\nif not app:\n print_with_color(\"What is the name of the app you want me to operate?\", \"blue\")\n app = input()\n app = app.replace(\" \", \"\")\napp_dir = os.path.join(os.path.join(root_dir, \"apps\"), app)\nwork_dir = os.path.join(root_dir, \"tasks\")\nif not os.path.exists(work_dir):\n os.mkdir(work_dir)\nauto_docs_dir = os.path.join(app_dir, \"auto_docs\")"
+ },
+ {
+ "comment": "Creating a new task directory with a timestamped name, checking for documentation directories, and asking user to choose which documentation to use or if no documents are found, prompting the user whether to proceed without documents.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/task_executor.py\":36-56",
+ "content": "demo_docs_dir = os.path.join(app_dir, \"demo_docs\")\ntask_timestamp = int(time.time())\ndir_name = datetime.datetime.fromtimestamp(task_timestamp).strftime(f\"task_{app}_%Y-%m-%d_%H-%M-%S\")\ntask_dir = os.path.join(work_dir, dir_name)\nos.mkdir(task_dir)\nlog_path = os.path.join(task_dir, f\"log_{app}_{dir_name}.txt\")\nno_doc = False\nif not os.path.exists(auto_docs_dir) and not os.path.exists(demo_docs_dir):\n print_with_color(f\"No documentations found for the app {app}. Do you want to proceed with no docs? Enter y or n\",\n \"red\")\n user_input = \"\"\n while user_input != \"y\" and user_input != \"n\":\n user_input = input().lower()\n if user_input == \"y\":\n no_doc = True\n else:\n sys.exit()\nelif os.path.exists(auto_docs_dir) and os.path.exists(demo_docs_dir):\n print_with_color(f\"The app {app} has documentations generated from both autonomous exploration and human \"\n f\"demonstration. Which one do you want to use? Type 1 or 2.\\n1. Autonomous exploration\\n2. Human \""
+ },
+ {
+ "comment": "This code snippet prompts the user to select a document base from either automatically generated ones or demonstration ones. It then checks if any devices are attached and prints relevant messages based on the conditions met.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/task_executor.py\":57-82",
+ "content": " f\"Demonstration\",\n \"blue\")\n user_input = \"\"\n while user_input != \"1\" and user_input != \"2\":\n user_input = input()\n if user_input == \"1\":\n docs_dir = auto_docs_dir\n else:\n docs_dir = demo_docs_dir\nelif os.path.exists(auto_docs_dir):\n print_with_color(f\"Documentations generated from autonomous exploration were found for the app {app}. The doc base \"\n f\"is selected automatically.\", \"yellow\")\n docs_dir = auto_docs_dir\nelse:\n print_with_color(f\"Documentations generated from human demonstration were found for the app {app}. The doc base is \"\n f\"selected automatically.\", \"yellow\")\n docs_dir = demo_docs_dir\ndevice_list = list_all_devices()\nif not device_list:\n print_with_color(\"ERROR: No device found!\", \"red\")\n sys.exit()\nprint_with_color(f\"List of devices attached:\\n{str(device_list)}\", \"yellow\")\nif len(device_list) == 1:\n device = device_list[0]\n print_with_color(f\"Device selected: {device}\", \"yellow\")"
+ },
+ {
+ "comment": "User is prompted to choose an Android device for the demo by entering its ID. The device size is checked and if it's invalid, an error is displayed and the program exits. Otherwise, user is asked to provide a task description in a few sentences. A function `area_to_xy` is defined to convert area number to x-y coordinates on the screen. \n\nStorage location: \"AppAgent/scripts/task_executor.py\":115-142\nCode:\n```\ndef get_subarea(area):\n while True:\n try:\n subarea = input(\"Please enter the subarea (top, top-left, top-right) for area \" + str(area+1) + \":\").lower()\n if subarea in (\"top\", \"top-left\", \"top-right\"):\n return subarea\n except Exception as e:\n print_with_color(\"ERROR:\", \"red\")\n print(e)\n```\nComment for code:\n\nThe function `get_subarea` prompts the user to enter the sub-areas for each area in a loop until valid input is provided. Valid inputs are 'top', 'top-left' or 'top-right'. If invalid input is entered, an error message is displayed along with the exception and the program continues to prompt until valid input is given.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/task_executor.py\":83-112",
+ "content": "else:\n print_with_color(\"Please choose the Android device to start demo by entering its ID:\", \"blue\")\n device = input()\ncontroller = AndroidController(device)\nwidth, height = controller.get_device_size()\nif not width and not height:\n print_with_color(\"ERROR: Invalid device size!\", \"red\")\n sys.exit()\nprint_with_color(f\"Screen resolution of {device}: {width}x{height}\", \"yellow\")\nprint_with_color(\"Please enter the description of the task you want me to complete in a few sentences:\", \"blue\")\ntask_desc = input()\nround_count = 0\nlast_act = \"None\"\ntask_complete = False\ngrid_on = False\nrows, cols = 0, 0\ndef area_to_xy(area, subarea):\n area -= 1\n row, col = area // cols, area % cols\n x_0, y_0 = col * (width // cols), row * (height // rows)\n if subarea == \"top-left\":\n x, y = x_0 + (width // cols) // 4, y_0 + (height // rows) // 4\n elif subarea == \"top\":\n x, y = x_0 + (width // cols) // 2, y_0 + (height // rows) // 4\n elif subarea == \"top-right\":\n x, y = x_0 + (width // cols) * 3 // 4, y_0 + (height // rows) // 4"
+ },
+ {
+ "comment": "Code calculates the coordinates for subareas of a screenshot and continues with round processing. It checks if screenshots or XML paths are errors, then breaks if so. If grid is on, it draws the grid.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/task_executor.py\":113-136",
+ "content": " elif subarea == \"left\":\n x, y = x_0 + (width // cols) // 4, y_0 + (height // rows) // 2\n elif subarea == \"right\":\n x, y = x_0 + (width // cols) * 3 // 4, y_0 + (height // rows) // 2\n elif subarea == \"bottom-left\":\n x, y = x_0 + (width // cols) // 4, y_0 + (height // rows) * 3 // 4\n elif subarea == \"bottom\":\n x, y = x_0 + (width // cols) // 2, y_0 + (height // rows) * 3 // 4\n elif subarea == \"bottom-right\":\n x, y = x_0 + (width // cols) * 3 // 4, y_0 + (height // rows) * 3 // 4\n else:\n x, y = x_0 + (width // cols) // 2, y_0 + (height // rows) // 2\n return x, y\nwhile round_count < configs[\"MAX_ROUNDS\"]:\n round_count += 1\n print_with_color(f\"Round {round_count}\", \"yellow\")\n screenshot_path = controller.get_screenshot(f\"{dir_name}_{round_count}\", task_dir)\n xml_path = controller.get_xml(f\"{dir_name}_{round_count}\", task_dir)\n if screenshot_path == \"ERROR\" or xml_path == \"ERROR\":\n break\n if grid_on:\n rows, cols = draw_grid(screenshot_path, os.path.join(task_dir, f\"{dir_name}_{round_count}_grid.png\"))"
+ },
+ {
+ "comment": "If `os.path.join(task_dir, f\"{dir_name}_{round_count}_grid.png\")` exists:\n- Encode the image using `encode_image()`.\n- Set prompt to `prompts.task_template_grid`.\nElse:\n- Create empty `clickable_list` and `focusable_list`.\n- Traverse XML tree to populate `clickable_list` and `focusable_list`.\n- Combine `clickable_list` and `focusable_list` into `elem_list`, excluding duplicates.\n- Draw bounding boxes for elements in `elem_list` on `screenshot_path` and save as `os.path.join(task_dir, f\"{dir_name}_{round_count}_labeled.png\")`.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/task_executor.py\":137-158",
+ "content": " base64_img = encode_image(os.path.join(task_dir, f\"{dir_name}_{round_count}_grid.png\"))\n prompt = prompts.task_template_grid\n else:\n clickable_list = []\n focusable_list = []\n traverse_tree(xml_path, clickable_list, \"clickable\", True)\n traverse_tree(xml_path, focusable_list, \"focusable\", True)\n elem_list = clickable_list.copy()\n for elem in focusable_list:\n bbox = elem.bbox\n center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n close = False\n for e in clickable_list:\n bbox = e.bbox\n center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5\n if dist <= configs[\"MIN_DIST\"]:\n close = True\n break\n if not close:\n elem_list.append(elem)\n draw_bbox_multi(screenshot_path, os.path.join(task_dir, f\"{dir_name}_{round_count}_labeled.png\"), elem_list,"
+ },
+ {
+ "comment": "This code is checking if there are any documentation files for UI elements and constructing the prompt accordingly. If there are no documentation files, it removes the \"\" placeholder from the task template. Otherwise, it adds a formatted documentation section to the prompt, listing each element's documentation file path and content.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/task_executor.py\":159-174",
+ "content": " dark_mode=configs[\"DARK_MODE\"])\n base64_img = encode_image(os.path.join(task_dir, f\"{dir_name}_{round_count}_labeled.png\"))\n if no_doc:\n prompt = re.sub(r\"\", \"\", prompts.task_template)\n else:\n ui_doc = \"\"\"\n You also have access to the following documentations that describes the functionalities of UI \n elements you can interact on the screen. These docs are crucial for you to determine the target of your \n next action. You should always prioritize these documented elements for interaction:\"\"\"\n for i, elem in enumerate(elem_list):\n doc_path = os.path.join(docs_dir, f\"{elem.uid}.txt\")\n if not os.path.exists(doc_path):\n continue\n ui_doc += f\"Documentation of UI element labeled with the numeric tag '{i + 1}':\\n\"\n doc_content = ast.literal_eval(open(doc_path, \"r\").read())\n if doc_content[\"tap\"]:"
+ },
+ {
+ "comment": "This code retrieves UI documentation for an interface and prints it in color. It includes clickability, text input, long press, vertical swipe, and horizontal swipe information.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/task_executor.py\":175-187",
+ "content": " ui_doc += f\"This UI element is clickable. {doc_content['tap']}\\n\\n\"\n if doc_content[\"text\"]:\n ui_doc += f\"This UI element can receive text input. The text input is used for the following \" \\\n f\"purposes: {doc_content['text']}\\n\\n\"\n if doc_content[\"long_press\"]:\n ui_doc += f\"This UI element is long clickable. {doc_content['long_press']}\\n\\n\"\n if doc_content[\"v_swipe\"]:\n ui_doc += f\"This element can be swiped directly without tapping. You can swipe vertically on \" \\\n f\"this UI element. {doc_content['v_swipe']}\\n\\n\"\n if doc_content[\"h_swipe\"]:\n ui_doc += f\"This element can be swiped directly without tapping. You can swipe horizontally on \" \\\n f\"this UI element. {doc_content['h_swipe']}\\n\\n\"\n print_with_color(f\"Documentations retrieved for the current interface:\\n{ui_doc}\", \"magenta\")"
+ },
+ {
+ "comment": "This code is creating a prompt by replacing placeholders with relevant information, then sending it to an AI model for response. If there's no error in the response, log the prompt, image, and response, and parse the response based on the grid setting. If the action name is \"FINISH\", set task_complete to True.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/task_executor.py\":188-217",
+ "content": " prompt = re.sub(r\"\", ui_doc, prompts.task_template)\n prompt = re.sub(r\"\", task_desc, prompt)\n prompt = re.sub(r\"\", last_act, prompt)\n content = [\n {\n \"type\": \"text\",\n \"text\": prompt\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{base64_img}\"\n }\n }\n ]\n print_with_color(\"Thinking about what to do in the next step...\", \"yellow\")\n rsp = ask_gpt4v(content)\n if \"error\" not in rsp:\n with open(log_path, \"a\") as logfile:\n log_item = {\"step\": round_count, \"prompt\": prompt, \"image\": f\"{dir_name}_{round_count}_labeled.png\",\n \"response\": rsp}\n logfile.write(json.dumps(log_item) + \"\\n\")\n if grid_on:\n res = parse_grid_rsp(rsp)\n else:\n res = parse_explore_rsp(rsp)\n act_name = res[0]\n if act_name == \"FINISH\":\n task_complete = True"
+ },
+ {
+ "comment": "Code handles various actions such as tap, text, and long press based on the given action name. It also checks for errors during execution and breaks if an error occurs.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/task_executor.py\":218-244",
+ "content": " break\n if act_name == \"ERROR\":\n break\n last_act = res[-1]\n res = res[:-1]\n if act_name == \"tap\":\n _, area = res\n tl, br = elem_list[area - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.tap(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: tap execution failed\", \"red\")\n break\n elif act_name == \"text\":\n _, input_str = res\n ret = controller.text(input_str)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: text execution failed\", \"red\")\n break\n elif act_name == \"long_press\":\n _, area = res\n tl, br = elem_list[area - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.long_press(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: long press execution failed\", \"red\")\n break"
+ },
+ {
+ "comment": "This code handles different types of actions such as \"swipe\", \"grid\", \"tap_grid\", and \"long_press_grid\". If the action is \"swipe\", it executes a swipe on the screen with specified direction and distance. If it fails, it prints an error message. For grid-related actions, it maps the area and subarea to coordinates and performs either a tap or long press accordingly. Again, if there's an error, it prints an error message.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/task_executor.py\":245-268",
+ "content": " elif act_name == \"swipe\":\n _, area, swipe_dir, dist = res\n tl, br = elem_list[area - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.swipe(x, y, swipe_dir, dist)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: swipe execution failed\", \"red\")\n break\n elif act_name == \"grid\":\n grid_on = True\n elif act_name == \"tap_grid\" or act_name == \"long_press_grid\":\n _, area, subarea = res\n x, y = area_to_xy(area, subarea)\n if act_name == \"tap_grid\":\n ret = controller.tap(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: tap execution failed\", \"red\")\n break\n else:\n ret = controller.long_press(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: tap execution failed\", \"red\")\n break\n elif act_name == \"swipe_grid\":"
+ },
+ {
+ "comment": "This code executes a swipe action with precise coordinates, checks for errors, and prints error or success messages based on task completion status.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/task_executor.py\":269-288",
+ "content": " _, start_area, start_subarea, end_area, end_subarea = res\n start_x, start_y = area_to_xy(start_area, start_subarea)\n end_x, end_y = area_to_xy(end_area, end_subarea)\n ret = controller.swipe_precise((start_x, start_y), (end_x, end_y))\n if ret == \"ERROR\":\n print_with_color(\"ERROR: tap execution failed\", \"red\")\n break\n if act_name != \"grid\":\n grid_on = False\n time.sleep(configs[\"REQUEST_INTERVAL\"])\n else:\n print_with_color(rsp[\"error\"][\"message\"], \"red\")\n break\nif task_complete:\n print_with_color(\"Task completed successfully\", \"yellow\")\nelif round_count == configs[\"MAX_ROUNDS\"]:\n print_with_color(\"Task finished due to reaching max rounds\", \"yellow\")\nelse:\n print_with_color(\"Task finished unexpectedly\", \"red\")"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/5722f6de-746e-44f7-8752-4c28a4a512c2.json b/docs/doc/5722f6de-746e-44f7-8752-4c28a4a512c2.json
new file mode 100644
index 0000000..85da10a
--- /dev/null
+++ b/docs/doc/5722f6de-746e-44f7-8752-4c28a4a512c2.json
@@ -0,0 +1,20 @@
+{
+ "summary": "The code is for an argument parser in the exploration phase of AppAgent, enabling users to select between autonomous or human demonstration mode and specifying required parameters. It also includes a document generation script for running specified apps and demos.",
+ "details": [
+ {
+ "comment": "This code is for an argument parser in the exploration phase of AppAgent. It allows users to input app and root directory, then provides a description of the phase's purpose: generating documentations for UI elements through autonomous exploration or human demonstration. The task-oriented approach requires giving task descriptions.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/learn.py\":0-22",
+ "content": "import argparse\nimport datetime\nimport os\nimport time\nfrom scripts.utils import print_with_color\narg_desc = \"AppAgent - exploration phase\"\nparser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)\nparser.add_argument(\"--app\")\nparser.add_argument(\"--root_dir\", default=\"./\")\nargs = vars(parser.parse_args())\napp = args[\"app\"]\nroot_dir = args[\"root_dir\"]\nprint_with_color(\"Welcome to the exploration phase of AppAgent!\\nThe exploration phase aims at generating \"\n \"documentations for UI elements through either autonomous exploration or human demonstration. \"\n \"Both options are task-oriented, which means you need to give a task description. During \"\n \"autonomous exploration, the agent will try to complete the task by interacting with possible \"\n \"elements on the UI within limited rounds. Documentations will be generated during the process of \"\n \"interacting with the correct elements to proceed with the task. Human demonstration relies on \""
+ },
+ {
+ "comment": "This code asks the user to choose between autonomous exploration or human demonstration mode for the app agent. If \"1\" is entered, it starts autonomous exploration using specific script with app and root_dir parameters. If \"2\" is entered, it begins a human demonstration by creating a demo name and running another script for step recording with app, demo name, and root_dir parameters.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/learn.py\":23-43",
+ "content": " \"the user to show the agent how to complete the given task, and the agent will generate \"\n \"documentations for the elements interacted during the human demo. To start, please enter the \"\n \"main interface of the app on your phone.\", \"yellow\")\nprint_with_color(\"Choose from the following modes:\\n1. autonomous exploration\\n2. human demonstration\\n\"\n \"Type 1 or 2.\", \"blue\")\nuser_input = \"\"\nwhile user_input != \"1\" and user_input != \"2\":\n user_input = input()\nif not app:\n print_with_color(\"What is the name of the target app?\", \"blue\")\n app = input()\n app = app.replace(\" \", \"\")\nif user_input == \"1\":\n os.system(f\"python scripts/self_explorer.py --app {app} --root_dir {root_dir}\")\nelse:\n demo_timestamp = int(time.time())\n demo_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime(f\"demo_{app}_%Y-%m-%d_%H-%M-%S\")\n os.system(f\"python scripts/step_recorder.py --app {app} --demo {demo_name} --root_dir {root_dir}\")\n o"
+ },
+ {
+ "comment": "Running document generation script for specified app and demo.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/learn.py\":43-43",
+ "content": "s.system(f\"python scripts/document_generation.py --app {app} --demo {demo_name} --root_dir {root_dir}\")"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/5be5c92a-4ba5-45e0-910c-cf1f595f1660.json b/docs/doc/5be5c92a-4ba5-45e0-910c-cf1f595f1660.json
new file mode 100644
index 0000000..7d3227b
--- /dev/null
+++ b/docs/doc/5be5c92a-4ba5-45e0-910c-cf1f595f1660.json
@@ -0,0 +1,40 @@
+{
+ "summary": "The code involves Android class definitions, adb command execution, unique identifier generation from XML attributes, and swipe actions with precision and duration options.",
+ "details": [
+ {
+ "comment": "Imports required libraries and defines a class for Android elements, function to execute ADB commands, lists all connected devices, and extracts ID from an Android element's bounds.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/and_controller.py\":0-42",
+ "content": "import os\nimport subprocess\nimport xml.etree.ElementTree as ET\nfrom config import load_config\nfrom utils import print_with_color\nconfigs = load_config()\nclass AndroidElement:\n def __init__(self, uid, bbox, attrib):\n self.uid = uid\n self.bbox = bbox\n self.attrib = attrib\ndef execute_adb(adb_command):\n # print(adb_command)\n result = subprocess.run(adb_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n if result.returncode == 0:\n return result.stdout.strip()\n print_with_color(f\"Command execution failed: {adb_command}\", \"red\")\n print_with_color(result.stderr, \"red\")\n return \"ERROR\"\ndef list_all_devices():\n adb_command = \"adb devices\"\n device_list = []\n result = execute_adb(adb_command)\n if result != \"ERROR\":\n devices = result.split(\"\\n\")[1:]\n for d in devices:\n device_list.append(d.split()[0])\n return device_list\ndef get_id_from_element(elem):\n bounds = elem.attrib[\"bounds\"][1:-1].split(\"][\")\n x1, y1 = map(int, bounds[0].split(\",\"))"
+ },
+ {
+ "comment": "This code snippet is parsing an XML file and generating unique identifiers for elements within the file. It extracts attributes such as resource-id, class, content-desc, and dimensions of each element to form the identifier. The function \"get_id_from_element\" generates the identifier based on these attributes, and the \"traverse_tree\" function traverses the XML tree, applying certain conditions to generate identifiers for elements that meet those criteria.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/and_controller.py\":43-64",
+ "content": " x2, y2 = map(int, bounds[1].split(\",\"))\n elem_w, elem_h = x2 - x1, y2 - y1\n if \"resource-id\" in elem.attrib and elem.attrib[\"resource-id\"]:\n elem_id = elem.attrib[\"resource-id\"].replace(\":\", \".\").replace(\"/\", \"_\")\n else:\n elem_id = f\"{elem.attrib['class']}_{elem_w}_{elem_h}\"\n if \"content-desc\" in elem.attrib and elem.attrib[\"content-desc\"] and len(elem.attrib[\"content-desc\"]) < 20:\n content_desc = elem.attrib['content-desc'].replace(\"/\", \"_\").replace(\" \", \"\").replace(\":\", \"_\")\n elem_id += f\"_{content_desc}\"\n return elem_id\ndef traverse_tree(xml_path, elem_list, attrib, add_index=False):\n path = []\n for event, elem in ET.iterparse(xml_path, ['start', 'end']):\n if event == 'start':\n path.append(elem)\n if attrib in elem.attrib and elem.attrib[attrib] == \"true\":\n parent_prefix = \"\"\n if len(path) > 1:\n parent_prefix = get_id_from_element(path[-2])\n bounds = elem.attrib[\"bounds\"][1:-1].split(\"][\")"
+ },
+ {
+ "comment": "Functionality: This code creates Android elements based on their bounding box coordinates and appends them to a list.\n\nExplanation: The code checks if the given element is close enough to an existing element in the list by comparing their bounding box centers' distance. If it's not close, it creates a new AndroidElement object with the provided ID, bounding box coordinates, and attributes, and appends it to the list. If it's already close, it skips creating a new element. At the end of the event (presumably loop), if 'end' is reached, the code removes the top element from the path stack.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/and_controller.py\":65-89",
+ "content": " x1, y1 = map(int, bounds[0].split(\",\"))\n x2, y2 = map(int, bounds[1].split(\",\"))\n center = (x1 + x2) // 2, (y1 + y2) // 2\n elem_id = get_id_from_element(elem)\n if parent_prefix:\n elem_id = parent_prefix + \"_\" + elem_id\n if add_index:\n elem_id += f\"_{elem.attrib['index']}\"\n close = False\n for e in elem_list:\n bbox = e.bbox\n center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5\n if dist <= configs[\"MIN_DIST\"]:\n close = True\n break\n if not close:\n elem_list.append(AndroidElement(elem_id, ((x1, y1), (x2, y2)), attrib))\n if event == 'end':\n path.pop()\nclass AndroidController:\n def __init__(self, device):"
+ },
+ {
+ "comment": "This code is part of an Android controller that handles device-related operations. It sets the device, screenshot directory, XML directory, and gets the device's width and height. The `get_device_size` function retrieves the screen size using the ADB command, and `get_screenshot` takes a prefix and save directory to capture and save a screenshot.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/and_controller.py\":90-111",
+ "content": " self.device = device\n self.screenshot_dir = configs[\"ANDROID_SCREENSHOT_DIR\"]\n self.xml_dir = configs[\"ANDROID_XML_DIR\"]\n self.width, self.height = self.get_device_size()\n self.backslash = \"\\\\\"\n def get_device_size(self):\n adb_command = f\"adb -s {self.device} shell wm size\"\n result = execute_adb(adb_command)\n if result != \"ERROR\":\n return map(int, result.split(\": \")[1].split(\"x\"))\n return 0, 0\n def get_screenshot(self, prefix, save_dir):\n cap_command = f\"adb -s {self.device} shell screencap -p \" \\\n f\"{os.path.join(self.screenshot_dir, prefix + '.png').replace(self.backslash, '/')}\"\n pull_command = f\"adb -s {self.device} pull \" \\\n f\"{os.path.join(self.screenshot_dir, prefix + '.png').replace(self.backslash, '/')} \" \\\n f\"{os.path.join(save_dir, prefix + '.png')}\"\n result = execute_adb(cap_command)\n if result != \"ERROR\":\n result = execute_adb(pull_command)"
+ },
+ {
+ "comment": "This code defines a class with three methods. The `and_controller` class allows executing commands on an Android device using adb (Android Debug Bridge).\n\nThe `back()` method sends the back key event to the device.\n\nThe `get_xml(prefix, save_dir)` method dumps and pulls an XML file from the device to the specified save directory, returning the saved file path if successful; otherwise, it returns any error message.\n\nThe `execute_adb(command)` function is used to execute adb commands but its implementation is not shown in this code block.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/and_controller.py\":112-136",
+ "content": " if result != \"ERROR\":\n return os.path.join(save_dir, prefix + \".png\")\n return result\n return result\n def get_xml(self, prefix, save_dir):\n dump_command = f\"adb -s {self.device} shell uiautomator dump \" \\\n f\"{os.path.join(self.xml_dir, prefix + '.xml').replace(self.backslash, '/')}\"\n pull_command = f\"adb -s {self.device} pull \" \\\n f\"{os.path.join(self.xml_dir, prefix + '.xml').replace(self.backslash, '/')} \" \\\n f\"{os.path.join(save_dir, prefix + '.xml')}\"\n result = execute_adb(dump_command)\n if result != \"ERROR\":\n result = execute_adb(pull_command)\n if result != \"ERROR\":\n return os.path.join(save_dir, prefix + \".xml\")\n return result\n return result\n def back(self):\n adb_command = f\"adb -s {self.device} shell input keyevent KEYCODE_BACK\"\n ret = execute_adb(adb_command)\n return ret\n def tap(self, x, y):"
+ },
+ {
+ "comment": "The code above contains four methods: \"tap\", \"text\", \"long_press\", and \"swipe\". Each method takes specific arguments such as (x, y) coordinates for taps and swipes, input text for text input, and duration for long press. The methods execute adb commands on a connected device to perform the specified action.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/and_controller.py\":137-165",
+ "content": " adb_command = f\"adb -s {self.device} shell input tap {x} {y}\"\n ret = execute_adb(adb_command)\n return ret\n def text(self, input_str):\n input_str = input_str.replace(\" \", \"%s\")\n input_str = input_str.replace(\"'\", \"\")\n adb_command = f\"adb -s {self.device} shell input text {input_str}\"\n ret = execute_adb(adb_command)\n return ret\n def long_press(self, x, y, duration=1000):\n adb_command = f\"adb -s {self.device} shell input swipe {x} {y} {x} {y} {duration}\"\n ret = execute_adb(adb_command)\n return ret\n def swipe(self, x, y, direction, dist=\"medium\", quick=False):\n unit_dist = int(self.width / 10)\n if dist == \"long\":\n unit_dist *= 3\n elif dist == \"medium\":\n unit_dist *= 2\n if direction == \"up\":\n offset = 0, -2 * unit_dist\n elif direction == \"down\":\n offset = 0, 2 * unit_dist\n elif direction == \"left\":\n offset = -1 * unit_dist, 0\n elif direction == \"right\":"
+ },
+ {
+ "comment": "Code performs swipe actions on a device using ADB (Android Debug Bridge) commands. It allows for different swipe durations based on the \"quick\" parameter and has two functions: \"swipe\" and \"swipe_precise\".",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/and_controller.py\":166-179",
+ "content": " offset = unit_dist, 0\n else:\n return \"ERROR\"\n duration = 100 if quick else 400\n adb_command = f\"adb -s {self.device} shell input swipe {x} {y} {x+offset[0]} {y+offset[1]} {duration}\"\n ret = execute_adb(adb_command)\n return ret\n def swipe_precise(self, start, end, duration=400):\n start_x, start_y = start\n end_x, end_y = end\n adb_command = f\"adb -s {self.device} shell input swipe {start_x} {start_x} {end_x} {end_y} {duration}\"\n ret = execute_adb(adb_command)\n return ret"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/7c00fefd-e68c-41f5-82e6-1e85e7bb9fb1.json b/docs/doc/7c00fefd-e68c-41f5-82e6-1e85e7bb9fb1.json
new file mode 100644
index 0000000..0d300a8
--- /dev/null
+++ b/docs/doc/7c00fefd-e68c-41f5-82e6-1e85e7bb9fb1.json
@@ -0,0 +1,10 @@
+{
+ "summary": "Code imports necessary modules, sets up an argument parser, and retrieves app name from user input before executing a task.",
+ "details": [
+ {
+ "comment": "Code imports necessary modules, sets up an argument parser, and retrieves app name from user input before executing a task.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/run.py\":0-24",
+ "content": "import argparse\nimport os\nfrom scripts.utils import print_with_color\narg_desc = \"AppAgent - deployment phase\"\nparser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)\nparser.add_argument(\"--app\")\nparser.add_argument(\"--root_dir\", default=\"./\")\nargs = vars(parser.parse_args())\napp = args[\"app\"]\nroot_dir = args[\"root_dir\"]\nprint_with_color(\"Welcome to the deployment phase of AppAgent!\\nBefore giving me the task, you should first tell me \"\n \"the name of the app you want me to operate and what documentation base you want me to use. I will \"\n \"try my best to complete the task without your intervention. First, please enter the main interface \"\n \"of the app on your phone and provide the following information.\", \"yellow\")\nif not app:\n print_with_color(\"What is the name of the target app?\", \"blue\")\n app = input()\n app = app.replace(\" \", \"\")\nos.system(f\"python scripts/task_executor.py --app {app} --root_dir {root_dir}\")"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/9fcf2d6b-1fc1-4f41-b278-d272ae653e5e.json b/docs/doc/9fcf2d6b-1fc1-4f41-b278-d272ae653e5e.json
new file mode 100644
index 0000000..ad1d98d
--- /dev/null
+++ b/docs/doc/9fcf2d6b-1fc1-4f41-b278-d272ae653e5e.json
@@ -0,0 +1,60 @@
+{
+ "summary": "The code prepares the environment, generates tasks, and logs interactions with GPT-4. It handles various actions, checks for completion, queries GPT-4 upon task completion, logs relevant information, manages errors, and processes, logs, and updates actions while managing errors and documentation; autonomous exploration ends upon reaching max rounds or in case of unexpected events, displaying a yellow or red message with doc count and success status.",
+ "details": [
+ {
+ "comment": "The code imports necessary libraries and defines arguments for executing the autonomous exploration script. It then loads configuration files, retrieves input from the user for target app name, creates directories if they don't exist, and prepares the environment for running the script.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/self_explorer.py\":0-37",
+ "content": "import argparse\nimport ast\nimport datetime\nimport json\nimport os\nimport re\nimport sys\nimport time\nimport prompts\nfrom config import load_config\nfrom and_controller import list_all_devices, AndroidController, traverse_tree\nfrom model import ask_gpt4v, parse_explore_rsp, parse_reflect_rsp\nfrom utils import print_with_color, draw_bbox_multi, encode_image\narg_desc = \"AppAgent - Autonomous Exploration\"\nparser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)\nparser.add_argument(\"--app\")\nparser.add_argument(\"--root_dir\", default=\"./\")\nargs = vars(parser.parse_args())\nconfigs = load_config()\napp = args[\"app\"]\nroot_dir = args[\"root_dir\"]\nif not app:\n print_with_color(\"What is the name of the target app?\", \"blue\")\n app = input()\n app = app.replace(\" \", \"\")\nwork_dir = os.path.join(root_dir, \"apps\")\nif not os.path.exists(work_dir):\n os.mkdir(work_dir)\nwork_dir = os.path.join(work_dir, app)\nif not os.path.exists(work_dir):\n os.mkdir(work_dir)\ndemo_dir = os.path.join(work_dir, \"demos\")"
+ },
+ {
+ "comment": "Checking if demo directory exists and creating it, then generating a task name and directory. Creating directories for auto_docs, log files, listing devices, and assigning one device based on the number of devices found.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/self_explorer.py\":38-62",
+ "content": "if not os.path.exists(demo_dir):\n os.mkdir(demo_dir)\ndemo_timestamp = int(time.time())\ntask_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime(\"self_explore_%Y-%m-%d_%H-%M-%S\")\ntask_dir = os.path.join(demo_dir, task_name)\nos.mkdir(task_dir)\ndocs_dir = os.path.join(work_dir, \"auto_docs\")\nif not os.path.exists(docs_dir):\n os.mkdir(docs_dir)\nexplore_log_path = os.path.join(task_dir, f\"log_explore_{task_name}.txt\")\nreflect_log_path = os.path.join(task_dir, f\"log_reflect_{task_name}.txt\")\ndevice_list = list_all_devices()\nif not device_list:\n print_with_color(\"ERROR: No device found!\", \"red\")\n sys.exit()\nprint_with_color(f\"List of devices attached:\\n{str(device_list)}\", \"yellow\")\nif len(device_list) == 1:\n device = device_list[0]\n print_with_color(f\"Device selected: {device}\", \"yellow\")\nelse:\n print_with_color(\"Please choose the Android device to start demo by entering its ID:\", \"blue\")\n device = input()\ncontroller = AndroidController(device)\nwidth, height = controller.get_device_size()"
+ },
+ {
+ "comment": "Checks if width and height are provided. If not, displays an error message and exits. Otherwise, prints device resolution and prompts for task description. Starts a loop to complete the task in multiple rounds until reaching MAX_ROUNDS.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/self_explorer.py\":63-90",
+ "content": "if not width and not height:\n print_with_color(\"ERROR: Invalid device size!\", \"red\")\n sys.exit()\nprint_with_color(f\"Screen resolution of {device}: {width}x{height}\", \"yellow\")\nprint_with_color(\"Please enter the description of the task you want me to complete in a few sentences:\", \"blue\")\ntask_desc = input()\nround_count = 0\ndoc_count = 0\nuseless_list = set()\nlast_act = \"None\"\ntask_complete = False\nwhile round_count < configs[\"MAX_ROUNDS\"]:\n round_count += 1\n print_with_color(f\"Round {round_count}\", \"yellow\")\n screenshot_before = controller.get_screenshot(f\"{round_count}_before\", task_dir)\n xml_path = controller.get_xml(f\"{round_count}\", task_dir)\n if screenshot_before == \"ERROR\" or xml_path == \"ERROR\":\n break\n clickable_list = []\n focusable_list = []\n traverse_tree(xml_path, clickable_list, \"clickable\", True)\n traverse_tree(xml_path, focusable_list, \"focusable\", True)\n elem_list = []\n for elem in clickable_list:\n if elem.uid in useless_list:\n continue"
+ },
+ {
+ "comment": "This code finds focusable elements on the screen, checks if they are close to any clickable elements, and adds them to a list. It then draws bounding boxes around these elements in an image and generates a task prompt with the image encoded as base64.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/self_explorer.py\":91-112",
+ "content": " elem_list.append(elem)\n for elem in focusable_list:\n if elem.uid in useless_list:\n continue\n bbox = elem.bbox\n center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n close = False\n for e in clickable_list:\n bbox = e.bbox\n center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2\n dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5\n if dist <= configs[\"MIN_DIST\"]:\n close = True\n break\n if not close:\n elem_list.append(elem)\n draw_bbox_multi(screenshot_before, os.path.join(task_dir, f\"{round_count}_before_labeled.png\"), elem_list,\n dark_mode=configs[\"DARK_MODE\"])\n prompt = re.sub(r\"\", task_desc, prompts.self_explore_task_template)\n prompt = re.sub(r\"\", last_act, prompt)\n base64_img_before = encode_image(os.path.join(task_dir, f\"{round_count}_before_labeled.png\"))"
+ },
+ {
+ "comment": "This code is sending a prompt to GPT-4 and receiving a response. It then logs the step, prompt, image, and response before parsing the response and checking if it's a \"FINISH\" command or a \"tap\" action.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/self_explorer.py\":113-144",
+ "content": " content = [\n {\n \"type\": \"text\",\n \"text\": prompt\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{base64_img_before}\"\n }\n }\n ]\n print_with_color(\"Thinking about what to do in the next step...\", \"yellow\")\n rsp = ask_gpt4v(content)\n if \"error\" not in rsp:\n with open(explore_log_path, \"a\") as logfile:\n log_item = {\"step\": round_count, \"prompt\": prompt, \"image\": f\"{round_count}_before_labeled.png\",\n \"response\": rsp}\n logfile.write(json.dumps(log_item) + \"\\n\")\n res = parse_explore_rsp(rsp)\n act_name = res[0]\n last_act = res[-1]\n res = res[:-1]\n if act_name == \"FINISH\":\n task_complete = True\n break\n if act_name == \"tap\":\n _, area = res\n tl, br = elem_list[area - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.tap(x, y)"
+ },
+ {
+ "comment": "This code handles different actions (tap, text, long_press, swipe) performed by the script. It checks if the execution of each action fails and prints an error message with color formatting in case of failure.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/self_explorer.py\":145-168",
+ "content": " if ret == \"ERROR\":\n print_with_color(\"ERROR: tap execution failed\", \"red\")\n break\n elif act_name == \"text\":\n _, input_str = res\n ret = controller.text(input_str)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: text execution failed\", \"red\")\n break\n elif act_name == \"long_press\":\n _, area = res\n tl, br = elem_list[area - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.long_press(x, y)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: long press execution failed\", \"red\")\n break\n elif act_name == \"swipe\":\n _, area, swipe_dir, dist = res\n tl, br = elem_list[area - 1].bbox\n x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2\n ret = controller.swipe(x, y, swipe_dir, dist)\n if ret == \"ERROR\":\n print_with_color(\"ERROR: swipe execution failed\", \"red\")"
+ },
+ {
+ "comment": "Checking if task is complete and breaks loop",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/self_explorer.py\":169-194",
+ "content": " break\n else:\n break\n time.sleep(configs[\"REQUEST_INTERVAL\"])\n else:\n print_with_color(rsp[\"error\"][\"message\"], \"red\")\n break\n screenshot_after = controller.get_screenshot(f\"{round_count}_after\", task_dir)\n if screenshot_after == \"ERROR\":\n break\n draw_bbox_multi(screenshot_after, os.path.join(task_dir, f\"{round_count}_after_labeled.png\"), elem_list,\n dark_mode=configs[\"DARK_MODE\"])\n base64_img_after = encode_image(os.path.join(task_dir, f\"{round_count}_after_labeled.png\"))\n if act_name == \"tap\":\n prompt = re.sub(r\"\", \"tapping\", prompts.self_explore_reflect_template)\n elif act_name == \"text\":\n continue\n elif act_name == \"long_press\":\n prompt = re.sub(r\"\", \"long pressing\", prompts.self_explore_reflect_template)\n elif act_name == \"swipe\":\n swipe_dir = res[2]\n if swipe_dir == \"up\" or swipe_dir == \"down\":\n act_name = \"v_swipe\"\n elif swipe_dir == \"left\" or swipe_dir == \"right\":"
+ },
+ {
+ "comment": "Code is preparing a message to ask GPT-4 about a previous action. It replaces placeholders in the prompt with appropriate values and sends it to GPT-4 for response. If there's no error in the response, it logs relevant information into reflect_log_path.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/self_explorer.py\":195-227",
+ "content": " act_name = \"h_swipe\"\n prompt = re.sub(r\"\", \"swiping\", prompts.self_explore_reflect_template)\n else:\n print_with_color(\"ERROR: Undefined act!\", \"red\")\n break\n prompt = re.sub(r\"\", str(area), prompt)\n prompt = re.sub(r\"\", task_desc, prompt)\n prompt = re.sub(r\"\", last_act, prompt)\n content = [\n {\n \"type\": \"text\",\n \"text\": prompt\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{base64_img_before}\"\n }\n },\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{base64_img_after}\"\n }\n }\n ]\n print_with_color(\"Reflecting on my previous action...\", \"yellow\")\n rsp = ask_gpt4v(content)\n if \"error\" not in rsp:\n resource_id = elem_list[int(area) - 1].uid\n with open(reflect_log_path, \"a\") as logfile:\n "
+ },
+ {
+ "comment": "Writing log item to file\nParses response and makes decision\nHandles \"ERROR\", \"INEFFECTIVE\", and other decisions\nIf \"BACK\" or \"CONTINUE\", adds resource_id to useless list, sets last_act to \"None\", and executes back action if necessary\nDocs processing begins",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/self_explorer.py\":227-248",
+ "content": "log_item = {\"step\": round_count, \"prompt\": prompt, \"image_before\": f\"{round_count}_before_labeled.png\",\n \"image_after\": f\"{round_count}_after.png\", \"response\": rsp}\n logfile.write(json.dumps(log_item) + \"\\n\")\n res = parse_reflect_rsp(rsp)\n decision = res[0]\n if decision == \"ERROR\":\n break\n if decision == \"INEFFECTIVE\":\n useless_list.add(resource_id)\n last_act = \"None\"\n elif decision == \"BACK\" or decision == \"CONTINUE\" or decision == \"SUCCESS\":\n if decision == \"BACK\" or decision == \"CONTINUE\":\n useless_list.add(resource_id)\n last_act = \"None\"\n if decision == \"BACK\":\n ret = controller.back()\n if ret == \"ERROR\":\n print_with_color(\"ERROR: back execution failed\", \"red\")\n break\n doc = res[-1]\n doc_name = resource_id + \".txt\"\n doc_path = os.path.join(docs_dir, doc_name)"
+ },
+ {
+ "comment": "If file exists, read its content, and if the action's documentation already exists, print a message and continue. Otherwise, create an empty dictionary for the document content, add the current action's documentation, save it to file, increment documentation count, and print a success message. If there is an undefined decision or error in response, print an error message and break the loop. After each task, sleep for the specified request interval.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/self_explorer.py\":249-275",
+ "content": " if os.path.exists(doc_path):\n doc_content = ast.literal_eval(open(doc_path).read())\n if doc_content[act_name]:\n print_with_color(f\"Documentation for the element {resource_id} already exists.\", \"yellow\")\n continue\n else:\n doc_content = {\n \"tap\": \"\",\n \"text\": \"\",\n \"v_swipe\": \"\",\n \"h_swipe\": \"\",\n \"long_press\": \"\"\n }\n doc_content[act_name] = doc\n with open(doc_path, \"w\") as outfile:\n outfile.write(str(doc_content))\n doc_count += 1\n print_with_color(f\"Documentation generated and saved to {doc_path}\", \"yellow\")\n else:\n print_with_color(f\"ERROR: Undefined decision! {decision}\", \"red\")\n break\n else:\n print_with_color(rsp[\"error\"][\"message\"], \"red\")\n break\n time.sleep(configs[\"REQUEST_INTERVAL\"])\nif task_complete:"
+ },
+ {
+ "comment": "Autonomous exploration ended. Yellow message if max rounds reached, red if unexpected, displays doc count and success status.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/self_explorer.py\":276-281",
+ "content": " print_with_color(f\"Autonomous exploration completed successfully. {doc_count} docs generated.\", \"yellow\")\nelif round_count == configs[\"MAX_ROUNDS\"]:\n print_with_color(f\"Autonomous exploration finished due to reaching max rounds. {doc_count} docs generated.\",\n \"yellow\")\nelse:\n print_with_color(f\"Autonomous exploration finished unexpectedly. {doc_count} docs generated.\", \"red\")"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/a2307bd8-618f-4b93-9d2c-0c67b787f421.json b/docs/doc/a2307bd8-618f-4b93-9d2c-0c67b787f421.json
new file mode 100644
index 0000000..c21c5c7
--- /dev/null
+++ b/docs/doc/a2307bd8-618f-4b93-9d2c-0c67b787f421.json
@@ -0,0 +1,25 @@
+{
+ "summary": "The code defines `print_with_color` and `draw_bbox_multi` functions, used to print text with different colors, draw bounding boxes on images, read images, draw grid lines and labels, save changes using OpenCV's putText, and encode in base64 format.",
+ "details": [
+ {
+ "comment": "The code defines a function `print_with_color` that allows printing text with different colors and a function `draw_bbox_multi` that draws bounding boxes on an image. The `print_with_color` function takes in a string to print and an optional color argument, which can be \"red\", \"green\", \"yellow\", \"blue\", \"magenta\", \"cyan\", \"white\" or \"black\". It then prints the text with the specified color. The `draw_bbox_multi` function reads an image from a file, loops through a list of elements (each having bounding box coordinates), and draws rectangles around each element on the image. Optionally, it can also keep track of the elements order in the record mode and use dark colors in dark mode.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/utils.py\":0-38",
+ "content": "import base64\nimport cv2\nimport pyshine as ps\nfrom colorama import Fore, Style\ndef print_with_color(text: str, color=\"\"):\n if color == \"red\":\n print(Fore.RED + text)\n elif color == \"green\":\n print(Fore.GREEN + text)\n elif color == \"yellow\":\n print(Fore.YELLOW + text)\n elif color == \"blue\":\n print(Fore.BLUE + text)\n elif color == \"magenta\":\n print(Fore.MAGENTA + text)\n elif color == \"cyan\":\n print(Fore.CYAN + text)\n elif color == \"white\":\n print(Fore.WHITE + text)\n elif color == \"black\":\n print(Fore.BLACK + text)\n else:\n print(text)\n print(Style.RESET_ALL)\ndef draw_bbox_multi(img_path, output_path, elem_list, record_mode=False, dark_mode=False):\n imgcv = cv2.imread(img_path)\n count = 1\n for elem in elem_list:\n try:\n top_left = elem.bbox[0]\n bottom_right = elem.bbox[1]\n left, top = top_left[0], top_left[1]\n right, bottom = bottom_right[0], bottom_right[1]\n label = str(count)"
+ },
+ {
+ "comment": "Code checks if record mode is active and assigns a color based on whether the element is clickable or focusable. If not in record mode, it sets colors depending on dark/light mode. Then, puts bounding text on the image with specified colors and positions.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/utils.py\":39-54",
+ "content": " if record_mode:\n if elem.attrib == \"clickable\":\n color = (250, 0, 0)\n elif elem.attrib == \"focusable\":\n color = (0, 0, 250)\n else:\n color = (0, 250, 0)\n imgcv = ps.putBText(imgcv, label, text_offset_x=(left + right) // 2 + 10, text_offset_y=(top + bottom) // 2 + 10,\n vspace=10, hspace=10, font_scale=1, thickness=2, background_RGB=color,\n text_RGB=(255, 250, 250), alpha=0.5)\n else:\n text_color = (10, 10, 10) if dark_mode else (255, 250, 250)\n bg_color = (255, 250, 250) if dark_mode else (10, 10, 10)\n imgcv = ps.putBText(imgcv, label, text_offset_x=(left + right) // 2 + 10, text_offset_y=(top + bottom) // 2 + 10,\n vspace=10, hspace=10, font_scale=1, thickness=2, background_RGB=bg_color,\n text_RGB=text_color, alpha=0.5)"
+ },
+ {
+ "comment": "This code reads an image and draws grid lines on it, dividing the image into equally sized squares. The size of the squares can be customized based on the image's width and height. If any exceptions occur while labeling the image, it will print an error message in red color.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/utils.py\":55-88",
+ "content": " except Exception as e:\n print_with_color(f\"ERROR: An exception occurs while labeling the image\\n{e}\", \"red\")\n count += 1\n cv2.imwrite(output_path, imgcv)\n return imgcv\ndef draw_grid(img_path, output_path):\n def get_unit_len(n):\n for i in range(1, n + 1):\n if n % i == 0 and 120 <= i <= 180:\n return i\n return -1\n image = cv2.imread(img_path)\n height, width, _ = image.shape\n color = (255, 116, 113)\n unit_height = get_unit_len(height)\n if unit_height < 0:\n unit_height = 120\n unit_width = get_unit_len(width)\n if unit_width < 0:\n unit_width = 120\n thick = int(unit_width // 50)\n rows = height // unit_height\n cols = width // unit_width\n for i in range(rows):\n for j in range(cols):\n label = i * cols + j + 1\n left = int(j * unit_width)\n top = int(i * unit_height)\n right = int((j + 1) * unit_width)\n bottom = int((i + 1) * unit_height)\n cv2.rectangle(image, (left, top), (right, bottom), color, thick // 2)"
+ },
+ {
+ "comment": "This code is used to draw text on an image and then save the modified image using OpenCV's putText function. The image is then encoded in base64 format.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/utils.py\":89-99",
+ "content": " cv2.putText(image, str(label), (left + int(unit_width * 0.05) + 3, top + int(unit_height * 0.3) + 3), 0,\n int(0.01 * unit_width), (0, 0, 0), thick)\n cv2.putText(image, str(label), (left + int(unit_width * 0.05), top + int(unit_height * 0.3)), 0,\n int(0.01 * unit_width), color, thick)\n cv2.imwrite(output_path, image)\n return rows, cols\ndef encode_image(image_path):\n with open(image_path, \"rb\") as image_file:\n return base64.b64encode(image_file.read()).decode('utf-8')"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/acddb05b-7112-40f0-8bef-fe2a0bd1a5f7.json b/docs/doc/acddb05b-7112-40f0-8bef-fe2a0bd1a5f7.json
new file mode 100644
index 0000000..8a5a6b6
--- /dev/null
+++ b/docs/doc/acddb05b-7112-40f0-8bef-fe2a0bd1a5f7.json
@@ -0,0 +1,90 @@
+{
+ "summary": "The code showcases mobile app UI templates for touch interactions, and instructs in describing UI elements task-oriented manner with pronouns, specifying output format as \"Decision: SUCCESS\" followed by an explanation of the action's impact on the task.",
+ "details": [
+ {
+ "comment": "ap_doc_template: Describes a mobile app screenshot before and after tapping a UI element with a number, focusing on the general function without mentioning numeric tag or specific details.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":0-9",
+ "content": "tap_doc_template = \"\"\"I will give you the screenshot of a mobile app before and after tapping the UI element labeled \nwith the number on the screen. The numeric tag of each element is located at the center of the element. \nTapping this UI element is a necessary part of proceeding with a larger task, which is to . Your task is to \ndescribe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI \nelement should focus on the general function. For example, if the UI element is used to navigate to the chat window \nwith John, your description should not include the name of the specific person. Just say: \"Tapping this area will \nnavigate the user to the chat window\". Never include the numeric tag of the UI element in your description. You can use \npronouns such as \"the UI element\" to refer to the element.\"\"\"\ntext_doc_template = \"\"\"I will give you the screenshot of a mobile app before and after typing in the input area labeled"
+ },
+ {
+ "comment": "Long press documentation template for a mobile app UI element. Provides screenshot comparison before and after long pressing.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":10-19",
+ "content": "with the number on the screen. The numeric tag of each element is located at the center of the element. \nTyping in this UI element is a necessary part of proceeding with a larger task, which is to . Your task is \nto describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the \nUI element should focus on the general function. For example, if the change of the screenshot shows that the user typed \n\"How are you?\" in the chat box, you do not need to mention the actual text. Just say: \"This input area is used for the \nuser to type a message to send to the chat window.\". Never include the numeric tag of the UI element in your \ndescription. You can use pronouns such as \"the UI element\" to refer to the element.\"\"\"\nlong_press_doc_template = \"\"\"I will give you the screenshot of a mobile app before and after long pressing the UI \nelement labeled with the number on the screen. The numeric tag of each element is located at the center of "
+ },
+ {
+ "comment": "This code is generating a template for describing the functionality of swiping a specific UI element in a mobile app. The description should focus on the general function, without including the numeric tag or name of the person related to the task.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":20-29",
+ "content": "the element. Long pressing this UI element is a necessary part of proceeding with a larger task, which is to \n. Your task is to describe the functionality of the UI element concisely in one or two sentences. Notice \nthat your description of the UI element should focus on the general function. For example, if long pressing the UI \nelement redirects the user to the chat window with John, your description should not include the name of the specific \nperson. Just say: \"Long pressing this area will redirect the user to the chat window\". Never include the numeric tag of \nthe UI element in your description. You can use pronouns such as \"the UI element\" to refer to the element.\"\"\"\nswipe_doc_template = \"\"\"I will give you the screenshot of a mobile app before and after swiping the UI \nelement labeled with the number on the screen. The numeric tag of each element is located at the center of \nthe element. Swiping this UI element is a necessary part of proceeding with a larger task, which is to . "
+ },
+ {
+ "comment": "This code appears to be related to generating documentation for a UI element. It provides instructions on how to describe the functionality of the UI element concisely, using general terms and without including the numeric tag. The \"refine_doc_suffix\" variable suggests incorporating previous documentation if available, but also resolving any conflicts that might arise due to flexibility in the UI element's function.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":30-40",
+ "content": "Your task is to describe the functionality of the UI element concisely in one or two sentences. Notice that your \ndescription of the UI element should be as general as possible. For example, if swiping the UI element increases the \ncontrast ratio of an image of a building, your description should be just like this: \"Swiping this area enables the \nuser to tune a specific parameter of the image\". Never include the numeric tag of the UI element in your description. \nYou can use pronouns such as \"the UI element\" to refer to the element.\"\"\"\nrefine_doc_suffix = \"\"\"\\nA documentation of this UI element generated from previous demos is shown below. Your \ngenerated description should be based on this previous doc and optimize it. Notice that it is possible that your \nunderstanding of the function of the UI element derived from the given screenshots conflicts with the previous doc, \nbecause the function of a UI element can be flexible. In this case, your generated description should combine both.\nOld documentation of this UI element: \"\"\""
+ },
+ {
+ "comment": "The code provides a template for instructions on how to interact with a smartphone using two functions: tap() and text(). It explains that the user needs to provide an element number for tap(), and a string wrapped in double quotes for text(). These functions can be used to control the phone, such as tapping UI elements or inserting text input.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":42-56",
+ "content": "task_template = \"\"\"You are an agent that is trained to perform some basic tasks on a smartphone. You will be given a \nsmartphone screenshot. The interactive UI elements on the screenshot are labeled with numeric tags starting from 1. The \nnumeric tag of each interactive element is located in the center of the element.\nYou can call the following functions to control the smartphone:\n1. tap(element: int)\nThis function is used to tap an UI element shown on the smartphone screen.\n\"element\" is a numeric tag assigned to an UI element shown on the smartphone screen.\nA simple use case can be tap(5), which taps the UI element labeled with the number 5.\n2. text(text_input: str)\nThis function is used to insert text input in an input field/box. text_input is the string you want to insert and must \nbe wrapped with double quotation marks. A simple use case can be text(\"Hello, world!\"), which inserts the string \n\"Hello, world!\" into the input area on the smartphone screen. This function is usually callable when you see a keyboard "
+ },
+ {
+ "comment": "Code snippet 57-70 describes functions for interacting with smartphone screen elements. \"long_press(element: int)\" is used to long press an element identified by its numeric tag. \"swipe(element: int, direction: str, dist: str)\" swipes an element in a specific direction and distance specified by the input parameters. The element is identified using its numeric tag, while direction and distance are string inputs enclosed in quotes.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":57-70",
+ "content": "showing in the lower half of the screen.\n3. long_press(element: int)\nThis function is used to long press an UI element shown on the smartphone screen.\n\"element\" is a numeric tag assigned to an UI element shown on the smartphone screen.\nA simple use case can be long_press(5), which long presses the UI element labeled with the number 5.\n4. swipe(element: int, direction: str, dist: str)\nThis function is used to swipe an UI element shown on the smartphone screen, usually a scroll view or a slide bar.\n\"element\" is a numeric tag assigned to an UI element shown on the smartphone screen. \"direction\" is a string that \nrepresents one of the four directions: up, down, left, right. \"direction\" must be wrapped with double quotation \nmarks. \"dist\" determines the distance of the swipe and can be one of the three options: short, medium, long. You should \nchoose the appropriate distance option according to your need.\nA simple use case can be swipe(21, \"up\", \"medium\"), which swipes up the UI element labeled with the number 21 for a "
+ },
+ {
+ "comment": "The code is providing instructions for a function grid() to be used when an element isn't labeled with a numeric tag and other elements can't help. It brings up a grid overlay to allow selecting any part of the screen for tapping, long pressing or swiping. The user needs to observe the image, think about the next step and call the correct function with parameters.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":71-86",
+ "content": "medium distance.\n5. grid()\nYou should call this function when you find the element you want to interact with is not labeled with a numeric tag and \nother elements with numeric tags cannot help with the task. The function will bring up a grid overlay to divide the \nsmartphone screen into small areas and this will give you more freedom to choose any part of the screen to tap, long \npress, or swipe.\n\nThe task you need to complete is to . Your past actions to proceed with this task are summarized as \nfollows: \nNow, given the documentation and the following labeled screenshot, you need to think and call the function needed to \nproceed with the task. Your output should include three parts in the given format:\nObservation: \nThought: \nAction: \nSummary: \nYou can only take one action at a time, so please directly call the function.\"\"\"\ntask_template_grid = \"\"\"You are an agent that is trained to perform some basic tasks on a smartphone. You will be given \na smartphone screenshot overlaid by a grid. The grid divides the screenshot into small square areas. Each area is \nlabeled with an integer in the top-left corner.\nYou can call the following functions to control the smartphone:\n1. tap(area: int, subarea: str)\nThis function is used to tap a grid area shown on the smartphone screen. \"area\" is the integer label assigned to a grid \narea shown on the smartphone screen. \"subarea\" is a string representing the exact location to tap within the grid area. \nIt can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left, bottom, and "
+ },
+ {
+ "comment": "This code defines functions for interacting with a smartphone screen, including tap, long_press, and swipe actions. The functions allow specifying the grid area and subarea for precise touch interactions.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":102-116",
+ "content": "bottom-right.\nA simple use case can be tap(5, \"center\"), which taps the exact center of the grid area labeled with the number 5.\n2. long_press(area: int, subarea: str)\nThis function is used to long press a grid area shown on the smartphone screen. \"area\" is the integer label assigned to \na grid area shown on the smartphone screen. \"subarea\" is a string representing the exact location to long press within \nthe grid area. It can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left, bottom, \nand bottom-right.\nA simple use case can be long_press(7, \"top-left\"), which long presses the top left part of the grid area labeled with \nthe number 7.\n3. swipe(start_area: int, start_subarea: str, end_area: int, end_subarea: str)\nThis function is used to perform a swipe action on the smartphone screen, especially when you want to interact with a \nscroll view or a slide bar. \"start_area\" is the integer label assigned to the grid area which marks the starting \nlocation of the swipe. \"start_subarea\" is a string representing the exact location to begin the swipe within the grid "
+ },
+ {
+ "comment": "This code is defining a swipe function that allows users to perform a swipe action between two grid areas with specified start and end subareas. It takes four parameters: the starting area, start subarea, ending area, and end subarea. The possible subarea values are center, top-left, top, top-right, left, right, bottom-left, bottom, and bottom-right.\n\nObservation: I see a screenshot with labeled grid areas and subareas.\nThought: To proceed with the task, I need to call the appropriate function with the correct parameters based on the information in the image.\nAction: swipe(21, \"center\", 25, \"right\")",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":117-130",
+ "content": "area. \"end_area\" is the integer label assigned to the grid area which marks the ending location of the swipe. \n\"end_subarea\" is a string representing the exact location to end the swipe within the grid area.\nThe two subarea parameters can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left, \nbottom, and bottom-right.\nA simple use case can be swipe(21, \"center\", 25, \"right\"), which performs a swipe starting from the center of grid area \n21 to the right part of grid area 25.\nThe task you need to complete is to . Your past actions to proceed with this task are summarized as \nfollows: \nNow, given the following labeled screenshot, you need to think and call the function needed to proceed with the task. \nYour output should include three parts in the given format:\nObservation: \nThought: \nAction: \nSummary: \nYou can only take one action at a time, so please directly call the function.\"\"\"\nself_explore_task_template = \"\"\"You are an agent that is trained to complete certain tasks on a smartphone. You will be \ngiven a screenshot of a smartphone app. The interactive UI elements on the screenshot are labeled with numeric tags \nstarting from 1. \nYou can call the following functions to interact with those labeled elements to control the smartphone:\n1. tap(element: int)\nThis function is used to tap an UI element shown on the smartphone screen.\n\"element\" is a numeric tag assigned to an UI element shown on the smartphone screen.\nA simple use case can be tap(5), which taps the UI element labeled with the number 5.\n2. text(text_input: str)"
+ },
+ {
+ "comment": "This function inserts text into an input field, long presses a UI element, or swipes an element on the smartphone screen.\ntext: Inserts string into input area when keyboard shows.\nlong_press: Long presses UI element with assigned numeric tag.\nswipe: Swipes UI element in specified direction and distance.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":149-162",
+ "content": "This function is used to insert text input in an input field/box. text_input is the string you want to insert and must \nbe wrapped with double quotation marks. A simple use case can be text(\"Hello, world!\"), which inserts the string \n\"Hello, world!\" into the input area on the smartphone screen. This function is only callable when you see a keyboard \nshowing in the lower half of the screen.\n3. long_press(element: int)\nThis function is used to long press an UI element shown on the smartphone screen.\n\"element\" is a numeric tag assigned to an UI element shown on the smartphone screen.\nA simple use case can be long_press(5), which long presses the UI element labeled with the number 5.\n4. swipe(element: int, direction: str, dist: str)\nThis function is used to swipe an UI element shown on the smartphone screen, usually a scroll view or a slide bar.\n\"element\" is a numeric tag assigned to an UI element shown on the smartphone screen. \"direction\" is a string that \nrepresents one of the four directions: up, down, left, right. \"direction\" must be wrapped with double quotation "
+ },
+ {
+ "comment": "Observation: The code explains the usage of a swipe function with options for distance (\"short\", \"medium\", or \"long\") and direction.\nThought: To complete the task, I need to call the appropriate function with the correct parameters.\nAction: FINISH",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":163-176",
+ "content": "marks. \"dist\" determines the distance of the swipe and can be one of the three options: short, medium, long. You should \nchoose the appropriate distance option according to your need.\nA simple use case can be swipe(21, \"up\", \"medium\"), which swipes up the UI element labeled with the number 21 for a \nmedium distance.\nThe task you need to complete is to . Your past actions to proceed with this task are summarized as \nfollows: \nNow, given the following labeled screenshot, you need to think and call the function needed to proceed with the task. \nYour output should include three parts in the given format:\nObservation: \nThought: \nAction: "
+ },
+ {
+ "comment": "The code is a prompt template for analyzing differences in mobile app screenshots before and after an action. The user needs to determine if the action was effective and helped progress the task.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":177-189",
+ "content": "Summary: \nYou can only take one action at a time, so please directly call the function.\"\"\"\nself_explore_reflect_template = \"\"\"I will give you screenshots of a mobile app before and after the UI \nelement labeled with the number '' on the first screenshot. The numeric tag of each element is located at \nthe center of the element. The action of this UI element was described as follows:\n\nThe action was also an attempt to proceed with a larger task, which is to . Your job is to carefully analyze \nthe difference between the two screenshots to determine if the action is in accord with the description above and at \nthe same time effectively moved the task forward. Your output should be determined based on the following situations:\n1. BACK\nIf you think the action navigated you to a page where you cannot proceed with the given task, you should go back to the "
+ },
+ {
+ "comment": "Decision: BACK\nThought: Reverses the last action and returns to previous interface.\nDocumentation: Allows user to undo the previous action and go back to the previous screen.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":190-202",
+ "content": "previous interface. At the same time, describe the functionality of the UI element concisely in one or two sentences by \nobserving the difference between the two screenshots. Notice that your description of the UI element should focus on \nthe general function. Never include the numeric tag of the UI element in your description. You can use pronouns such as \n\"the UI element\" to refer to the element. Your output should be in the following format:\nDecision: BACK\nThought: \nDocumentation: \n2. INEFFECTIVE\nIf you find the action changed nothing on the screen (screenshots before and after the action are identical), you \nshould continue to interact with other elements on the screen. Notice that if you find the location of the cursor \nchanged between the two screenshots, then they are not identical. Your output should be in the following format:\nDecision: INEFFECTIVE\nThought: "
+ },
+ {
+ "comment": "Code snippet discusses the process of interacting with a UI element and documenting its functionality when the action doesn't fully complete the task but still makes progress.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":203-216",
+ "content": "3. CONTINUE\nIf you find the action changed something on the screen but does not reflect the action description above and did not \nmove the given task forward, you should continue to interact with other elements on the screen. At the same time, \ndescribe the functionality of the UI element concisely in one or two sentences by observing the difference between the \ntwo screenshots. Notice that your description of the UI element should focus on the general function. Never include the \nnumeric tag of the UI element in your description. You can use pronouns such as \"the UI element\" to refer to the \nelement. Your output should be in the following format:\nDecision: CONTINUE\nThought: \nDocumentation: \n4. SUCCESS\nIf you think the action successfully moved the task forward (even though it did not completed the task), you should \ndescribe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI "
+ },
+ {
+ "comment": "The code provides instructions for describing UI elements in a task-oriented manner, using pronouns and avoiding numeric tags. It specifies the output format as \"Decision: SUCCESS\" followed by a thought explaining why the action moved the task forward, along with documentation describing the function of the UI element.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/prompts.py\":217-222",
+ "content": "element should focus on the general function. Never include the numeric tag of the UI element in your description. You \ncan use pronouns such as \"the UI element\" to refer to the element. Your output should be in the following format:\nDecision: SUCCESS\nThought: \nDocumentation: \n\"\"\""
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/f04d3d09-26da-486d-bb25-9d88db98050a.json b/docs/doc/f04d3d09-26da-486d-bb25-9d88db98050a.json
new file mode 100644
index 0000000..ae1b954
--- /dev/null
+++ b/docs/doc/f04d3d09-26da-486d-bb25-9d88db98050a.json
@@ -0,0 +1,10 @@
+{
+ "summary": "List of dependencies for the project: argparse, colorama, opencv-python, pyshtine, pyyaml, requests",
+ "details": [
+ {
+ "comment": "List of dependencies for the project: argparse, colorama, opencv-python, pyshtine, pyyaml, requests",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/requirements.txt\":0-5",
+ "content": "argparse\ncolorama\nopencv-python\npyshine\npyyaml\nrequests"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/f3068f71-4b0f-445b-af7b-1d8d1417b73b.json b/docs/doc/f3068f71-4b0f-445b-af7b-1d8d1417b73b.json
new file mode 100644
index 0000000..79deecc
--- /dev/null
+++ b/docs/doc/f3068f71-4b0f-445b-af7b-1d8d1417b73b.json
@@ -0,0 +1,35 @@
+{
+ "summary": "The code imports modules, loads configuration, defines functions for requesting OpenAI API, parsing response JSON, extracting information, printing with color formatting, handling model responses and exceptions, and deciding/formatting actions based on the act name.",
+ "details": [
+ {
+ "comment": "This code imports necessary modules and loads configuration from a file. It then defines a function `ask_gpt4v` that sends a request to an OpenAI API using provided configuration, returns the response JSON, and prints the request cost if there is no error in the response.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/model.py\":0-33",
+ "content": "import re\nimport requests\nfrom config import load_config\nfrom utils import print_with_color\nconfigs = load_config()\ndef ask_gpt4v(content):\n headers = {\n \"Content-Type\": \"application/json\",\n \"Authorization\": f\"Bearer {configs['OPENAI_API_KEY']}\"\n }\n payload = {\n \"model\": configs[\"OPENAI_API_MODEL\"],\n \"messages\": [\n {\n \"role\": \"user\",\n \"content\": content\n }\n ],\n \"temperature\": configs[\"TEMPERATURE\"],\n \"max_tokens\": configs[\"MAX_TOKENS\"]\n }\n response = requests.post(configs[\"OPENAI_API_BASE\"], headers=headers, json=payload)\n if \"error\" not in response.json():\n usage = response.json()[\"usage\"]\n prompt_tokens = usage[\"prompt_tokens\"]\n completion_tokens = usage[\"completion_tokens\"]\n print_with_color(f\"Request cost is \"\n f\"${'{0:.2f}'.format(prompt_tokens / 1000 * 0.01 + completion_tokens / 1000 * 0.03)}\",\n \"yellow\")\n return response.json()"
+ },
+ {
+ "comment": "Function `parse_explore_rsp` parses a response and extracts observation, thought, action, and summary. It then prints observation, thought, action, and summary with colors, and returns the action name, area (if action is 'tap'), and last_act (if action is 'text') or finishes if \"FINISH\" found in action.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/model.py\":36-58",
+ "content": "def parse_explore_rsp(rsp):\n try:\n msg = rsp[\"choices\"][0][\"message\"][\"content\"]\n observation = re.findall(r\"Observation: (.*?)$\", msg, re.MULTILINE)[0]\n think = re.findall(r\"Thought: (.*?)$\", msg, re.MULTILINE)[0]\n act = re.findall(r\"Action: (.*?)$\", msg, re.MULTILINE)[0]\n last_act = re.findall(r\"Summary: (.*?)$\", msg, re.MULTILINE)[0]\n print_with_color(\"Observation:\", \"yellow\")\n print_with_color(observation, \"magenta\")\n print_with_color(\"Thought:\", \"yellow\")\n print_with_color(think, \"magenta\")\n print_with_color(\"Action:\", \"yellow\")\n print_with_color(act, \"magenta\")\n print_with_color(\"Summary:\", \"yellow\")\n print_with_color(last_act, \"magenta\")\n if \"FINISH\" in act:\n return [\"FINISH\"]\n act_name = act.split(\"(\")[0]\n if act_name == \"tap\":\n area = int(re.findall(r\"tap\\((.*?)\\)\", act)[0])\n return [act_name, area, last_act]\n elif act_name == \"text\":\n input_str = re.findall(r\"text\\((.*?)\\)\", act)[0][1:-1]"
+ },
+ {
+ "comment": "This code is parsing a response from a model and returns different information based on the type of action specified in the response. If an undefined action or error occurs, it prints an error message. The \"parse_grid_rsp\" function specifically handles grid actions.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/model.py\":59-84",
+ "content": " return [act_name, input_str, last_act]\n elif act_name == \"long_press\":\n area = int(re.findall(r\"long_press\\((.*?)\\)\", act)[0])\n return [act_name, area, last_act]\n elif act_name == \"swipe\":\n params = re.findall(r\"swipe\\((.*?)\\)\", act)[0]\n area, swipe_dir, dist = params.split(\",\")\n area = int(area)\n swipe_dir = swipe_dir.strip()[1:-1]\n dist = dist.strip()[1:-1]\n return [act_name, area, swipe_dir, dist, last_act]\n elif act_name == \"grid\":\n return [act_name]\n else:\n print_with_color(f\"ERROR: Undefined act {act_name}!\", \"red\")\n return [\"ERROR\"]\n except Exception as e:\n print_with_color(f\"ERROR: an exception occurs while parsing the model response: {e}\", \"red\")\n print_with_color(rsp, \"red\")\n return [\"ERROR\"]\ndef parse_grid_rsp(rsp):\n try:\n msg = rsp[\"choices\"][0][\"message\"][\"content\"]\n observation = re.findall(r\"Observation: (.*?)$\", msg, re.MULTILINE)[0]"
+ },
+ {
+ "comment": "Extracts observation, thought, action, and summary from the message string. Displays them with color formatting. If \"FINISH\" is found in the action, it returns [\"FINISH\"]. For actions \"tap\", extracts grid area and subarea parameters. If \"long_press\" found, extracts the parameters.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/model.py\":85-105",
+ "content": " think = re.findall(r\"Thought: (.*?)$\", msg, re.MULTILINE)[0]\n act = re.findall(r\"Action: (.*?)$\", msg, re.MULTILINE)[0]\n last_act = re.findall(r\"Summary: (.*?)$\", msg, re.MULTILINE)[0]\n print_with_color(\"Observation:\", \"yellow\")\n print_with_color(observation, \"magenta\")\n print_with_color(\"Thought:\", \"yellow\")\n print_with_color(think, \"magenta\")\n print_with_color(\"Action:\", \"yellow\")\n print_with_color(act, \"magenta\")\n print_with_color(\"Summary:\", \"yellow\")\n print_with_color(last_act, \"magenta\")\n if \"FINISH\" in act:\n return [\"FINISH\"]\n act_name = act.split(\"(\")[0]\n if act_name == \"tap\":\n params = re.findall(r\"tap\\((.*?)\\)\", act)[0].split(\",\")\n area = int(params[0].strip())\n subarea = params[1].strip()[1:-1]\n return [act_name + \"_grid\", area, subarea, last_act]\n elif act_name == \"long_press\":\n params = re.findall(r\"long_press\\((.*?)\\)\", act)[0].split(\",\")"
+ },
+ {
+ "comment": "This code is parsing the response from a model and determines the appropriate action based on the act name. It returns a specific grid if the act is 'grid'. If the act name is undefined, it prints an error message in red color. If any exception occurs while parsing the response, it also prints an error message with details of the exception.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/model.py\":106-130",
+ "content": " area = int(params[0].strip())\n subarea = params[1].strip()[1:-1]\n return [act_name + \"_grid\", area, subarea, last_act]\n elif act_name == \"swipe\":\n params = re.findall(r\"swipe\\((.*?)\\)\", act)[0].split(\",\")\n start_area = int(params[0].strip())\n start_subarea = params[1].strip()[1:-1]\n end_area = int(params[2].strip())\n end_subarea = params[3].strip()[1:-1]\n return [act_name + \"_grid\", start_area, start_subarea, end_area, end_subarea, last_act]\n elif act_name == \"grid\":\n return [act_name]\n else:\n print_with_color(f\"ERROR: Undefined act {act_name}!\", \"red\")\n return [\"ERROR\"]\n except Exception as e:\n print_with_color(f\"ERROR: an exception occurs while parsing the model response: {e}\", \"red\")\n print_with_color(rsp, \"red\")\n return [\"ERROR\"]\ndef parse_reflect_rsp(rsp):\n try:\n msg = rsp[\"choices\"][0][\"message\"][\"content\"]\n decision = re.findall(r\"Decision: (.*?)$\", msg, re.MULTILINE)[0]"
+ },
+ {
+ "comment": "This code extracts decision, thought, and documentation from a message using regular expressions. It then prints them with colored formatting and returns the information as a list. If an undefined decision or exception occurs, it returns an error message.",
+ "location": "\"/media/root/Toshiba XG3/works/AppAgent/docs/src/scripts/model.py\":131-149",
+ "content": " think = re.findall(r\"Thought: (.*?)$\", msg, re.MULTILINE)[0]\n print_with_color(\"Decision:\", \"yellow\")\n print_with_color(decision, \"magenta\")\n print_with_color(\"Thought:\", \"yellow\")\n print_with_color(think, \"magenta\")\n if decision == \"INEFFECTIVE\":\n return [decision, think]\n elif decision == \"BACK\" or decision == \"CONTINUE\" or decision == \"SUCCESS\":\n doc = re.findall(r\"Documentation: (.*?)$\", msg, re.MULTILINE)[0]\n print_with_color(\"Documentation:\", \"yellow\")\n print_with_color(doc, \"magenta\")\n return [decision, think, doc]\n else:\n print_with_color(f\"ERROR: Undefined decision {decision}!\", \"red\")\n return [\"ERROR\"]\n except Exception as e:\n print_with_color(f\"ERROR: an exception occurs while parsing the model response: {e}\", \"red\")\n print_with_color(rsp, \"red\")\n return [\"ERROR\"]"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/github-markdown.css b/docs/github-markdown.css
new file mode 100644
index 0000000..96a4f29
--- /dev/null
+++ b/docs/github-markdown.css
@@ -0,0 +1,1197 @@
+@media (prefers-color-scheme: dark) {
+
+ .markdown-body,
+ [data-theme="dark"] {
+ /*dark*/
+ color-scheme: dark;
+ --color-prettylights-syntax-comment: #8b949e;
+ --color-prettylights-syntax-constant: #79c0ff;
+ --color-prettylights-syntax-entity: #d2a8ff;
+ --color-prettylights-syntax-storage-modifier-import: #c9d1d9;
+ --color-prettylights-syntax-entity-tag: #7ee787;
+ --color-prettylights-syntax-keyword: #ff7b72;
+ --color-prettylights-syntax-string: #a5d6ff;
+ --color-prettylights-syntax-variable: #ffa657;
+ --color-prettylights-syntax-brackethighlighter-unmatched: #f85149;
+ --color-prettylights-syntax-invalid-illegal-text: #f0f6fc;
+ --color-prettylights-syntax-invalid-illegal-bg: #8e1519;
+ --color-prettylights-syntax-carriage-return-text: #f0f6fc;
+ --color-prettylights-syntax-carriage-return-bg: #b62324;
+ --color-prettylights-syntax-string-regexp: #7ee787;
+ --color-prettylights-syntax-markup-list: #f2cc60;
+ --color-prettylights-syntax-markup-heading: #1f6feb;
+ --color-prettylights-syntax-markup-italic: #c9d1d9;
+ --color-prettylights-syntax-markup-bold: #c9d1d9;
+ --color-prettylights-syntax-markup-deleted-text: #ffdcd7;
+ --color-prettylights-syntax-markup-deleted-bg: #67060c;
+ --color-prettylights-syntax-markup-inserted-text: #aff5b4;
+ --color-prettylights-syntax-markup-inserted-bg: #033a16;
+ --color-prettylights-syntax-markup-changed-text: #ffdfb6;
+ --color-prettylights-syntax-markup-changed-bg: #5a1e02;
+ --color-prettylights-syntax-markup-ignored-text: #c9d1d9;
+ --color-prettylights-syntax-markup-ignored-bg: #1158c7;
+ --color-prettylights-syntax-meta-diff-range: #d2a8ff;
+ --color-prettylights-syntax-brackethighlighter-angle: #8b949e;
+ --color-prettylights-syntax-sublimelinter-gutter-mark: #484f58;
+ --color-prettylights-syntax-constant-other-reference-link: #a5d6ff;
+ --color-fg-default: #e6edf3;
+ --color-fg-muted: #848d97;
+ --color-fg-subtle: #6e7681;
+ --color-canvas-default: #0d1117;
+ --color-canvas-subtle: #161b22;
+ --color-border-default: #30363d;
+ --color-border-muted: #21262d;
+ --color-neutral-muted: rgba(110, 118, 129, 0.4);
+ --color-accent-fg: #2f81f7;
+ --color-accent-emphasis: #1f6feb;
+ --color-success-fg: #3fb950;
+ --color-success-emphasis: #238636;
+ --color-attention-fg: #d29922;
+ --color-attention-emphasis: #9e6a03;
+ --color-attention-subtle: rgba(187, 128, 9, 0.15);
+ --color-danger-fg: #f85149;
+ --color-danger-emphasis: #da3633;
+ --color-done-fg: #a371f7;
+ --color-done-emphasis: #8957e5;
+ }
+}
+
+@media (prefers-color-scheme: light) {
+
+ .markdown-body,
+ [data-theme="light"] {
+ /*light*/
+ color-scheme: light;
+ --color-prettylights-syntax-comment: #57606a;
+ --color-prettylights-syntax-constant: #0550ae;
+ --color-prettylights-syntax-entity: #6639ba;
+ --color-prettylights-syntax-storage-modifier-import: #24292f;
+ --color-prettylights-syntax-entity-tag: #116329;
+ --color-prettylights-syntax-keyword: #cf222e;
+ --color-prettylights-syntax-string: #0a3069;
+ --color-prettylights-syntax-variable: #953800;
+ --color-prettylights-syntax-brackethighlighter-unmatched: #82071e;
+ --color-prettylights-syntax-invalid-illegal-text: #f6f8fa;
+ --color-prettylights-syntax-invalid-illegal-bg: #82071e;
+ --color-prettylights-syntax-carriage-return-text: #f6f8fa;
+ --color-prettylights-syntax-carriage-return-bg: #cf222e;
+ --color-prettylights-syntax-string-regexp: #116329;
+ --color-prettylights-syntax-markup-list: #3b2300;
+ --color-prettylights-syntax-markup-heading: #0550ae;
+ --color-prettylights-syntax-markup-italic: #24292f;
+ --color-prettylights-syntax-markup-bold: #24292f;
+ --color-prettylights-syntax-markup-deleted-text: #82071e;
+ --color-prettylights-syntax-markup-deleted-bg: #ffebe9;
+ --color-prettylights-syntax-markup-inserted-text: #116329;
+ --color-prettylights-syntax-markup-inserted-bg: #dafbe1;
+ --color-prettylights-syntax-markup-changed-text: #953800;
+ --color-prettylights-syntax-markup-changed-bg: #ffd8b5;
+ --color-prettylights-syntax-markup-ignored-text: #eaeef2;
+ --color-prettylights-syntax-markup-ignored-bg: #0550ae;
+ --color-prettylights-syntax-meta-diff-range: #8250df;
+ --color-prettylights-syntax-brackethighlighter-angle: #57606a;
+ --color-prettylights-syntax-sublimelinter-gutter-mark: #8c959f;
+ --color-prettylights-syntax-constant-other-reference-link: #0a3069;
+ --color-fg-default: #1F2328;
+ --color-fg-muted: #656d76;
+ --color-fg-subtle: #6e7781;
+ --color-canvas-default: #ffffff;
+ --color-canvas-subtle: #f6f8fa;
+ --color-border-default: #d0d7de;
+ --color-border-muted: hsla(210, 18%, 87%, 1);
+ --color-neutral-muted: rgba(175, 184, 193, 0.2);
+ --color-accent-fg: #0969da;
+ --color-accent-emphasis: #0969da;
+ --color-success-fg: #1a7f37;
+ --color-success-emphasis: #1f883d;
+ --color-attention-fg: #9a6700;
+ --color-attention-emphasis: #9a6700;
+ --color-attention-subtle: #fff8c5;
+ --color-danger-fg: #d1242f;
+ --color-danger-emphasis: #cf222e;
+ --color-done-fg: #8250df;
+ --color-done-emphasis: #8250df;
+ }
+}
+
+.markdown-body {
+ -ms-text-size-adjust: 100%;
+ -webkit-text-size-adjust: 100%;
+ margin: 0;
+ color: var(--color-fg-default);
+ background-color: var(--color-canvas-default);
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Noto Sans", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji";
+ font-size: 16px;
+ line-height: 1.5;
+ word-wrap: break-word;
+}
+
+.markdown-body .octicon {
+ display: inline-block;
+ fill: currentColor;
+ vertical-align: text-bottom;
+}
+
+.markdown-body h1:hover .anchor .octicon-link:before,
+.markdown-body h2:hover .anchor .octicon-link:before,
+.markdown-body h3:hover .anchor .octicon-link:before,
+.markdown-body h4:hover .anchor .octicon-link:before,
+.markdown-body h5:hover .anchor .octicon-link:before,
+.markdown-body h6:hover .anchor .octicon-link:before {
+ width: 16px;
+ height: 16px;
+ content: ' ';
+ display: inline-block;
+ background-color: currentColor;
+ -webkit-mask-image: url("data:image/svg+xml,");
+ mask-image: url("data:image/svg+xml,");
+}
+
+.markdown-body details,
+.markdown-body figcaption,
+.markdown-body figure {
+ display: block;
+}
+
+.markdown-body summary {
+ display: list-item;
+}
+
+.markdown-body [hidden] {
+ display: none !important;
+}
+
+.markdown-body a {
+ background-color: transparent;
+ color: var(--color-accent-fg);
+ text-decoration: none;
+}
+
+.markdown-body abbr[title] {
+ border-bottom: none;
+ -webkit-text-decoration: underline dotted;
+ text-decoration: underline dotted;
+}
+
+.markdown-body b,
+.markdown-body strong {
+ font-weight: var(--base-text-weight-semibold, 600);
+}
+
+.markdown-body dfn {
+ font-style: italic;
+}
+
+.markdown-body h1 {
+ margin: .67em 0;
+ font-weight: var(--base-text-weight-semibold, 600);
+ padding-bottom: .3em;
+ font-size: 2em;
+ border-bottom: 1px solid var(--color-border-muted);
+}
+
+.markdown-body mark {
+ background-color: var(--color-attention-subtle);
+ color: var(--color-fg-default);
+}
+
+.markdown-body small {
+ font-size: 90%;
+}
+
+.markdown-body sub,
+.markdown-body sup {
+ font-size: 75%;
+ line-height: 0;
+ position: relative;
+ vertical-align: baseline;
+}
+
+.markdown-body sub {
+ bottom: -0.25em;
+}
+
+.markdown-body sup {
+ top: -0.5em;
+}
+
+.markdown-body img {
+ border-style: none;
+ max-width: 100%;
+ box-sizing: content-box;
+ background-color: var(--color-canvas-default);
+}
+
+.markdown-body code,
+.markdown-body kbd,
+.markdown-body pre,
+.markdown-body samp {
+ font-family: monospace;
+ font-size: 1em;
+}
+
+.markdown-body figure {
+ margin: 1em 40px;
+}
+
+.markdown-body hr {
+ box-sizing: content-box;
+ overflow: hidden;
+ background: transparent;
+ border-bottom: 1px solid var(--color-border-muted);
+ height: .25em;
+ padding: 0;
+ margin: 24px 0;
+ background-color: var(--color-border-default);
+ border: 0;
+}
+
+.markdown-body input {
+ font: inherit;
+ margin: 0;
+ overflow: visible;
+ font-family: inherit;
+ font-size: inherit;
+ line-height: inherit;
+}
+
+.markdown-body [type=button],
+.markdown-body [type=reset],
+.markdown-body [type=submit] {
+ -webkit-appearance: button;
+ appearance: button;
+}
+
+.markdown-body [type=checkbox],
+.markdown-body [type=radio] {
+ box-sizing: border-box;
+ padding: 0;
+}
+
+.markdown-body [type=number]::-webkit-inner-spin-button,
+.markdown-body [type=number]::-webkit-outer-spin-button {
+ height: auto;
+}
+
+.markdown-body [type=search]::-webkit-search-cancel-button,
+.markdown-body [type=search]::-webkit-search-decoration {
+ -webkit-appearance: none;
+ appearance: none;
+}
+
+.markdown-body ::-webkit-input-placeholder {
+ color: inherit;
+ opacity: .54;
+}
+
+.markdown-body ::-webkit-file-upload-button {
+ -webkit-appearance: button;
+ appearance: button;
+ font: inherit;
+}
+
+.markdown-body a:hover {
+ text-decoration: underline;
+}
+
+.markdown-body ::placeholder {
+ color: var(--color-fg-subtle);
+ opacity: 1;
+}
+
+.markdown-body hr::before {
+ display: table;
+ content: "";
+}
+
+.markdown-body hr::after {
+ display: table;
+ clear: both;
+ content: "";
+}
+
+.markdown-body table {
+ border-spacing: 0;
+ border-collapse: collapse;
+ display: block;
+ width: max-content;
+ max-width: 100%;
+ overflow: auto;
+}
+
+.markdown-body td,
+.markdown-body th {
+ padding: 0;
+}
+
+.markdown-body details summary {
+ cursor: pointer;
+}
+
+.markdown-body details:not([open])>*:not(summary) {
+ display: none !important;
+}
+
+.markdown-body a:focus,
+.markdown-body [role=button]:focus,
+.markdown-body input[type=radio]:focus,
+.markdown-body input[type=checkbox]:focus {
+ outline: 2px solid var(--color-accent-fg);
+ outline-offset: -2px;
+ box-shadow: none;
+}
+
+.markdown-body a:focus:not(:focus-visible),
+.markdown-body [role=button]:focus:not(:focus-visible),
+.markdown-body input[type=radio]:focus:not(:focus-visible),
+.markdown-body input[type=checkbox]:focus:not(:focus-visible) {
+ outline: solid 1px transparent;
+}
+
+.markdown-body a:focus-visible,
+.markdown-body [role=button]:focus-visible,
+.markdown-body input[type=radio]:focus-visible,
+.markdown-body input[type=checkbox]:focus-visible {
+ outline: 2px solid var(--color-accent-fg);
+ outline-offset: -2px;
+ box-shadow: none;
+}
+
+.markdown-body a:not([class]):focus,
+.markdown-body a:not([class]):focus-visible,
+.markdown-body input[type=radio]:focus,
+.markdown-body input[type=radio]:focus-visible,
+.markdown-body input[type=checkbox]:focus,
+.markdown-body input[type=checkbox]:focus-visible {
+ outline-offset: 0;
+}
+
+.markdown-body kbd {
+ display: inline-block;
+ padding: 3px 5px;
+ font: 11px ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace;
+ line-height: 10px;
+ color: var(--color-fg-default);
+ vertical-align: middle;
+ background-color: var(--color-canvas-subtle);
+ border: solid 1px var(--color-neutral-muted);
+ border-bottom-color: var(--color-neutral-muted);
+ border-radius: 6px;
+ box-shadow: inset 0 -1px 0 var(--color-neutral-muted);
+}
+
+.markdown-body h1,
+.markdown-body h2,
+.markdown-body h3,
+.markdown-body h4,
+.markdown-body h5,
+.markdown-body h6 {
+ margin-top: 24px;
+ margin-bottom: 16px;
+ font-weight: var(--base-text-weight-semibold, 600);
+ line-height: 1.25;
+}
+
+.markdown-body h2 {
+ font-weight: var(--base-text-weight-semibold, 600);
+ padding-bottom: .3em;
+ font-size: 1.5em;
+ border-bottom: 1px solid var(--color-border-muted);
+}
+
+.markdown-body h3 {
+ font-weight: var(--base-text-weight-semibold, 600);
+ font-size: 1.25em;
+}
+
+.markdown-body h4 {
+ font-weight: var(--base-text-weight-semibold, 600);
+ font-size: 1em;
+}
+
+.markdown-body h5 {
+ font-weight: var(--base-text-weight-semibold, 600);
+ font-size: .875em;
+}
+
+.markdown-body h6 {
+ font-weight: var(--base-text-weight-semibold, 600);
+ font-size: .85em;
+ color: var(--color-fg-muted);
+}
+
+.markdown-body p {
+ margin-top: 0;
+ margin-bottom: 10px;
+}
+
+.markdown-body blockquote {
+ margin: 0;
+ padding: 0 1em;
+ color: var(--color-fg-muted);
+ border-left: .25em solid var(--color-border-default);
+}
+
+.markdown-body ul,
+.markdown-body ol {
+ margin-top: 0;
+ margin-bottom: 0;
+ padding-left: 2em;
+}
+
+.markdown-body ol ol,
+.markdown-body ul ol {
+ list-style-type: lower-roman;
+}
+
+.markdown-body ul ul ol,
+.markdown-body ul ol ol,
+.markdown-body ol ul ol,
+.markdown-body ol ol ol {
+ list-style-type: lower-alpha;
+}
+
+.markdown-body dd {
+ margin-left: 0;
+}
+
+.markdown-body tt,
+.markdown-body code,
+.markdown-body samp {
+ font-family: ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace;
+ font-size: 12px;
+}
+
+.markdown-body pre {
+ margin-top: 0;
+ margin-bottom: 0;
+ font-family: ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace;
+ font-size: 12px;
+ word-wrap: normal;
+}
+
+.markdown-body .octicon {
+ display: inline-block;
+ overflow: visible !important;
+ vertical-align: text-bottom;
+ fill: currentColor;
+}
+
+.markdown-body input::-webkit-outer-spin-button,
+.markdown-body input::-webkit-inner-spin-button {
+ margin: 0;
+ -webkit-appearance: none;
+ appearance: none;
+}
+
+.markdown-body .mr-2 {
+ margin-right: var(--base-size-8, 8px) !important;
+}
+
+.markdown-body::before {
+ display: table;
+ content: "";
+}
+
+.markdown-body::after {
+ display: table;
+ clear: both;
+ content: "";
+}
+
+.markdown-body>*:first-child {
+ margin-top: 0 !important;
+}
+
+.markdown-body>*:last-child {
+ margin-bottom: 0 !important;
+}
+
+.markdown-body a:not([href]) {
+ color: inherit;
+ text-decoration: none;
+}
+
+.markdown-body .absent {
+ color: var(--color-danger-fg);
+}
+
+.markdown-body .anchor {
+ float: left;
+ padding-right: 4px;
+ margin-left: -20px;
+ line-height: 1;
+}
+
+.markdown-body .anchor:focus {
+ outline: none;
+}
+
+.markdown-body p,
+.markdown-body blockquote,
+.markdown-body ul,
+.markdown-body ol,
+.markdown-body dl,
+.markdown-body table,
+.markdown-body pre,
+.markdown-body details {
+ margin-top: 0;
+ margin-bottom: 16px;
+}
+
+.markdown-body blockquote>:first-child {
+ margin-top: 0;
+}
+
+.markdown-body blockquote>:last-child {
+ margin-bottom: 0;
+}
+
+.markdown-body h1 .octicon-link,
+.markdown-body h2 .octicon-link,
+.markdown-body h3 .octicon-link,
+.markdown-body h4 .octicon-link,
+.markdown-body h5 .octicon-link,
+.markdown-body h6 .octicon-link {
+ color: var(--color-fg-default);
+ vertical-align: middle;
+ visibility: hidden;
+}
+
+.markdown-body h1:hover .anchor,
+.markdown-body h2:hover .anchor,
+.markdown-body h3:hover .anchor,
+.markdown-body h4:hover .anchor,
+.markdown-body h5:hover .anchor,
+.markdown-body h6:hover .anchor {
+ text-decoration: none;
+}
+
+.markdown-body h1:hover .anchor .octicon-link,
+.markdown-body h2:hover .anchor .octicon-link,
+.markdown-body h3:hover .anchor .octicon-link,
+.markdown-body h4:hover .anchor .octicon-link,
+.markdown-body h5:hover .anchor .octicon-link,
+.markdown-body h6:hover .anchor .octicon-link {
+ visibility: visible;
+}
+
+.markdown-body h1 tt,
+.markdown-body h1 code,
+.markdown-body h2 tt,
+.markdown-body h2 code,
+.markdown-body h3 tt,
+.markdown-body h3 code,
+.markdown-body h4 tt,
+.markdown-body h4 code,
+.markdown-body h5 tt,
+.markdown-body h5 code,
+.markdown-body h6 tt,
+.markdown-body h6 code {
+ padding: 0 .2em;
+ font-size: inherit;
+}
+
+.markdown-body summary h1,
+.markdown-body summary h2,
+.markdown-body summary h3,
+.markdown-body summary h4,
+.markdown-body summary h5,
+.markdown-body summary h6 {
+ display: inline-block;
+}
+
+.markdown-body summary h1 .anchor,
+.markdown-body summary h2 .anchor,
+.markdown-body summary h3 .anchor,
+.markdown-body summary h4 .anchor,
+.markdown-body summary h5 .anchor,
+.markdown-body summary h6 .anchor {
+ margin-left: -40px;
+}
+
+.markdown-body summary h1,
+.markdown-body summary h2 {
+ padding-bottom: 0;
+ border-bottom: 0;
+}
+
+.markdown-body ul.no-list,
+.markdown-body ol.no-list {
+ padding: 0;
+ list-style-type: none;
+}
+
+.markdown-body ol[type="a s"] {
+ list-style-type: lower-alpha;
+}
+
+.markdown-body ol[type="A s"] {
+ list-style-type: upper-alpha;
+}
+
+.markdown-body ol[type="i s"] {
+ list-style-type: lower-roman;
+}
+
+.markdown-body ol[type="I s"] {
+ list-style-type: upper-roman;
+}
+
+.markdown-body ol[type="1"] {
+ list-style-type: decimal;
+}
+
+.markdown-body div>ol:not([type]) {
+ list-style-type: decimal;
+}
+
+.markdown-body ul ul,
+.markdown-body ul ol,
+.markdown-body ol ol,
+.markdown-body ol ul {
+ margin-top: 0;
+ margin-bottom: 0;
+}
+
+.markdown-body li>p {
+ margin-top: 16px;
+}
+
+.markdown-body li+li {
+ margin-top: .25em;
+}
+
+.markdown-body dl {
+ padding: 0;
+}
+
+.markdown-body dl dt {
+ padding: 0;
+ margin-top: 16px;
+ font-size: 1em;
+ font-style: italic;
+ font-weight: var(--base-text-weight-semibold, 600);
+}
+
+.markdown-body dl dd {
+ padding: 0 16px;
+ margin-bottom: 16px;
+}
+
+.markdown-body table th {
+ font-weight: var(--base-text-weight-semibold, 600);
+}
+
+.markdown-body table th,
+.markdown-body table td {
+ padding: 6px 13px;
+ border: 1px solid var(--color-border-default);
+}
+
+.markdown-body table td>:last-child {
+ margin-bottom: 0;
+}
+
+.markdown-body table tr {
+ background-color: var(--color-canvas-default);
+ border-top: 1px solid var(--color-border-muted);
+}
+
+.markdown-body table tr:nth-child(2n) {
+ background-color: var(--color-canvas-subtle);
+}
+
+.markdown-body table img {
+ background-color: transparent;
+}
+
+.markdown-body img[align=right] {
+ padding-left: 20px;
+}
+
+.markdown-body img[align=left] {
+ padding-right: 20px;
+}
+
+.markdown-body .emoji {
+ max-width: none;
+ vertical-align: text-top;
+ background-color: transparent;
+}
+
+.markdown-body span.frame {
+ display: block;
+ overflow: hidden;
+}
+
+.markdown-body span.frame>span {
+ display: block;
+ float: left;
+ width: auto;
+ padding: 7px;
+ margin: 13px 0 0;
+ overflow: hidden;
+ border: 1px solid var(--color-border-default);
+}
+
+.markdown-body span.frame span img {
+ display: block;
+ float: left;
+}
+
+.markdown-body span.frame span span {
+ display: block;
+ padding: 5px 0 0;
+ clear: both;
+ color: var(--color-fg-default);
+}
+
+.markdown-body span.align-center {
+ display: block;
+ overflow: hidden;
+ clear: both;
+}
+
+.markdown-body span.align-center>span {
+ display: block;
+ margin: 13px auto 0;
+ overflow: hidden;
+ text-align: center;
+}
+
+.markdown-body span.align-center span img {
+ margin: 0 auto;
+ text-align: center;
+}
+
+.markdown-body span.align-right {
+ display: block;
+ overflow: hidden;
+ clear: both;
+}
+
+.markdown-body span.align-right>span {
+ display: block;
+ margin: 13px 0 0;
+ overflow: hidden;
+ text-align: right;
+}
+
+.markdown-body span.align-right span img {
+ margin: 0;
+ text-align: right;
+}
+
+.markdown-body span.float-left {
+ display: block;
+ float: left;
+ margin-right: 13px;
+ overflow: hidden;
+}
+
+.markdown-body span.float-left span {
+ margin: 13px 0 0;
+}
+
+.markdown-body span.float-right {
+ display: block;
+ float: right;
+ margin-left: 13px;
+ overflow: hidden;
+}
+
+.markdown-body span.float-right>span {
+ display: block;
+ margin: 13px auto 0;
+ overflow: hidden;
+ text-align: right;
+}
+
+.markdown-body code,
+.markdown-body tt {
+ padding: .2em .4em;
+ margin: 0;
+ font-size: 85%;
+ white-space: break-spaces;
+ background-color: var(--color-neutral-muted);
+ border-radius: 6px;
+}
+
+.markdown-body code br,
+.markdown-body tt br {
+ display: none;
+}
+
+.markdown-body del code {
+ text-decoration: inherit;
+}
+
+.markdown-body samp {
+ font-size: 85%;
+}
+
+.markdown-body pre code {
+ font-size: 100%;
+}
+
+.markdown-body pre>code {
+ padding: 0;
+ margin: 0;
+ word-break: normal;
+ white-space: pre;
+ background: transparent;
+ border: 0;
+}
+
+.markdown-body .highlight {
+ margin-bottom: 16px;
+}
+
+.markdown-body .highlight pre {
+ margin-bottom: 0;
+ word-break: normal;
+}
+
+.markdown-body .highlight pre,
+.markdown-body pre {
+ padding: 16px;
+ overflow: auto;
+ font-size: 85%;
+ line-height: 1.45;
+ color: var(--color-fg-default);
+ background-color: var(--color-canvas-subtle);
+ border-radius: 6px;
+}
+
+.markdown-body pre code,
+.markdown-body pre tt {
+ display: inline;
+ max-width: auto;
+ padding: 0;
+ margin: 0;
+ overflow: visible;
+ line-height: inherit;
+ word-wrap: normal;
+ background-color: transparent;
+ border: 0;
+}
+
+.markdown-body .csv-data td,
+.markdown-body .csv-data th {
+ padding: 5px;
+ overflow: hidden;
+ font-size: 12px;
+ line-height: 1;
+ text-align: left;
+ white-space: nowrap;
+}
+
+.markdown-body .csv-data .blob-num {
+ padding: 10px 8px 9px;
+ text-align: right;
+ background: var(--color-canvas-default);
+ border: 0;
+}
+
+.markdown-body .csv-data tr {
+ border-top: 0;
+}
+
+.markdown-body .csv-data th {
+ font-weight: var(--base-text-weight-semibold, 600);
+ background: var(--color-canvas-subtle);
+ border-top: 0;
+}
+
+.markdown-body [data-footnote-ref]::before {
+ content: "[";
+}
+
+.markdown-body [data-footnote-ref]::after {
+ content: "]";
+}
+
+.markdown-body .footnotes {
+ font-size: 12px;
+ color: var(--color-fg-muted);
+ border-top: 1px solid var(--color-border-default);
+}
+
+.markdown-body .footnotes ol {
+ padding-left: 16px;
+}
+
+.markdown-body .footnotes ol ul {
+ display: inline-block;
+ padding-left: 16px;
+ margin-top: 16px;
+}
+
+.markdown-body .footnotes li {
+ position: relative;
+}
+
+.markdown-body .footnotes li:target::before {
+ position: absolute;
+ top: -8px;
+ right: -8px;
+ bottom: -8px;
+ left: -24px;
+ pointer-events: none;
+ content: "";
+ border: 2px solid var(--color-accent-emphasis);
+ border-radius: 6px;
+}
+
+.markdown-body .footnotes li:target {
+ color: var(--color-fg-default);
+}
+
+.markdown-body .footnotes .data-footnote-backref g-emoji {
+ font-family: monospace;
+}
+
+.markdown-body .pl-c {
+ color: var(--color-prettylights-syntax-comment);
+}
+
+.markdown-body .pl-c1,
+.markdown-body .pl-s .pl-v {
+ color: var(--color-prettylights-syntax-constant);
+}
+
+.markdown-body .pl-e,
+.markdown-body .pl-en {
+ color: var(--color-prettylights-syntax-entity);
+}
+
+.markdown-body .pl-smi,
+.markdown-body .pl-s .pl-s1 {
+ color: var(--color-prettylights-syntax-storage-modifier-import);
+}
+
+.markdown-body .pl-ent {
+ color: var(--color-prettylights-syntax-entity-tag);
+}
+
+.markdown-body .pl-k {
+ color: var(--color-prettylights-syntax-keyword);
+}
+
+.markdown-body .pl-s,
+.markdown-body .pl-pds,
+.markdown-body .pl-s .pl-pse .pl-s1,
+.markdown-body .pl-sr,
+.markdown-body .pl-sr .pl-cce,
+.markdown-body .pl-sr .pl-sre,
+.markdown-body .pl-sr .pl-sra {
+ color: var(--color-prettylights-syntax-string);
+}
+
+.markdown-body .pl-v,
+.markdown-body .pl-smw {
+ color: var(--color-prettylights-syntax-variable);
+}
+
+.markdown-body .pl-bu {
+ color: var(--color-prettylights-syntax-brackethighlighter-unmatched);
+}
+
+.markdown-body .pl-ii {
+ color: var(--color-prettylights-syntax-invalid-illegal-text);
+ background-color: var(--color-prettylights-syntax-invalid-illegal-bg);
+}
+
+.markdown-body .pl-c2 {
+ color: var(--color-prettylights-syntax-carriage-return-text);
+ background-color: var(--color-prettylights-syntax-carriage-return-bg);
+}
+
+.markdown-body .pl-sr .pl-cce {
+ font-weight: bold;
+ color: var(--color-prettylights-syntax-string-regexp);
+}
+
+.markdown-body .pl-ml {
+ color: var(--color-prettylights-syntax-markup-list);
+}
+
+.markdown-body .pl-mh,
+.markdown-body .pl-mh .pl-en,
+.markdown-body .pl-ms {
+ font-weight: bold;
+ color: var(--color-prettylights-syntax-markup-heading);
+}
+
+.markdown-body .pl-mi {
+ font-style: italic;
+ color: var(--color-prettylights-syntax-markup-italic);
+}
+
+.markdown-body .pl-mb {
+ font-weight: bold;
+ color: var(--color-prettylights-syntax-markup-bold);
+}
+
+.markdown-body .pl-md {
+ color: var(--color-prettylights-syntax-markup-deleted-text);
+ background-color: var(--color-prettylights-syntax-markup-deleted-bg);
+}
+
+.markdown-body .pl-mi1 {
+ color: var(--color-prettylights-syntax-markup-inserted-text);
+ background-color: var(--color-prettylights-syntax-markup-inserted-bg);
+}
+
+.markdown-body .pl-mc {
+ color: var(--color-prettylights-syntax-markup-changed-text);
+ background-color: var(--color-prettylights-syntax-markup-changed-bg);
+}
+
+.markdown-body .pl-mi2 {
+ color: var(--color-prettylights-syntax-markup-ignored-text);
+ background-color: var(--color-prettylights-syntax-markup-ignored-bg);
+}
+
+.markdown-body .pl-mdr {
+ font-weight: bold;
+ color: var(--color-prettylights-syntax-meta-diff-range);
+}
+
+.markdown-body .pl-ba {
+ color: var(--color-prettylights-syntax-brackethighlighter-angle);
+}
+
+.markdown-body .pl-sg {
+ color: var(--color-prettylights-syntax-sublimelinter-gutter-mark);
+}
+
+.markdown-body .pl-corl {
+ text-decoration: underline;
+ color: var(--color-prettylights-syntax-constant-other-reference-link);
+}
+
+.markdown-body g-emoji {
+ display: inline-block;
+ min-width: 1ch;
+ font-family: "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
+ font-size: 1em;
+ font-style: normal !important;
+ font-weight: var(--base-text-weight-normal, 400);
+ line-height: 1;
+ vertical-align: -0.075em;
+}
+
+.markdown-body g-emoji img {
+ width: 1em;
+ height: 1em;
+}
+
+.markdown-body .task-list-item {
+ list-style-type: none;
+}
+
+.markdown-body .task-list-item label {
+ font-weight: var(--base-text-weight-normal, 400);
+}
+
+.markdown-body .task-list-item.enabled label {
+ cursor: pointer;
+}
+
+.markdown-body .task-list-item+.task-list-item {
+ margin-top: 4px;
+}
+
+.markdown-body .task-list-item .handle {
+ display: none;
+}
+
+.markdown-body .task-list-item-checkbox {
+ margin: 0 .2em .25em -1.4em;
+ vertical-align: middle;
+}
+
+.markdown-body .contains-task-list:dir(rtl) .task-list-item-checkbox {
+ margin: 0 -1.6em .25em .2em;
+}
+
+.markdown-body .contains-task-list {
+ position: relative;
+}
+
+.markdown-body .contains-task-list:hover .task-list-item-convert-container,
+.markdown-body .contains-task-list:focus-within .task-list-item-convert-container {
+ display: block;
+ width: auto;
+ height: 24px;
+ overflow: visible;
+ clip: auto;
+}
+
+.markdown-body ::-webkit-calendar-picker-indicator {
+ filter: invert(50%);
+}
+
+.markdown-body .markdown-alert {
+ padding: var(--base-size-8) var(--base-size-16);
+ margin-bottom: 16px;
+ color: inherit;
+ border-left: .25em solid var(--color-border-default);
+}
+
+.markdown-body .markdown-alert>:first-child {
+ margin-top: 0;
+}
+
+.markdown-body .markdown-alert>:last-child {
+ margin-bottom: 0;
+}
+
+.markdown-body .markdown-alert .markdown-alert-title {
+ display: flex;
+ font-weight: var(--base-text-weight-medium, 500);
+ align-items: center;
+ line-height: 1;
+}
+
+.markdown-body .markdown-alert.markdown-alert-note {
+ border-left-color: var(--color-accent-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-note .markdown-alert-title {
+ color: var(--color-accent-fg);
+}
+
+.markdown-body .markdown-alert.markdown-alert-important {
+ border-left-color: var(--color-done-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-important .markdown-alert-title {
+ color: var(--color-done-fg);
+}
+
+.markdown-body .markdown-alert.markdown-alert-warning {
+ border-left-color: var(--color-attention-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-warning .markdown-alert-title {
+ color: var(--color-attention-fg);
+}
+
+.markdown-body .markdown-alert.markdown-alert-tip {
+ border-left-color: var(--color-success-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-tip .markdown-alert-title {
+ color: var(--color-success-fg);
+}
+
+.markdown-body .markdown-alert.markdown-alert-caution {
+ border-left-color: var(--color-danger-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-caution .markdown-alert-title {
+ color: var(--color-danger-fg);
+}
\ No newline at end of file
diff --git a/docs/index.html b/docs/index.html
new file mode 100644
index 0000000..d1154b4
--- /dev/null
+++ b/docs/index.html
@@ -0,0 +1,1250 @@
+
+
+
+
+
+
+
+
+
+ Search Code By Comment
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+![](./assets/teaser.png)
+
+ℹ️Should you encounter any issues⚠️ while using our project, please feel free to report them on [GitHub Issues](https://github.com/mnotgod96/AppAgent/issues) or reach out to [Dr. Chi Zhang](https://icoz69.github.io/) via email at dr.zhang.chi@outlook.com.
+
+## 📝 Changelog
+- __[2024.1.2]__: 🔥Added an optional method for the agent to bring up a grid overlay on the screen to **tap/swipe anywhere** on the screen.
+- __[2023.12.26]__: Added [Tips](#tips) section for better use experience; added instruction for using the **Android Studio emulator** for
+ users who do not have Android devices.
+- __[2023.12.21]__: 🔥🔥 Open-sourced the git repository, including the detailed configuration steps to implement our AppAgent!
+
+
+## 🔆 Introduction
+
+We introduce a novel LLM-based multimodal agent framework designed to operate smartphone applications.
+
+Our framework enables the agent to operate smartphone applications through a simplified action space, mimicking human-like interactions such as tapping and swiping. This novel approach bypasses the need for system back-end access, thereby broadening its applicability across diverse apps.
+
+Central to our agent's functionality is its innovative learning method. The agent learns to navigate and use new apps either through autonomous exploration or by observing human demonstrations. This process generates a knowledge base that the agent refers to for executing complex tasks across different applications.
+
+
+## ✨ Demo
+
+The demo video shows the process of using AppAgent to follow a user on X (Twitter) in the deployment phase.
+
+https://github.com/mnotgod96/AppAgent/assets/40715314/db99d650-dec1-4531-b4b2-e085bfcadfb7
+
+An interesting experiment showing AppAgent's ability to pass CAPTCHA.
+
+https://github.com/mnotgod96/AppAgent/assets/27103154/5cc7ba50-dbab-42a0-a411-a9a862482548
+
+An example of using the grid overlay to locate a UI element that is not labeled with a numeric tag.
+
+https://github.com/mnotgod96/AppAgent/assets/27103154/71603333-274c-46ed-8381-2f9a34cdfc53
+
+## 🚀 Quick Start
+
+This section will guide you on how to quickly use `gpt-4-vision-preview` as an agent to complete specific tasks for you on
+your Android app.
+
+### ⚙️ Step 1. Prerequisites
+
+1. On your PC, download and install [Android Debug Bridge](https://developer.android.com/tools/adb) (adb) which is a
+ command-line tool that lets you communicate with your Android device from the PC.
+
+2. Get an Android device and enable the USB debugging that can be found in Developer Options in Settings.
+
+3. Connect your device to your PC using a USB cable.
+
+4. (Optional) If you do not have an Android device but still want to try AppAgent. We recommend you download
+ [Android Studio](https://developer.android.com/studio/run/emulator) and use the emulator that comes with it.
+ The emulator can be found in the device manager of Android Studio. You can install apps on an emulator by
+ downloading APK files from the internet and dragging them to the emulator.
+ AppAgent can detect the emulated device and operate apps on it just like operating a real device.
+
+
+
+5. Clone this repo and install the dependencies. All scripts in this project are written in Python 3 so make sure you
+ have installed it.
+
+```bash
+cd AppAgent
+pip install -r requirements.txt
+```
+
+### 🤖 Step 2. Configure the Agent
+
+AppAgent needs to be powered by a multi-modal model which can receive both text and visual inputs. During our experiment
+, we used `gpt-4-vision-preview` as the model to make decisions on how to take actions to complete a task on the smartphone.
+
+To configure your requests to GPT-4V, you should modify `config.yaml` in the root directory.
+There are two key parameters that must be configured to try AppAgent:
+1. OpenAI API key: you must purchase an eligible API key from OpenAI so that you can have access to GPT-4V.
+2. Request interval: this is the time interval in seconds between consecutive GPT-4V requests to control the frequency
+of your requests to GPT-4V. Adjust this value according to the status of your account.
+
+Other parameters in `config.yaml` are well commented. Modify them as you need.
+
+> Be aware that GPT-4V is not free. Each request/response pair involved in this project costs around $0.03. Use it wisely.
+
+If you want to test AppAgent using your own models, you should modify the `ask_gpt_4v` function in `scripts/model.py`
+accordingly.
+
+### 🔍 Step 3. Exploration Phase
+
+Our paper proposed a novel solution that involves two phases, exploration, and deployment, to turn GPT-4V into a capable
+agent that can help users operate their Android phones when a task is given. The exploration phase starts with a task
+given by you, and you can choose to let the agent either explore the app on its own or learn from your demonstration.
+In both cases, the agent generates documentation for elements interacted during the exploration/demonstration and
+saves them for use in the deployment phase.
+
+#### Option 1: Autonomous Exploration
+
+This solution features a fully autonomous exploration which allows the agent to explore the use of the app by attempting
+the given task without any intervention from humans.
+
+To start, run `learn.py` in the root directory. Follow the prompted instructions to select `autonomous exploration`
+as the operating mode and provide the app name and task description. Then, your agent will do the job for you. Under
+this mode, AppAgent will reflect on its previous action making sure its action adheres to the given task and generate
+documentation for the elements explored.
+
+```bash
+python learn.py
+```
+
+#### Option 2: Learning from Human Demonstrations
+
+This solution requires users to demonstrate a similar task first. AppAgent will learn from the demo and generate
+documentations for UI elements seen during the demo.
+
+To start human demonstration, you should run `learn.py` in the root directory. Follow the prompted instructions to select
+`human demonstration` as the operating mode and provide the app name and task description. A screenshot of your phone
+will be captured and all interactive elements shown on the screen will be labeled with numeric tags. You need to follow
+the prompts to determine your next action and the target of the action. When you believe the demonstration is finished,
+type `stop` to end the demo.
+
+```bash
+python learn.py
+```
+
+![](./assets/demo.png)
+
+### 📱 Step 4. Deployment Phase
+
+After the exploration phase finishes, you can run `run.py` in the root directory. Follow the prompted instructions to enter
+the name of the app, select the appropriate documentation base you want the agent to use and provide the task
+description. Then, your agent will do the job for you. The agent will automatically detect if there is documentation
+base generated before for the app; if there is no documentation found, you can also choose to run the agent without any
+documentation (success rate not guaranteed).
+
+```bash
+python run.py
+```
+
+## 💡 Tips
+- For an improved experience, you might permit AppAgent to undertake a broader range of tasks through autonomous exploration, or you can directly demonstrate more app functions to enhance the app documentation. Generally, the more extensive the documentation provided to the agent, the higher the likelihood of successful task completion.
+- It is always a good practice to inspect the documentation generated by the agent. When you find some documentation not accurately
+ describe the function of the element, manually revising the documentation is also an option.
+
+
+## 📖 To-Do List
+- [ ] Open source the Benchmark.
+- [x] Open source the configuration.
+
+## 😉 Citation
+```bib
+@misc{yang2023appagent,
+ title={AppAgent: Multimodal Agents as Smartphone Users},
+ author={Chi Zhang and Zhao Yang and Jiaxuan Liu and Yucheng Han and Xin Chen and Zebiao Huang and Bin Fu and Gang Yu},
+ year={2023},
+ eprint={2312.13771},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=mnotgod96/AppAgent&type=Date)](https://star-history.com/#mnotgod96/AppAgent&Date)
+
+
+## License
+The [MIT license](./assets/license.txt).
diff --git a/docs/src/config.yaml b/docs/src/config.yaml
new file mode 100644
index 0000000..014884d
--- /dev/null
+++ b/docs/src/config.yaml
@@ -0,0 +1,14 @@
+OPENAI_API_BASE: "https://api.openai.com/v1/chat/completions"
+OPENAI_API_KEY: "sk-" # Set the value to sk-xxx if you host the openai interface for open llm model
+OPENAI_API_MODEL: "gpt-4-vision-preview" # The only OpenAI model by now that accepts visual input
+MAX_TOKENS: 300 # The max token limit for the response completion
+TEMPERATURE: 0.0 # The temperature of the model: the lower the value, the more consistent the output of the model
+REQUEST_INTERVAL: 10 # Time in seconds between consecutive GPT-4V requests
+
+ANDROID_SCREENSHOT_DIR: "/sdcard/Pictures/Screenshots" # Set the directory on your Android device to store the intermediate screenshots. Make sure the directory EXISTS on your phone!
+ANDROID_XML_DIR: "/sdcard" # Set the directory on your Android device to store the intermediate XML files used for determining locations of UI elements on your screen. Make sure the directory EXISTS on your phone!
+
+DOC_REFINE: false # Set this to true will make the agent refine existing documentation based on the latest demonstration; otherwise, the agent will not regenerate a new documentation for elements with the same resource ID.
+MAX_ROUNDS: 20 # Set the round limit for the agent to complete the task
+DARK_MODE: false # Set this to true if your app is in dark mode to enhance the element labeling
+MIN_DIST: 30 # The minimum distance between elements to prevent overlapping during the labeling process
\ No newline at end of file
diff --git a/docs/src/learn.py b/docs/src/learn.py
new file mode 100644
index 0000000..c922200
--- /dev/null
+++ b/docs/src/learn.py
@@ -0,0 +1,44 @@
+import argparse
+import datetime
+import os
+import time
+
+from scripts.utils import print_with_color
+
+arg_desc = "AppAgent - exploration phase"
+parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
+parser.add_argument("--app")
+parser.add_argument("--root_dir", default="./")
+args = vars(parser.parse_args())
+
+app = args["app"]
+root_dir = args["root_dir"]
+
+
+print_with_color("Welcome to the exploration phase of AppAgent!\nThe exploration phase aims at generating "
+ "documentations for UI elements through either autonomous exploration or human demonstration. "
+ "Both options are task-oriented, which means you need to give a task description. During "
+ "autonomous exploration, the agent will try to complete the task by interacting with possible "
+ "elements on the UI within limited rounds. Documentations will be generated during the process of "
+ "interacting with the correct elements to proceed with the task. Human demonstration relies on "
+ "the user to show the agent how to complete the given task, and the agent will generate "
+ "documentations for the elements interacted during the human demo. To start, please enter the "
+ "main interface of the app on your phone.", "yellow")
+print_with_color("Choose from the following modes:\n1. autonomous exploration\n2. human demonstration\n"
+ "Type 1 or 2.", "blue")
+user_input = ""
+while user_input != "1" and user_input != "2":
+ user_input = input()
+
+if not app:
+ print_with_color("What is the name of the target app?", "blue")
+ app = input()
+ app = app.replace(" ", "")
+
+if user_input == "1":
+ os.system(f"python scripts/self_explorer.py --app {app} --root_dir {root_dir}")
+else:
+ demo_timestamp = int(time.time())
+ demo_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime(f"demo_{app}_%Y-%m-%d_%H-%M-%S")
+ os.system(f"python scripts/step_recorder.py --app {app} --demo {demo_name} --root_dir {root_dir}")
+ os.system(f"python scripts/document_generation.py --app {app} --demo {demo_name} --root_dir {root_dir}")
diff --git a/docs/src/requirements.txt b/docs/src/requirements.txt
new file mode 100644
index 0000000..45072ca
--- /dev/null
+++ b/docs/src/requirements.txt
@@ -0,0 +1,6 @@
+argparse
+colorama
+opencv-python
+pyshine
+pyyaml
+requests
\ No newline at end of file
diff --git a/docs/src/run.py b/docs/src/run.py
new file mode 100644
index 0000000..96f9967
--- /dev/null
+++ b/docs/src/run.py
@@ -0,0 +1,25 @@
+import argparse
+import os
+
+from scripts.utils import print_with_color
+
+arg_desc = "AppAgent - deployment phase"
+parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
+parser.add_argument("--app")
+parser.add_argument("--root_dir", default="./")
+args = vars(parser.parse_args())
+
+app = args["app"]
+root_dir = args["root_dir"]
+
+print_with_color("Welcome to the deployment phase of AppAgent!\nBefore giving me the task, you should first tell me "
+ "the name of the app you want me to operate and what documentation base you want me to use. I will "
+ "try my best to complete the task without your intervention. First, please enter the main interface "
+ "of the app on your phone and provide the following information.", "yellow")
+
+if not app:
+ print_with_color("What is the name of the target app?", "blue")
+ app = input()
+ app = app.replace(" ", "")
+
+os.system(f"python scripts/task_executor.py --app {app} --root_dir {root_dir}")
diff --git a/docs/src/scripts/and_controller.py b/docs/src/scripts/and_controller.py
new file mode 100644
index 0000000..769f5b9
--- /dev/null
+++ b/docs/src/scripts/and_controller.py
@@ -0,0 +1,180 @@
+import os
+import subprocess
+import xml.etree.ElementTree as ET
+
+from config import load_config
+from utils import print_with_color
+
+
+configs = load_config()
+
+
+class AndroidElement:
+ def __init__(self, uid, bbox, attrib):
+ self.uid = uid
+ self.bbox = bbox
+ self.attrib = attrib
+
+
+def execute_adb(adb_command):
+ # print(adb_command)
+ result = subprocess.run(adb_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+ if result.returncode == 0:
+ return result.stdout.strip()
+ print_with_color(f"Command execution failed: {adb_command}", "red")
+ print_with_color(result.stderr, "red")
+ return "ERROR"
+
+
+def list_all_devices():
+ adb_command = "adb devices"
+ device_list = []
+ result = execute_adb(adb_command)
+ if result != "ERROR":
+ devices = result.split("\n")[1:]
+ for d in devices:
+ device_list.append(d.split()[0])
+
+ return device_list
+
+
+def get_id_from_element(elem):
+ bounds = elem.attrib["bounds"][1:-1].split("][")
+ x1, y1 = map(int, bounds[0].split(","))
+ x2, y2 = map(int, bounds[1].split(","))
+ elem_w, elem_h = x2 - x1, y2 - y1
+ if "resource-id" in elem.attrib and elem.attrib["resource-id"]:
+ elem_id = elem.attrib["resource-id"].replace(":", ".").replace("/", "_")
+ else:
+ elem_id = f"{elem.attrib['class']}_{elem_w}_{elem_h}"
+ if "content-desc" in elem.attrib and elem.attrib["content-desc"] and len(elem.attrib["content-desc"]) < 20:
+ content_desc = elem.attrib['content-desc'].replace("/", "_").replace(" ", "").replace(":", "_")
+ elem_id += f"_{content_desc}"
+ return elem_id
+
+
+def traverse_tree(xml_path, elem_list, attrib, add_index=False):
+ path = []
+ for event, elem in ET.iterparse(xml_path, ['start', 'end']):
+ if event == 'start':
+ path.append(elem)
+ if attrib in elem.attrib and elem.attrib[attrib] == "true":
+ parent_prefix = ""
+ if len(path) > 1:
+ parent_prefix = get_id_from_element(path[-2])
+ bounds = elem.attrib["bounds"][1:-1].split("][")
+ x1, y1 = map(int, bounds[0].split(","))
+ x2, y2 = map(int, bounds[1].split(","))
+ center = (x1 + x2) // 2, (y1 + y2) // 2
+ elem_id = get_id_from_element(elem)
+ if parent_prefix:
+ elem_id = parent_prefix + "_" + elem_id
+ if add_index:
+ elem_id += f"_{elem.attrib['index']}"
+ close = False
+ for e in elem_list:
+ bbox = e.bbox
+ center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
+ dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5
+ if dist <= configs["MIN_DIST"]:
+ close = True
+ break
+ if not close:
+ elem_list.append(AndroidElement(elem_id, ((x1, y1), (x2, y2)), attrib))
+
+ if event == 'end':
+ path.pop()
+
+
+class AndroidController:
+ def __init__(self, device):
+ self.device = device
+ self.screenshot_dir = configs["ANDROID_SCREENSHOT_DIR"]
+ self.xml_dir = configs["ANDROID_XML_DIR"]
+ self.width, self.height = self.get_device_size()
+ self.backslash = "\\"
+
+ def get_device_size(self):
+ adb_command = f"adb -s {self.device} shell wm size"
+ result = execute_adb(adb_command)
+ if result != "ERROR":
+ return map(int, result.split(": ")[1].split("x"))
+ return 0, 0
+
+ def get_screenshot(self, prefix, save_dir):
+ cap_command = f"adb -s {self.device} shell screencap -p " \
+ f"{os.path.join(self.screenshot_dir, prefix + '.png').replace(self.backslash, '/')}"
+ pull_command = f"adb -s {self.device} pull " \
+ f"{os.path.join(self.screenshot_dir, prefix + '.png').replace(self.backslash, '/')} " \
+ f"{os.path.join(save_dir, prefix + '.png')}"
+ result = execute_adb(cap_command)
+ if result != "ERROR":
+ result = execute_adb(pull_command)
+ if result != "ERROR":
+ return os.path.join(save_dir, prefix + ".png")
+ return result
+ return result
+
+ def get_xml(self, prefix, save_dir):
+ dump_command = f"adb -s {self.device} shell uiautomator dump " \
+ f"{os.path.join(self.xml_dir, prefix + '.xml').replace(self.backslash, '/')}"
+ pull_command = f"adb -s {self.device} pull " \
+ f"{os.path.join(self.xml_dir, prefix + '.xml').replace(self.backslash, '/')} " \
+ f"{os.path.join(save_dir, prefix + '.xml')}"
+ result = execute_adb(dump_command)
+ if result != "ERROR":
+ result = execute_adb(pull_command)
+ if result != "ERROR":
+ return os.path.join(save_dir, prefix + ".xml")
+ return result
+ return result
+
+ def back(self):
+ adb_command = f"adb -s {self.device} shell input keyevent KEYCODE_BACK"
+ ret = execute_adb(adb_command)
+ return ret
+
+ def tap(self, x, y):
+ adb_command = f"adb -s {self.device} shell input tap {x} {y}"
+ ret = execute_adb(adb_command)
+ return ret
+
+ def text(self, input_str):
+ input_str = input_str.replace(" ", "%s")
+ input_str = input_str.replace("'", "")
+ adb_command = f"adb -s {self.device} shell input text {input_str}"
+ ret = execute_adb(adb_command)
+ return ret
+
+ def long_press(self, x, y, duration=1000):
+ adb_command = f"adb -s {self.device} shell input swipe {x} {y} {x} {y} {duration}"
+ ret = execute_adb(adb_command)
+ return ret
+
+ def swipe(self, x, y, direction, dist="medium", quick=False):
+ unit_dist = int(self.width / 10)
+ if dist == "long":
+ unit_dist *= 3
+ elif dist == "medium":
+ unit_dist *= 2
+ if direction == "up":
+ offset = 0, -2 * unit_dist
+ elif direction == "down":
+ offset = 0, 2 * unit_dist
+ elif direction == "left":
+ offset = -1 * unit_dist, 0
+ elif direction == "right":
+ offset = unit_dist, 0
+ else:
+ return "ERROR"
+ duration = 100 if quick else 400
+ adb_command = f"adb -s {self.device} shell input swipe {x} {y} {x+offset[0]} {y+offset[1]} {duration}"
+ ret = execute_adb(adb_command)
+ return ret
+
+ def swipe_precise(self, start, end, duration=400):
+ start_x, start_y = start
+ end_x, end_y = end
+ adb_command = f"adb -s {self.device} shell input swipe {start_x} {start_x} {end_x} {end_y} {duration}"
+ ret = execute_adb(adb_command)
+ return ret
diff --git a/docs/src/scripts/config.py b/docs/src/scripts/config.py
new file mode 100644
index 0000000..37afb99
--- /dev/null
+++ b/docs/src/scripts/config.py
@@ -0,0 +1,10 @@
+import os
+import yaml
+
+
+def load_config(config_path="./config.yaml"):
+ configs = dict(os.environ)
+ with open(config_path, "r") as file:
+ yaml_data = yaml.safe_load(file)
+ configs.update(yaml_data)
+ return configs
diff --git a/docs/src/scripts/document_generation.py b/docs/src/scripts/document_generation.py
new file mode 100644
index 0000000..b9a14fb
--- /dev/null
+++ b/docs/src/scripts/document_generation.py
@@ -0,0 +1,141 @@
+import argparse
+import ast
+import json
+import os
+import re
+import sys
+import time
+
+import prompts
+from config import load_config
+from model import ask_gpt4v
+from utils import print_with_color, encode_image
+
+arg_desc = "AppAgent - Human Demonstration"
+parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
+parser.add_argument("--app", required=True)
+parser.add_argument("--demo", required=True)
+parser.add_argument("--root_dir", default="./")
+args = vars(parser.parse_args())
+
+configs = load_config()
+
+root_dir = args["root_dir"]
+work_dir = os.path.join(root_dir, "apps")
+if not os.path.exists(work_dir):
+ os.mkdir(work_dir)
+app = args["app"]
+work_dir = os.path.join(work_dir, app)
+demo_dir = os.path.join(work_dir, "demos")
+demo_name = args["demo"]
+task_dir = os.path.join(demo_dir, demo_name)
+xml_dir = os.path.join(task_dir, "xml")
+labeled_ss_dir = os.path.join(task_dir, "labeled_screenshots")
+record_path = os.path.join(task_dir, "record.txt")
+task_desc_path = os.path.join(task_dir, "task_desc.txt")
+if not os.path.exists(task_dir) or not os.path.exists(xml_dir) or not os.path.exists(labeled_ss_dir) \
+ or not os.path.exists(record_path) or not os.path.exists(task_desc_path):
+ sys.exit()
+log_path = os.path.join(task_dir, f"log_{app}_{demo_name}.txt")
+
+docs_dir = os.path.join(work_dir, "demo_docs")
+if not os.path.exists(docs_dir):
+ os.mkdir(docs_dir)
+
+print_with_color(f"Starting to generate documentations for the app {app} based on the demo {demo_name}", "yellow")
+doc_count = 0
+with open(record_path, "r") as infile:
+ step = len(infile.readlines()) - 1
+ infile.seek(0)
+ for i in range(1, step + 1):
+ img_before = encode_image(os.path.join(labeled_ss_dir, f"{demo_name}_{i}.png"))
+ img_after = encode_image(os.path.join(labeled_ss_dir, f"{demo_name}_{i + 1}.png"))
+ rec = infile.readline().strip()
+ action, resource_id = rec.split(":::")
+ action_type = action.split("(")[0]
+ action_param = re.findall(r"\((.*?)\)", action)[0]
+ if action_type == "tap":
+ prompt_template = prompts.tap_doc_template
+ prompt = re.sub(r"", action_param, prompt_template)
+ elif action_type == "text":
+ input_area, input_text = action_param.split(":sep:")
+ prompt_template = prompts.text_doc_template
+ prompt = re.sub(r"", input_area, prompt_template)
+ elif action_type == "long_press":
+ prompt_template = prompts.long_press_doc_template
+ prompt = re.sub(r"", action_param, prompt_template)
+ elif action_type == "swipe":
+ swipe_area, swipe_dir = action_param.split(":sep:")
+ if swipe_dir == "up" or swipe_dir == "down":
+ action_type = "v_swipe"
+ elif swipe_dir == "left" or swipe_dir == "right":
+ action_type = "h_swipe"
+ prompt_template = prompts.swipe_doc_template
+ prompt = re.sub(r"", swipe_dir, prompt_template)
+ prompt = re.sub(r"", swipe_area, prompt)
+ else:
+ break
+ task_desc = open(task_desc_path, "r").read()
+ prompt = re.sub(r"", task_desc, prompt)
+
+ doc_name = resource_id + ".txt"
+ doc_path = os.path.join(docs_dir, doc_name)
+
+ if os.path.exists(doc_path):
+ doc_content = ast.literal_eval(open(doc_path).read())
+ if doc_content[action_type]:
+ if configs["DOC_REFINE"]:
+ suffix = re.sub(r"", doc_content[action_type], prompts.refine_doc_suffix)
+ prompt += suffix
+ print_with_color(f"Documentation for the element {resource_id} already exists. The doc will be "
+ f"refined based on the latest demo.", "yellow")
+ else:
+ print_with_color(f"Documentation for the element {resource_id} already exists. Turn on DOC_REFINE "
+ f"in the config file if needed.", "yellow")
+ continue
+ else:
+ doc_content = {
+ "tap": "",
+ "text": "",
+ "v_swipe": "",
+ "h_swipe": "",
+ "long_press": ""
+ }
+
+ print_with_color(f"Waiting for GPT-4V to generate documentation for the element {resource_id}", "yellow")
+ content = [
+ {
+ "type": "text",
+ "text": prompt
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{img_before}"
+ }
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{img_after}"
+ }
+ }
+ ]
+
+ rsp = ask_gpt4v(content)
+ if "error" not in rsp:
+ msg = rsp["choices"][0]["message"]["content"]
+ doc_content[action_type] = msg
+ with open(log_path, "a") as logfile:
+ log_item = {"step": i, "prompt": prompt, "image_before": f"{demo_name}_{i}.png",
+ "image_after": f"{demo_name}_{i + 1}.png", "response": rsp}
+ logfile.write(json.dumps(log_item) + "\n")
+ with open(doc_path, "w") as outfile:
+ outfile.write(str(doc_content))
+ doc_count += 1
+ print_with_color(f"Documentation generated and saved to {doc_path}", "yellow")
+ else:
+ print_with_color(rsp["error"]["message"], "red")
+ time.sleep(configs["REQUEST_INTERVAL"])
+
+print_with_color(f"Documentation generation phase completed. {doc_count} docs generated.", "yellow")
diff --git a/docs/src/scripts/model.py b/docs/src/scripts/model.py
new file mode 100644
index 0000000..62db761
--- /dev/null
+++ b/docs/src/scripts/model.py
@@ -0,0 +1,150 @@
+import re
+import requests
+
+from config import load_config
+from utils import print_with_color
+
+configs = load_config()
+
+
+def ask_gpt4v(content):
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {configs['OPENAI_API_KEY']}"
+ }
+ payload = {
+ "model": configs["OPENAI_API_MODEL"],
+ "messages": [
+ {
+ "role": "user",
+ "content": content
+ }
+ ],
+ "temperature": configs["TEMPERATURE"],
+ "max_tokens": configs["MAX_TOKENS"]
+ }
+ response = requests.post(configs["OPENAI_API_BASE"], headers=headers, json=payload)
+ if "error" not in response.json():
+ usage = response.json()["usage"]
+ prompt_tokens = usage["prompt_tokens"]
+ completion_tokens = usage["completion_tokens"]
+ print_with_color(f"Request cost is "
+ f"${'{0:.2f}'.format(prompt_tokens / 1000 * 0.01 + completion_tokens / 1000 * 0.03)}",
+ "yellow")
+ return response.json()
+
+
+def parse_explore_rsp(rsp):
+ try:
+ msg = rsp["choices"][0]["message"]["content"]
+ observation = re.findall(r"Observation: (.*?)$", msg, re.MULTILINE)[0]
+ think = re.findall(r"Thought: (.*?)$", msg, re.MULTILINE)[0]
+ act = re.findall(r"Action: (.*?)$", msg, re.MULTILINE)[0]
+ last_act = re.findall(r"Summary: (.*?)$", msg, re.MULTILINE)[0]
+ print_with_color("Observation:", "yellow")
+ print_with_color(observation, "magenta")
+ print_with_color("Thought:", "yellow")
+ print_with_color(think, "magenta")
+ print_with_color("Action:", "yellow")
+ print_with_color(act, "magenta")
+ print_with_color("Summary:", "yellow")
+ print_with_color(last_act, "magenta")
+ if "FINISH" in act:
+ return ["FINISH"]
+ act_name = act.split("(")[0]
+ if act_name == "tap":
+ area = int(re.findall(r"tap\((.*?)\)", act)[0])
+ return [act_name, area, last_act]
+ elif act_name == "text":
+ input_str = re.findall(r"text\((.*?)\)", act)[0][1:-1]
+ return [act_name, input_str, last_act]
+ elif act_name == "long_press":
+ area = int(re.findall(r"long_press\((.*?)\)", act)[0])
+ return [act_name, area, last_act]
+ elif act_name == "swipe":
+ params = re.findall(r"swipe\((.*?)\)", act)[0]
+ area, swipe_dir, dist = params.split(",")
+ area = int(area)
+ swipe_dir = swipe_dir.strip()[1:-1]
+ dist = dist.strip()[1:-1]
+ return [act_name, area, swipe_dir, dist, last_act]
+ elif act_name == "grid":
+ return [act_name]
+ else:
+ print_with_color(f"ERROR: Undefined act {act_name}!", "red")
+ return ["ERROR"]
+ except Exception as e:
+ print_with_color(f"ERROR: an exception occurs while parsing the model response: {e}", "red")
+ print_with_color(rsp, "red")
+ return ["ERROR"]
+
+
+def parse_grid_rsp(rsp):
+ try:
+ msg = rsp["choices"][0]["message"]["content"]
+ observation = re.findall(r"Observation: (.*?)$", msg, re.MULTILINE)[0]
+ think = re.findall(r"Thought: (.*?)$", msg, re.MULTILINE)[0]
+ act = re.findall(r"Action: (.*?)$", msg, re.MULTILINE)[0]
+ last_act = re.findall(r"Summary: (.*?)$", msg, re.MULTILINE)[0]
+ print_with_color("Observation:", "yellow")
+ print_with_color(observation, "magenta")
+ print_with_color("Thought:", "yellow")
+ print_with_color(think, "magenta")
+ print_with_color("Action:", "yellow")
+ print_with_color(act, "magenta")
+ print_with_color("Summary:", "yellow")
+ print_with_color(last_act, "magenta")
+ if "FINISH" in act:
+ return ["FINISH"]
+ act_name = act.split("(")[0]
+ if act_name == "tap":
+ params = re.findall(r"tap\((.*?)\)", act)[0].split(",")
+ area = int(params[0].strip())
+ subarea = params[1].strip()[1:-1]
+ return [act_name + "_grid", area, subarea, last_act]
+ elif act_name == "long_press":
+ params = re.findall(r"long_press\((.*?)\)", act)[0].split(",")
+ area = int(params[0].strip())
+ subarea = params[1].strip()[1:-1]
+ return [act_name + "_grid", area, subarea, last_act]
+ elif act_name == "swipe":
+ params = re.findall(r"swipe\((.*?)\)", act)[0].split(",")
+ start_area = int(params[0].strip())
+ start_subarea = params[1].strip()[1:-1]
+ end_area = int(params[2].strip())
+ end_subarea = params[3].strip()[1:-1]
+ return [act_name + "_grid", start_area, start_subarea, end_area, end_subarea, last_act]
+ elif act_name == "grid":
+ return [act_name]
+ else:
+ print_with_color(f"ERROR: Undefined act {act_name}!", "red")
+ return ["ERROR"]
+ except Exception as e:
+ print_with_color(f"ERROR: an exception occurs while parsing the model response: {e}", "red")
+ print_with_color(rsp, "red")
+ return ["ERROR"]
+
+
+def parse_reflect_rsp(rsp):
+ try:
+ msg = rsp["choices"][0]["message"]["content"]
+ decision = re.findall(r"Decision: (.*?)$", msg, re.MULTILINE)[0]
+ think = re.findall(r"Thought: (.*?)$", msg, re.MULTILINE)[0]
+ print_with_color("Decision:", "yellow")
+ print_with_color(decision, "magenta")
+ print_with_color("Thought:", "yellow")
+ print_with_color(think, "magenta")
+ if decision == "INEFFECTIVE":
+ return [decision, think]
+ elif decision == "BACK" or decision == "CONTINUE" or decision == "SUCCESS":
+ doc = re.findall(r"Documentation: (.*?)$", msg, re.MULTILINE)[0]
+ print_with_color("Documentation:", "yellow")
+ print_with_color(doc, "magenta")
+ return [decision, think, doc]
+ else:
+ print_with_color(f"ERROR: Undefined decision {decision}!", "red")
+ return ["ERROR"]
+ except Exception as e:
+ print_with_color(f"ERROR: an exception occurs while parsing the model response: {e}", "red")
+ print_with_color(rsp, "red")
+ return ["ERROR"]
diff --git a/docs/src/scripts/prompts.py b/docs/src/scripts/prompts.py
new file mode 100644
index 0000000..3fe83f8
--- /dev/null
+++ b/docs/src/scripts/prompts.py
@@ -0,0 +1,223 @@
+tap_doc_template = """I will give you the screenshot of a mobile app before and after tapping the UI element labeled
+with the number on the screen. The numeric tag of each element is located at the center of the element.
+Tapping this UI element is a necessary part of proceeding with a larger task, which is to . Your task is to
+describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI
+element should focus on the general function. For example, if the UI element is used to navigate to the chat window
+with John, your description should not include the name of the specific person. Just say: "Tapping this area will
+navigate the user to the chat window". Never include the numeric tag of the UI element in your description. You can use
+pronouns such as "the UI element" to refer to the element."""
+
+text_doc_template = """I will give you the screenshot of a mobile app before and after typing in the input area labeled
+with the number on the screen. The numeric tag of each element is located at the center of the element.
+Typing in this UI element is a necessary part of proceeding with a larger task, which is to . Your task is
+to describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the
+UI element should focus on the general function. For example, if the change of the screenshot shows that the user typed
+"How are you?" in the chat box, you do not need to mention the actual text. Just say: "This input area is used for the
+user to type a message to send to the chat window.". Never include the numeric tag of the UI element in your
+description. You can use pronouns such as "the UI element" to refer to the element."""
+
+long_press_doc_template = """I will give you the screenshot of a mobile app before and after long pressing the UI
+element labeled with the number on the screen. The numeric tag of each element is located at the center of
+the element. Long pressing this UI element is a necessary part of proceeding with a larger task, which is to
+. Your task is to describe the functionality of the UI element concisely in one or two sentences. Notice
+that your description of the UI element should focus on the general function. For example, if long pressing the UI
+element redirects the user to the chat window with John, your description should not include the name of the specific
+person. Just say: "Long pressing this area will redirect the user to the chat window". Never include the numeric tag of
+the UI element in your description. You can use pronouns such as "the UI element" to refer to the element."""
+
+swipe_doc_template = """I will give you the screenshot of a mobile app before and after swiping the UI
+element labeled with the number on the screen. The numeric tag of each element is located at the center of
+the element. Swiping this UI element is a necessary part of proceeding with a larger task, which is to .
+Your task is to describe the functionality of the UI element concisely in one or two sentences. Notice that your
+description of the UI element should be as general as possible. For example, if swiping the UI element increases the
+contrast ratio of an image of a building, your description should be just like this: "Swiping this area enables the
+user to tune a specific parameter of the image". Never include the numeric tag of the UI element in your description.
+You can use pronouns such as "the UI element" to refer to the element."""
+
+refine_doc_suffix = """\nA documentation of this UI element generated from previous demos is shown below. Your
+generated description should be based on this previous doc and optimize it. Notice that it is possible that your
+understanding of the function of the UI element derived from the given screenshots conflicts with the previous doc,
+because the function of a UI element can be flexible. In this case, your generated description should combine both.
+Old documentation of this UI element: """
+
+task_template = """You are an agent that is trained to perform some basic tasks on a smartphone. You will be given a
+smartphone screenshot. The interactive UI elements on the screenshot are labeled with numeric tags starting from 1. The
+numeric tag of each interactive element is located in the center of the element.
+
+You can call the following functions to control the smartphone:
+
+1. tap(element: int)
+This function is used to tap an UI element shown on the smartphone screen.
+"element" is a numeric tag assigned to an UI element shown on the smartphone screen.
+A simple use case can be tap(5), which taps the UI element labeled with the number 5.
+
+2. text(text_input: str)
+This function is used to insert text input in an input field/box. text_input is the string you want to insert and must
+be wrapped with double quotation marks. A simple use case can be text("Hello, world!"), which inserts the string
+"Hello, world!" into the input area on the smartphone screen. This function is usually callable when you see a keyboard
+showing in the lower half of the screen.
+
+3. long_press(element: int)
+This function is used to long press an UI element shown on the smartphone screen.
+"element" is a numeric tag assigned to an UI element shown on the smartphone screen.
+A simple use case can be long_press(5), which long presses the UI element labeled with the number 5.
+
+4. swipe(element: int, direction: str, dist: str)
+This function is used to swipe an UI element shown on the smartphone screen, usually a scroll view or a slide bar.
+"element" is a numeric tag assigned to an UI element shown on the smartphone screen. "direction" is a string that
+represents one of the four directions: up, down, left, right. "direction" must be wrapped with double quotation
+marks. "dist" determines the distance of the swipe and can be one of the three options: short, medium, long. You should
+choose the appropriate distance option according to your need.
+A simple use case can be swipe(21, "up", "medium"), which swipes up the UI element labeled with the number 21 for a
+medium distance.
+
+5. grid()
+You should call this function when you find the element you want to interact with is not labeled with a numeric tag and
+other elements with numeric tags cannot help with the task. The function will bring up a grid overlay to divide the
+smartphone screen into small areas and this will give you more freedom to choose any part of the screen to tap, long
+press, or swipe.
+
+The task you need to complete is to . Your past actions to proceed with this task are summarized as
+follows:
+Now, given the documentation and the following labeled screenshot, you need to think and call the function needed to
+proceed with the task. Your output should include three parts in the given format:
+Observation:
+Thought:
+Action:
+Summary:
+You can only take one action at a time, so please directly call the function."""
+
+task_template_grid = """You are an agent that is trained to perform some basic tasks on a smartphone. You will be given
+a smartphone screenshot overlaid by a grid. The grid divides the screenshot into small square areas. Each area is
+labeled with an integer in the top-left corner.
+
+You can call the following functions to control the smartphone:
+
+1. tap(area: int, subarea: str)
+This function is used to tap a grid area shown on the smartphone screen. "area" is the integer label assigned to a grid
+area shown on the smartphone screen. "subarea" is a string representing the exact location to tap within the grid area.
+It can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left, bottom, and
+bottom-right.
+A simple use case can be tap(5, "center"), which taps the exact center of the grid area labeled with the number 5.
+
+2. long_press(area: int, subarea: str)
+This function is used to long press a grid area shown on the smartphone screen. "area" is the integer label assigned to
+a grid area shown on the smartphone screen. "subarea" is a string representing the exact location to long press within
+the grid area. It can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left, bottom,
+and bottom-right.
+A simple use case can be long_press(7, "top-left"), which long presses the top left part of the grid area labeled with
+the number 7.
+
+3. swipe(start_area: int, start_subarea: str, end_area: int, end_subarea: str)
+This function is used to perform a swipe action on the smartphone screen, especially when you want to interact with a
+scroll view or a slide bar. "start_area" is the integer label assigned to the grid area which marks the starting
+location of the swipe. "start_subarea" is a string representing the exact location to begin the swipe within the grid
+area. "end_area" is the integer label assigned to the grid area which marks the ending location of the swipe.
+"end_subarea" is a string representing the exact location to end the swipe within the grid area.
+The two subarea parameters can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left,
+bottom, and bottom-right.
+A simple use case can be swipe(21, "center", 25, "right"), which performs a swipe starting from the center of grid area
+21 to the right part of grid area 25.
+
+The task you need to complete is to . Your past actions to proceed with this task are summarized as
+follows:
+Now, given the following labeled screenshot, you need to think and call the function needed to proceed with the task.
+Your output should include three parts in the given format:
+Observation:
+Thought:
+Action:
+Summary:
+You can only take one action at a time, so please directly call the function."""
+
+self_explore_task_template = """You are an agent that is trained to complete certain tasks on a smartphone. You will be
+given a screenshot of a smartphone app. The interactive UI elements on the screenshot are labeled with numeric tags
+starting from 1.
+
+You can call the following functions to interact with those labeled elements to control the smartphone:
+
+1. tap(element: int)
+This function is used to tap an UI element shown on the smartphone screen.
+"element" is a numeric tag assigned to an UI element shown on the smartphone screen.
+A simple use case can be tap(5), which taps the UI element labeled with the number 5.
+
+2. text(text_input: str)
+This function is used to insert text input in an input field/box. text_input is the string you want to insert and must
+be wrapped with double quotation marks. A simple use case can be text("Hello, world!"), which inserts the string
+"Hello, world!" into the input area on the smartphone screen. This function is only callable when you see a keyboard
+showing in the lower half of the screen.
+
+3. long_press(element: int)
+This function is used to long press an UI element shown on the smartphone screen.
+"element" is a numeric tag assigned to an UI element shown on the smartphone screen.
+A simple use case can be long_press(5), which long presses the UI element labeled with the number 5.
+
+4. swipe(element: int, direction: str, dist: str)
+This function is used to swipe an UI element shown on the smartphone screen, usually a scroll view or a slide bar.
+"element" is a numeric tag assigned to an UI element shown on the smartphone screen. "direction" is a string that
+represents one of the four directions: up, down, left, right. "direction" must be wrapped with double quotation
+marks. "dist" determines the distance of the swipe and can be one of the three options: short, medium, long. You should
+choose the appropriate distance option according to your need.
+A simple use case can be swipe(21, "up", "medium"), which swipes up the UI element labeled with the number 21 for a
+medium distance.
+
+The task you need to complete is to . Your past actions to proceed with this task are summarized as
+follows:
+Now, given the following labeled screenshot, you need to think and call the function needed to proceed with the task.
+Your output should include three parts in the given format:
+Observation:
+Thought:
+Action:
+Summary:
+You can only take one action at a time, so please directly call the function."""
+
+self_explore_reflect_template = """I will give you screenshots of a mobile app before and after the UI
+element labeled with the number '' on the first screenshot. The numeric tag of each element is located at
+the center of the element. The action of this UI element was described as follows:
+
+The action was also an attempt to proceed with a larger task, which is to . Your job is to carefully analyze
+the difference between the two screenshots to determine if the action is in accord with the description above and at
+the same time effectively moved the task forward. Your output should be determined based on the following situations:
+1. BACK
+If you think the action navigated you to a page where you cannot proceed with the given task, you should go back to the
+previous interface. At the same time, describe the functionality of the UI element concisely in one or two sentences by
+observing the difference between the two screenshots. Notice that your description of the UI element should focus on
+the general function. Never include the numeric tag of the UI element in your description. You can use pronouns such as
+"the UI element" to refer to the element. Your output should be in the following format:
+Decision: BACK
+Thought:
+Documentation:
+2. INEFFECTIVE
+If you find the action changed nothing on the screen (screenshots before and after the action are identical), you
+should continue to interact with other elements on the screen. Notice that if you find the location of the cursor
+changed between the two screenshots, then they are not identical. Your output should be in the following format:
+Decision: INEFFECTIVE
+Thought:
+3. CONTINUE
+If you find the action changed something on the screen but does not reflect the action description above and did not
+move the given task forward, you should continue to interact with other elements on the screen. At the same time,
+describe the functionality of the UI element concisely in one or two sentences by observing the difference between the
+two screenshots. Notice that your description of the UI element should focus on the general function. Never include the
+numeric tag of the UI element in your description. You can use pronouns such as "the UI element" to refer to the
+element. Your output should be in the following format:
+Decision: CONTINUE
+Thought:
+Documentation:
+4. SUCCESS
+If you think the action successfully moved the task forward (even though it did not completed the task), you should
+describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI
+element should focus on the general function. Never include the numeric tag of the UI element in your description. You
+can use pronouns such as "the UI element" to refer to the element. Your output should be in the following format:
+Decision: SUCCESS
+Thought:
+Documentation:
+"""
diff --git a/docs/src/scripts/self_explorer.py b/docs/src/scripts/self_explorer.py
new file mode 100644
index 0000000..4b7553d
--- /dev/null
+++ b/docs/src/scripts/self_explorer.py
@@ -0,0 +1,282 @@
+import argparse
+import ast
+import datetime
+import json
+import os
+import re
+import sys
+import time
+
+import prompts
+from config import load_config
+from and_controller import list_all_devices, AndroidController, traverse_tree
+from model import ask_gpt4v, parse_explore_rsp, parse_reflect_rsp
+from utils import print_with_color, draw_bbox_multi, encode_image
+
+arg_desc = "AppAgent - Autonomous Exploration"
+parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
+parser.add_argument("--app")
+parser.add_argument("--root_dir", default="./")
+args = vars(parser.parse_args())
+
+configs = load_config()
+
+app = args["app"]
+root_dir = args["root_dir"]
+
+if not app:
+ print_with_color("What is the name of the target app?", "blue")
+ app = input()
+ app = app.replace(" ", "")
+
+work_dir = os.path.join(root_dir, "apps")
+if not os.path.exists(work_dir):
+ os.mkdir(work_dir)
+work_dir = os.path.join(work_dir, app)
+if not os.path.exists(work_dir):
+ os.mkdir(work_dir)
+demo_dir = os.path.join(work_dir, "demos")
+if not os.path.exists(demo_dir):
+ os.mkdir(demo_dir)
+demo_timestamp = int(time.time())
+task_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime("self_explore_%Y-%m-%d_%H-%M-%S")
+task_dir = os.path.join(demo_dir, task_name)
+os.mkdir(task_dir)
+docs_dir = os.path.join(work_dir, "auto_docs")
+if not os.path.exists(docs_dir):
+ os.mkdir(docs_dir)
+explore_log_path = os.path.join(task_dir, f"log_explore_{task_name}.txt")
+reflect_log_path = os.path.join(task_dir, f"log_reflect_{task_name}.txt")
+
+device_list = list_all_devices()
+if not device_list:
+ print_with_color("ERROR: No device found!", "red")
+ sys.exit()
+print_with_color(f"List of devices attached:\n{str(device_list)}", "yellow")
+if len(device_list) == 1:
+ device = device_list[0]
+ print_with_color(f"Device selected: {device}", "yellow")
+else:
+ print_with_color("Please choose the Android device to start demo by entering its ID:", "blue")
+ device = input()
+controller = AndroidController(device)
+width, height = controller.get_device_size()
+if not width and not height:
+ print_with_color("ERROR: Invalid device size!", "red")
+ sys.exit()
+print_with_color(f"Screen resolution of {device}: {width}x{height}", "yellow")
+
+print_with_color("Please enter the description of the task you want me to complete in a few sentences:", "blue")
+task_desc = input()
+
+round_count = 0
+doc_count = 0
+useless_list = set()
+last_act = "None"
+task_complete = False
+while round_count < configs["MAX_ROUNDS"]:
+ round_count += 1
+ print_with_color(f"Round {round_count}", "yellow")
+ screenshot_before = controller.get_screenshot(f"{round_count}_before", task_dir)
+ xml_path = controller.get_xml(f"{round_count}", task_dir)
+ if screenshot_before == "ERROR" or xml_path == "ERROR":
+ break
+ clickable_list = []
+ focusable_list = []
+ traverse_tree(xml_path, clickable_list, "clickable", True)
+ traverse_tree(xml_path, focusable_list, "focusable", True)
+ elem_list = []
+ for elem in clickable_list:
+ if elem.uid in useless_list:
+ continue
+ elem_list.append(elem)
+ for elem in focusable_list:
+ if elem.uid in useless_list:
+ continue
+ bbox = elem.bbox
+ center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
+ close = False
+ for e in clickable_list:
+ bbox = e.bbox
+ center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
+ dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5
+ if dist <= configs["MIN_DIST"]:
+ close = True
+ break
+ if not close:
+ elem_list.append(elem)
+ draw_bbox_multi(screenshot_before, os.path.join(task_dir, f"{round_count}_before_labeled.png"), elem_list,
+ dark_mode=configs["DARK_MODE"])
+
+ prompt = re.sub(r"", task_desc, prompts.self_explore_task_template)
+ prompt = re.sub(r"", last_act, prompt)
+ base64_img_before = encode_image(os.path.join(task_dir, f"{round_count}_before_labeled.png"))
+ content = [
+ {
+ "type": "text",
+ "text": prompt
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{base64_img_before}"
+ }
+ }
+ ]
+ print_with_color("Thinking about what to do in the next step...", "yellow")
+ rsp = ask_gpt4v(content)
+
+ if "error" not in rsp:
+ with open(explore_log_path, "a") as logfile:
+ log_item = {"step": round_count, "prompt": prompt, "image": f"{round_count}_before_labeled.png",
+ "response": rsp}
+ logfile.write(json.dumps(log_item) + "\n")
+ res = parse_explore_rsp(rsp)
+ act_name = res[0]
+ last_act = res[-1]
+ res = res[:-1]
+ if act_name == "FINISH":
+ task_complete = True
+ break
+ if act_name == "tap":
+ _, area = res
+ tl, br = elem_list[area - 1].bbox
+ x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
+ ret = controller.tap(x, y)
+ if ret == "ERROR":
+ print_with_color("ERROR: tap execution failed", "red")
+ break
+ elif act_name == "text":
+ _, input_str = res
+ ret = controller.text(input_str)
+ if ret == "ERROR":
+ print_with_color("ERROR: text execution failed", "red")
+ break
+ elif act_name == "long_press":
+ _, area = res
+ tl, br = elem_list[area - 1].bbox
+ x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
+ ret = controller.long_press(x, y)
+ if ret == "ERROR":
+ print_with_color("ERROR: long press execution failed", "red")
+ break
+ elif act_name == "swipe":
+ _, area, swipe_dir, dist = res
+ tl, br = elem_list[area - 1].bbox
+ x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
+ ret = controller.swipe(x, y, swipe_dir, dist)
+ if ret == "ERROR":
+ print_with_color("ERROR: swipe execution failed", "red")
+ break
+ else:
+ break
+ time.sleep(configs["REQUEST_INTERVAL"])
+ else:
+ print_with_color(rsp["error"]["message"], "red")
+ break
+
+ screenshot_after = controller.get_screenshot(f"{round_count}_after", task_dir)
+ if screenshot_after == "ERROR":
+ break
+ draw_bbox_multi(screenshot_after, os.path.join(task_dir, f"{round_count}_after_labeled.png"), elem_list,
+ dark_mode=configs["DARK_MODE"])
+ base64_img_after = encode_image(os.path.join(task_dir, f"{round_count}_after_labeled.png"))
+
+ if act_name == "tap":
+ prompt = re.sub(r"", "tapping", prompts.self_explore_reflect_template)
+ elif act_name == "text":
+ continue
+ elif act_name == "long_press":
+ prompt = re.sub(r"", "long pressing", prompts.self_explore_reflect_template)
+ elif act_name == "swipe":
+ swipe_dir = res[2]
+ if swipe_dir == "up" or swipe_dir == "down":
+ act_name = "v_swipe"
+ elif swipe_dir == "left" or swipe_dir == "right":
+ act_name = "h_swipe"
+ prompt = re.sub(r"", "swiping", prompts.self_explore_reflect_template)
+ else:
+ print_with_color("ERROR: Undefined act!", "red")
+ break
+ prompt = re.sub(r"", str(area), prompt)
+ prompt = re.sub(r"", task_desc, prompt)
+ prompt = re.sub(r"", last_act, prompt)
+
+ content = [
+ {
+ "type": "text",
+ "text": prompt
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{base64_img_before}"
+ }
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{base64_img_after}"
+ }
+ }
+ ]
+ print_with_color("Reflecting on my previous action...", "yellow")
+ rsp = ask_gpt4v(content)
+ if "error" not in rsp:
+ resource_id = elem_list[int(area) - 1].uid
+ with open(reflect_log_path, "a") as logfile:
+ log_item = {"step": round_count, "prompt": prompt, "image_before": f"{round_count}_before_labeled.png",
+ "image_after": f"{round_count}_after.png", "response": rsp}
+ logfile.write(json.dumps(log_item) + "\n")
+ res = parse_reflect_rsp(rsp)
+ decision = res[0]
+ if decision == "ERROR":
+ break
+ if decision == "INEFFECTIVE":
+ useless_list.add(resource_id)
+ last_act = "None"
+ elif decision == "BACK" or decision == "CONTINUE" or decision == "SUCCESS":
+ if decision == "BACK" or decision == "CONTINUE":
+ useless_list.add(resource_id)
+ last_act = "None"
+ if decision == "BACK":
+ ret = controller.back()
+ if ret == "ERROR":
+ print_with_color("ERROR: back execution failed", "red")
+ break
+ doc = res[-1]
+ doc_name = resource_id + ".txt"
+ doc_path = os.path.join(docs_dir, doc_name)
+ if os.path.exists(doc_path):
+ doc_content = ast.literal_eval(open(doc_path).read())
+ if doc_content[act_name]:
+ print_with_color(f"Documentation for the element {resource_id} already exists.", "yellow")
+ continue
+ else:
+ doc_content = {
+ "tap": "",
+ "text": "",
+ "v_swipe": "",
+ "h_swipe": "",
+ "long_press": ""
+ }
+ doc_content[act_name] = doc
+ with open(doc_path, "w") as outfile:
+ outfile.write(str(doc_content))
+ doc_count += 1
+ print_with_color(f"Documentation generated and saved to {doc_path}", "yellow")
+ else:
+ print_with_color(f"ERROR: Undefined decision! {decision}", "red")
+ break
+ else:
+ print_with_color(rsp["error"]["message"], "red")
+ break
+ time.sleep(configs["REQUEST_INTERVAL"])
+
+if task_complete:
+ print_with_color(f"Autonomous exploration completed successfully. {doc_count} docs generated.", "yellow")
+elif round_count == configs["MAX_ROUNDS"]:
+ print_with_color(f"Autonomous exploration finished due to reaching max rounds. {doc_count} docs generated.",
+ "yellow")
+else:
+ print_with_color(f"Autonomous exploration finished unexpectedly. {doc_count} docs generated.", "red")
diff --git a/docs/src/scripts/step_recorder.py b/docs/src/scripts/step_recorder.py
new file mode 100644
index 0000000..14c65d6
--- /dev/null
+++ b/docs/src/scripts/step_recorder.py
@@ -0,0 +1,183 @@
+import argparse
+import datetime
+
+import cv2
+import os
+import shutil
+import sys
+import time
+
+from and_controller import list_all_devices, AndroidController, traverse_tree
+from config import load_config
+from utils import print_with_color, draw_bbox_multi
+
+arg_desc = "AppAgent - Human Demonstration"
+parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
+parser.add_argument("--app")
+parser.add_argument("--demo")
+parser.add_argument("--root_dir", default="./")
+args = vars(parser.parse_args())
+
+app = args["app"]
+demo_name = args["demo"]
+root_dir = args["root_dir"]
+
+configs = load_config()
+
+if not app:
+ print_with_color("What is the name of the app you are going to demo?", "blue")
+ app = input()
+ app = app.replace(" ", "")
+if not demo_name:
+ demo_timestamp = int(time.time())
+ demo_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime(f"demo_{app}_%Y-%m-%d_%H-%M-%S")
+
+work_dir = os.path.join(root_dir, "apps")
+if not os.path.exists(work_dir):
+ os.mkdir(work_dir)
+work_dir = os.path.join(work_dir, app)
+if not os.path.exists(work_dir):
+ os.mkdir(work_dir)
+demo_dir = os.path.join(work_dir, "demos")
+if not os.path.exists(demo_dir):
+ os.mkdir(demo_dir)
+task_dir = os.path.join(demo_dir, demo_name)
+if os.path.exists(task_dir):
+ shutil.rmtree(task_dir)
+os.mkdir(task_dir)
+raw_ss_dir = os.path.join(task_dir, "raw_screenshots")
+os.mkdir(raw_ss_dir)
+xml_dir = os.path.join(task_dir, "xml")
+os.mkdir(xml_dir)
+labeled_ss_dir = os.path.join(task_dir, "labeled_screenshots")
+os.mkdir(labeled_ss_dir)
+record_path = os.path.join(task_dir, "record.txt")
+record_file = open(record_path, "w")
+task_desc_path = os.path.join(task_dir, "task_desc.txt")
+
+device_list = list_all_devices()
+if not device_list:
+ print_with_color("ERROR: No device found!", "red")
+ sys.exit()
+print_with_color("List of devices attached:\n" + str(device_list), "yellow")
+if len(device_list) == 1:
+ device = device_list[0]
+ print_with_color(f"Device selected: {device}", "yellow")
+else:
+ print_with_color("Please choose the Android device to start demo by entering its ID:", "blue")
+ device = input()
+controller = AndroidController(device)
+width, height = controller.get_device_size()
+if not width and not height:
+ print_with_color("ERROR: Invalid device size!", "red")
+ sys.exit()
+print_with_color(f"Screen resolution of {device}: {width}x{height}", "yellow")
+
+print_with_color("Please state the goal of your following demo actions clearly, e.g. send a message to John", "blue")
+task_desc = input()
+with open(task_desc_path, "w") as f:
+ f.write(task_desc)
+
+print_with_color("All interactive elements on the screen are labeled with red and blue numeric tags. Elements "
+ "labeled with red tags are clickable elements; elements labeled with blue tags are scrollable "
+ "elements.", "blue")
+
+step = 0
+while True:
+ step += 1
+ screenshot_path = controller.get_screenshot(f"{demo_name}_{step}", raw_ss_dir)
+ xml_path = controller.get_xml(f"{demo_name}_{step}", xml_dir)
+ if screenshot_path == "ERROR" or xml_path == "ERROR":
+ break
+ clickable_list = []
+ focusable_list = []
+ traverse_tree(xml_path, clickable_list, "clickable", True)
+ traverse_tree(xml_path, focusable_list, "focusable", True)
+ elem_list = clickable_list.copy()
+ for elem in focusable_list:
+ bbox = elem.bbox
+ center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
+ close = False
+ for e in clickable_list:
+ bbox = e.bbox
+ center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
+ dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5
+ if dist <= configs["MIN_DIST"]:
+ close = True
+ break
+ if not close:
+ elem_list.append(elem)
+ labeled_img = draw_bbox_multi(screenshot_path, os.path.join(labeled_ss_dir, f"{demo_name}_{step}.png"), elem_list,
+ True)
+ cv2.imshow("image", labeled_img)
+ cv2.waitKey(0)
+ cv2.destroyAllWindows()
+ user_input = "xxx"
+ print_with_color("Choose one of the following actions you want to perform on the current screen:\ntap, text, long "
+ "press, swipe, stop", "blue")
+ while user_input.lower() != "tap" and user_input.lower() != "text" and user_input.lower() != "long press" \
+ and user_input.lower() != "swipe" and user_input.lower() != "stop":
+ user_input = input()
+ if user_input.lower() == "tap":
+ print_with_color(f"Which element do you want to tap? Choose a numeric tag from 1 to {len(elem_list)}:", "blue")
+ user_input = "xxx"
+ while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:
+ user_input = input()
+ tl, br = elem_list[int(user_input) - 1].bbox
+ x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
+ ret = controller.tap(x, y)
+ if ret == "ERROR":
+ print_with_color("ERROR: tap execution failed", "red")
+ break
+ record_file.write(f"tap({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n")
+ elif user_input.lower() == "text":
+ print_with_color(f"Which element do you want to input the text string? Choose a numeric tag from 1 to "
+ f"{len(elem_list)}:", "blue")
+ input_area = "xxx"
+ while not input_area.isnumeric() or int(input_area) > len(elem_list) or int(input_area) < 1:
+ input_area = input()
+ print_with_color("Enter your input text below:", "blue")
+ user_input = ""
+ while not user_input:
+ user_input = input()
+ controller.text(user_input)
+ record_file.write(f"text({input_area}:sep:\"{user_input}\"):::{elem_list[int(input_area) - 1].uid}\n")
+ elif user_input.lower() == "long press":
+ print_with_color(f"Which element do you want to long press? Choose a numeric tag from 1 to {len(elem_list)}:",
+ "blue")
+ user_input = "xxx"
+ while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:
+ user_input = input()
+ tl, br = elem_list[int(user_input) - 1].bbox
+ x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
+ ret = controller.long_press(x, y)
+ if ret == "ERROR":
+ print_with_color("ERROR: long press execution failed", "red")
+ break
+ record_file.write(f"long_press({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n")
+ elif user_input.lower() == "swipe":
+ print_with_color(f"What is the direction of your swipe? Choose one from the following options:\nup, down, left,"
+ f" right", "blue")
+ user_input = ""
+ while user_input != "up" and user_input != "down" and user_input != "left" and user_input != "right":
+ user_input = input()
+ swipe_dir = user_input
+ print_with_color(f"Which element do you want to swipe? Choose a numeric tag from 1 to {len(elem_list)}:")
+ while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:
+ user_input = input()
+ tl, br = elem_list[int(user_input) - 1].bbox
+ x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
+ ret = controller.swipe(x, y, swipe_dir)
+ if ret == "ERROR":
+ print_with_color("ERROR: swipe execution failed", "red")
+ break
+ record_file.write(f"swipe({int(user_input)}:sep:{swipe_dir}):::{elem_list[int(user_input) - 1].uid}\n")
+ elif user_input.lower() == "stop":
+ record_file.write("stop\n")
+ record_file.close()
+ break
+ else:
+ break
+ time.sleep(3)
+
+print_with_color(f"Demonstration phase completed. {step} steps were recorded.", "yellow")
diff --git a/docs/src/scripts/task_executor.py b/docs/src/scripts/task_executor.py
new file mode 100644
index 0000000..0edea7d
--- /dev/null
+++ b/docs/src/scripts/task_executor.py
@@ -0,0 +1,289 @@
+import argparse
+import ast
+import datetime
+import json
+import os
+import re
+import sys
+import time
+
+import prompts
+from config import load_config
+from and_controller import list_all_devices, AndroidController, traverse_tree
+from model import ask_gpt4v, parse_explore_rsp, parse_grid_rsp
+from utils import print_with_color, draw_bbox_multi, encode_image, draw_grid
+
+arg_desc = "AppAgent Executor"
+parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
+parser.add_argument("--app")
+parser.add_argument("--root_dir", default="./")
+args = vars(parser.parse_args())
+
+configs = load_config()
+
+app = args["app"]
+root_dir = args["root_dir"]
+
+if not app:
+ print_with_color("What is the name of the app you want me to operate?", "blue")
+ app = input()
+ app = app.replace(" ", "")
+
+app_dir = os.path.join(os.path.join(root_dir, "apps"), app)
+work_dir = os.path.join(root_dir, "tasks")
+if not os.path.exists(work_dir):
+ os.mkdir(work_dir)
+auto_docs_dir = os.path.join(app_dir, "auto_docs")
+demo_docs_dir = os.path.join(app_dir, "demo_docs")
+task_timestamp = int(time.time())
+dir_name = datetime.datetime.fromtimestamp(task_timestamp).strftime(f"task_{app}_%Y-%m-%d_%H-%M-%S")
+task_dir = os.path.join(work_dir, dir_name)
+os.mkdir(task_dir)
+log_path = os.path.join(task_dir, f"log_{app}_{dir_name}.txt")
+
+no_doc = False
+if not os.path.exists(auto_docs_dir) and not os.path.exists(demo_docs_dir):
+ print_with_color(f"No documentations found for the app {app}. Do you want to proceed with no docs? Enter y or n",
+ "red")
+ user_input = ""
+ while user_input != "y" and user_input != "n":
+ user_input = input().lower()
+ if user_input == "y":
+ no_doc = True
+ else:
+ sys.exit()
+elif os.path.exists(auto_docs_dir) and os.path.exists(demo_docs_dir):
+ print_with_color(f"The app {app} has documentations generated from both autonomous exploration and human "
+ f"demonstration. Which one do you want to use? Type 1 or 2.\n1. Autonomous exploration\n2. Human "
+ f"Demonstration",
+ "blue")
+ user_input = ""
+ while user_input != "1" and user_input != "2":
+ user_input = input()
+ if user_input == "1":
+ docs_dir = auto_docs_dir
+ else:
+ docs_dir = demo_docs_dir
+elif os.path.exists(auto_docs_dir):
+ print_with_color(f"Documentations generated from autonomous exploration were found for the app {app}. The doc base "
+ f"is selected automatically.", "yellow")
+ docs_dir = auto_docs_dir
+else:
+ print_with_color(f"Documentations generated from human demonstration were found for the app {app}. The doc base is "
+ f"selected automatically.", "yellow")
+ docs_dir = demo_docs_dir
+
+device_list = list_all_devices()
+if not device_list:
+ print_with_color("ERROR: No device found!", "red")
+ sys.exit()
+print_with_color(f"List of devices attached:\n{str(device_list)}", "yellow")
+if len(device_list) == 1:
+ device = device_list[0]
+ print_with_color(f"Device selected: {device}", "yellow")
+else:
+ print_with_color("Please choose the Android device to start demo by entering its ID:", "blue")
+ device = input()
+controller = AndroidController(device)
+width, height = controller.get_device_size()
+if not width and not height:
+ print_with_color("ERROR: Invalid device size!", "red")
+ sys.exit()
+print_with_color(f"Screen resolution of {device}: {width}x{height}", "yellow")
+
+print_with_color("Please enter the description of the task you want me to complete in a few sentences:", "blue")
+task_desc = input()
+
+round_count = 0
+last_act = "None"
+task_complete = False
+grid_on = False
+rows, cols = 0, 0
+
+
+def area_to_xy(area, subarea):
+ area -= 1
+ row, col = area // cols, area % cols
+ x_0, y_0 = col * (width // cols), row * (height // rows)
+ if subarea == "top-left":
+ x, y = x_0 + (width // cols) // 4, y_0 + (height // rows) // 4
+ elif subarea == "top":
+ x, y = x_0 + (width // cols) // 2, y_0 + (height // rows) // 4
+ elif subarea == "top-right":
+ x, y = x_0 + (width // cols) * 3 // 4, y_0 + (height // rows) // 4
+ elif subarea == "left":
+ x, y = x_0 + (width // cols) // 4, y_0 + (height // rows) // 2
+ elif subarea == "right":
+ x, y = x_0 + (width // cols) * 3 // 4, y_0 + (height // rows) // 2
+ elif subarea == "bottom-left":
+ x, y = x_0 + (width // cols) // 4, y_0 + (height // rows) * 3 // 4
+ elif subarea == "bottom":
+ x, y = x_0 + (width // cols) // 2, y_0 + (height // rows) * 3 // 4
+ elif subarea == "bottom-right":
+ x, y = x_0 + (width // cols) * 3 // 4, y_0 + (height // rows) * 3 // 4
+ else:
+ x, y = x_0 + (width // cols) // 2, y_0 + (height // rows) // 2
+ return x, y
+
+
+while round_count < configs["MAX_ROUNDS"]:
+ round_count += 1
+ print_with_color(f"Round {round_count}", "yellow")
+ screenshot_path = controller.get_screenshot(f"{dir_name}_{round_count}", task_dir)
+ xml_path = controller.get_xml(f"{dir_name}_{round_count}", task_dir)
+ if screenshot_path == "ERROR" or xml_path == "ERROR":
+ break
+ if grid_on:
+ rows, cols = draw_grid(screenshot_path, os.path.join(task_dir, f"{dir_name}_{round_count}_grid.png"))
+ base64_img = encode_image(os.path.join(task_dir, f"{dir_name}_{round_count}_grid.png"))
+ prompt = prompts.task_template_grid
+ else:
+ clickable_list = []
+ focusable_list = []
+ traverse_tree(xml_path, clickable_list, "clickable", True)
+ traverse_tree(xml_path, focusable_list, "focusable", True)
+ elem_list = clickable_list.copy()
+ for elem in focusable_list:
+ bbox = elem.bbox
+ center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
+ close = False
+ for e in clickable_list:
+ bbox = e.bbox
+ center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
+ dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5
+ if dist <= configs["MIN_DIST"]:
+ close = True
+ break
+ if not close:
+ elem_list.append(elem)
+ draw_bbox_multi(screenshot_path, os.path.join(task_dir, f"{dir_name}_{round_count}_labeled.png"), elem_list,
+ dark_mode=configs["DARK_MODE"])
+ base64_img = encode_image(os.path.join(task_dir, f"{dir_name}_{round_count}_labeled.png"))
+ if no_doc:
+ prompt = re.sub(r"", "", prompts.task_template)
+ else:
+ ui_doc = """
+ You also have access to the following documentations that describes the functionalities of UI
+ elements you can interact on the screen. These docs are crucial for you to determine the target of your
+ next action. You should always prioritize these documented elements for interaction:"""
+ for i, elem in enumerate(elem_list):
+ doc_path = os.path.join(docs_dir, f"{elem.uid}.txt")
+ if not os.path.exists(doc_path):
+ continue
+ ui_doc += f"Documentation of UI element labeled with the numeric tag '{i + 1}':\n"
+ doc_content = ast.literal_eval(open(doc_path, "r").read())
+ if doc_content["tap"]:
+ ui_doc += f"This UI element is clickable. {doc_content['tap']}\n\n"
+ if doc_content["text"]:
+ ui_doc += f"This UI element can receive text input. The text input is used for the following " \
+ f"purposes: {doc_content['text']}\n\n"
+ if doc_content["long_press"]:
+ ui_doc += f"This UI element is long clickable. {doc_content['long_press']}\n\n"
+ if doc_content["v_swipe"]:
+ ui_doc += f"This element can be swiped directly without tapping. You can swipe vertically on " \
+ f"this UI element. {doc_content['v_swipe']}\n\n"
+ if doc_content["h_swipe"]:
+ ui_doc += f"This element can be swiped directly without tapping. You can swipe horizontally on " \
+ f"this UI element. {doc_content['h_swipe']}\n\n"
+ print_with_color(f"Documentations retrieved for the current interface:\n{ui_doc}", "magenta")
+ prompt = re.sub(r"", ui_doc, prompts.task_template)
+ prompt = re.sub(r"", task_desc, prompt)
+ prompt = re.sub(r"", last_act, prompt)
+ content = [
+ {
+ "type": "text",
+ "text": prompt
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/jpeg;base64,{base64_img}"
+ }
+ }
+ ]
+ print_with_color("Thinking about what to do in the next step...", "yellow")
+ rsp = ask_gpt4v(content)
+
+ if "error" not in rsp:
+ with open(log_path, "a") as logfile:
+ log_item = {"step": round_count, "prompt": prompt, "image": f"{dir_name}_{round_count}_labeled.png",
+ "response": rsp}
+ logfile.write(json.dumps(log_item) + "\n")
+ if grid_on:
+ res = parse_grid_rsp(rsp)
+ else:
+ res = parse_explore_rsp(rsp)
+ act_name = res[0]
+ if act_name == "FINISH":
+ task_complete = True
+ break
+ if act_name == "ERROR":
+ break
+ last_act = res[-1]
+ res = res[:-1]
+ if act_name == "tap":
+ _, area = res
+ tl, br = elem_list[area - 1].bbox
+ x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
+ ret = controller.tap(x, y)
+ if ret == "ERROR":
+ print_with_color("ERROR: tap execution failed", "red")
+ break
+ elif act_name == "text":
+ _, input_str = res
+ ret = controller.text(input_str)
+ if ret == "ERROR":
+ print_with_color("ERROR: text execution failed", "red")
+ break
+ elif act_name == "long_press":
+ _, area = res
+ tl, br = elem_list[area - 1].bbox
+ x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
+ ret = controller.long_press(x, y)
+ if ret == "ERROR":
+ print_with_color("ERROR: long press execution failed", "red")
+ break
+ elif act_name == "swipe":
+ _, area, swipe_dir, dist = res
+ tl, br = elem_list[area - 1].bbox
+ x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
+ ret = controller.swipe(x, y, swipe_dir, dist)
+ if ret == "ERROR":
+ print_with_color("ERROR: swipe execution failed", "red")
+ break
+ elif act_name == "grid":
+ grid_on = True
+ elif act_name == "tap_grid" or act_name == "long_press_grid":
+ _, area, subarea = res
+ x, y = area_to_xy(area, subarea)
+ if act_name == "tap_grid":
+ ret = controller.tap(x, y)
+ if ret == "ERROR":
+ print_with_color("ERROR: tap execution failed", "red")
+ break
+ else:
+ ret = controller.long_press(x, y)
+ if ret == "ERROR":
+ print_with_color("ERROR: tap execution failed", "red")
+ break
+ elif act_name == "swipe_grid":
+ _, start_area, start_subarea, end_area, end_subarea = res
+ start_x, start_y = area_to_xy(start_area, start_subarea)
+ end_x, end_y = area_to_xy(end_area, end_subarea)
+ ret = controller.swipe_precise((start_x, start_y), (end_x, end_y))
+ if ret == "ERROR":
+ print_with_color("ERROR: tap execution failed", "red")
+ break
+ if act_name != "grid":
+ grid_on = False
+ time.sleep(configs["REQUEST_INTERVAL"])
+ else:
+ print_with_color(rsp["error"]["message"], "red")
+ break
+
+if task_complete:
+ print_with_color("Task completed successfully", "yellow")
+elif round_count == configs["MAX_ROUNDS"]:
+ print_with_color("Task finished due to reaching max rounds", "yellow")
+else:
+ print_with_color("Task finished unexpectedly", "red")
diff --git a/docs/src/scripts/utils.py b/docs/src/scripts/utils.py
new file mode 100644
index 0000000..19d8b80
--- /dev/null
+++ b/docs/src/scripts/utils.py
@@ -0,0 +1,100 @@
+import base64
+import cv2
+import pyshine as ps
+
+from colorama import Fore, Style
+
+
+def print_with_color(text: str, color=""):
+ if color == "red":
+ print(Fore.RED + text)
+ elif color == "green":
+ print(Fore.GREEN + text)
+ elif color == "yellow":
+ print(Fore.YELLOW + text)
+ elif color == "blue":
+ print(Fore.BLUE + text)
+ elif color == "magenta":
+ print(Fore.MAGENTA + text)
+ elif color == "cyan":
+ print(Fore.CYAN + text)
+ elif color == "white":
+ print(Fore.WHITE + text)
+ elif color == "black":
+ print(Fore.BLACK + text)
+ else:
+ print(text)
+ print(Style.RESET_ALL)
+
+
+def draw_bbox_multi(img_path, output_path, elem_list, record_mode=False, dark_mode=False):
+ imgcv = cv2.imread(img_path)
+ count = 1
+ for elem in elem_list:
+ try:
+ top_left = elem.bbox[0]
+ bottom_right = elem.bbox[1]
+ left, top = top_left[0], top_left[1]
+ right, bottom = bottom_right[0], bottom_right[1]
+ label = str(count)
+ if record_mode:
+ if elem.attrib == "clickable":
+ color = (250, 0, 0)
+ elif elem.attrib == "focusable":
+ color = (0, 0, 250)
+ else:
+ color = (0, 250, 0)
+ imgcv = ps.putBText(imgcv, label, text_offset_x=(left + right) // 2 + 10, text_offset_y=(top + bottom) // 2 + 10,
+ vspace=10, hspace=10, font_scale=1, thickness=2, background_RGB=color,
+ text_RGB=(255, 250, 250), alpha=0.5)
+ else:
+ text_color = (10, 10, 10) if dark_mode else (255, 250, 250)
+ bg_color = (255, 250, 250) if dark_mode else (10, 10, 10)
+ imgcv = ps.putBText(imgcv, label, text_offset_x=(left + right) // 2 + 10, text_offset_y=(top + bottom) // 2 + 10,
+ vspace=10, hspace=10, font_scale=1, thickness=2, background_RGB=bg_color,
+ text_RGB=text_color, alpha=0.5)
+ except Exception as e:
+ print_with_color(f"ERROR: An exception occurs while labeling the image\n{e}", "red")
+ count += 1
+ cv2.imwrite(output_path, imgcv)
+ return imgcv
+
+
+def draw_grid(img_path, output_path):
+ def get_unit_len(n):
+ for i in range(1, n + 1):
+ if n % i == 0 and 120 <= i <= 180:
+ return i
+ return -1
+
+ image = cv2.imread(img_path)
+ height, width, _ = image.shape
+ color = (255, 116, 113)
+ unit_height = get_unit_len(height)
+ if unit_height < 0:
+ unit_height = 120
+ unit_width = get_unit_len(width)
+ if unit_width < 0:
+ unit_width = 120
+ thick = int(unit_width // 50)
+ rows = height // unit_height
+ cols = width // unit_width
+ for i in range(rows):
+ for j in range(cols):
+ label = i * cols + j + 1
+ left = int(j * unit_width)
+ top = int(i * unit_height)
+ right = int((j + 1) * unit_width)
+ bottom = int((i + 1) * unit_height)
+ cv2.rectangle(image, (left, top), (right, bottom), color, thick // 2)
+ cv2.putText(image, str(label), (left + int(unit_width * 0.05) + 3, top + int(unit_height * 0.3) + 3), 0,
+ int(0.01 * unit_width), (0, 0, 0), thick)
+ cv2.putText(image, str(label), (left + int(unit_width * 0.05), top + int(unit_height * 0.3)), 0,
+ int(0.01 * unit_width), color, thick)
+ cv2.imwrite(output_path, image)
+ return rows, cols
+
+
+def encode_image(image_path):
+ with open(image_path, "rb") as image_file:
+ return base64.b64encode(image_file.read()).decode('utf-8')
diff --git a/docs/tree.html b/docs/tree.html
new file mode 100644
index 0000000..c261563
--- /dev/null
+++ b/docs/tree.html
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+
+
+ Project structure of: mnotgod96/AppAgent
+
+
+
+
+
+
+
+
Project structure of: mnotgod96/AppAgent
+
+
AppAgentAutonomous Android emulator tool with OpenAI integration