Merge pull request #2 from kturung/multimodal-support

Feature - Multimodal vision support
kturung · Jul 7, 2024 · d9af3fa · d9af3fa
2 parents 8078cc8 + be064c5
commit d9af3fa
Show file tree

Hide file tree

Showing 5 changed files with 153 additions and 31 deletions.
diff --git a/.env b/.env
@@ -1,2 +1,3 @@
 ANTHROPIC_API_KEY=your_api_key
-E2B_API_KEY=your_api_key
+E2B_API_KEY=your_api_key
+BROWSER=none
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,8 @@ build/
 # Production build output
 /dist
 
+/uploaded_files
+
 # Debug logs from npm
 npm-debug.log*
 yarn-debug.log*
@@ -15,6 +17,10 @@ yarn-error.log*
 .vscode/
 *.swp
 *.swo
+.env
+
+src/App.js
+sandboxid.txt
 
 # Operating System Files
 .DS_Store
@@ -27,4 +33,8 @@ Thumbs.db
 .eslintcache
 
 # Optional tsc command output
-*.tsbuildinfo
+*.tsbuildinfo
+
+package-lock.json
+
+application.flag
diff --git a/README.md b/README.md
@@ -11,7 +11,9 @@ https://github.com/kturung/langgraph_streamlit_codeassistant/assets/37293938/bdc
 https://github.com/kturung/langgraph_streamlit_codeassistant/assets/37293938/cc64a6cd-ab31-4ad0-a490-48e4df08fba6
 
 
-
+| **New Feature** | Feature Description                           | Notes                                                                                                                                                                                                                   |
+|-----------------------|-----------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| **03.07.2024**        | **Multimodal Support (Vision Capability)**    | - **Enables the application** to process and analyze images  alongside text and code.<br>- **AI to generate from referanced images** for a wider range of tasks.<br>- **Integration**: Seamlessly added to existing framework. |
 
 
 ## Key Features and Functionalities

diff --git a/main.py b/main.py
@@ -4,13 +4,20 @@
 from langchain_core.tools import tool
 from langgraph.prebuilt import ToolNode, tools_condition
 from langgraph.graph import MessageGraph, END
-from langchain_core.messages import AIMessage
+from langchain_core.messages import AIMessage, HumanMessage
 from e2b_code_interpreter import CodeInterpreter
 import base64
 import streamlit.components.v1 as components
 import subprocess
 from langchain.pydantic_v1 import BaseModel, Field
 import shutil
+import platform
+import time
+import threading
+import queue
+import re
+
+
 
 
 #get .env variables
@@ -21,21 +28,6 @@
 
 col1, col2, col3, col4 = st.columns([0.05, 0.45, 0.05, 0.45])
 
-# This is to start the react visualization component
-@st.cache_resource
-def run_async_command():
-    if not st.session_state.get('command_run', False):
-        process = subprocess.Popen(
-            "npm start",
-            shell=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True
-        )
-        st.session_state.process = process
-        st.session_state.command_run = True
-
-run_async_command()
 
 @tool
 def execute_python(code: str):
@@ -96,22 +88,121 @@ def send_file_to_user(filepath: str):
         f.write(file_in_bytes) 
     return "File sent to the user successfully."
 
+class NpmDepdencySchema(BaseModel):
+    package_names: str = Field(description="Name of the npm packages to install. Should be space-separated.")
+
+@tool("install_npm_dependencies", args_schema=NpmDepdencySchema ,return_direct=True)
+def install_npm_dependencies(package_names: str):
+    """Installs the given npm dependencies and returns the result of the installation."""
+    try:
+        # Split the package_names string into a list of individual package names
+        package_list = package_names.split()
+        npm_cmd = "npm.cmd" if platform.system() == "Windows" else "npm"
+        # Construct the command with each package name as a separate argument
+        command = [npm_cmd, "install"] + package_list
+        result = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            check=True
+        )
+    except subprocess.CalledProcessError as e:
+        return f"Failed to install npm packages '{package_names}': {e.stderr}"
+
+    return f"Successfully installed npm packages '{package_names}'"
+
 class ReactInputSchema(BaseModel):
-    code: str = Field(description="src/App.js code to render a react component.")
+    code: str = Field(description="src/App.js code to render a react component. Should not contain local file import statements.")
 
 # This is for agent to render react component on the fly with the given code
 @tool("render_react", args_schema=ReactInputSchema, return_direct=True)
 def render_react(code: str):
     """Render a react component with the given code and return the render result."""
     cwd = os.getcwd()
-    with open(f"{cwd}\\src\\App.js", "w") as f:
+    file_path = os.path.join(cwd, "src", "App.js")
+    with open(file_path, "w", encoding="utf-8") as f:
         f.write(code)
-    filename = f"application.flag"
-    with open(filename, "w") as f:
+    # Determine the appropriate command based on the operating system
+    npm_cmd = "npm.cmd" if platform.system() == "Windows" else "npm"
+
+    # Start the React application
+    try:
+        if platform.system() == "Windows":
+            subprocess.run(["taskkill", "/F", "/IM", "node.exe"], check=True)
+        else:
+            subprocess.run(["pkill", "node"], check=True)
+    except subprocess.CalledProcessError:
+        pass
+
+    output_queue = queue.Queue()
+    error_messages = []
+    success_pattern = re.compile(r'Compiled successfully|webpack compiled successfully')
+    error_pattern = re.compile(r'Failed to compile|Error:|ERROR in')
+    start_time = time.time()
+
+    def handle_output(stream, prefix):
+        for line in iter(stream.readline, ''):
+            output_queue.put(f"{prefix}: {line.strip()}")
+        stream.close()
+
+    try:
+        process = subprocess.Popen(
+            [npm_cmd, "start"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            bufsize=1
+        )
+
+        stdout_thread = threading.Thread(target=handle_output, args=(process.stdout, "stdout"))
+        stderr_thread = threading.Thread(target=handle_output, args=(process.stderr, "stderr"))
+
+        stdout_thread.start()
+        stderr_thread.start()
+
+        compilation_failed = False
+
+        while True:
+            try:
+                line = output_queue.get(timeout=5)  # Wait for 5 seconds for new output
+                print(line)  # Print the output for debugging
+
+                if success_pattern.search(line):
+                    with open("application.flag", "w") as f:
+                        f.write("flag")
+                    return "npm start completed successfully"
+
+                if error_pattern.search(line):
+                    compilation_failed = True
+                    error_messages.append(line)
+
+                if compilation_failed and "webpack compiled with" in line:
+                    return "npm start failed with errors:\n" + "\n".join(error_messages)
+
+            except queue.Empty:
+                # Check if we've exceeded the timeout
+                if time.time() - start_time > 30:
+                    return f"npm start process timed out after 30 seconds"
+
+            if not stdout_thread.is_alive() and not stderr_thread.is_alive():
+                # Both output streams have closed
+                break
+
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
+
+    if error_messages:
+        return "npm start failed with errors:\n" + "\n".join(error_messages)
+
+    with open("application.flag", "w") as f:
         f.write("flag")
-    return "React component rendered successfully."
+    return "npm start completed without obvious errors or success messages"
+
+
+
 
-tools = [execute_python, render_react, send_file_to_user]
+tools = [execute_python, render_react, send_file_to_user, install_npm_dependencies]
 
 # LangGraph to orchestrate the workflow of the chatbot
 @st.cache_resource
@@ -152,6 +243,7 @@ def initialize_session_state():
 """}]
         st.session_state["filesuploaded"] = False
         st.session_state["tool_text_list"] = []
+        st.session_state["image_data"] = ""
         sandboxmain = CodeInterpreter.create()
         sandboxid = sandboxmain.id
         sandboxmain.keep_alive(300)
@@ -183,12 +275,18 @@ def initialize_session_state():
         if not os.path.exists(save_path):
             os.makedirs(save_path)
         for uploaded_file in uploaded_files:
+            _, file_extension = os.path.splitext(uploaded_file.name)
+            file_extension = file_extension.lower()
             file_path = os.path.join(save_path, uploaded_file.name)
             with open(file_path, "wb") as f:
                 f.write(uploaded_file.getbuffer())
             with open(file_path, "rb") as f:
                 remote_path = sandbox.upload_file(f)  
                 print(f"Uploaded file to {remote_path}")
+            if file_extension in ['.jpeg', '.jpg', '.png']:
+                file_path = os.path.join(save_path, uploaded_file.name)
+                with open(file_path, "rb") as f:
+                    st.session_state.image_data = base64.b64encode(f.read()).decode("utf-8")
         uploaded_file_names = [uploaded_file.name for uploaded_file in uploaded_files]
         uploaded_files_prompt = f"\n\nThese files are saved to disk. User may ask questions about them. {', '.join(uploaded_file_names)}"
         st.session_state["messages"][0]["content"] += uploaded_files_prompt
@@ -215,7 +313,19 @@ def initialize_session_state():
 
     if user_prompt:
         messages.chat_message("user").write(user_prompt)
-        st.session_state.messages.append({"role": "user", "content": user_prompt})
+        if st.session_state.image_data:
+            st.session_state.messages.append(HumanMessage(
+            content=[
+                {"type": "text", "text": user_prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{st.session_state.image_data}"},
+                },
+            ],
+        ))
+            st.session_state.image_data = ""
+        else:
+            st.session_state.messages.append({"role": "user", "content": user_prompt})
         st.session_state.chat_history.append({"role": "user", "content": {"type": "text", "text": user_prompt}})
 
         thread = {"configurable": {"thread_id": "4"}}
@@ -256,7 +366,7 @@ def initialize_session_state():
 if os.path.exists("application.flag"):
     with col4:
         st.header('Application Preview')
-        react_app_url = "http://localhost:3000"
+        react_app_url = f"http://localhost:3000?t={int(time.time())}"
         components.iframe(src=react_app_url, height=700)
 
 if os.path.exists("downloads") and os.listdir("downloads"):

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,6 @@
 e2b_code_interpreter==0.0.10
 langchain==0.2.6
-langchain_anthropic==0.1.17
-langchain_core==0.2.10
-langgraph==0.1.4
-python-dotenv==1.0.1
+langchain_anthropic==0.1.19
+langchain_core==0.2.11
+langgraph==0.1.5
 streamlit==1.36.0