Skip to content

Commit

Permalink
Merge pull request #152 from michaelhhogue/ollama-llava
Browse files Browse the repository at this point in the history
Add support for LLaVA through Ollama
  • Loading branch information
joshbickett authored Feb 9, 2024
2 parents 10bb8bf + ce2d42e commit 33af25c
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 7 deletions.
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,31 @@ Start `operate` with the SoM model
operate -m gpt-4-with-som
```

### Locally Hosted LLaVA Through Ollama
If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama!
*Note: Ollama currently only supports MacOS and Linux*

First, install Ollama on your machine from https://ollama.ai/download.

Once Ollama is installed, pull the LLaVA model:
```
ollama pull llava
```
This will download the model on your machine which takes approximately 5 GB of storage.

When Ollama has finished pulling LLaVA, start the server:
```
ollama serve
```

That's it! Now start `operate` and select the LLaVA model:
```
operate -m llava
```
**Important:** Error rates when using LLaVA are very high. This is simply intended to be a base to build off of as local multimodal models improve over time.

Learn more about Ollama at its [GitHub Repository](https://www.github.com/ollama/ollama)

### Voice Mode `--voice`
The framework supports voice inputs for the objective. Try voice by following the instructions below.
**Clone the repo** to a directory on your computer:
Expand Down
85 changes: 84 additions & 1 deletion operate/models/apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import traceback
import io
import easyocr

import ollama

from PIL import Image
from ultralytics import YOLO
Expand Down Expand Up @@ -53,6 +53,9 @@ async def get_next_action(model, messages, objective, session_id):
return "coming soon"
elif model == "gemini-pro-vision":
return call_gemini_pro_vision(messages, objective), None
elif model == "llava":
operation = call_ollama_llava(messages), None
return operation

raise ModelNotRecognizedException(model)

Expand Down Expand Up @@ -464,6 +467,86 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
return call_gpt_4_vision_preview(messages)


def call_ollama_llava(messages):
if VERBOSE:
print("[call_ollama_llava]")
time.sleep(1)
try:
screenshots_dir = "screenshots"
if not os.path.exists(screenshots_dir):
os.makedirs(screenshots_dir)

screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
# Call the function to capture the screen with the cursor
capture_screen_with_cursor(screenshot_filename)

if len(messages) == 1:
user_prompt = get_user_first_message_prompt()
else:
user_prompt = get_user_prompt()

if VERBOSE:
print(
"[call_ollama_llava] user_prompt",
user_prompt,
)

vision_message = {
"role": "user",
"content": user_prompt,
"images": [screenshot_filename],
}
messages.append(vision_message)

response = ollama.chat(
model="llava",
messages=messages,
)

# Important: Remove the image path from the message history.
# Ollama will attempt to load each image reference and will
# eventually timeout.
messages[-1]["images"] = None

content = response['message']['content'].strip()

if content.startswith("```json"):
content = content[len("```json") :] # Remove starting ```json
if content.endswith("```"):
content = content[: -len("```")] # Remove ending

assistant_message = {"role": "assistant", "content": content}
if VERBOSE:
print(
"[call_ollama_llava] content",
content,
)
content = json.loads(content)

messages.append(assistant_message)

return content

except ollama.ResponseError as e:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull llava` then `ollama serve`{ANSI_RESET}",
e,
)

except Exception as e:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}",
e,
)
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
content,
)
if VERBOSE:
traceback.print_exc()
return call_ollama_llava(messages)


def get_last_assistant_message(messages):
"""
Retrieve the last message from the assistant in the messages array.
Expand Down
61 changes: 55 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,53 +1,102 @@
aiohttp==3.9.1
aiosignal==1.3.1
annotated-types==0.6.0
anyio==3.7.1
attrs==23.2.0
cachetools==5.3.2
certifi==2023.7.22
charset-normalizer==3.3.2
colorama==0.4.6
contourpy==1.2.0
cycler==0.12.1
distro==1.8.0
easyocr==1.7.1
EasyProcess==1.1
entrypoint2==1.1
exceptiongroup==1.1.3
filelock==3.13.1
fonttools==4.44.0
frozenlist==1.4.1
fsspec==2024.2.0
google-ai-generativelanguage==0.4.0
google-api-core==2.16.2
google-auth==2.27.0
google-generativeai==0.3.0
googleapis-common-protos==1.62.0
grpcio==1.60.1
grpcio-status==1.60.1
h11==0.14.0
httpcore==1.0.2
httpx==0.25.1
httpx==0.25.2
idna==3.4
imageio==2.33.1
importlib-resources==6.1.1
Jinja2==3.1.3
kiwisolver==1.4.5
lazy_loader==0.3
MarkupSafe==2.1.5
matplotlib==3.8.1
MouseInfo==0.1.3
mpmath==1.3.0
mss==9.0.1
multidict==6.0.5
networkx==3.2.1
ninja==1.11.1.1
numpy==1.26.1
ollama==0.1.6
openai==1.2.3
opencv-python==4.9.0.80
opencv-python-headless==4.9.0.80
packaging==23.2
pandas==2.2.0
Pillow==10.1.0
prompt-toolkit==3.0.39
proto-plus==1.23.0
protobuf==4.25.2
psutil==5.9.8
py-cpuinfo==9.0.0
pyasn1==0.5.1
pyasn1-modules==0.3.0
PyAutoGUI==0.9.54
pyclipper==1.3.0.post5
pydantic==2.4.2
pydantic_core==2.10.1
PyGetWindow==0.0.9
PyMsgBox==1.0.9
pyobjc-core==10.1
pyobjc-framework-Cocoa==10.1
pyobjc-framework-Quartz==10.1
pyparsing==3.1.1
pyperclip==1.8.2
PyRect==0.2.0
pyscreenshot==3.1
PyScreeze==0.1.29
python3-xlib==0.15
python-bidi==0.4.2
python-dateutil==2.8.2
python-dotenv==1.0.0
python3-xlib==0.15
pytweening==1.0.7
pytz==2024.1
PyYAML==6.0.1
requests==2.31.0
rsa==4.9
rubicon-objc==0.4.7
scikit-image==0.22.0
scipy==1.12.0
seaborn==0.13.2
shapely==2.0.2
six==1.16.0
sniffio==1.3.0
sympy==1.12
thop==0.1.1.post2209072238
tifffile==2024.1.30
torch==2.2.0
torchvision==0.17.0
tqdm==4.66.1
typing_extensions==4.8.0
tzdata==2023.4
ultralytics==8.0.227
urllib3==2.0.7
wcwidth==0.2.9
yarl==1.9.4
zipp==3.17.0
google-generativeai==0.3.0
aiohttp==3.9.1
ultralytics==8.0.227
easyocr==1.7.1

0 comments on commit 33af25c

Please sign in to comment.