Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions omnitool/gradio/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
import requests
from requests.exceptions import RequestException
import base64
import glob

CONFIG_DIR = Path("~/.anthropic").expanduser()
API_KEY_FILE = CONFIG_DIR / "api_key"
IMG_DIR = "./tmp/outputs/"

INTRO_TEXT = '''
OmniParser lets you turn any vision-langauge model into an AI agent. We currently support **OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL) or Anthropic Computer Use (Sonnet).**
Expand Down Expand Up @@ -192,8 +194,20 @@ def valid_params(user_input, state):

for server_name, url in [('Windows Host', 'localhost:5000'), ('OmniParser Server', args.omniparser_server_url)]:
try:
url = f'http://{url}/probe'
response = requests.get(url, timeout=3)
if server_name == "OmniParser Server":
try:
url_test = f'http://{url}/probe'
response = requests.get(url_test, timeout=3)
if response.status_code != 200:
errors.append(f"{server_name} is not responding")
except RequestException:
url_test = f'https://{url}/probe/'
response = requests.get(url_test, timeout=3)
if response.status_code != 200:
errors.append(f"{server_name} is not responding")
else:
url = f'http://{url}/probe'
response = requests.get(url, timeout=3)
if response.status_code != 200:
errors.append(f"{server_name} is not responding")
except RequestException as e:
Expand All @@ -202,6 +216,9 @@ def valid_params(user_input, state):
if not state["api_key"].strip():
errors.append("LLM API Key is not set")

if state["provider"] == "azure" and not state.get("azure_resource_name", "").strip():
errors.append("Azure Resource Name is required when using Azure OpenAI")

if not user_input:
errors.append("no computer use request provided")

Expand Down Expand Up @@ -405,7 +422,14 @@ def update_api_key(api_key_value, state):
state["api_key"] = api_key_value
state[f'{state["provider"]}_api_key'] = api_key_value

def clear_img_cache():
files = glob.glob(IMG_DIR+'screenshot_*.png')
for f in files:
os.remove(f)

def clear_chat(state):
# Clear images in the cache
clear_img_cache()
# Reset message-related state
state["messages"] = []
state["responses"] = {}
Expand Down
16 changes: 15 additions & 1 deletion omnitool/gradio/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from agent.vlm_agent import VLMAgent
from agent.vlm_agent_with_orchestrator import VLMOrchestratedAgent
from executor.anthropic_executor import AnthropicExecutor
import requests
from requests.exceptions import RequestException

BETA_FLAG = "computer-use-2024-10-22"

Expand Down Expand Up @@ -55,7 +57,19 @@ def sampling_loop_sync(
Synchronous agentic sampling loop for the assistant/tool interaction of computer use.
"""
print('in sampling_loop_sync, model:', model)
omniparser_client = OmniParserClient(url=f"http://{omniparser_url}/parse/")

# Try https with a trailing slash if http fails
url = omniparser_url
try:
url_test1 = f'http://{url}/parse'
requests.get(url_test1, timeout=3)
url = url_test1
except RequestException:
url_test2 = f'https://{url}/parse/'
requests.get(url_test2, timeout=3)
url = url_test2

omniparser_client = OmniParserClient(url=url)
if model == "claude-3-5-sonnet-20241022":
# Register Actor and Executor
actor = AnthropicActor(
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ easyocr
torchvision
supervision==0.18.0
openai==1.3.5
transformers
transformers==4.45.0
ultralytics==8.3.70
azure-identity
numpy==1.26.4
Expand Down