diff --git a/omnitool/gradio/app.py b/omnitool/gradio/app.py index 54cca8a0..4b85b4cc 100644 --- a/omnitool/gradio/app.py +++ b/omnitool/gradio/app.py @@ -22,9 +22,11 @@ import requests from requests.exceptions import RequestException import base64 +import glob CONFIG_DIR = Path("~/.anthropic").expanduser() API_KEY_FILE = CONFIG_DIR / "api_key" +IMG_DIR = "./tmp/outputs/" INTRO_TEXT = ''' OmniParser lets you turn any vision-langauge model into an AI agent. We currently support **OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL) or Anthropic Computer Use (Sonnet).** @@ -192,8 +194,20 @@ def valid_params(user_input, state): for server_name, url in [('Windows Host', 'localhost:5000'), ('OmniParser Server', args.omniparser_server_url)]: try: - url = f'http://{url}/probe' - response = requests.get(url, timeout=3) + if server_name == "OmniParser Server": + try: + url_test = f'http://{url}/probe' + response = requests.get(url_test, timeout=3) + if response.status_code != 200: + errors.append(f"{server_name} is not responding") + except RequestException: + url_test = f'https://{url}/probe/' + response = requests.get(url_test, timeout=3) + if response.status_code != 200: + errors.append(f"{server_name} is not responding") + else: + url = f'http://{url}/probe' + response = requests.get(url, timeout=3) if response.status_code != 200: errors.append(f"{server_name} is not responding") except RequestException as e: @@ -202,6 +216,9 @@ def valid_params(user_input, state): if not state["api_key"].strip(): errors.append("LLM API Key is not set") + if state["provider"] == "azure" and not state.get("azure_resource_name", "").strip(): + errors.append("Azure Resource Name is required when using Azure OpenAI") + if not user_input: errors.append("no computer use request provided") @@ -405,7 +422,14 @@ def update_api_key(api_key_value, state): state["api_key"] = api_key_value state[f'{state["provider"]}_api_key'] = api_key_value + def clear_img_cache(): + files = glob.glob(IMG_DIR+'screenshot_*.png') + for f in files: + os.remove(f) + def clear_chat(state): + # Clear images in the cache + clear_img_cache() # Reset message-related state state["messages"] = [] state["responses"] = {} diff --git a/omnitool/gradio/loop.py b/omnitool/gradio/loop.py index 9ce63169..4edafea4 100644 --- a/omnitool/gradio/loop.py +++ b/omnitool/gradio/loop.py @@ -20,6 +20,8 @@ from agent.vlm_agent import VLMAgent from agent.vlm_agent_with_orchestrator import VLMOrchestratedAgent from executor.anthropic_executor import AnthropicExecutor +import requests +from requests.exceptions import RequestException BETA_FLAG = "computer-use-2024-10-22" @@ -55,7 +57,19 @@ def sampling_loop_sync( Synchronous agentic sampling loop for the assistant/tool interaction of computer use. """ print('in sampling_loop_sync, model:', model) - omniparser_client = OmniParserClient(url=f"http://{omniparser_url}/parse/") + + # Try https with a trailing slash if http fails + url = omniparser_url + try: + url_test1 = f'http://{url}/parse' + requests.get(url_test1, timeout=3) + url = url_test1 + except RequestException: + url_test2 = f'https://{url}/parse/' + requests.get(url_test2, timeout=3) + url = url_test2 + + omniparser_client = OmniParserClient(url=url) if model == "claude-3-5-sonnet-20241022": # Register Actor and Executor actor = AnthropicActor( diff --git a/requirements.txt b/requirements.txt index 901a27fa..95cfb08f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ easyocr torchvision supervision==0.18.0 openai==1.3.5 -transformers +transformers==4.45.0 ultralytics==8.3.70 azure-identity numpy==1.26.4