Samfredoly commited on
Commit
2a729e6
Β·
verified Β·
1 Parent(s): 36cf618

Upload 14 files

Browse files
Files changed (14) hide show
  1. .env.example +15 -0
  2. .gitignore +52 -0
  3. Dockerfile +32 -0
  4. LICENSE +15 -0
  5. config.py +80 -0
  6. docker-compose.yml +24 -0
  7. image_utils.py +118 -0
  8. main.py +225 -0
  9. models/.gitkeep +2 -0
  10. quickstart.py +77 -0
  11. requirements.txt +14 -0
  12. setup.bat +76 -0
  13. setup.sh +64 -0
  14. test_api.py +104 -0
.env.example ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OmniParser Configuration
2
+
3
+ # Server Settings
4
+ HOST=0.0.0.0
5
+ PORT=8000
6
+ DEBUG=False
7
+
8
+ # Model Settings
9
+ MODEL_NAME=microsoft/OmniParser-v2.0
10
+ DEVICE=cpu
11
+ # DEVICE=cuda # Uncomment for GPU support
12
+
13
+ # API Settings
14
+ MAX_FILE_SIZE=52428800 # 50MB in bytes
15
+ ALLOWED_EXTENSIONS=jpg,jpeg,png,bmp,gif
.gitignore ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ venv/
9
+ ENV/
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # IDEs
27
+ .vscode/
28
+ .idea/
29
+ *.swp
30
+ *.swo
31
+ *~
32
+ .DS_Store
33
+
34
+ # Environment
35
+ .env
36
+ .env.local
37
+
38
+ # Model cache (large files)
39
+ models/
40
+ *.pt
41
+ *.pth
42
+ *.onnx
43
+
44
+ # Logs
45
+ *.log
46
+ logs/
47
+
48
+ # Test outputs
49
+ test_output/
50
+ outputs/
51
+ *.json
52
+ *.txt
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ libopencv-dev \
9
+ python3-opencv \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Copy requirements
13
+ COPY requirements.txt .
14
+
15
+ # Install Python dependencies
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Copy application code
19
+ COPY . .
20
+
21
+ # Create models directory
22
+ RUN mkdir -p models
23
+
24
+ # Expose port
25
+ EXPOSE 8000
26
+
27
+ # Health check
28
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
29
+ CMD python -c "import requests; requests.get('http://localhost:8000/health')"
30
+
31
+ # Run application
32
+ CMD ["python", "main.py"]
LICENSE ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
config.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration management for OmniParser API
3
+ """
4
+ from pydantic_settings import BaseSettings
5
+ from typing import Optional
6
+ from pathlib import Path
7
+
8
+
9
+ class Settings(BaseSettings):
10
+ """Application settings"""
11
+
12
+ # Server
13
+ host: str = "0.0.0.0"
14
+ port: int = 8000
15
+ debug: bool = False
16
+
17
+ # Model
18
+ model_name: str = "microsoft/OmniParser-v2.0"
19
+ device: str = "cpu" # "cpu" or "cuda"
20
+
21
+ # File handling
22
+ max_file_size: int = 52428800 # 50MB
23
+ allowed_extensions: tuple = ("jpg", "jpeg", "png", "bmp", "gif")
24
+
25
+ # HuggingFace
26
+ huggingface_token: Optional[str] = None
27
+ cache_dir: Path = Path("./models")
28
+
29
+ # Processing
30
+ enable_caching: bool = False
31
+ max_workers: int = 4
32
+
33
+ # CORS
34
+ cors_origins: list = ["*"]
35
+
36
+ class Config:
37
+ env_file = ".env"
38
+ env_file_encoding = "utf-8"
39
+ case_sensitive = False
40
+
41
+
42
+ # Global settings instance
43
+ settings = Settings()
44
+
45
+
46
+ def get_settings() -> Settings:
47
+ """Get current settings"""
48
+ return settings
49
+
50
+
51
+ def validate_image_file(filename: str) -> bool:
52
+ """Validate if file is allowed image format"""
53
+ ext = Path(filename).suffix.lower().lstrip(".")
54
+ return ext in settings.allowed_extensions
55
+
56
+
57
+ def get_device():
58
+ """Get computation device"""
59
+ device = settings.device.lower()
60
+ if device == "cuda":
61
+ try:
62
+ import torch
63
+ if torch.cuda.is_available():
64
+ return "cuda"
65
+ else:
66
+ print("⚠️ CUDA requested but not available, falling back to CPU")
67
+ return "cpu"
68
+ except ImportError:
69
+ print("⚠️ torch not installed, using CPU")
70
+ return "cpu"
71
+ return "cpu"
72
+
73
+
74
+ if __name__ == "__main__":
75
+ print("Current Configuration:")
76
+ print("=" * 60)
77
+ for key, value in settings.dict().items():
78
+ if key not in ["huggingface_token"]:
79
+ print(f"{key}: {value}")
80
+ print("=" * 60)
docker-compose.yml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ omniparser:
5
+ build: .
6
+ container_name: omniparser-api
7
+ ports:
8
+ - "8000:8000"
9
+ environment:
10
+ - HOST=0.0.0.0
11
+ - PORT=8000
12
+ - DEBUG=False
13
+ - DEVICE=cpu
14
+ - MODEL_NAME=microsoft/OmniParser-v2.0
15
+ volumes:
16
+ - ./models:/app/models
17
+ - ./logs:/app/logs
18
+ restart: unless-stopped
19
+ healthcheck:
20
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
21
+ interval: 30s
22
+ timeout: 10s
23
+ retries: 3
24
+ start_period: 5s
image_utils.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Image processing helper functions for OmniParser
3
+ """
4
+ import cv2
5
+ import numpy as np
6
+ from PIL import Image
7
+ from pathlib import Path
8
+ from typing import Tuple, List
9
+
10
+
11
+ def load_image(image_path: str) -> Image.Image:
12
+ """Load image from file"""
13
+ return Image.open(image_path)
14
+
15
+
16
+ def resize_image(image: Image.Image, max_size: Tuple[int, int] = (1920, 1080)) -> Image.Image:
17
+ """Resize image to max dimensions while maintaining aspect ratio"""
18
+ image.thumbnail(max_size, Image.Resampling.LANCZOS)
19
+ return image
20
+
21
+
22
+ def capture_screenshot() -> Image.Image:
23
+ """Capture screenshot (requires mss or similar)"""
24
+ try:
25
+ import mss
26
+ with mss.mss() as sct:
27
+ monitor = sct.monitors[1] # Primary monitor
28
+ screenshot = sct.grab(monitor)
29
+ return Image.frombytes('RGB', screenshot.size, screenshot.rgb)
30
+ except ImportError:
31
+ print("⚠️ mss not installed. Install with: pip install mss")
32
+ return None
33
+
34
+
35
+ def annotate_image(image: Image.Image, elements: List[dict]) -> Image.Image:
36
+ """Draw bounding boxes on image for visualization"""
37
+ img_copy = image.copy()
38
+ from PIL import ImageDraw, ImageFont
39
+
40
+ draw = ImageDraw.Draw(img_copy)
41
+
42
+ colors = {
43
+ "button": "red",
44
+ "textfield": "blue",
45
+ "icon": "green",
46
+ "text": "yellow",
47
+ "image": "purple"
48
+ }
49
+
50
+ for i, elem in enumerate(elements):
51
+ bbox = elem.get("bbox", [])
52
+ if len(bbox) >= 4:
53
+ x1, y1, x2, y2 = bbox[:4]
54
+ elem_type = elem.get("element_type", "unknown")
55
+ color = colors.get(elem_type, "white")
56
+
57
+ # Draw bounding box
58
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
59
+
60
+ # Draw label
61
+ label = f"{elem.get('label', 'elem')} ({elem.get('confidence', 0):.2f})"
62
+ draw.text((x1, y1 - 10), label, fill=color)
63
+
64
+ return img_copy
65
+
66
+
67
+ def save_image(image: Image.Image, output_path: str):
68
+ """Save image to file"""
69
+ image.save(output_path)
70
+ print(f"βœ… Image saved: {output_path}")
71
+
72
+
73
+ def create_sample_screenshot() -> Image.Image:
74
+ """Create a simple sample image for testing"""
75
+ # Create a blank image with some shapes
76
+ img = Image.new('RGB', (800, 600), color='white')
77
+ from PIL import ImageDraw
78
+
79
+ draw = ImageDraw.Draw(img)
80
+
81
+ # Draw some sample UI elements
82
+ # Button
83
+ draw.rectangle([50, 50, 200, 100], fill='lightblue', outline='blue', width=2)
84
+ draw.text((80, 65), "Click Me", fill='black')
85
+
86
+ # Search box
87
+ draw.rectangle([250, 50, 700, 100], fill='white', outline='gray', width=2)
88
+ draw.text((260, 65), "Search...", fill='gray')
89
+
90
+ # Menu items
91
+ for i, text in enumerate(['Home', 'About', 'Contact']):
92
+ y = 150 + i * 50
93
+ draw.rectangle([50, y, 200, y + 40], fill='lightgray', outline='black', width=1)
94
+ draw.text((70, y + 10), text, fill='black')
95
+
96
+ # Status area
97
+ draw.rectangle([250, 150, 700, 500], fill='lightyellow', outline='orange', width=2)
98
+ draw.text((260, 160), "Status Area", fill='black')
99
+
100
+ return img
101
+
102
+
103
+ if __name__ == "__main__":
104
+ print("Image Processing Examples")
105
+ print("=" * 60)
106
+
107
+ # Create sample image
108
+ print("πŸ“· Creating sample screenshot...")
109
+ sample_img = create_sample_screenshot()
110
+ sample_img.save("sample_screenshot.png")
111
+ print("βœ… Sample saved as: sample_screenshot.png")
112
+
113
+ # Resize example
114
+ print("\nπŸ“ Resizing image...")
115
+ resized = resize_image(sample_img, (640, 480))
116
+ print(f"βœ… Resized to: {resized.size}")
117
+
118
+ print("\nβœ… All examples completed!")
main.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import base64
4
+ from pathlib import Path
5
+ from typing import Optional, List
6
+ from fastapi import FastAPI, File, UploadFile, HTTPException
7
+ from fastapi.responses import JSONResponse
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from pydantic import BaseModel
10
+ import cv2
11
+ import numpy as np
12
+ from PIL import Image
13
+ import logging
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Initialize FastAPI app
20
+ app = FastAPI(
21
+ title="OmniParser-v2.0 API",
22
+ description="Extract UI elements and cursor coordinates from screenshots",
23
+ version="1.0.0"
24
+ )
25
+
26
+ # Add CORS middleware
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=["*"],
30
+ allow_credentials=True,
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ # Global OmniParser model (lazy loaded)
36
+ omni_parser = None
37
+
38
+
39
+ class ParseRequest(BaseModel):
40
+ """Request model for UI parsing"""
41
+ image_base64: str
42
+ extract_text: bool = True
43
+ extract_icons: bool = True
44
+
45
+
46
+ class UIElement(BaseModel):
47
+ """Model for UI element"""
48
+ element_id: int
49
+ label: str
50
+ bbox: List[int] # [x1, y1, x2, y2]
51
+ element_type: str
52
+ confidence: float
53
+
54
+
55
+ class ParseResponse(BaseModel):
56
+ """Response model for parsing results"""
57
+ elements: List[UIElement]
58
+ image_width: int
59
+ image_height: int
60
+ processing_time: float
61
+ model_used: str = "OmniParser-v2.0"
62
+
63
+
64
+ def load_omniparser():
65
+ """Load OmniParser model (lazy loading)"""
66
+ global omni_parser
67
+ if omni_parser is None:
68
+ try:
69
+ logger.info("Loading OmniParser-v2.0 from HuggingFace...")
70
+ # Import and initialize OmniParser
71
+ # For now, we'll use a placeholder that demonstrates the structure
72
+ # You can replace this with actual OmniParser initialization
73
+ omni_parser = {
74
+ "loaded": True,
75
+ "model_name": "microsoft/OmniParser-v2.0"
76
+ }
77
+ logger.info("OmniParser loaded successfully")
78
+ except Exception as e:
79
+ logger.error(f"Failed to load OmniParser: {e}")
80
+ raise
81
+ return omni_parser
82
+
83
+
84
+ def extract_image_from_base64(image_base64: str) -> Image.Image:
85
+ """Decode base64 image"""
86
+ try:
87
+ image_data = base64.b64decode(image_base64)
88
+ image = Image.open(io.BytesIO(image_data))
89
+ return image
90
+ except Exception as e:
91
+ raise ValueError(f"Failed to decode image: {e}")
92
+
93
+
94
+ def parse_ui_elements(image: Image.Image) -> List[UIElement]:
95
+ """Parse UI elements from image using OmniParser"""
96
+ try:
97
+ # Load model
98
+ load_omniparser()
99
+
100
+ # Placeholder implementation - replace with actual OmniParser logic
101
+ logger.info(f"Processing image of size: {image.size}")
102
+
103
+ # For demonstration, create mock UI elements
104
+ # Replace this with actual OmniParser parsing logic
105
+ elements = [
106
+ UIElement(
107
+ element_id=1,
108
+ label="Button",
109
+ bbox=[10, 10, 100, 50],
110
+ element_type="button",
111
+ confidence=0.95
112
+ ),
113
+ UIElement(
114
+ element_id=2,
115
+ label="Search",
116
+ bbox=[150, 10, 400, 50],
117
+ element_type="textfield",
118
+ confidence=0.92
119
+ ),
120
+ ]
121
+
122
+ return elements
123
+ except Exception as e:
124
+ logger.error(f"Error parsing UI elements: {e}")
125
+ raise
126
+
127
+
128
+ @app.get("/")
129
+ async def root():
130
+ """Root endpoint"""
131
+ return {
132
+ "message": "OmniParser-v2.0 API",
133
+ "status": "running",
134
+ "endpoints": [
135
+ "/docs - API documentation",
136
+ "/health - Health check",
137
+ "/parse - Parse UI elements from screenshot"
138
+ ]
139
+ }
140
+
141
+
142
+ @app.get("/health")
143
+ async def health_check():
144
+ """Health check endpoint"""
145
+ try:
146
+ load_omniparser()
147
+ return {"status": "healthy", "model": "OmniParser-v2.0"}
148
+ except Exception as e:
149
+ return JSONResponse(
150
+ status_code=503,
151
+ content={"status": "unhealthy", "error": str(e)}
152
+ )
153
+
154
+
155
+ @app.post("/parse", response_model=ParseResponse)
156
+ async def parse_screenshot(file: UploadFile = File(...)):
157
+ """
158
+ Parse UI elements from a screenshot.
159
+
160
+ - **file**: Image file (PNG, JPG, etc.)
161
+
162
+ Returns UI elements with bounding boxes and cursor coordinates.
163
+ """
164
+ try:
165
+ import time
166
+ start_time = time.time()
167
+
168
+ # Read uploaded file
169
+ contents = await file.read()
170
+ image = Image.open(io.BytesIO(contents))
171
+
172
+ # Parse UI elements
173
+ elements = parse_ui_elements(image)
174
+
175
+ # Calculate processing time
176
+ processing_time = time.time() - start_time
177
+
178
+ return ParseResponse(
179
+ elements=elements,
180
+ image_width=image.width,
181
+ image_height=image.height,
182
+ processing_time=processing_time
183
+ )
184
+ except Exception as e:
185
+ logger.error(f"Error in parse endpoint: {e}")
186
+ raise HTTPException(status_code=400, detail=str(e))
187
+
188
+
189
+ @app.post("/parse-base64", response_model=ParseResponse)
190
+ async def parse_base64(request: ParseRequest):
191
+ """
192
+ Parse UI elements from base64-encoded image.
193
+
194
+ Request body:
195
+ - **image_base64**: Base64-encoded image string
196
+ - **extract_text**: Extract text from elements (default: True)
197
+ - **extract_icons**: Extract icons (default: True)
198
+ """
199
+ try:
200
+ import time
201
+ start_time = time.time()
202
+
203
+ # Decode image
204
+ image = extract_image_from_base64(request.image_base64)
205
+
206
+ # Parse UI elements
207
+ elements = parse_ui_elements(image)
208
+
209
+ # Calculate processing time
210
+ processing_time = time.time() - start_time
211
+
212
+ return ParseResponse(
213
+ elements=elements,
214
+ image_width=image.width,
215
+ image_height=image.height,
216
+ processing_time=processing_time
217
+ )
218
+ except Exception as e:
219
+ logger.error(f"Error in parse-base64 endpoint: {e}")
220
+ raise HTTPException(status_code=400, detail=str(e))
221
+
222
+
223
+ if __name__ == "__main__":
224
+ import uvicorn
225
+ uvicorn.run(app, host="0.0.0.0", port=8000)
models/.gitkeep ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # This directory will contain downloaded models from HuggingFace
2
+ # Models are large files (2GB+) and should not be committed to git
quickstart.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Quick start example for OmniParser API
3
+ """
4
+ import subprocess
5
+ import sys
6
+ import time
7
+ import requests
8
+ from pathlib import Path
9
+
10
+
11
+ def main():
12
+ """Quick start guide"""
13
+ print("\n" + "=" * 60)
14
+ print("OmniParser-v2.0 QUICK START GUIDE")
15
+ print("=" * 60 + "\n")
16
+
17
+ print("This guide will help you get started with OmniParser API.\n")
18
+
19
+ # Step 1: Virtual Environment
20
+ print("STEP 1: Setup Virtual Environment")
21
+ print("-" * 60)
22
+ print("Windows:")
23
+ print(" python -m venv venv")
24
+ print(" venv\\Scripts\\activate.bat")
25
+ print("\nLinux/macOS:")
26
+ print(" python3 -m venv venv")
27
+ print(" source venv/bin/activate\n")
28
+
29
+ # Step 2: Install Dependencies
30
+ print("STEP 2: Install Dependencies")
31
+ print("-" * 60)
32
+ print("Run: pip install -r requirements.txt")
33
+ print("(This will take a few minutes)\n")
34
+
35
+ # Step 3: Configuration
36
+ print("STEP 3: Configuration")
37
+ print("-" * 60)
38
+ print("Copy .env.example to .env and edit if needed")
39
+ print("Run: copy .env.example .env (Windows)")
40
+ print(" or: cp .env.example .env (Linux/macOS)\n")
41
+
42
+ # Step 4: Run Server
43
+ print("STEP 4: Run the Server")
44
+ print("-" * 60)
45
+ print("Run: python main.py")
46
+ print("Expected output: 'INFO: Uvicorn running on http://0.0.0.0:8000'\n")
47
+
48
+ # Step 5: Test API
49
+ print("STEP 5: Test the API")
50
+ print("-" * 60)
51
+ print("Option A - Interactive Docs:")
52
+ print(" Open: http://localhost:8000/docs")
53
+ print(" Click 'Try it out' on any endpoint\n")
54
+
55
+ print("Option B - Python Script:")
56
+ print(" python test_api.py\n")
57
+
58
+ print("Option C - cURL:")
59
+ print(" curl -X GET http://localhost:8000/health\n")
60
+
61
+ # Next Steps
62
+ print("NEXT STEPS:")
63
+ print("-" * 60)
64
+ print("1. Upload a screenshot: POST /parse")
65
+ print("2. Extract UI elements with coordinates")
66
+ print("3. Integrate with your application\n")
67
+
68
+ print("For more information:")
69
+ print("- See README.md for detailed documentation")
70
+ print("- Visit: https://huggingface.co/microsoft/OmniParser-v2.0")
71
+ print("- API Docs: http://localhost:8000/docs\n")
72
+
73
+ print("=" * 60 + "\n")
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn==0.24.0
3
+ python-multipart==0.0.6
4
+ pydantic==2.5.0
5
+ pydantic-settings==2.1.0
6
+ python-dotenv==1.0.0
7
+ pillow==10.1.0
8
+ numpy==1.24.3
9
+ opencv-python==4.8.1.78
10
+ torch==2.1.0
11
+ torchvision==0.16.0
12
+ transformers==4.35.2
13
+ timm==0.9.12
14
+ einops==0.7.0
setup.bat ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+ REM OmniParser Setup Script for Windows
3
+
4
+ echo.
5
+ echo ====================================================
6
+ echo OmniParser-v2.0 Setup Script
7
+ echo ====================================================
8
+ echo.
9
+
10
+ REM Check if Python is installed
11
+ python --version >nul 2>&1
12
+ if errorlevel 1 (
13
+ echo ERROR: Python is not installed or not in PATH
14
+ echo Please install Python 3.8+ from https://www.python.org
15
+ exit /b 1
16
+ )
17
+
18
+ echo Python found:
19
+ python --version
20
+ echo.
21
+
22
+ REM Create virtual environment
23
+ echo Creating virtual environment...
24
+ python -m venv venv
25
+ if errorlevel 1 (
26
+ echo ERROR: Failed to create virtual environment
27
+ exit /b 1
28
+ )
29
+ echo βœ“ Virtual environment created
30
+ echo.
31
+
32
+ REM Activate virtual environment
33
+ echo Activating virtual environment...
34
+ call venv\Scripts\activate.bat
35
+ if errorlevel 1 (
36
+ echo ERROR: Failed to activate virtual environment
37
+ exit /b 1
38
+ )
39
+ echo βœ“ Virtual environment activated
40
+ echo.
41
+
42
+ REM Upgrade pip
43
+ echo Upgrading pip...
44
+ python -m pip install --upgrade pip >nul 2>&1
45
+ echo βœ“ pip upgraded
46
+ echo.
47
+
48
+ REM Install dependencies
49
+ echo Installing dependencies...
50
+ echo This may take a few minutes...
51
+ pip install -r requirements.txt
52
+ if errorlevel 1 (
53
+ echo ERROR: Failed to install dependencies
54
+ exit /b 1
55
+ )
56
+ echo βœ“ Dependencies installed
57
+ echo.
58
+
59
+ REM Create .env file from template
60
+ if not exist .env (
61
+ echo Creating .env file...
62
+ copy .env.example .env >nul
63
+ echo βœ“ .env file created
64
+ echo.
65
+ )
66
+
67
+ echo ====================================================
68
+ echo Setup completed successfully!
69
+ echo ====================================================
70
+ echo.
71
+ echo Next steps:
72
+ echo 1. Activate environment: venv\Scripts\activate.bat
73
+ echo 2. Run server: python main.py
74
+ echo 3. Visit: http://localhost:8000/docs
75
+ echo.
76
+ pause
setup.sh ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # OmniParser Setup Script for Linux/macOS
3
+
4
+ set -e
5
+
6
+ echo ""
7
+ echo "===================================================="
8
+ echo "OmniParser-v2.0 Setup Script"
9
+ echo "===================================================="
10
+ echo ""
11
+
12
+ # Check if Python is installed
13
+ if ! command -v python3 &> /dev/null; then
14
+ echo "ERROR: Python 3 is not installed"
15
+ echo "Please install Python 3.8+ from https://www.python.org"
16
+ exit 1
17
+ fi
18
+
19
+ echo "Python found:"
20
+ python3 --version
21
+ echo ""
22
+
23
+ # Create virtual environment
24
+ echo "Creating virtual environment..."
25
+ python3 -m venv venv
26
+ echo "βœ“ Virtual environment created"
27
+ echo ""
28
+
29
+ # Activate virtual environment
30
+ echo "Activating virtual environment..."
31
+ source venv/bin/activate
32
+ echo "βœ“ Virtual environment activated"
33
+ echo ""
34
+
35
+ # Upgrade pip
36
+ echo "Upgrading pip..."
37
+ pip install --upgrade pip > /dev/null 2>&1
38
+ echo "βœ“ pip upgraded"
39
+ echo ""
40
+
41
+ # Install dependencies
42
+ echo "Installing dependencies..."
43
+ echo "This may take a few minutes..."
44
+ pip install -r requirements.txt
45
+ echo "βœ“ Dependencies installed"
46
+ echo ""
47
+
48
+ # Create .env file from template
49
+ if [ ! -f .env ]; then
50
+ echo "Creating .env file..."
51
+ cp .env.example .env
52
+ echo "βœ“ .env file created"
53
+ echo ""
54
+ fi
55
+
56
+ echo "===================================================="
57
+ echo "Setup completed successfully!"
58
+ echo "===================================================="
59
+ echo ""
60
+ echo "Next steps:"
61
+ echo "1. Activate environment: source venv/bin/activate"
62
+ echo "2. Run server: python main.py"
63
+ echo "3. Visit: http://localhost:8000/docs"
64
+ echo ""
test_api.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test script for OmniParser API
3
+ """
4
+ import requests
5
+ import json
6
+ import base64
7
+ from pathlib import Path
8
+
9
+ BASE_URL = "http://localhost:8000"
10
+
11
+
12
+ def health_check():
13
+ """Check API health"""
14
+ print("πŸ₯ Health Check...")
15
+ response = requests.get(f"{BASE_URL}/health")
16
+ print(f"Status: {response.status_code}")
17
+ print(json.dumps(response.json(), indent=2))
18
+ print()
19
+
20
+
21
+ def parse_file(image_path: str):
22
+ """Parse image file"""
23
+ print(f"πŸ“Έ Parsing file: {image_path}")
24
+
25
+ if not Path(image_path).exists():
26
+ print(f"❌ File not found: {image_path}")
27
+ return
28
+
29
+ with open(image_path, "rb") as f:
30
+ files = {"file": f}
31
+ response = requests.post(f"{BASE_URL}/parse", files=files)
32
+
33
+ if response.status_code == 200:
34
+ result = response.json()
35
+ print(f"βœ… Found {len(result['elements'])} UI elements")
36
+ print(f" Image size: {result['image_width']}x{result['image_height']}")
37
+ print(f" Processing time: {result['processing_time']:.2f}s")
38
+ print("\n Elements:")
39
+ for elem in result['elements']:
40
+ print(f" - {elem['label']}: bbox={elem['bbox']}, confidence={elem['confidence']}")
41
+ else:
42
+ print(f"❌ Error: {response.status_code}")
43
+ print(response.text)
44
+ print()
45
+
46
+
47
+ def parse_base64(image_path: str):
48
+ """Parse base64-encoded image"""
49
+ print(f"πŸ“· Parsing base64 image: {image_path}")
50
+
51
+ if not Path(image_path).exists():
52
+ print(f"❌ File not found: {image_path}")
53
+ return
54
+
55
+ # Read and encode image
56
+ with open(image_path, "rb") as f:
57
+ image_data = base64.b64encode(f.read()).decode('utf-8')
58
+
59
+ payload = {
60
+ "image_base64": image_data,
61
+ "extract_text": True,
62
+ "extract_icons": True
63
+ }
64
+
65
+ response = requests.post(f"{BASE_URL}/parse-base64", json=payload)
66
+
67
+ if response.status_code == 200:
68
+ result = response.json()
69
+ print(f"βœ… Found {len(result['elements'])} UI elements")
70
+ print(f" Processing time: {result['processing_time']:.2f}s")
71
+ else:
72
+ print(f"❌ Error: {response.status_code}")
73
+ print(response.text)
74
+ print()
75
+
76
+
77
+ if __name__ == "__main__":
78
+ print("=" * 60)
79
+ print("OmniParser API Test Suite")
80
+ print("=" * 60)
81
+ print()
82
+
83
+ # Health check
84
+ health_check()
85
+
86
+ # Test with a sample image (if available)
87
+ sample_images = [
88
+ "screenshot.png",
89
+ "test_image.png",
90
+ "../screenshots/example.png"
91
+ ]
92
+
93
+ for img in sample_images:
94
+ if Path(img).exists():
95
+ parse_file(img)
96
+ parse_base64(img)
97
+ break
98
+ else:
99
+ print("⚠️ No test images found. Upload an image and try again.")
100
+ print(" Expected: screenshot.png or test_image.png")
101
+
102
+ print("=" * 60)
103
+ print("βœ… Test suite completed")
104
+ print("=" * 60)