| import json |
|
|
|
|
| SYSTEM_PROMPT = """You are an expert **UI State Transition Simulator** and **Frontend Developer**. |
| Your task is to predict the **NEXT UI STATE** based on a screenshot of the current state and a user interaction. |
| |
| ### 1. IMAGE INTERPRETATION RULES |
| The input image contains visual cues denoting the user's action. You must interpret them as follows: |
| * **Red Circle**: Indicates a **Click** or **Long Press** target at that location. |
| * **Red Arrow**: Indicates a **Scroll** or **Swipe**. |
| * The arrow points in the direction of finger movement. |
| * *Example*: An arrow pointing UP means the finger slides up, pushing content up (Scrolling Down). |
| * **Note**: These cues exist ONLY to show the action. **DO NOT render these red circles or arrows in your output HTML.** |
| |
| ### 2. CRITICAL STRUCTURAL RULES (MUST FOLLOW) |
| * **Format**: Output ONLY raw HTML. Start with `<!DOCTYPE html>` and end with `</html>`. |
| * **Root Element**: All visible content MUST be wrapped in: |
| `<div id="render-target"> ... </div>` |
| * **Container Style**: `#render-target` must have: |
| `width: 1080px; height: 2400px; position: relative; overflow: hidden;` |
| (Apply background colors and shadows here, NOT on the body). |
| * **Body Style**: The `<body>` tag must have `margin: 0; padding: 0; background: transparent;`. |
| * **Layout**: Do NOT center the body. Let `#render-target` sit at (0,0). |
| |
| ### 3. CONTENT GENERATION LOGIC |
| * **Transition**: Analyze the action. If the user clicks a button, show the *result* (e.g., a menu opens, a checkbox checks, page navigates). |
| * **Images**: Use semantic text placeholders. DO NOT use real URLs. |
| * Format: `<div style="...">[IMG: description]</div>` |
| * **Icons**: Use simple inline SVG paths or Unicode. |
| |
| ### 4. OUTPUT REQUIREMENT |
| * Do NOT generate Markdown blocks (```html). |
| * Do NOT provide explanations or conversational text. |
| * Output the code directly. |
| """ |
|
|
|
|
| USER_PROMPT_TEMPLATE = """<image> |
| ### INPUT CONTEXT |
| 1. **User Intent**: "{instruction_str}" |
| 2. **Interaction Details**: |
| * **Description**: {semantic_desc} |
| * **Action Data**: {action_json} |
| |
| ### COMMAND |
| Based on the visual cues in the image and the interaction data above, generate the **HTML for the RESULTING UI STATE** (what the screen looks like *after* this action). |
| """ |
|
|
|
|
| def get_action_semantic_description(action): |
| action_type = action.get("action_type") |
|
|
| if action_type == "click": |
| x, y = action.get("x"), action.get("y") |
| return ( |
| f"User performed a CLICK at coordinates ({x}, {y}). " |
| f"Expect the button/element at this location to trigger." |
| ) |
|
|
| if action_type == "long_press": |
| x, y = action.get("x"), action.get("y") |
| return ( |
| f"User performed a LONG PRESS at coordinates ({x}, {y}). " |
| f"Expect a context menu or selection state." |
| ) |
|
|
| if action_type in ["scroll", "swipe"]: |
| direction = action.get("direction", "down") |
| return ( |
| f"User SCROLLED {direction.upper()}. " |
| f"The content should move, revealing new items from the {direction} direction." |
| ) |
|
|
| if action_type == "input_text": |
| text = action.get("text", "") |
| return ( |
| f"User is TYPING the text: '{text}'. " |
| f"The focused input field MUST now contain this text." |
| ) |
|
|
| if action_type == "open_app": |
| app_name = action.get("app_name", "app") |
| return ( |
| f"System Context Switch: The user opened the app '{app_name}'. " |
| f"Show the home screen of this app." |
| ) |
|
|
| if action_type == "navigate_back": |
| return "System Navigation: The user pressed BACK. Return to the previous screen." |
|
|
| if action_type == "navigate_home": |
| return "System Navigation: The user pressed HOME. Show the Desktop." |
|
|
| if action_type == "wait": |
| return "Action: WAIT. Keep the UI mostly unchanged unless loading completes." |
|
|
| return f"Perform action: {action_type}." |
|
|
|
|
| def build_user_prompt(instruction_str, action, semantic_desc=None): |
| if semantic_desc is None: |
| semantic_desc = get_action_semantic_description(action) |
|
|
| action_json = json.dumps(action, ensure_ascii=False) |
|
|
| return USER_PROMPT_TEMPLATE.format( |
| instruction_str=instruction_str, |
| semantic_desc=semantic_desc, |
| action_json=action_json, |
| ) |