import json SYSTEM_PROMPT = """You are an expert **UI State Transition Simulator** and **Frontend Developer**. Your task is to predict the **NEXT UI STATE** based on a screenshot of the current state and a user interaction. ### 1. IMAGE INTERPRETATION RULES The input image contains visual cues denoting the user's action. You must interpret them as follows: * **Red Circle**: Indicates a **Click** or **Long Press** target at that location. * **Red Arrow**: Indicates a **Scroll** or **Swipe**. * The arrow points in the direction of finger movement. * *Example*: An arrow pointing UP means the finger slides up, pushing content up (Scrolling Down). * **Note**: These cues exist ONLY to show the action. **DO NOT render these red circles or arrows in your output HTML.** ### 2. CRITICAL STRUCTURAL RULES (MUST FOLLOW) * **Format**: Output ONLY raw HTML. Start with `` and end with ``. * **Root Element**: All visible content MUST be wrapped in: `
...
` * **Container Style**: `#render-target` must have: `width: 1080px; height: 2400px; position: relative; overflow: hidden;` (Apply background colors and shadows here, NOT on the body). * **Body Style**: The `` tag must have `margin: 0; padding: 0; background: transparent;`. * **Layout**: Do NOT center the body. Let `#render-target` sit at (0,0). ### 3. CONTENT GENERATION LOGIC * **Transition**: Analyze the action. If the user clicks a button, show the *result* (e.g., a menu opens, a checkbox checks, page navigates). * **Images**: Use semantic text placeholders. DO NOT use real URLs. * Format: `
[IMG: description]
` * **Icons**: Use simple inline SVG paths or Unicode. ### 4. OUTPUT REQUIREMENT * Do NOT generate Markdown blocks (```html). * Do NOT provide explanations or conversational text. * Output the code directly. """ USER_PROMPT_TEMPLATE = """ ### INPUT CONTEXT 1. **User Intent**: "{instruction_str}" 2. **Interaction Details**: * **Description**: {semantic_desc} * **Action Data**: {action_json} ### COMMAND Based on the visual cues in the image and the interaction data above, generate the **HTML for the RESULTING UI STATE** (what the screen looks like *after* this action). """ def get_action_semantic_description(action): action_type = action.get("action_type") if action_type == "click": x, y = action.get("x"), action.get("y") return ( f"User performed a CLICK at coordinates ({x}, {y}). " f"Expect the button/element at this location to trigger." ) if action_type == "long_press": x, y = action.get("x"), action.get("y") return ( f"User performed a LONG PRESS at coordinates ({x}, {y}). " f"Expect a context menu or selection state." ) if action_type in ["scroll", "swipe"]: direction = action.get("direction", "down") return ( f"User SCROLLED {direction.upper()}. " f"The content should move, revealing new items from the {direction} direction." ) if action_type == "input_text": text = action.get("text", "") return ( f"User is TYPING the text: '{text}'. " f"The focused input field MUST now contain this text." ) if action_type == "open_app": app_name = action.get("app_name", "app") return ( f"System Context Switch: The user opened the app '{app_name}'. " f"Show the home screen of this app." ) if action_type == "navigate_back": return "System Navigation: The user pressed BACK. Return to the previous screen." if action_type == "navigate_home": return "System Navigation: The user pressed HOME. Show the Desktop." if action_type == "wait": return "Action: WAIT. Keep the UI mostly unchanged unless loading completes." return f"Perform action: {action_type}." def build_user_prompt(instruction_str, action, semantic_desc=None): if semantic_desc is None: semantic_desc = get_action_semantic_description(action) action_json = json.dumps(action, ensure_ascii=False) return USER_PROMPT_TEMPLATE.format( instruction_str=instruction_str, semantic_desc=semantic_desc, action_json=action_json, )