File size: 4,404 Bytes
4da1734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import json


SYSTEM_PROMPT = """You are an expert **UI State Transition Simulator** and **Frontend Developer**.
Your task is to predict the **NEXT UI STATE** based on a screenshot of the current state and a user interaction.

### 1. IMAGE INTERPRETATION RULES
The input image contains visual cues denoting the user's action. You must interpret them as follows:
*   **Red Circle**: Indicates a **Click** or **Long Press** target at that location.
*   **Red Arrow**: Indicates a **Scroll** or **Swipe**.
    *   The arrow points in the direction of finger movement.
    *   *Example*: An arrow pointing UP means the finger slides up, pushing content up (Scrolling Down).
*   **Note**: These cues exist ONLY to show the action. **DO NOT render these red circles or arrows in your output HTML.**

### 2. CRITICAL STRUCTURAL RULES (MUST FOLLOW)
*   **Format**: Output ONLY raw HTML. Start with `<!DOCTYPE html>` and end with `</html>`.
*   **Root Element**: All visible content MUST be wrapped in:
    `<div id="render-target"> ... </div>`
*   **Container Style**: `#render-target` must have:
    `width: 1080px; height: 2400px; position: relative; overflow: hidden;`
    (Apply background colors and shadows here, NOT on the body).
*   **Body Style**: The `<body>` tag must have `margin: 0; padding: 0; background: transparent;`.
*   **Layout**: Do NOT center the body. Let `#render-target` sit at (0,0).

### 3. CONTENT GENERATION LOGIC
*   **Transition**: Analyze the action. If the user clicks a button, show the *result* (e.g., a menu opens, a checkbox checks, page navigates).
*   **Images**: Use semantic text placeholders. DO NOT use real URLs.
    *   Format: `<div style="...">[IMG: description]</div>`
*   **Icons**: Use simple inline SVG paths or Unicode.

### 4. OUTPUT REQUIREMENT
*   Do NOT generate Markdown blocks (```html).
*   Do NOT provide explanations or conversational text.
*   Output the code directly.
"""


USER_PROMPT_TEMPLATE = """<image>
### INPUT CONTEXT
1.  **User Intent**: "{instruction_str}"
2.  **Interaction Details**:
    *   **Description**: {semantic_desc}
    *   **Action Data**: {action_json}

### COMMAND
Based on the visual cues in the image and the interaction data above, generate the **HTML for the RESULTING UI STATE** (what the screen looks like *after* this action).
"""


def get_action_semantic_description(action):
    action_type = action.get("action_type")

    if action_type == "click":
        x, y = action.get("x"), action.get("y")
        return (
            f"User performed a CLICK at coordinates ({x}, {y}). "
            f"Expect the button/element at this location to trigger."
        )

    if action_type == "long_press":
        x, y = action.get("x"), action.get("y")
        return (
            f"User performed a LONG PRESS at coordinates ({x}, {y}). "
            f"Expect a context menu or selection state."
        )

    if action_type in ["scroll", "swipe"]:
        direction = action.get("direction", "down")
        return (
            f"User SCROLLED {direction.upper()}. "
            f"The content should move, revealing new items from the {direction} direction."
        )

    if action_type == "input_text":
        text = action.get("text", "")
        return (
            f"User is TYPING the text: '{text}'. "
            f"The focused input field MUST now contain this text."
        )

    if action_type == "open_app":
        app_name = action.get("app_name", "app")
        return (
            f"System Context Switch: The user opened the app '{app_name}'. "
            f"Show the home screen of this app."
        )

    if action_type == "navigate_back":
        return "System Navigation: The user pressed BACK. Return to the previous screen."

    if action_type == "navigate_home":
        return "System Navigation: The user pressed HOME. Show the Desktop."

    if action_type == "wait":
        return "Action: WAIT. Keep the UI mostly unchanged unless loading completes."

    return f"Perform action: {action_type}."


def build_user_prompt(instruction_str, action, semantic_desc=None):
    if semantic_desc is None:
        semantic_desc = get_action_semantic_description(action)

    action_json = json.dumps(action, ensure_ascii=False)

    return USER_PROMPT_TEMPLATE.format(
        instruction_str=instruction_str,
        semantic_desc=semantic_desc,
        action_json=action_json,
    )