Code2World / prompt_builder.py
yhzheng1031's picture
Upload folder using huggingface_hub
4da1734 verified
import json
SYSTEM_PROMPT = """You are an expert **UI State Transition Simulator** and **Frontend Developer**.
Your task is to predict the **NEXT UI STATE** based on a screenshot of the current state and a user interaction.
### 1. IMAGE INTERPRETATION RULES
The input image contains visual cues denoting the user's action. You must interpret them as follows:
* **Red Circle**: Indicates a **Click** or **Long Press** target at that location.
* **Red Arrow**: Indicates a **Scroll** or **Swipe**.
* The arrow points in the direction of finger movement.
* *Example*: An arrow pointing UP means the finger slides up, pushing content up (Scrolling Down).
* **Note**: These cues exist ONLY to show the action. **DO NOT render these red circles or arrows in your output HTML.**
### 2. CRITICAL STRUCTURAL RULES (MUST FOLLOW)
* **Format**: Output ONLY raw HTML. Start with `<!DOCTYPE html>` and end with `</html>`.
* **Root Element**: All visible content MUST be wrapped in:
`<div id="render-target"> ... </div>`
* **Container Style**: `#render-target` must have:
`width: 1080px; height: 2400px; position: relative; overflow: hidden;`
(Apply background colors and shadows here, NOT on the body).
* **Body Style**: The `<body>` tag must have `margin: 0; padding: 0; background: transparent;`.
* **Layout**: Do NOT center the body. Let `#render-target` sit at (0,0).
### 3. CONTENT GENERATION LOGIC
* **Transition**: Analyze the action. If the user clicks a button, show the *result* (e.g., a menu opens, a checkbox checks, page navigates).
* **Images**: Use semantic text placeholders. DO NOT use real URLs.
* Format: `<div style="...">[IMG: description]</div>`
* **Icons**: Use simple inline SVG paths or Unicode.
### 4. OUTPUT REQUIREMENT
* Do NOT generate Markdown blocks (```html).
* Do NOT provide explanations or conversational text.
* Output the code directly.
"""
USER_PROMPT_TEMPLATE = """<image>
### INPUT CONTEXT
1. **User Intent**: "{instruction_str}"
2. **Interaction Details**:
* **Description**: {semantic_desc}
* **Action Data**: {action_json}
### COMMAND
Based on the visual cues in the image and the interaction data above, generate the **HTML for the RESULTING UI STATE** (what the screen looks like *after* this action).
"""
def get_action_semantic_description(action):
action_type = action.get("action_type")
if action_type == "click":
x, y = action.get("x"), action.get("y")
return (
f"User performed a CLICK at coordinates ({x}, {y}). "
f"Expect the button/element at this location to trigger."
)
if action_type == "long_press":
x, y = action.get("x"), action.get("y")
return (
f"User performed a LONG PRESS at coordinates ({x}, {y}). "
f"Expect a context menu or selection state."
)
if action_type in ["scroll", "swipe"]:
direction = action.get("direction", "down")
return (
f"User SCROLLED {direction.upper()}. "
f"The content should move, revealing new items from the {direction} direction."
)
if action_type == "input_text":
text = action.get("text", "")
return (
f"User is TYPING the text: '{text}'. "
f"The focused input field MUST now contain this text."
)
if action_type == "open_app":
app_name = action.get("app_name", "app")
return (
f"System Context Switch: The user opened the app '{app_name}'. "
f"Show the home screen of this app."
)
if action_type == "navigate_back":
return "System Navigation: The user pressed BACK. Return to the previous screen."
if action_type == "navigate_home":
return "System Navigation: The user pressed HOME. Show the Desktop."
if action_type == "wait":
return "Action: WAIT. Keep the UI mostly unchanged unless loading completes."
return f"Perform action: {action_type}."
def build_user_prompt(instruction_str, action, semantic_desc=None):
if semantic_desc is None:
semantic_desc = get_action_semantic_description(action)
action_json = json.dumps(action, ensure_ascii=False)
return USER_PROMPT_TEMPLATE.format(
instruction_str=instruction_str,
semantic_desc=semantic_desc,
action_json=action_json,
)