Code2World / prompt_builder.py

Upload folder using huggingface_hub

4da1734 verified 4 days ago

4.4 kB

	import json


	SYSTEM_PROMPT = """You are an expert UI State Transition Simulator and Frontend Developer.
	Your task is to predict the NEXT UI STATE based on a screenshot of the current state and a user interaction.

	### 1. IMAGE INTERPRETATION RULES
	The input image contains visual cues denoting the user's action. You must interpret them as follows:
	* Red Circle: Indicates a Click or Long Press target at that location.
	* Red Arrow: Indicates a Scroll or Swipe.
	* The arrow points in the direction of finger movement.
	* Example: An arrow pointing UP means the finger slides up, pushing content up (Scrolling Down).
	* Note: These cues exist ONLY to show the action. DO NOT render these red circles or arrows in your output HTML.

	### 2. CRITICAL STRUCTURAL RULES (MUST FOLLOW)
	* Format: Output ONLY raw HTML. Start with `<!DOCTYPE html>` and end with `</html>`.
	* Root Element: All visible content MUST be wrapped in:
	`<div id="render-target"> ... </div>`
	* Container Style: `#render-target` must have:
	`width: 1080px; height: 2400px; position: relative; overflow: hidden;`
	(Apply background colors and shadows here, NOT on the body).
	* Body Style: The `<body>` tag must have `margin: 0; padding: 0; background: transparent;`.
	* Layout: Do NOT center the body. Let `#render-target` sit at (0,0).

	### 3. CONTENT GENERATION LOGIC
	* Transition: Analyze the action. If the user clicks a button, show the result (e.g., a menu opens, a checkbox checks, page navigates).
	* Images: Use semantic text placeholders. DO NOT use real URLs.
	* Format: `<div style="...">[IMG: description]</div>`
	* Icons: Use simple inline SVG paths or Unicode.

	### 4. OUTPUT REQUIREMENT
	* Do NOT generate Markdown blocks (```html).
	* Do NOT provide explanations or conversational text.
	* Output the code directly.
	"""


	USER_PROMPT_TEMPLATE = """<image>
	### INPUT CONTEXT
	1. User Intent: "{instruction_str}"
	2. Interaction Details:
	* Description: {semantic_desc}
	* Action Data: {action_json}

	### COMMAND
	Based on the visual cues in the image and the interaction data above, generate the HTML for the RESULTING UI STATE (what the screen looks like after this action).
	"""


	def get_action_semantic_description(action):
	action_type = action.get("action_type")

	if action_type == "click":
	x, y = action.get("x"), action.get("y")
	return (
	f"User performed a CLICK at coordinates ({x}, {y}). "
	f"Expect the button/element at this location to trigger."
	)

	if action_type == "long_press":
	x, y = action.get("x"), action.get("y")
	return (
	f"User performed a LONG PRESS at coordinates ({x}, {y}). "
	f"Expect a context menu or selection state."
	)

	if action_type in ["scroll", "swipe"]:
	direction = action.get("direction", "down")
	return (
	f"User SCROLLED {direction.upper()}. "
	f"The content should move, revealing new items from the {direction} direction."
	)

	if action_type == "input_text":
	text = action.get("text", "")
	return (
	f"User is TYPING the text: '{text}'. "
	f"The focused input field MUST now contain this text."
	)

	if action_type == "open_app":
	app_name = action.get("app_name", "app")
	return (
	f"System Context Switch: The user opened the app '{app_name}'. "
	f"Show the home screen of this app."
	)

	if action_type == "navigate_back":
	return "System Navigation: The user pressed BACK. Return to the previous screen."

	if action_type == "navigate_home":
	return "System Navigation: The user pressed HOME. Show the Desktop."

	if action_type == "wait":
	return "Action: WAIT. Keep the UI mostly unchanged unless loading completes."

	return f"Perform action: {action_type}."


	def build_user_prompt(instruction_str, action, semantic_desc=None):
	if semantic_desc is None:
	semantic_desc = get_action_semantic_description(action)

	action_json = json.dumps(action, ensure_ascii=False)

	return USER_PROMPT_TEMPLATE.format(
	instruction_str=instruction_str,
	semantic_desc=semantic_desc,
	action_json=action_json,
	)