GD-ML
/

Code2World

@@ -21,6 +21,151 @@ The code of Code2World has been in the latest Hugging Face transformers and we a
 pip install transformers==4.57.0
 ```
 ## Citation
 If you find our work helpful, feel free to give us a cite.

 pip install transformers==4.57.0
 ```
+```python
+import os
+import re
+import json
+import math
+import torch
+from PIL import Image, ImageDraw
+from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+from playwright.sync_api import sync_playwright
+from prompt_builder import SYSTEM_PROMPT, build_user_prompt
+from visual_hint import build_visual_hint
+from render_utils import render_html_to_image, save_demo_outputs
+MODEL_NAME = "GD-ML/Code2World"
+model = Qwen3VLForConditionalGeneration.from_pretrained(
+    MODEL_NAME,
+    dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map="auto",
+)
+processor = AutoProcessor.from_pretrained(MODEL_NAME)
+def extract_clean_html(text: str) -> str:
+    text = text.replace("```html", "").replace("```", "")
+    start_match = re.search(r"<!DOCTYPE html>", text, re.IGNORECASE)
+    end_match = re.search(r"</html>", text, re.IGNORECASE)
+    if start_match and end_match:
+        start_idx = start_match.start()
+        end_idx = end_match.end()
+        if end_idx > start_idx:
+            return text[start_idx:end_idx]
+    return text.strip()
+def build_messages(image: Image.Image, instruction: str, action: dict, semantic_desc=None):
+    user_prompt = build_user_prompt(
+        instruction_str=instruction,
+        action=action,
+        semantic_desc=semantic_desc,
+    )
+    return [
+        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image.convert("RGB")},
+                {"type": "text", "text": user_prompt},
+            ],
+        },
+    ]
+@torch.inference_mode()
+def generate_html(image: Image.Image, instruction: str, action: dict, semantic_desc=None, max_new_tokens: int = 8192):
+    messages = build_messages(
+        image=image,
+        instruction=instruction,
+        action=action,
+        semantic_desc=semantic_desc,
+    )
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to(model.device)
+    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )[0]
+    return extract_clean_html(output_text)
+def run_demo(
+    image_path: str,
+    instruction: str,
+    action: dict,
+    step_pam: dict | None = None,
+    semantic_desc: str | None = None,
+    use_visual_hint: bool = True,
+    max_new_tokens: int = 8192,
+    output_dir: str = "./demo_outputs",
+):
+    image = Image.open(image_path).convert("RGB")
+    if use_visual_hint:
+        hinted_image = build_visual_hint(image, action, step_pam)
+    else:
+        hinted_image = image
+    html = generate_html(
+        image=hinted_image,
+        instruction=instruction,
+        action=action,
+        semantic_desc=semantic_desc,
+        max_new_tokens=max_new_tokens,
+    )
+    rendered_image = render_html_to_image(html)
+    save_demo_outputs(output_dir, hinted_image, html, rendered_image)
+    return hinted_image, html, rendered_image
+if __name__ == "__main__":
+    example_action = {
+        "action_type": "click",
+        "x": 540,
+        "y": 320,
+    }
+    example_step_pam = {
+        "coordinate": [540, 320]
+    }
+    run_demo(
+        image_path="./examples/current.png",
+        instruction="Tap the search bar to start searching.",
+        action=example_action,
+        step_pam=example_step_pam,
+        output_dir="./demo_outputs",
+    )
+```
 ## Citation
 If you find our work helpful, feel free to give us a cite.