yhzheng1031 commited on
Commit
0983922
·
verified ·
1 Parent(s): 62bdb05

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +145 -0
README.md CHANGED
@@ -21,6 +21,151 @@ The code of Code2World has been in the latest Hugging Face transformers and we a
21
  pip install transformers==4.57.0
22
  ```
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  ## Citation
25
 
26
  If you find our work helpful, feel free to give us a cite.
 
21
  pip install transformers==4.57.0
22
  ```
23
 
24
+ ```python
25
+ import os
26
+ import re
27
+ import json
28
+ import math
29
+
30
+ import torch
31
+ from PIL import Image, ImageDraw
32
+ from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
33
+ from playwright.sync_api import sync_playwright
34
+
35
+ from prompt_builder import SYSTEM_PROMPT, build_user_prompt
36
+ from visual_hint import build_visual_hint
37
+ from render_utils import render_html_to_image, save_demo_outputs
38
+
39
+
40
+ MODEL_NAME = "GD-ML/Code2World"
41
+
42
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
43
+ MODEL_NAME,
44
+ dtype=torch.bfloat16,
45
+ attn_implementation="flash_attention_2",
46
+ device_map="auto",
47
+ )
48
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
49
+
50
+
51
+
52
+ def extract_clean_html(text: str) -> str:
53
+ text = text.replace("```html", "").replace("```", "")
54
+ start_match = re.search(r"<!DOCTYPE html>", text, re.IGNORECASE)
55
+ end_match = re.search(r"</html>", text, re.IGNORECASE)
56
+
57
+ if start_match and end_match:
58
+ start_idx = start_match.start()
59
+ end_idx = end_match.end()
60
+ if end_idx > start_idx:
61
+ return text[start_idx:end_idx]
62
+
63
+ return text.strip()
64
+
65
+
66
+
67
+ def build_messages(image: Image.Image, instruction: str, action: dict, semantic_desc=None):
68
+ user_prompt = build_user_prompt(
69
+ instruction_str=instruction,
70
+ action=action,
71
+ semantic_desc=semantic_desc,
72
+ )
73
+
74
+ return [
75
+ {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
76
+ {
77
+ "role": "user",
78
+ "content": [
79
+ {"type": "image", "image": image.convert("RGB")},
80
+ {"type": "text", "text": user_prompt},
81
+ ],
82
+ },
83
+ ]
84
+
85
+
86
+ @torch.inference_mode()
87
+ def generate_html(image: Image.Image, instruction: str, action: dict, semantic_desc=None, max_new_tokens: int = 8192):
88
+ messages = build_messages(
89
+ image=image,
90
+ instruction=instruction,
91
+ action=action,
92
+ semantic_desc=semantic_desc,
93
+ )
94
+
95
+ inputs = processor.apply_chat_template(
96
+ messages,
97
+ add_generation_prompt=True,
98
+ tokenize=True,
99
+ return_dict=True,
100
+ return_tensors="pt",
101
+ )
102
+ inputs = inputs.to(model.device)
103
+
104
+ generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
105
+ generated_ids_trimmed = [
106
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
107
+ ]
108
+
109
+ output_text = processor.batch_decode(
110
+ generated_ids_trimmed,
111
+ skip_special_tokens=True,
112
+ clean_up_tokenization_spaces=False,
113
+ )[0]
114
+
115
+ return extract_clean_html(output_text)
116
+
117
+
118
+
119
+ def run_demo(
120
+ image_path: str,
121
+ instruction: str,
122
+ action: dict,
123
+ step_pam: dict | None = None,
124
+ semantic_desc: str | None = None,
125
+ use_visual_hint: bool = True,
126
+ max_new_tokens: int = 8192,
127
+ output_dir: str = "./demo_outputs",
128
+ ):
129
+ image = Image.open(image_path).convert("RGB")
130
+
131
+ if use_visual_hint:
132
+ hinted_image = build_visual_hint(image, action, step_pam)
133
+ else:
134
+ hinted_image = image
135
+
136
+ html = generate_html(
137
+ image=hinted_image,
138
+ instruction=instruction,
139
+ action=action,
140
+ semantic_desc=semantic_desc,
141
+ max_new_tokens=max_new_tokens,
142
+ )
143
+
144
+ rendered_image = render_html_to_image(html)
145
+ save_demo_outputs(output_dir, hinted_image, html, rendered_image)
146
+
147
+ return hinted_image, html, rendered_image
148
+
149
+
150
+ if __name__ == "__main__":
151
+ example_action = {
152
+ "action_type": "click",
153
+ "x": 540,
154
+ "y": 320,
155
+ }
156
+ example_step_pam = {
157
+ "coordinate": [540, 320]
158
+ }
159
+
160
+ run_demo(
161
+ image_path="./examples/current.png",
162
+ instruction="Tap the search bar to start searching.",
163
+ action=example_action,
164
+ step_pam=example_step_pam,
165
+ output_dir="./demo_outputs",
166
+ )
167
+ ```
168
+
169
  ## Citation
170
 
171
  If you find our work helpful, feel free to give us a cite.