snorfyang commited on
Commit
f073607
Β·
1 Parent(s): 1e97ae4
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +163 -0
  3. requirements.txt +2 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Token Visualizer
3
- emoji: 🏒
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
 
1
  ---
2
  title: Token Visualizer
3
+ emoji: πŸ”
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer
3
+ import json
4
+ import re
5
+
6
+ # Supported models list
7
+ SUPPORTED_MODELS = {
8
+ "Llama-2": "meta-llama/Llama-2-7b-chat-hf",
9
+ "Llama-3": "meta-llama/Meta-Llama-3-8B-Instruct",
10
+ "Qwen2": "Qwen/Qwen2-7B-Instruct",
11
+ "Gemma-2": "google/gemma-2-9b-it",
12
+ "GPT-2": "gpt2",
13
+ "BERT": "bert-base-uncased",
14
+ }
15
+
16
+ # Global variable to store current tokenizer
17
+ current_tokenizer = None
18
+
19
+ # Color palette for alternating tokens
20
+ TOKEN_COLORS = [
21
+ "#e3f2fd", # Light blue
22
+ "#f3e5f5", # Light purple
23
+ "#e8f5e8", # Light green
24
+ "#fff3e0", # Light orange
25
+ "#fce4ec", # Light pink
26
+ "#e0f2f1", # Light teal
27
+ "#f1f8e9", # Light lime
28
+ "#fafafa", # Light gray
29
+ "#fff8e1", # Light amber
30
+ "#f3e5f5", # Light indigo
31
+ ]
32
+
33
+ def load_tokenizer(model_name):
34
+ """Load the specified tokenizer"""
35
+ global current_tokenizer
36
+ try:
37
+ model_path = SUPPORTED_MODELS[model_name]
38
+ current_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
39
+ return f"βœ… Successfully loaded {model_name} tokenizer"
40
+ except Exception as e:
41
+ return f"❌ Loading failed: {str(e)}"
42
+
43
+ def visualize_tokens(text, model_name):
44
+ """Visualize the tokenization results of text"""
45
+ global current_tokenizer
46
+
47
+ if not current_tokenizer:
48
+ return "Please select and load a model first", None, None
49
+
50
+ if not text.strip():
51
+ return "Please enter text to analyze", None, None
52
+
53
+ try:
54
+ # Perform tokenization on text
55
+ encoding = current_tokenizer(text, return_tensors="pt", add_special_tokens=True)
56
+ token_ids = encoding['input_ids'][0].tolist()
57
+ tokens = current_tokenizer.convert_ids_to_tokens(token_ids)
58
+
59
+ # Create HTML format visualization results
60
+ html_output = "<div style='font-family: monospace; font-size: 14px; line-height: 1.5;'>"
61
+
62
+ # Display Tokenization Results
63
+ html_output += "<h3>Tokenization Results:</h3>"
64
+ html_output += "<div style='margin-bottom: 20px;'>"
65
+
66
+ # Cycle through multiple colors for consecutive tokens
67
+ for i, token in enumerate(tokens):
68
+ # Cycle through all available colors
69
+ current_color_index = i % len(TOKEN_COLORS)
70
+
71
+ # Get color for this token
72
+ bg_color = TOKEN_COLORS[current_color_index]
73
+ border_color = "#2196f3"
74
+
75
+ # Create token span
76
+ # Escape special characters in token string for HTML display
77
+ escaped_token = token.replace("<", "&lt;").replace(">", "&gt;")
78
+ token_html = f'<span class="token" data-token-id="{token_ids[i]}" data-token-string="{token}" data-token-index="{i}" data-color-index="{current_color_index}" style="display: inline-block; margin: 2px; padding: 4px 8px; background-color: {bg_color}; border: 1px solid {border_color}; border-radius: 4px; color: black;">{escaped_token}</span>'
79
+ html_output += token_html
80
+
81
+ html_output += "</div>"
82
+
83
+ # Display Token IDs
84
+ html_output += "<h3>Token IDs:</h3>"
85
+ html_output += "<div style='margin-bottom: 20px;'>"
86
+
87
+ for i, token_id in enumerate(token_ids):
88
+ # Alternate between two colors for consecutive token IDs
89
+ current_color_index = i % len(TOKEN_COLORS)
90
+ bg_color = TOKEN_COLORS[current_color_index]
91
+ border_color = "#2196f3"
92
+
93
+ # Create token ID span
94
+ token_id_html = f'<span class="token-id" data-token-id="{token_id}" data-token-index="{i}" data-color-index="{current_color_index}" style="display: inline-block; margin: 2px; padding: 4px 8px; background-color: {bg_color}; border: 1px solid {border_color}; border-radius: 4px; color: black; text-decoration: none !important;">{token_id}</span>'
95
+ html_output += token_id_html
96
+
97
+ html_output += "</div>"
98
+ html_output += "</div>"
99
+
100
+ # No JavaScript needed since we removed hover effects
101
+ js_code = ""
102
+
103
+ html_output += js_code
104
+
105
+ # Create JSON format detailed information
106
+
107
+ # Get vocabulary size
108
+ vocab_size = current_tokenizer.vocab_size
109
+
110
+ return html_output, f"Total tokens: {len(tokens)}\nVocabulary size: {vocab_size:,}"
111
+
112
+ except Exception as e:
113
+ return f"❌ Processing failed: {str(e)}", None
114
+
115
+
116
+
117
+ # Create Gradio interface
118
+ with gr.Blocks(title="Token Visualizer", theme=gr.themes.Soft()) as demo:
119
+ gr.Markdown("# πŸ” Token Visualizer")
120
+ gr.Markdown("This is a tool for visualizing the text tokenization process. Select a model, input text, and view the tokenization results.")
121
+
122
+ with gr.Row():
123
+ with gr.Column(scale=1):
124
+ gr.Markdown("### 1. Select Model")
125
+ model_dropdown = gr.Dropdown(
126
+ choices=list(SUPPORTED_MODELS.keys()),
127
+ label="Select Model",
128
+ value="GPT-2"
129
+ )
130
+ load_btn = gr.Button("Load Tokenizer", variant="primary")
131
+ load_status = gr.Textbox(label="Loading Status", interactive=False)
132
+
133
+ gr.Markdown("### 2. Input Text")
134
+ text_input = gr.Textbox(
135
+ label="Enter text to tokenize",
136
+ placeholder="Example: Hello, how are you today?",
137
+ lines=4
138
+ )
139
+ visualize_btn = gr.Button("Visualize", variant="primary")
140
+
141
+ with gr.Column(scale=2):
142
+ gr.Markdown("### 3. Visualization Results")
143
+ html_output = gr.HTML(label="Token Visualization")
144
+ stats_output = gr.Textbox(label="Statistics", interactive=False)
145
+
146
+ # Event binding
147
+
148
+ load_btn.click(
149
+ fn=load_tokenizer,
150
+ inputs=[model_dropdown],
151
+ outputs=[load_status]
152
+ )
153
+
154
+ visualize_btn.click(
155
+ fn=visualize_tokens,
156
+ inputs=[text_input, model_dropdown],
157
+ outputs=[html_output, stats_output]
158
+ )
159
+
160
+
161
+
162
+ if __name__ == "__main__":
163
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch
2
+ transformers