Spaces:

snorfyang
/

token-visualizer

Runtime error

App Files Files Community

snorfyang commited on Jul 6, 2025

Commit

f073607

1 Parent(s): 1e97ae4

demo

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +163 -0
requirements.txt +2 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Token Visualizer
-emoji: 🏢
 colorFrom: blue
 colorTo: purple
 sdk: gradio

 ---
 title: Token Visualizer
+emoji: 🔍
 colorFrom: blue
 colorTo: purple
 sdk: gradio

app.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import gradio as gr
+from transformers import AutoTokenizer
+import json
+import re
+# Supported models list
+SUPPORTED_MODELS = {
+    "Llama-2": "meta-llama/Llama-2-7b-chat-hf",
+    "Llama-3": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "Qwen2": "Qwen/Qwen2-7B-Instruct",
+    "Gemma-2": "google/gemma-2-9b-it",
+    "GPT-2": "gpt2",
+    "BERT": "bert-base-uncased",
+}
+# Global variable to store current tokenizer
+current_tokenizer = None
+# Color palette for alternating tokens
+TOKEN_COLORS = [
+    "#e3f2fd",  # Light blue
+    "#f3e5f5",  # Light purple
+    "#e8f5e8",  # Light green
+    "#fff3e0",  # Light orange
+    "#fce4ec",  # Light pink
+    "#e0f2f1",  # Light teal
+    "#f1f8e9",  # Light lime
+    "#fafafa",  # Light gray
+    "#fff8e1",  # Light amber
+    "#f3e5f5",  # Light indigo
+]
+def load_tokenizer(model_name):
+    """Load the specified tokenizer"""
+    global current_tokenizer
+    try:
+        model_path = SUPPORTED_MODELS[model_name]
+        current_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        return f"✅ Successfully loaded {model_name} tokenizer"
+    except Exception as e:
+        return f"❌ Loading failed: {str(e)}"
+def visualize_tokens(text, model_name):
+    """Visualize the tokenization results of text"""
+    global current_tokenizer
+    if not current_tokenizer:
+        return "Please select and load a model first", None, None
+    if not text.strip():
+        return "Please enter text to analyze", None, None
+    try:
+        # Perform tokenization on text
+        encoding = current_tokenizer(text, return_tensors="pt", add_special_tokens=True)
+        token_ids = encoding['input_ids'][0].tolist()
+        tokens = current_tokenizer.convert_ids_to_tokens(token_ids)
+        # Create HTML format visualization results
+        html_output = "<div style='font-family: monospace; font-size: 14px; line-height: 1.5;'>"
+        # Display Tokenization Results
+        html_output += "<h3>Tokenization Results:</h3>"
+        html_output += "<div style='margin-bottom: 20px;'>"
+        # Cycle through multiple colors for consecutive tokens
+        for i, token in enumerate(tokens):
+            # Cycle through all available colors
+            current_color_index = i % len(TOKEN_COLORS)
+            # Get color for this token
+            bg_color = TOKEN_COLORS[current_color_index]
+            border_color = "#2196f3"
+            # Create token span
+            # Escape special characters in token string for HTML display
+            escaped_token = token.replace("<", "&lt;").replace(">", "&gt;")
+            token_html = f'<span class="token" data-token-id="{token_ids[i]}" data-token-string="{token}" data-token-index="{i}" data-color-index="{current_color_index}" style="display: inline-block; margin: 2px; padding: 4px 8px; background-color: {bg_color}; border: 1px solid {border_color}; border-radius: 4px; color: black;">{escaped_token}</span>'
+            html_output += token_html
+        html_output += "</div>"
+        # Display Token IDs
+        html_output += "<h3>Token IDs:</h3>"
+        html_output += "<div style='margin-bottom: 20px;'>"
+        for i, token_id in enumerate(token_ids):
+            # Alternate between two colors for consecutive token IDs
+            current_color_index = i % len(TOKEN_COLORS)
+            bg_color = TOKEN_COLORS[current_color_index]
+            border_color = "#2196f3"
+            # Create token ID span
+            token_id_html = f'<span class="token-id" data-token-id="{token_id}" data-token-index="{i}" data-color-index="{current_color_index}" style="display: inline-block; margin: 2px; padding: 4px 8px; background-color: {bg_color}; border: 1px solid {border_color}; border-radius: 4px; color: black; text-decoration: none !important;">{token_id}</span>'
+            html_output += token_id_html
+        html_output += "</div>"
+        html_output += "</div>"
+        # No JavaScript needed since we removed hover effects
+        js_code = ""
+        html_output += js_code
+        # Create JSON format detailed information
+        # Get vocabulary size
+        vocab_size = current_tokenizer.vocab_size
+        return html_output, f"Total tokens: {len(tokens)}\nVocabulary size: {vocab_size:,}"
+    except Exception as e:
+        return f"❌ Processing failed: {str(e)}", None
+# Create Gradio interface
+with gr.Blocks(title="Token Visualizer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🔍 Token Visualizer")
+    gr.Markdown("This is a tool for visualizing the text tokenization process. Select a model, input text, and view the tokenization results.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 1. Select Model")
+            model_dropdown = gr.Dropdown(
+                choices=list(SUPPORTED_MODELS.keys()),
+                label="Select Model",
+                value="GPT-2"
+            )
+            load_btn = gr.Button("Load Tokenizer", variant="primary")
+            load_status = gr.Textbox(label="Loading Status", interactive=False)
+            gr.Markdown("### 2. Input Text")
+            text_input = gr.Textbox(
+                label="Enter text to tokenize",
+                placeholder="Example: Hello, how are you today?",
+                lines=4
+            )
+            visualize_btn = gr.Button("Visualize", variant="primary")
+        with gr.Column(scale=2):
+            gr.Markdown("### 3. Visualization Results")
+            html_output = gr.HTML(label="Token Visualization")
+            stats_output = gr.Textbox(label="Statistics", interactive=False)
+    # Event binding
+    load_btn.click(
+        fn=load_tokenizer,
+        inputs=[model_dropdown],
+        outputs=[load_status]
+    )
+    visualize_btn.click(
+        fn=visualize_tokens,
+        inputs=[text_input, model_dropdown],
+        outputs=[html_output, stats_output]
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch
2	+ transformers