| import gradio as gr |
| from transformers import AutoTokenizer |
| import random |
| import colorsys |
| import html |
|
|
| def get_distinct_colors(n): |
| colors = [] |
| for i in range(n): |
| h = i / n |
| s = 0.6 |
| v = 0.7 |
| r, g, b = colorsys.hsv_to_rgb(h, s, v) |
| color = "#{:02x}{:02x}{:02x}".format(int(r*255), int(g*255), int(b*255)) |
| colors.append(color) |
| return colors |
|
|
| def tokenize_text(hf_model_id, text, token=None): |
| try: |
| tokenizer = AutoTokenizer.from_pretrained(hf_model_id, access_token=token) |
| tokens = tokenizer.tokenize(text) |
| token_count = len(tokens) |
| colors = get_distinct_colors(token_count) |
| colored_tokens = [] |
| for i, token in enumerate(tokens): |
| display_token = token.replace('Ġ', '<space>') |
| display_token = html.escape(display_token) |
| colored_tokens.append(f'<span style="background-color: {colors[i]}; color: white; padding: 2px 4px; border-radius: 3px; margin: 2px; display: inline-block;">{display_token}</span>') |
| tokenized_text = "".join(colored_tokens) |
| return token_count, tokenized_text |
| except Exception as e: |
| return f"Error: {str(e)}", "" |
|
|
| demo = gr.Interface( |
| fn=tokenize_text, |
| inputs=[ |
| gr.Textbox(label="Hugging Face Model ID", placeholder="unsloth/gemma-3-27b-it", value="unsloth/gemma-3-27b-it"), |
| gr.Textbox(label="Text to Tokenize", lines=5, placeholder="Enter your text here..."), |
| gr.Textbox(label="HuggingFace Token (optional)", placeholder="hf_...", lines=1) |
| ], |
| outputs=[ |
| gr.Number(label="Token Count"), |
| gr.HTML(label="Tokens", container=True, show_label=True) |
| ], |
| title="HuggingFace Tokenizer", |
| description="Enter a HuggingFace model ID and text to see how it gets tokenized. Provide a huggingface token if the model is gated.", |
| allow_flagging="never" |
| ) |
|
|
| demo.launch() |