tiny-random
/

gemma-4e

@@ -15,34 +15,73 @@ This tiny model is intended for debugging. It is randomly initialized using the
 ```python
 import torch
-from transformers import pipeline, AutoProcessor, AutoModelForCausalLM
 model_id = "tiny-random/gemma-4e"
 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    dtype=torch.bfloat16,
-    device_map="auto"
 )
 messages = [
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user",  "content": [
-        {"type": "audio", "audio": "https://raw.githubusercontent.com/google-gemma/cookbook/refs/heads/main/Demos/sample-data/journal1.wav"},
-        {"type": "text", "text": "Transcribe the following speech segment in its original language. Follow these specific instructions for formatting the answer:\n* Only output the transcription, with no newlines.\n* When transcribing numbers, write the digits, i.e. write 1.7 and not one point seven, and write 3 instead of three."},
-    ]},
 ]
-text = processor.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True,
-    enable_thinking=True,
-)
-inputs = processor(text=text, return_tensors="pt").to(model.device)
 input_len = inputs["input_ids"].shape[-1]
-outputs = model.generate(**inputs, max_new_tokens=16)
-response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
-print(processor.parse_response(response))
 ```
 ### Codes to create this repo:
@@ -55,9 +94,8 @@ import json
 from pathlib import Path
 import torch
 from huggingface_hub import file_exists, hf_hub_download
-# from timm.models.mobilenetv5 import decode_arch_def
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
@@ -74,38 +112,53 @@ save_folder = "/tmp/tiny-random/gemma-4e"
 processor = AutoProcessor.from_pretrained(source_model_id)
 processor.save_pretrained(save_folder)
-with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
     config_json = json.load(f)
-config_json['audio_config'].update({
-    "num_attention_heads": 2,
-    "num_hidden_layers": 2,
-    "hidden_size": 64,
-    'output_proj_dims': 32,
-})
-config_json['text_config'].update({
-    "global_head_dim": 64,
-    "head_dim": 32,
-    "hidden_size": 8,
-    "hidden_size_per_layer_input": 2,
-    "intermediate_size": 64,
-    "layer_types": ['sliding_attention', 'full_attention', 'sliding_attention', 'full_attention'],
-    "num_attention_heads": 8,
-    "num_hidden_layers": 4,
-    "num_key_value_heads": 4,
-    "num_kv_shared_layers": 2,
-})
-config_json['vision_config'].update({
-    'num_hidden_layers': 2,
-    'hidden_size': 8,
-    'intermediate_size': 64,
-    'head_dim': 32,
-    'global_head_dim': 32,
-    'num_attention_heads': 4,
-    "num_key_value_heads": 4,
-})
-with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
     json.dump(config_json, f, indent=2)
 config = AutoConfig.from_pretrained(
@@ -117,9 +170,12 @@ print(config)
 torch.set_default_dtype(torch.bfloat16)
 model = Gemma4ForConditionalGeneration(config)
 torch.set_default_dtype(torch.float32)
-if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
     model.generation_config = GenerationConfig.from_pretrained(
-        source_model_id, trust_remote_code=True,
     )
 set_seed(42)
 model = model.cpu()
@@ -129,7 +185,7 @@ for name, p in sorted(model.named_parameters()):
 with torch.no_grad():
     for name, p in sorted(model.named_parameters()):
         torch.nn.init.normal_(p, 0, 0.2)
-        print(name, p.shape, f'{p.numel() / all_numels * 100: .4f}%')
 model.save_pretrained(save_folder)
 ```

 ```python
 import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
 model_id = "tiny-random/gemma-4e"
 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
+    model_id, dtype=torch.bfloat16, device_map="auto"
 )
 messages = [
+    # system message tokenization is buggy, comment out for now
+    # {
+    #     "role": "system",
+    #     "content": [{"type": "text", "text": "You are a helpful assistant."}],
+    # },
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "audio",
+                "audio": "https://raw.githubusercontent.com/google-gemma/cookbook/refs/heads/main/Demos/sample-data/journal1.wav",
+            },
+            {"type": "text", "text": "Transcribe the following speech segment."},
+        ],
+    },
+    {
+        "role": "assistant",
+        "content": [{"type": "text", "text": "Dummy response for audio"}],
+    },
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "url": "https://raw.githubusercontent.com/google-gemma/cookbook/refs/heads/main/Demos/sample-data/GoldenGate.png",
+            },
+            {"type": "text", "text": "What is shown in this image?"},
+        ],
+    },
+    {
+        "role": "assistant",
+        "content": [{"type": "text", "text": "Dummy response for image"}],
+    },
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "video": "https://github.com/bebechien/gemma/raw/refs/heads/main/videos/ForBiggerBlazes.mp4",
+            },
+            {"type": "text", "text": "Describe this video."},
+        ],
+    },
 ]
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    add_generation_prompt=True,
+).to(model.device)
 input_len = inputs["input_ids"].shape[-1]
+print("input_len:", input_len)
+outputs = model.generate(**inputs, max_new_tokens=32)
+response = processor.decode(outputs[0], skip_special_tokens=False)
+response = response.replace("<|audio|>", "A")
+response = response.replace("<|image|>", "I")
+response = response.replace("<|video|>", "V")
+print(response)
 ```
 ### Codes to create this repo:
 from pathlib import Path
 import torch
 from huggingface_hub import file_exists, hf_hub_download
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
 processor = AutoProcessor.from_pretrained(source_model_id)
 processor.save_pretrained(save_folder)
+with open(
+    hf_hub_download(source_model_id, filename="config.json", repo_type="model"),
+    "r",
+    encoding="utf-8",
+) as f:
     config_json = json.load(f)
+config_json["audio_config"].update(
+    {
+        "num_attention_heads": 2,
+        "num_hidden_layers": 2,
+        "hidden_size": 64,
+        "output_proj_dims": 32,
+    }
+)
+config_json["text_config"].update(
+    {
+        "global_head_dim": 64,
+        "head_dim": 32,
+        "hidden_size": 8,
+        "hidden_size_per_layer_input": 2,
+        "intermediate_size": 64,
+        "layer_types": [
+            "sliding_attention",
+            "full_attention",
+            "sliding_attention",
+            "full_attention",
+        ],
+        "num_attention_heads": 8,
+        "num_hidden_layers": 4,
+        "num_key_value_heads": 4,
+        "num_kv_shared_layers": 2,
+    }
+)
+config_json["vision_config"].update(
+    {
+        "num_hidden_layers": 2,
+        "hidden_size": 8,
+        "intermediate_size": 64,
+        "head_dim": 32,
+        "global_head_dim": 32,
+        "num_attention_heads": 4,
+        "num_key_value_heads": 4,
+    }
+)
+with open(f"{save_folder}/config.json", "w", encoding="utf-8") as f:
     json.dump(config_json, f, indent=2)
 config = AutoConfig.from_pretrained(
 torch.set_default_dtype(torch.bfloat16)
 model = Gemma4ForConditionalGeneration(config)
 torch.set_default_dtype(torch.float32)
+if file_exists(
+    filename="generation_config.json", repo_id=source_model_id, repo_type="model"
+):
     model.generation_config = GenerationConfig.from_pretrained(
+        source_model_id,
+        trust_remote_code=True,
     )
 set_seed(42)
 model = model.cpu()
 with torch.no_grad():
     for name, p in sorted(model.named_parameters()):
         torch.nn.init.normal_(p, 0, 0.2)
+        print(name, p.shape, f"{p.numel() / all_numels * 100: .4f}%")
 model.save_pretrained(save_folder)
 ```