| model_card_metadata = |
| { |
| "license": "apache-2.0", |
| "language": ["en"], |
| "metrics": ["accuracy", "bertscore"], |
| "library_name": ["adapter-transformers", "transformers"], |
| "model_name": "AutoModel", |
| "model_type": "multimodal-transformer", |
| "tags": ["multimodal", "transformer"], |
| "datasets": ["dataset1", "dataset2"], |
| "finetuned_from": "pretrained-model", |
| "config": { |
| "hidden_size": 768, |
| "num_attention_heads": 12, |
| "num_hidden_layers": 12, |
| "intermediate_size": 2048, |
| "hidden_dropout_prob": 0.1, |
| "attention_probs_dropout_prob": 0.1, |
| "image_size": 224, |
| "image_channels": 3, |
| "patch_size": 16, |
| "max_position_embeddings": 512, |
| "vocab_size": 30522, |
| "type_vocab_size": 2, |
| "audio_sample_rate": 16000, |
| "audio_frame_size": 1024, |
| "audio_hop_size": 512, |
| "enable_vqa": True, |
| "enable_caption": True, |
| "enable_retrieval": True, |
| "enable_asr": True, |
| "enable_realtime_asr": True, |
| "batch_size": 32, |
| "learning_rate": 0.0001, |
| "weight_decay": 0.01, |
| "warmup_steps": 10000, |
| "max_steps": 100000 |
| } |
| } |
|
|
|
|