| { | |
| "model_name": "Lance", | |
| "model_size": "3B active parameters", | |
| "license": "apache-2.0", | |
| "organization": "bytedance-research", | |
| "description": "Lance is a lightweight native unified multimodal model trained from scratch for image and video understanding, generation, and editing in a single framework.", | |
| "supported_tasks": [ | |
| "text-to-image generation", | |
| "text-to-video generation", | |
| "image editing", | |
| "video editing", | |
| "image understanding", | |
| "video understanding" | |
| ], | |
| "task_names": [ | |
| "t2i", | |
| "t2v", | |
| "image_edit", | |
| "video_edit", | |
| "x2t_image", | |
| "x2t_video" | |
| ], | |
| "recommended_environment": { | |
| "python": ">=3.10", | |
| "cuda": ">=12.4", | |
| "gpu_memory": ">=40GB VRAM for inference" | |
| }, | |
| "checkpoint_directories": [ | |
| "Lance_3B", | |
| "Lance_3B_Video", | |
| "Qwen2.5-VL-ViT" | |
| ], | |
| "homepage": "https://huggingface.co/bytedance-research/Lance" | |
| } | |