| { |
| "model_name": "Lance", |
| "model_size": "3B active parameters", |
| "license": "apache-2.0", |
| "organization": "bytedance-research", |
| "description": "Lance is a lightweight native unified multimodal model trained from scratch for image and video understanding, generation, and editing in a single framework.", |
| "supported_tasks": [ |
| "text-to-image generation", |
| "text-to-video generation", |
| "image editing", |
| "video editing", |
| "image understanding", |
| "video understanding" |
| ], |
| "task_names": [ |
| "t2i", |
| "t2v", |
| "image_edit", |
| "video_edit", |
| "x2t_image", |
| "x2t_video" |
| ], |
| "recommended_environment": { |
| "python": ">=3.10", |
| "cuda": ">=12.4", |
| "gpu_memory": ">=40GB VRAM for inference" |
| }, |
| "checkpoint_directories": [ |
| "Lance_3B", |
| "Lance_3B_Video", |
| "Qwen2.5-VL-ViT" |
| ], |
| "homepage": "https://huggingface.co/bytedance-research/Lance" |
| } |
|
|