{ "model_name": "Lance", "model_size": "3B active parameters", "license": "apache-2.0", "organization": "bytedance-research", "description": "Lance is a lightweight native unified multimodal model trained from scratch for image and video understanding, generation, and editing in a single framework.", "supported_tasks": [ "text-to-image generation", "text-to-video generation", "image editing", "video editing", "image understanding", "video understanding" ], "task_names": [ "t2i", "t2v", "image_edit", "video_edit", "x2t_image", "x2t_video" ], "recommended_environment": { "python": ">=3.10", "cuda": ">=12.4", "gpu_memory": ">=40GB VRAM for inference" }, "checkpoint_directories": [ "Lance_3B", "Lance_3B_Video", "Qwen2.5-VL-ViT" ], "homepage": "https://huggingface.co/bytedance-research/Lance" }