{
  "model_name": "Lance",
  "model_size": "3B active parameters",
  "license": "apache-2.0",
  "organization": "bytedance-research",
  "description": "Lance is a lightweight native unified multimodal model trained from scratch for image and video understanding, generation, and editing in a single framework.",
  "supported_tasks": [
    "text-to-image generation",
    "text-to-video generation",
    "image editing",
    "video editing",
    "image understanding",
    "video understanding"
  ],
  "task_names": [
    "t2i",
    "t2v",
    "image_edit",
    "video_edit",
    "x2t_image",
    "x2t_video"
  ],
  "recommended_environment": {
    "python": ">=3.10",
    "cuda": ">=12.4",
    "gpu_memory": ">=40GB VRAM for inference"
  },
  "checkpoint_directories": [
    "Lance_3B",
    "Lance_3B_Video",
    "Qwen2.5-VL-ViT"
  ],
  "homepage": "https://huggingface.co/bytedance-research/Lance"
}