""" finetune_owlv2.py — CLI for fine-tuning OWLv2 on a COCO-format dataset. Usage: uv run python scripts/finetune_owlv2.py uv run python scripts/finetune_owlv2.py --epochs 20 --lr 5e-5 uv run python scripts/finetune_owlv2.py --unfreeze-vision --backbone-lr 1e-5 uv run python scripts/finetune_owlv2.py --resume models/owlv2-finetuned/checkpoint-epoch-005 Recommended hardware: CUDA (Windows/Linux) — use fp16 for speed, set --device cuda MPS (Apple Silicon) — fp32 only, slower but functional for small datasets CPU — very slow, only for tiny sanity-check runs Typical first run: 1. make export # build data/labeled/coco_export.json 2. make finetune # train with defaults 3. Update app.py to load from models/owlv2-finetuned/best """ from __future__ import annotations import logging from pathlib import Path import click import torch from dotenv import load_dotenv load_dotenv() from autolabel.config import settings from autolabel.finetune import FinetuneConfig, run_finetune from autolabel.utils import setup_logging PROJECT_ROOT = Path(__file__).resolve().parent.parent DEFAULT_OUTPUT = PROJECT_ROOT / "models" / "owlv2-finetuned" @click.command() @click.option( "--coco-json", default=str(settings.labeled_dir / "coco_export.json"), show_default=True, type=click.Path(exists=True, path_type=Path), help="COCO JSON file produced by `make export`.", ) @click.option( "--image-dir", default=str(settings.raw_dir), show_default=True, type=click.Path(exists=True, file_okay=False, path_type=Path), help="Directory containing the source images (matched by file_name in COCO JSON).", ) @click.option( "--output-dir", default=str(DEFAULT_OUTPUT), show_default=True, type=click.Path(file_okay=False, path_type=Path), help="Directory to save checkpoints and the best model.", ) @click.option("--model", default=settings.model, show_default=True, help="Base model to fine-tune.") @click.option("--epochs", default=10, show_default=True, type=int) @click.option("--batch-size", default=1, show_default=True, type=int, help="Images per forward pass. Keep at 1 for OWLv2-large on ≤8 GB VRAM.") @click.option("--grad-accum", default=4, show_default=True, type=int, help="Gradient accumulation steps. Effective batch = batch_size * grad_accum.") @click.option("--lr", default=1e-4, show_default=True, type=float, help="Learning rate for detection heads.") @click.option("--val-split", default=0.2, show_default=True, type=float, help="Fraction of data to use for validation.") @click.option("--warmup-steps", default=50, show_default=True, type=int) @click.option("--save-every", default=1, show_default=True, type=int, help="Save a checkpoint every N epochs.") @click.option( "--unfreeze-vision", is_flag=True, default=False, help="Also fine-tune the ViT image encoder (needs more VRAM, slower).", ) @click.option( "--backbone-lr", default=1e-5, show_default=True, type=float, help="LR for the vision encoder when --unfreeze-vision is set.", ) @click.option( "--resume", default=None, type=click.Path(path_type=Path), help="Path to a saved checkpoint to resume from.", ) @click.option( "--device", default=settings.device, show_default=True, help="Torch device: cuda | mps | cpu.", ) @click.option("--verbose", "-v", is_flag=True, default=False) def main( coco_json: Path, image_dir: Path, output_dir: Path, model: str, epochs: int, batch_size: int, grad_accum: int, lr: float, val_split: float, warmup_steps: int, save_every: int, unfreeze_vision: bool, backbone_lr: float, resume: Path | None, device: str, verbose: bool, ) -> None: """Fine-tune OWLv2 on your labeled COCO dataset.""" setup_logging(logging.DEBUG if verbose else logging.INFO) dtype = torch.float16 if device == "cuda" else torch.float32 cfg = FinetuneConfig( coco_json=coco_json, image_dir=image_dir, output_dir=output_dir, model_name=model, device=device, torch_dtype=dtype, epochs=epochs, batch_size=batch_size, grad_accum_steps=grad_accum, lr=lr, backbone_lr=backbone_lr if unfreeze_vision else 0.0, val_split=val_split, warmup_steps=warmup_steps, save_every=save_every, unfreeze_vision=unfreeze_vision, resume_from=resume, ) click.echo(f"Fine-tuning OWLv2 on {coco_json}") click.echo(f" device : {device} ({dtype})") click.echo(f" epochs : {epochs}") click.echo(f" effective bs : {batch_size * grad_accum}") click.echo(f" heads lr : {lr}") click.echo(f" unfreeze ViT : {unfreeze_vision}") click.echo(f" output : {output_dir}") click.echo() run_finetune(cfg) if __name__ == "__main__": main()