# Pick which vLLM stack runs via COMPOSE_PROFILES (or --profile). Set VLLM_OPENAI_MODEL in .env
# to match the served model (required for 27b; 9b defaults below if omitted):
#   COMPOSE_PROFILES=vllm-9b   -> QuantTrio/Qwen3.5-9B-AWQ (default VLLM_OPENAI_MODEL)
#   COMPOSE_PROFILES=vllm-27b  -> set VLLM_OPENAI_MODEL=QuantTrio/Qwen3.5-27B-AWQ
# App uses http://vllm-inference:8000 (shared network alias on both vLLM services).
# Example CLI commands:
# docker compose -f docker-compose_vllm.yml --profile vllm-9b up -d
# docker compose -f docker-compose_vllm.yml --profile vllm-27b up -d
services:
  vllm-server-qwen35-9b:
    profiles: ["vllm-9b"]
    image: vllm/vllm-openai:latest
    shm_size: '8gb'
    command: |
      --model QuantTrio/Qwen3.5-9B-AWQ
      --gpu-memory-utilization 0.7
      --tensor-parallel-size 1
      --max-num-seqs 1
      --reasoning-parser qwen3
      --max-model-len 32768
      --speculative-config '{"method":"mtp","num_speculative_tokens":3}'
      --max-num-batched-tokens 2048
    
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    ports:
      - "8000:8000"
    volumes:
      - hf-model-cache:/root/.cache/huggingface
    networks:
      redaction-net-vllm:
        aliases:
          - vllm-inference

  vllm-server-qwen35-27b:
    profiles: ["vllm-27b"]
    image: vllm/vllm-openai:latest
    shm_size: '16gb'
    command: |
      --model QuantTrio/Qwen3.5-27B-AWQ
      --gpu-memory-utilization 0.94
      --tensor-parallel-size 1
      --max-num-seqs 2
      --reasoning-parser qwen3
      --max-model-len 16384  
      --max-num-batched-tokens 4096
      --enforce-eager
      --kv-cache-dtype fp8
      --enable-chunked-prefill
      --enable-prefix-caching
    
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    ports:
      - "8001:8000"
    volumes:
      - hf-model-cache:/root/.cache/huggingface
    networks:
      redaction-net-vllm:
        aliases:
          - vllm-inference

  redaction-app-vllm:
    profiles: ["vllm-9b", "vllm-27b"]
    image: redaction-app-main
    build:
      context: .              # Look in the current folder
      dockerfile: Dockerfile  # Use this file
      target: gradio          # Use the 'gradio' stage from your Dockerfile
      args:                   # Pass your build-time variables here!
        - TORCH_GPU_ENABLED=False
        - INSTALL_VLM=False
        - PADDLE_GPU_ENABLED=True
        - INSTALL_PADDLEOCR=True
    shm_size: '8gb'
    depends_on:
      vllm-server-qwen35-9b:
        condition: service_healthy
        required: false
      vllm-server-qwen35-27b:
        condition: service_healthy
        required: false
    environment:
      - FLAGS_fraction_of_gpu_memory_to_use=0.05
      - RUN_FASTAPI=True
      - APP_MODE=fastapi
      - SHOW_PADDLE_MODEL_OPTIONS=True
      - SHOW_LOCAL_OCR_MODEL_OPTIONS=True
      - SHOW_INFERENCE_SERVER_PII_OPTIONS=True
      - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
      - SHOW_HYBRID_MODELS=True
      - SHOW_DIFFICULT_OCR_EXAMPLES=True
      - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
      - SHOW_SUMMARISATION=True
      - SHOW_AWS_API_KEYS=True
      - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
      - DEFAULT_LOCAL_OCR_MODEL=paddle
      - DEFAULT_PII_DETECTION_MODEL=Local
      - CUSTOM_VLM_BACKEND=inference_vlm
      - MAX_WORKERS=12
      - TESSERACT_MAX_WORKERS=8
      - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
      - LOAD_PADDLE_AT_STARTUP=False
      - INFERENCE_SERVER_API_URL=http://vllm-inference:8000
      - DEFAULT_INFERENCE_SERVER_VLM_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
      - DEFAULT_INFERENCE_SERVER_PII_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
      - EFFICIENT_OCR=True
      - SHOW_CUSTOM_VLM_ENTITIES=True
      - SESSION_OUTPUT_FOLDER=True
      - SAVE_PAGE_OCR_VISUALISATIONS=False
      - HYBRID_OCR_CONFIDENCE_THRESHOLD=97
      - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
      - PREPROCESS_LOCAL_OCR_IMAGES=False
      - INFERENCE_SERVER_DISABLE_THINKING=True
      - MAX_NEW_TOKENS=16384
      - SAVE_EXAMPLE_HYBRID_IMAGES=False
      - SAVE_VLM_INPUT_IMAGES=False
      - VLM_MAX_DPI=200.0
      - DEFAULT_NEW_BATCH_CHAR_COUNT=1250
      - REPORT_VLM_OUTPUTS_TO_GUI=True
      - REPORT_LLM_OUTPUTS_TO_GUI=True
      - ADD_VLM_BOUNDING_BOX_RULES=False

    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    ports:
      - "7860:7860"
    networks:
      - redaction-net-vllm

networks:
  redaction-net-vllm:
    driver: bridge

volumes:
  hf-model-cache: