# Pick which GGUF model runs by setting COMPOSE_PROFILES in .env (or pass --profile):
#   COMPOSE_PROFILES=35b   -> qwen35-35b_q4_gguf
#   COMPOSE_PROFILES=27b   -> qwen35-27b_q4_gguf
# The app always talks to http://llama-inference:8080 (shared network alias on both model services).
# Each model service uses its own llama.cpp and Hugging Face hub cache volumes so mmproj-F16.gguf
# (same filename per repo) and -hf downloads are not shared across profiles.
# Example CLI commands:
# docker compose -f docker-compose_llama.yml --profile 35b_36 up -d
# docker compose -f docker-compose_llama.yml --profile 27b_36 up -d
# docker compose -f docker-compose_llama.yml --profile 35b up -d
# docker compose -f docker-compose_llama.yml --profile 27b up -d
# docker compose -f docker-compose_llama.yml --profile 9b up -d

# docker compose -f docker-compose_llama.yml --profile gemma4-31b up -d
# docker compose -f docker-compose_llama.yml --profile gemma4-26b up -d

# Add --build to the above if you want to rebuild the app image.
services:
  # Qwen 3.6 27B model setup below requires 24GB of VRAM to run.
  qwen36-27b_q4_gguf:
    profiles: ["27b_36"]
    image: ghcr.io/ggml-org/llama.cpp:server-cuda12
    command:
      - -hf
      - unsloth/Qwen3.6-27B-GGUF
      - --hf-file
      - Qwen3.6-27B-UD-Q4_K_XL.gguf
      - --mmproj-url
      - https://huggingface.co/unsloth/Qwen3.6-27B-GGUF/resolve/main/mmproj-F16.gguf
      - --n-gpu-layers
      - "999"
      - --ctx-size
      - "32768"
      - --fit
      - "off"
      - --temp
      - "0.7"
      - --top-k
      - "20"
      - --top-p
      - "0.8"
      - --min-p
      - "0.0"
      - --frequency-penalty
      - "1"
      - --presence-penalty
      - "1"
      - --host
      - "0.0.0.0"
      - --port
      - "8080"
      - --no-warmup
      - --seed
      - "42"
      - --image_min_tokens
      - "300"
      - --image_max_tokens
      - "2000" # To fit within batch size 2048 (default)
    ports:
      - "8000:8080"
    volumes:
      - ./models:/models
      - hf-llama-cache-qwen36-27b:/root/.cache/llama.cpp
      - hf-hub-cache-qwen36-27b:/root/.cache/huggingface
    pull_policy: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    networks:
      redaction-net-llama:
        aliases:
          - llama-inference

   # Qwen 3.6 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system.
  qwen36-35b_q4_gguf:
    profiles: ["35b_36"]
    image: ghcr.io/ggml-org/llama.cpp:server-cuda12
    command:
      - -hf
      - unsloth/Qwen3.6-35B-A3B-GGUF
      - --hf-file
      - Qwen3.6-35B-A3B-UD-IQ4_NL.gguf
      - --mmproj-url
      - https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
      - --n-gpu-layers
      - "999"
      - --ctx-size
      - "32768"
      - --fit
      - "off"
      - --temp
      - "0.7"
      - --top-k
      - "20"
      - --top-p
      - "0.8"
      - --min-p
      - "0.0"
      - --frequency-penalty
      - "1"
      - --presence-penalty
      - "1"
      - --host
      - "0.0.0.0"
      - --port
      - "8080"
      - --no-warmup
      - --seed
      - "42"
      - --n-cpu-moe
      - "0" # Increase this value to fit within your available VRAM
      - --image_min_tokens
      - "300"
      - --image_max_tokens
      - "2000" # To fit within batch size 2048 (default)
    ports:
      - "8005:8080"
    volumes:
      - ./models:/models
      - hf-llama-cache-qwen36-35b:/root/.cache/llama.cpp
      - hf-hub-cache-qwen36-35b:/root/.cache/huggingface
    pull_policy: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    networks:
      redaction-net-llama:
        aliases:
          - llama-inference

  # Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system.
  qwen35-35b_q4_gguf:
    profiles: ["35b"]
    image: ghcr.io/ggml-org/llama.cpp:server-cuda12
    command:
      - -hf
      - unsloth/Qwen3.5-35B-A3B-GGUF
      - --hf-file
      - Qwen3.5-35B-A3B-UD-IQ4_NL.gguf
      - --mmproj-url
      - https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
      - --n-gpu-layers
      - "999"
      - --ctx-size
      - "32768"
      - --fit
      - "off"
      - --temp
      - "0.7"
      - --top-k
      - "20"
      - --top-p
      - "0.8"
      - --min-p
      - "0.0"
      - --frequency-penalty
      - "1"
      - --presence-penalty
      - "1"
      - --host
      - "0.0.0.0"
      - --port
      - "8080"
      - --no-warmup
      - --seed
      - "42"
      - --n-cpu-moe
      - "0" # Increase this value to fit within your available VRAM
      - --image_min_tokens
      - "300"
      - --image_max_tokens
      - "2000" # To fit within batch size 2048 (default)
    ports:
      - "8001:8080"
    volumes:
      - ./models:/models
      - hf-llama-cache-qwen35-35b:/root/.cache/llama.cpp
      - hf-hub-cache-qwen35-35b:/root/.cache/huggingface
    pull_policy: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    networks:
      redaction-net-llama:
        aliases:
          - llama-inference

  # Qwen 3.5 27B model setup below requires 24GB of VRAM to run.
  qwen35-27b_q4_gguf:
    profiles: ["27b"]
    image: ghcr.io/ggml-org/llama.cpp:server-cuda12
    command:
      - -hf
      - unsloth/Qwen3.5-27B-GGUF
      - --hf-file
      - Qwen3.5-27B-UD-Q4_K_XL.gguf
      - --mmproj-url
      - https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/mmproj-F16.gguf
      - --n-gpu-layers
      - "999"
      - --ctx-size
      - "32768"
      - --fit
      - "off"
      - --temp
      - "0.7"
      - --top-k
      - "20"
      - --top-p
      - "0.8"
      - --min-p
      - "0.0"
      - --frequency-penalty
      - "1"
      - --presence-penalty
      - "1"
      - --host
      - "0.0.0.0"
      - --port
      - "8080"
      - --no-warmup
      - --seed
      - "42"
      - --image_min_tokens
      - "300"
      - --image_max_tokens
      - "2000" # To fit within batch size 2048 (default)
    ports:
      - "8000:8080"
    volumes:
      - ./models:/models
      - hf-llama-cache-qwen35-27b:/root/.cache/llama.cpp
      - hf-hub-cache-qwen35-27b:/root/.cache/huggingface
    pull_policy: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    networks:
      redaction-net-llama:
        aliases:
          - llama-inference

  qwen9b_q4_gguf:
    profiles: ["9b"]
    image: ghcr.io/ggml-org/llama.cpp:server-cuda12
    command:
      - -hf
      - unsloth/Qwen3.5-9B-GGUF
      - --hf-file
      - Qwen3.5-9B-UD-Q4_K_XL.gguf
      - --mmproj-url
      - https://huggingface.co/unsloth/Qwen3.5-9B-A3B-GGUF/resolve/main/mmproj-F16.gguf
      - --n-gpu-layers
      - "999"
      - --ctx-size
      - "16384"
      - --fit
      - "off"
      - --temp
      - "0.7"
      - --top-k
      - "20"
      - --top-p
      - "0.8"
      - --min-p
      - "0.0"
      - --frequency-penalty
      - "1"
      - --presence-penalty
      - "1"
      - --host
      - "0.0.0.0"
      - --port
      - "8080"
      - --no-warmup
      - --seed
      - "42"
      - --n-cpu-moe
      - "0" # Increase this value to fit within your availableVRAM
      - --cache-type-k
      - "q8_0"
      - --cache-type-v
      - "q8_0"
      - --image_min_tokens
      - "300"
      - --image_max_tokens
      - "2000" # To fit within batch size 2048 (default)
    ports:
      - "8003:8080"
    volumes:
      - ./models:/models
      - hf-llama-cache-qwen9b:/root/.cache/llama.cpp
      - hf-hub-cache-qwen9b:/root/.cache/huggingface
    pull_policy: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]

   # Gemma 4 31B model setup below requires 24GB+ of VRAM to run.
  gemma4-31b_q4_gguf:
    profiles: ["gemma4-31b"]
    image: ghcr.io/ggml-org/llama.cpp:server-cuda12
    command:
      - -hf
      - unsloth/gemma-4-31B-it-GGUF
      - --hf-file
      - gemma-4-31B-it-IQ4_NL.gguf
      - --mmproj-url
      - https://huggingface.co/unsloth/gemma-4-31B-it-GGUF/resolve/main/mmproj-F16.gguf
      - --n-gpu-layers
      - "999"
      - --ctx-size
      - "16384"
      - --fit
      - "off"
      - --temp
      - "1.0"
      - --top-k
      - "64"
      - --top-p
      - "0.95"
      - --host
      - "0.0.0.0"
      - --port
      - "8080"
      - --no-warmup
      - --seed
      - "42"
      - -np
      - "1"
      - --cache-type-k
      - "q8_0"
      - --cache-type-v
      - "q8_0"
      - --image_min_tokens
      - "300"
      - --image_max_tokens
      - "2000" # To fit within batch size 2048 (default)
    ports:
      - "8002:8080"
    volumes:
      - ./models:/models
      - hf-llama-cache-gemma4-31b:/root/.cache/llama.cpp
      - hf-hub-cache-gemma4-31b:/root/.cache/huggingface
    pull_policy: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    networks:
      redaction-net-llama:
        aliases:
          - llama-inference

 # Gemma 4 26B model setup below requires 24GB+ of VRAM to run.
  gemma4-26b_q4_gguf:
    profiles: ["gemma4-26b"]
    image: ghcr.io/ggml-org/llama.cpp:server-cuda12
    command:
      - -hf
      - unsloth/gemma-4-26B-A4B-it-GGUF
      - --hf-file
      - gemma-4-26B-A4B-it-UD-IQ4_NL.gguf
      - --mmproj-url
      - https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/resolve/main/mmproj-F16.gguf
      - --n-gpu-layers
      - "999"
      - --ctx-size
      - "65536"
      - --fit
      - "off"
      - --temp
      - "1.0"
      - --top-k
      - "64"
      - --top-p
      - "0.95"
      - --host
      - "0.0.0.0"
      - --port
      - "8080"
      - --no-warmup
      - --seed
      - "42"
      - -np
      - "1"
      - --cache-type-k
      - "q8_0"
      - --cache-type-v
      - "q8_0"
      - --image_min_tokens
      - "300"
      - --image_max_tokens
      - "2000" # To fit within batch size 2048 (default)
    ports:
      - "8002:8080"
    volumes:
      - ./models:/models
      - hf-llama-cache-gemma4-26b:/root/.cache/llama.cpp
      - hf-hub-cache-gemma4-26b:/root/.cache/huggingface
    pull_policy: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    networks:
      redaction-net-llama:
        aliases:
          - llama-inference

  redaction-app-llama:
    profiles: ["35b_36", "27b_36", "35b", "27b", "9b", "gemma4-31b", "gemma4-26b"]
    image: redaction-app-main
    build:
      context: .              # Look in the current folder
      dockerfile: Dockerfile  # Use this file
      target: gradio          # Use the 'gradio' stage from your Dockerfile
      args:                   # Pass your build-time variables here!
        - TORCH_GPU_ENABLED=False
        - INSTALL_VLM=False
        - PADDLE_GPU_ENABLED=True
        - INSTALL_PADDLEOCR=True
    shm_size: '8gb'
    depends_on:
      qwen36-35b_q4_gguf:
        condition: service_healthy
        required: false
      qwen36-27b_q4_gguf:
        condition: service_healthy
        required: false
      qwen35-35b_q4_gguf:
        condition: service_healthy
        required: false
      qwen35-27b_q4_gguf:
        condition: service_healthy
        required: false
      qwen9b_q4_gguf:
        condition: service_healthy
        required: false
      gemma4-31b_q4_gguf:
        condition: service_healthy
        required: false
      gemma4-26b_q4_gguf:
        condition: service_healthy
        required: false
    environment:
      - FLAGS_fraction_of_gpu_memory_to_use=0.05
      - RUN_FASTAPI=True
      - APP_MODE=fastapi
      - SHOW_PADDLE_MODEL_OPTIONS=True
      - SHOW_LOCAL_OCR_MODEL_OPTIONS=True
      - SHOW_LOCAL_PII_DETECTION_OPTIONS=True
      - SHOW_INFERENCE_SERVER_PII_OPTIONS=True
      - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
      - SHOW_HYBRID_MODELS=True
      - SHOW_DIFFICULT_OCR_EXAMPLES=True
      - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
      - SHOW_SUMMARISATION=True
      - SHOW_AWS_API_KEYS=True
      - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
      - DEFAULT_LOCAL_OCR_MODEL=paddle
      - DEFAULT_PII_DETECTION_MODEL=Local
      - INFERENCE_SERVER_API_URL=http://llama-inference:8080
      - DEFAULT_INFERENCE_SERVER_VLM_MODEL=""
      - DEFAULT_INFERENCE_SERVER_PII_MODEL=""
      - CUSTOM_VLM_BACKEND=inference_vlm
      - MAX_WORKERS=12
      - TESSERACT_MAX_WORKERS=8
      - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
      - LOAD_PADDLE_AT_STARTUP=False
      - EFFICIENT_OCR=True
      - SHOW_CUSTOM_VLM_ENTITIES=True
      - SESSION_OUTPUT_FOLDER=True
      - SAVE_PAGE_OCR_VISUALISATIONS=False
      - HYBRID_OCR_CONFIDENCE_THRESHOLD=97
      - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
      - PREPROCESS_LOCAL_OCR_IMAGES=False
      - INFERENCE_SERVER_DISABLE_THINKING=True
      - MAX_NEW_TOKENS=16384
      - SAVE_EXAMPLE_HYBRID_IMAGES=False
      - SAVE_VLM_INPUT_IMAGES=False
      - VLM_MAX_DPI=200.0
      - DEFAULT_NEW_BATCH_CHAR_COUNT=1250
      - REPORT_VLM_OUTPUTS_TO_GUI=True
      - REPORT_LLM_OUTPUTS_TO_GUI=True
      - ADD_VLM_BOUNDING_BOX_RULES=False      

    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    ports:
      - "7861:7860"
    networks:
      - redaction-net-llama

networks:
  redaction-net-llama:
    driver: bridge

volumes:
  hf-llama-cache-qwen36-35b:
  hf-llama-cache-qwen36-27b:
  hf-llama-cache-qwen35-35b:
  hf-llama-cache-qwen35-27b:
  hf-llama-cache-qwen9b:
  hf-llama-cache-gemma4-31b:
  hf-llama-cache-gemma4-26b:
  hf-hub-cache-qwen36-35b:
  hf-hub-cache-qwen35-35b:
  hf-hub-cache-qwen35-27b:
  hf-hub-cache-qwen36-27b:
  hf-hub-cache-qwen9b:
  hf-hub-cache-gemma4-31b:
  hf-hub-cache-gemma4-26b: