# Pick which GGUF model runs by setting COMPOSE_PROFILES in .env (or pass --profile): # COMPOSE_PROFILES=35b -> qwen35-35b_q4_gguf # COMPOSE_PROFILES=27b -> qwen35-27b_q4_gguf # The app always talks to http://llama-inference:8080 (shared network alias on both model services). # Each model service uses its own llama.cpp and Hugging Face hub cache volumes so mmproj-F16.gguf # (same filename per repo) and -hf downloads are not shared across profiles. # Example CLI commands: # docker compose -f docker-compose_llama.yml --profile 35b_36 up -d # docker compose -f docker-compose_llama.yml --profile 27b_36 up -d # docker compose -f docker-compose_llama.yml --profile 35b up -d # docker compose -f docker-compose_llama.yml --profile 27b up -d # docker compose -f docker-compose_llama.yml --profile 9b up -d # docker compose -f docker-compose_llama.yml --profile gemma4-31b up -d # docker compose -f docker-compose_llama.yml --profile gemma4-26b up -d # Add --build to the above if you want to rebuild the app image. services: # Qwen 3.6 27B model setup below requires 24GB of VRAM to run. qwen36-27b_q4_gguf: profiles: ["27b_36"] image: ghcr.io/ggml-org/llama.cpp:server-cuda12 command: - -hf - unsloth/Qwen3.6-27B-GGUF - --hf-file - Qwen3.6-27B-UD-Q4_K_XL.gguf - --mmproj-url - https://huggingface.co/unsloth/Qwen3.6-27B-GGUF/resolve/main/mmproj-F16.gguf - --n-gpu-layers - "999" - --ctx-size - "32768" - --fit - "off" - --temp - "0.7" - --top-k - "20" - --top-p - "0.8" - --min-p - "0.0" - --frequency-penalty - "1" - --presence-penalty - "1" - --host - "0.0.0.0" - --port - "8080" - --no-warmup - --seed - "42" - --image_min_tokens - "300" - --image_max_tokens - "2000" # To fit within batch size 2048 (default) ports: - "8000:8080" volumes: - ./models:/models - hf-llama-cache-qwen36-27b:/root/.cache/llama.cpp - hf-hub-cache-qwen36-27b:/root/.cache/huggingface pull_policy: always deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] interval: 30s timeout: 15s retries: 8 start_period: 1200s networks: redaction-net-llama: aliases: - llama-inference # Qwen 3.6 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system. qwen36-35b_q4_gguf: profiles: ["35b_36"] image: ghcr.io/ggml-org/llama.cpp:server-cuda12 command: - -hf - unsloth/Qwen3.6-35B-A3B-GGUF - --hf-file - Qwen3.6-35B-A3B-UD-IQ4_NL.gguf - --mmproj-url - https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf - --n-gpu-layers - "999" - --ctx-size - "32768" - --fit - "off" - --temp - "0.7" - --top-k - "20" - --top-p - "0.8" - --min-p - "0.0" - --frequency-penalty - "1" - --presence-penalty - "1" - --host - "0.0.0.0" - --port - "8080" - --no-warmup - --seed - "42" - --n-cpu-moe - "0" # Increase this value to fit within your available VRAM - --image_min_tokens - "300" - --image_max_tokens - "2000" # To fit within batch size 2048 (default) ports: - "8005:8080" volumes: - ./models:/models - hf-llama-cache-qwen36-35b:/root/.cache/llama.cpp - hf-hub-cache-qwen36-35b:/root/.cache/huggingface pull_policy: always deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] interval: 30s timeout: 15s retries: 8 start_period: 1200s networks: redaction-net-llama: aliases: - llama-inference # Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system. qwen35-35b_q4_gguf: profiles: ["35b"] image: ghcr.io/ggml-org/llama.cpp:server-cuda12 command: - -hf - unsloth/Qwen3.5-35B-A3B-GGUF - --hf-file - Qwen3.5-35B-A3B-UD-IQ4_NL.gguf - --mmproj-url - https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf - --n-gpu-layers - "999" - --ctx-size - "32768" - --fit - "off" - --temp - "0.7" - --top-k - "20" - --top-p - "0.8" - --min-p - "0.0" - --frequency-penalty - "1" - --presence-penalty - "1" - --host - "0.0.0.0" - --port - "8080" - --no-warmup - --seed - "42" - --n-cpu-moe - "0" # Increase this value to fit within your available VRAM - --image_min_tokens - "300" - --image_max_tokens - "2000" # To fit within batch size 2048 (default) ports: - "8001:8080" volumes: - ./models:/models - hf-llama-cache-qwen35-35b:/root/.cache/llama.cpp - hf-hub-cache-qwen35-35b:/root/.cache/huggingface pull_policy: always deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] interval: 30s timeout: 15s retries: 8 start_period: 1200s networks: redaction-net-llama: aliases: - llama-inference # Qwen 3.5 27B model setup below requires 24GB of VRAM to run. qwen35-27b_q4_gguf: profiles: ["27b"] image: ghcr.io/ggml-org/llama.cpp:server-cuda12 command: - -hf - unsloth/Qwen3.5-27B-GGUF - --hf-file - Qwen3.5-27B-UD-Q4_K_XL.gguf - --mmproj-url - https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/mmproj-F16.gguf - --n-gpu-layers - "999" - --ctx-size - "32768" - --fit - "off" - --temp - "0.7" - --top-k - "20" - --top-p - "0.8" - --min-p - "0.0" - --frequency-penalty - "1" - --presence-penalty - "1" - --host - "0.0.0.0" - --port - "8080" - --no-warmup - --seed - "42" - --image_min_tokens - "300" - --image_max_tokens - "2000" # To fit within batch size 2048 (default) ports: - "8000:8080" volumes: - ./models:/models - hf-llama-cache-qwen35-27b:/root/.cache/llama.cpp - hf-hub-cache-qwen35-27b:/root/.cache/huggingface pull_policy: always deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] interval: 30s timeout: 15s retries: 8 start_period: 1200s networks: redaction-net-llama: aliases: - llama-inference qwen9b_q4_gguf: profiles: ["9b"] image: ghcr.io/ggml-org/llama.cpp:server-cuda12 command: - -hf - unsloth/Qwen3.5-9B-GGUF - --hf-file - Qwen3.5-9B-UD-Q4_K_XL.gguf - --mmproj-url - https://huggingface.co/unsloth/Qwen3.5-9B-A3B-GGUF/resolve/main/mmproj-F16.gguf - --n-gpu-layers - "999" - --ctx-size - "16384" - --fit - "off" - --temp - "0.7" - --top-k - "20" - --top-p - "0.8" - --min-p - "0.0" - --frequency-penalty - "1" - --presence-penalty - "1" - --host - "0.0.0.0" - --port - "8080" - --no-warmup - --seed - "42" - --n-cpu-moe - "0" # Increase this value to fit within your availableVRAM - --cache-type-k - "q8_0" - --cache-type-v - "q8_0" - --image_min_tokens - "300" - --image_max_tokens - "2000" # To fit within batch size 2048 (default) ports: - "8003:8080" volumes: - ./models:/models - hf-llama-cache-qwen9b:/root/.cache/llama.cpp - hf-hub-cache-qwen9b:/root/.cache/huggingface pull_policy: always deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] # Gemma 4 31B model setup below requires 24GB+ of VRAM to run. gemma4-31b_q4_gguf: profiles: ["gemma4-31b"] image: ghcr.io/ggml-org/llama.cpp:server-cuda12 command: - -hf - unsloth/gemma-4-31B-it-GGUF - --hf-file - gemma-4-31B-it-IQ4_NL.gguf - --mmproj-url - https://huggingface.co/unsloth/gemma-4-31B-it-GGUF/resolve/main/mmproj-F16.gguf - --n-gpu-layers - "999" - --ctx-size - "16384" - --fit - "off" - --temp - "1.0" - --top-k - "64" - --top-p - "0.95" - --host - "0.0.0.0" - --port - "8080" - --no-warmup - --seed - "42" - -np - "1" - --cache-type-k - "q8_0" - --cache-type-v - "q8_0" - --image_min_tokens - "300" - --image_max_tokens - "2000" # To fit within batch size 2048 (default) ports: - "8002:8080" volumes: - ./models:/models - hf-llama-cache-gemma4-31b:/root/.cache/llama.cpp - hf-hub-cache-gemma4-31b:/root/.cache/huggingface pull_policy: always deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] interval: 30s timeout: 15s retries: 8 start_period: 1200s networks: redaction-net-llama: aliases: - llama-inference # Gemma 4 26B model setup below requires 24GB+ of VRAM to run. gemma4-26b_q4_gguf: profiles: ["gemma4-26b"] image: ghcr.io/ggml-org/llama.cpp:server-cuda12 command: - -hf - unsloth/gemma-4-26B-A4B-it-GGUF - --hf-file - gemma-4-26B-A4B-it-UD-IQ4_NL.gguf - --mmproj-url - https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/resolve/main/mmproj-F16.gguf - --n-gpu-layers - "999" - --ctx-size - "65536" - --fit - "off" - --temp - "1.0" - --top-k - "64" - --top-p - "0.95" - --host - "0.0.0.0" - --port - "8080" - --no-warmup - --seed - "42" - -np - "1" - --cache-type-k - "q8_0" - --cache-type-v - "q8_0" - --image_min_tokens - "300" - --image_max_tokens - "2000" # To fit within batch size 2048 (default) ports: - "8002:8080" volumes: - ./models:/models - hf-llama-cache-gemma4-26b:/root/.cache/llama.cpp - hf-hub-cache-gemma4-26b:/root/.cache/huggingface pull_policy: always deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] interval: 30s timeout: 15s retries: 8 start_period: 1200s networks: redaction-net-llama: aliases: - llama-inference redaction-app-llama: profiles: ["35b_36", "27b_36", "35b", "27b", "9b", "gemma4-31b", "gemma4-26b"] image: redaction-app-main build: context: . # Look in the current folder dockerfile: Dockerfile # Use this file target: gradio # Use the 'gradio' stage from your Dockerfile args: # Pass your build-time variables here! - TORCH_GPU_ENABLED=False - INSTALL_VLM=False - PADDLE_GPU_ENABLED=True - INSTALL_PADDLEOCR=True shm_size: '8gb' depends_on: qwen36-35b_q4_gguf: condition: service_healthy required: false qwen36-27b_q4_gguf: condition: service_healthy required: false qwen35-35b_q4_gguf: condition: service_healthy required: false qwen35-27b_q4_gguf: condition: service_healthy required: false qwen9b_q4_gguf: condition: service_healthy required: false gemma4-31b_q4_gguf: condition: service_healthy required: false gemma4-26b_q4_gguf: condition: service_healthy required: false environment: - FLAGS_fraction_of_gpu_memory_to_use=0.05 - RUN_FASTAPI=True - APP_MODE=fastapi - SHOW_PADDLE_MODEL_OPTIONS=True - SHOW_LOCAL_OCR_MODEL_OPTIONS=True - SHOW_LOCAL_PII_DETECTION_OPTIONS=True - SHOW_INFERENCE_SERVER_PII_OPTIONS=True - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True - SHOW_HYBRID_MODELS=True - SHOW_DIFFICULT_OCR_EXAMPLES=True - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True - SHOW_SUMMARISATION=True - SHOW_AWS_API_KEYS=True - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text - DEFAULT_LOCAL_OCR_MODEL=paddle - DEFAULT_PII_DETECTION_MODEL=Local - INFERENCE_SERVER_API_URL=http://llama-inference:8080 - DEFAULT_INFERENCE_SERVER_VLM_MODEL="" - DEFAULT_INFERENCE_SERVER_PII_MODEL="" - CUSTOM_VLM_BACKEND=inference_vlm - MAX_WORKERS=12 - TESSERACT_MAX_WORKERS=8 - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors - LOAD_PADDLE_AT_STARTUP=False - EFFICIENT_OCR=True - SHOW_CUSTOM_VLM_ENTITIES=True - SESSION_OUTPUT_FOLDER=True - SAVE_PAGE_OCR_VISUALISATIONS=False - HYBRID_OCR_CONFIDENCE_THRESHOLD=97 - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True - PREPROCESS_LOCAL_OCR_IMAGES=False - INFERENCE_SERVER_DISABLE_THINKING=True - MAX_NEW_TOKENS=16384 - SAVE_EXAMPLE_HYBRID_IMAGES=False - SAVE_VLM_INPUT_IMAGES=False - VLM_MAX_DPI=200.0 - DEFAULT_NEW_BATCH_CHAR_COUNT=1250 - REPORT_VLM_OUTPUTS_TO_GUI=True - REPORT_LLM_OUTPUTS_TO_GUI=True - ADD_VLM_BOUNDING_BOX_RULES=False deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] ports: - "7861:7860" networks: - redaction-net-llama networks: redaction-net-llama: driver: bridge volumes: hf-llama-cache-qwen36-35b: hf-llama-cache-qwen36-27b: hf-llama-cache-qwen35-35b: hf-llama-cache-qwen35-27b: hf-llama-cache-qwen9b: hf-llama-cache-gemma4-31b: hf-llama-cache-gemma4-26b: hf-hub-cache-qwen36-35b: hf-hub-cache-qwen35-35b: hf-hub-cache-qwen35-27b: hf-hub-cache-qwen36-27b: hf-hub-cache-qwen9b: hf-hub-cache-gemma4-31b: hf-hub-cache-gemma4-26b: