document_redaction / docker-compose_llama.yml
seanpedrickcase's picture
Sync: Updated review data modification method. Added page ocr visualtion save on demand. Updated agentic api routes. Can now set full with gui with variable.
4a56925
# Pick which GGUF model runs by setting COMPOSE_PROFILES in .env (or pass --profile):
# COMPOSE_PROFILES=35b -> qwen35-35b_q4_gguf
# COMPOSE_PROFILES=27b -> qwen35-27b_q4_gguf
# The app always talks to http://llama-inference:8080 (shared network alias on both model services).
# Each model service uses its own llama.cpp and Hugging Face hub cache volumes so mmproj-F16.gguf
# (same filename per repo) and -hf downloads are not shared across profiles.
# Example CLI commands:
# docker compose -f docker-compose_llama.yml --profile 35b_36 up -d
# docker compose -f docker-compose_llama.yml --profile 35b up -d
# docker compose -f docker-compose_llama.yml --profile 27b up -d
# docker compose -f docker-compose_llama.yml --profile 9b up -d
# docker compose -f docker-compose_llama.yml --profile gemma4-31b up -d
# docker compose -f docker-compose_llama.yml --profile gemma4-26b up -d
# Add --build to the above if you want to rebuild the app image.
services:
# Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system.
qwen36-35b_q4_gguf:
profiles: ["35b_36"]
image: ghcr.io/ggml-org/llama.cpp:server-cuda12
command:
- -hf
- unsloth/Qwen3.6-35B-A3B-GGUF
- --hf-file
- Qwen3.6-35B-A3B-UD-IQ4_NL.gguf
- --mmproj-url
- https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
- --n-gpu-layers
- "999"
- --ctx-size
- "32768"
- --fit
- "off"
- --temp
- "0.7"
- --top-k
- "20"
- --top-p
- "0.8"
- --min-p
- "0.0"
- --frequency-penalty
- "1"
- --presence-penalty
- "1"
- --host
- "0.0.0.0"
- --port
- "8080"
- --no-warmup
- --seed
- "42"
- --n-cpu-moe
- "0" # Increase this value to fit within your availableVRAM
ports:
- "8005:8080"
volumes:
- ./models:/models
- hf-llama-cache-qwen36-35b:/root/.cache/llama.cpp
- hf-hub-cache-qwen36-35b:/root/.cache/huggingface
pull_policy: always
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
interval: 30s
timeout: 15s
retries: 8
start_period: 1200s
networks:
redaction-net-llama:
aliases:
- llama-inference
# Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system.
qwen35-35b_q4_gguf:
profiles: ["35b"]
image: ghcr.io/ggml-org/llama.cpp:server-cuda12
command:
- -hf
- unsloth/Qwen3.5-35B-A3B-GGUF
- --hf-file
- Qwen3.5-35B-A3B-UD-IQ4_NL.gguf
- --mmproj-url
- https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
- --n-gpu-layers
- "999"
- --ctx-size
- "32768"
- --fit
- "off"
- --temp
- "0.7"
- --top-k
- "20"
- --top-p
- "0.8"
- --min-p
- "0.0"
- --frequency-penalty
- "1"
- --presence-penalty
- "1"
- --host
- "0.0.0.0"
- --port
- "8080"
- --no-warmup
- --seed
- "42"
- --n-cpu-moe
- "0" # Increase this value to fit within your availableVRAM
ports:
- "8001:8080"
volumes:
- ./models:/models
- hf-llama-cache-qwen35-35b:/root/.cache/llama.cpp
- hf-hub-cache-qwen35-35b:/root/.cache/huggingface
pull_policy: always
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
interval: 30s
timeout: 15s
retries: 8
start_period: 1200s
networks:
redaction-net-llama:
aliases:
- llama-inference
# Qwen 3.5 27B model setup below requires 24GB of VRAM to run.
qwen35-27b_q4_gguf:
profiles: ["27b"]
image: ghcr.io/ggml-org/llama.cpp:server-cuda12
command:
- -hf
- unsloth/Qwen3.5-27B-GGUF
- --hf-file
- Qwen3.5-27B-UD-Q4_K_XL.gguf
- --mmproj-url
- https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/mmproj-F16.gguf
- --n-gpu-layers
- "999"
- --ctx-size
- "32768"
- --fit
- "off"
- --temp
- "0.7"
- --top-k
- "20"
- --top-p
- "0.8"
- --min-p
- "0.0"
- --frequency-penalty
- "1"
- --presence-penalty
- "1"
- --host
- "0.0.0.0"
- --port
- "8080"
- --no-warmup
- --seed
- "42"
ports:
- "8000:8080"
volumes:
- ./models:/models
- hf-llama-cache-qwen35-27b:/root/.cache/llama.cpp
- hf-hub-cache-qwen35-27b:/root/.cache/huggingface
pull_policy: always
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
interval: 30s
timeout: 15s
retries: 8
start_period: 1200s
networks:
redaction-net-llama:
aliases:
- llama-inference
qwen9b_q4_gguf:
profiles: ["qwen9b"]
image: ghcr.io/ggml-org/llama.cpp:server-cuda12
command:
- -hf
- unsloth/Qwen3.5-9B-A3B-GGUF
- --hf-file
- Qwen3.5-9B-A3B-UD-IQ4_NL.gguf
- --mmproj-url
- https://huggingface.co/unsloth/Qwen3.5-9B-A3B-GGUF/resolve/main/mmproj-F16.gguf
- --n-gpu-layers
- "999"
- --ctx-size
- "16384"
- --fit
- "off"
- --temp
- "0.7"
- --top-k
- "20"
- --top-p
- "0.8"
- --min-p
- "0.0"
- --frequency-penalty
- "1"
- --presence-penalty
- "1"
- --host
- "0.0.0.0"
- --port
- "8080"
- --no-warmup
- --seed
- "42"
- --n-cpu-moe
- "0" # Increase this value to fit within your availableVRAM
- --cache-type-k
- "q8_0"
- --cache-type-v
- "q8_0"
ports:
- "8003:8080"
volumes:
- ./models:/models
- hf-llama-cache-qwen9b:/root/.cache/llama.cpp
- hf-hub-cache-qwen9b:/root/.cache/huggingface
pull_policy: always
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
# Gemma 4 31B model setup below requires 24GB+ of VRAM to run.
gemma4-31b_q4_gguf:
profiles: ["gemma4-31b"]
image: ghcr.io/ggml-org/llama.cpp:server-cuda12
command:
- -hf
- unsloth/gemma-4-31B-it-GGUF
- --hf-file
- gemma-4-31B-it-IQ4_NL.gguf
- --mmproj-url
- https://huggingface.co/unsloth/gemma-4-31B-it-GGUF/resolve/main/mmproj-F16.gguf
- --n-gpu-layers
- "999"
- --ctx-size
- "16384"
- --fit
- "off"
- --temp
- "1.0"
- --top-k
- "64"
- --top-p
- "0.95"
- --host
- "0.0.0.0"
- --port
- "8080"
- --no-warmup
- --seed
- "42"
- -np
- "1"
- --cache-type-k
- "q8_0"
- --cache-type-v
- "q8_0"
ports:
- "8002:8080"
volumes:
- ./models:/models
- hf-llama-cache-gemma4-31b:/root/.cache/llama.cpp
- hf-hub-cache-gemma4-31b:/root/.cache/huggingface
pull_policy: always
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
interval: 30s
timeout: 15s
retries: 8
start_period: 1200s
networks:
redaction-net-llama:
aliases:
- llama-inference
# Gemma 4 26B model setup below requires 24GB+ of VRAM to run.
gemma4-26b_q4_gguf:
profiles: ["gemma4-26b"]
image: ghcr.io/ggml-org/llama.cpp:server-cuda12
command:
- -hf
- unsloth/gemma-4-26B-A4B-it-GGUF
- --hf-file
- gemma-4-26B-A4B-it-UD-IQ4_NL.gguf
- --mmproj-url
- https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/resolve/main/mmproj-F16.gguf
- --n-gpu-layers
- "999"
- --ctx-size
- "65536"
- --fit
- "off"
- --temp
- "1.0"
- --top-k
- "64"
- --top-p
- "0.95"
- --host
- "0.0.0.0"
- --port
- "8080"
- --no-warmup
- --seed
- "42"
- -np
- "1"
- --cache-type-k
- "q8_0"
- --cache-type-v
- "q8_0"
ports:
- "8002:8080"
volumes:
- ./models:/models
- hf-llama-cache-gemma4-26b:/root/.cache/llama.cpp
- hf-hub-cache-gemma4-26b:/root/.cache/huggingface
pull_policy: always
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
interval: 30s
timeout: 15s
retries: 8
start_period: 1200s
networks:
redaction-net-llama:
aliases:
- llama-inference
redaction-app-llama:
profiles: ["35b_36", "35b", "27b", "9b", "gemma4-31b", "gemma4-26b"]
image: redaction-app-main
build:
context: . # Look in the current folder
dockerfile: Dockerfile # Use this file
target: gradio # Use the 'gradio' stage from your Dockerfile
args: # Pass your build-time variables here!
- TORCH_GPU_ENABLED=False
- INSTALL_VLM=False
- PADDLE_GPU_ENABLED=True
- INSTALL_PADDLEOCR=True
shm_size: '8gb'
depends_on:
qwen36-35b_q4_gguf:
condition: service_healthy
required: false
qwen35-35b_q4_gguf:
condition: service_healthy
required: false
qwen35-27b_q4_gguf:
condition: service_healthy
required: false
qwen9b_q4_gguf:
condition: service_healthy
required: false
gemma4-31b_q4_gguf:
condition: service_healthy
required: false
gemma4-26b_q4_gguf:
condition: service_healthy
required: false
environment:
- FLAGS_fraction_of_gpu_memory_to_use=0.05
- RUN_FASTAPI=True
- APP_MODE=fastapi
- SHOW_PADDLE_MODEL_OPTIONS=True
- SHOW_LOCAL_OCR_MODEL_OPTIONS=True
- SHOW_LOCAL_PII_DETECTION_OPTIONS=True
- SHOW_INFERENCE_SERVER_PII_OPTIONS=True
- SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
- SHOW_HYBRID_MODELS=True
- SHOW_DIFFICULT_OCR_EXAMPLES=True
- SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
- SHOW_SUMMARISATION=True
- SHOW_AWS_API_KEYS=True
- DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
- DEFAULT_LOCAL_OCR_MODEL=paddle
- DEFAULT_PII_DETECTION_MODEL=Local
- INFERENCE_SERVER_API_URL=http://llama-inference:8080
- DEFAULT_INFERENCE_SERVER_VLM_MODEL=""
- DEFAULT_INFERENCE_SERVER_PII_MODEL=""
- CUSTOM_VLM_BACKEND=inference_vlm
- MAX_WORKERS=12
- TESSERACT_MAX_WORKERS=8
- PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
- LOAD_PADDLE_AT_STARTUP=False
- EFFICIENT_OCR=True
- SHOW_CUSTOM_VLM_ENTITIES=True
- SESSION_OUTPUT_FOLDER=True
- SAVE_PAGE_OCR_VISUALISATIONS=False
- HYBRID_OCR_CONFIDENCE_THRESHOLD=97
- INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
- PREPROCESS_LOCAL_OCR_IMAGES=False
- INFERENCE_SERVER_DISABLE_THINKING=True
- MAX_NEW_TOKENS=16384
- SAVE_EXAMPLE_HYBRID_IMAGES=False
- SAVE_VLM_INPUT_IMAGES=False
- VLM_MAX_DPI=200.0
- DEFAULT_NEW_BATCH_CHAR_COUNT=1250
- REPORT_VLM_OUTPUTS_TO_GUI=True
- REPORT_LLM_OUTPUTS_TO_GUI=True
- ADD_VLM_BOUNDING_BOX_RULES=False
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
ports:
- "7861:7860"
networks:
- redaction-net-llama
networks:
redaction-net-llama:
driver: bridge
volumes:
hf-llama-cache-qwen36-35b:
hf-llama-cache-qwen35-35b:
hf-llama-cache-qwen35-27b:
hf-llama-cache-qwen9b:
hf-llama-cache-gemma4-31b:
hf-llama-cache-gemma4-26b:
hf-hub-cache-qwen36-35b:
hf-hub-cache-qwen35-35b:
hf-hub-cache-qwen35-27b:
hf-hub-cache-qwen9b:
hf-hub-cache-gemma4-31b:
hf-hub-cache-gemma4-26b: