Spaces:

seanpedrickcase
/

document_redaction

Running

document_redaction / docker-compose_llama.yml

Sync: Updated review data modification method. Added page ocr visualtion save on demand. Updated agentic api routes. Can now set full with gui with variable.

4a56925 11 minutes ago

raw

history blame contribute delete

13.3 kB

	# Pick which GGUF model runs by setting COMPOSE_PROFILES in .env (or pass --profile):
	# COMPOSE_PROFILES=35b -> qwen35-35b_q4_gguf
	# COMPOSE_PROFILES=27b -> qwen35-27b_q4_gguf
	# The app always talks to http://llama-inference:8080 (shared network alias on both model services).
	# Each model service uses its own llama.cpp and Hugging Face hub cache volumes so mmproj-F16.gguf
	# (same filename per repo) and -hf downloads are not shared across profiles.
	# Example CLI commands:
	# docker compose -f docker-compose_llama.yml --profile 35b_36 up -d
	# docker compose -f docker-compose_llama.yml --profile 35b up -d
	# docker compose -f docker-compose_llama.yml --profile 27b up -d
	# docker compose -f docker-compose_llama.yml --profile 9b up -d

	# docker compose -f docker-compose_llama.yml --profile gemma4-31b up -d
	# docker compose -f docker-compose_llama.yml --profile gemma4-26b up -d

	# Add --build to the above if you want to rebuild the app image.
	services:

	# Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system.
	qwen36-35b_q4_gguf:
	profiles: ["35b_36"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/Qwen3.6-35B-A3B-GGUF
	- --hf-file
	- Qwen3.6-35B-A3B-UD-IQ4_NL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "999"
	- --ctx-size
	- "32768"
	- --fit
	- "off"
	- --temp
	- "0.7"
	- --top-k
	- "20"
	- --top-p
	- "0.8"
	- --min-p
	- "0.0"
	- --frequency-penalty
	- "1"
	- --presence-penalty
	- "1"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	- --n-cpu-moe
	- "0" # Increase this value to fit within your availableVRAM
	ports:
	- "8005:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-qwen36-35b:/root/.cache/llama.cpp
	- hf-hub-cache-qwen36-35b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	networks:
	redaction-net-llama:
	aliases:
	- llama-inference

	# Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system.
	qwen35-35b_q4_gguf:
	profiles: ["35b"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/Qwen3.5-35B-A3B-GGUF
	- --hf-file
	- Qwen3.5-35B-A3B-UD-IQ4_NL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "999"
	- --ctx-size
	- "32768"
	- --fit
	- "off"
	- --temp
	- "0.7"
	- --top-k
	- "20"
	- --top-p
	- "0.8"
	- --min-p
	- "0.0"
	- --frequency-penalty
	- "1"
	- --presence-penalty
	- "1"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	- --n-cpu-moe
	- "0" # Increase this value to fit within your availableVRAM
	ports:
	- "8001:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-qwen35-35b:/root/.cache/llama.cpp
	- hf-hub-cache-qwen35-35b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	networks:
	redaction-net-llama:
	aliases:
	- llama-inference

	# Qwen 3.5 27B model setup below requires 24GB of VRAM to run.
	qwen35-27b_q4_gguf:
	profiles: ["27b"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/Qwen3.5-27B-GGUF
	- --hf-file
	- Qwen3.5-27B-UD-Q4_K_XL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "999"
	- --ctx-size
	- "32768"
	- --fit
	- "off"
	- --temp
	- "0.7"
	- --top-k
	- "20"
	- --top-p
	- "0.8"
	- --min-p
	- "0.0"
	- --frequency-penalty
	- "1"
	- --presence-penalty
	- "1"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	ports:
	- "8000:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-qwen35-27b:/root/.cache/llama.cpp
	- hf-hub-cache-qwen35-27b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	networks:
	redaction-net-llama:
	aliases:
	- llama-inference

	qwen9b_q4_gguf:
	profiles: ["qwen9b"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/Qwen3.5-9B-A3B-GGUF
	- --hf-file
	- Qwen3.5-9B-A3B-UD-IQ4_NL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/Qwen3.5-9B-A3B-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "999"
	- --ctx-size
	- "16384"
	- --fit
	- "off"
	- --temp
	- "0.7"
	- --top-k
	- "20"
	- --top-p
	- "0.8"
	- --min-p
	- "0.0"
	- --frequency-penalty
	- "1"
	- --presence-penalty
	- "1"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	- --n-cpu-moe
	- "0" # Increase this value to fit within your availableVRAM
	- --cache-type-k
	- "q8_0"
	- --cache-type-v
	- "q8_0"
	ports:
	- "8003:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-qwen9b:/root/.cache/llama.cpp
	- hf-hub-cache-qwen9b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]

	# Gemma 4 31B model setup below requires 24GB+ of VRAM to run.
	gemma4-31b_q4_gguf:
	profiles: ["gemma4-31b"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/gemma-4-31B-it-GGUF
	- --hf-file
	- gemma-4-31B-it-IQ4_NL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/gemma-4-31B-it-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "999"
	- --ctx-size
	- "16384"
	- --fit
	- "off"
	- --temp
	- "1.0"
	- --top-k
	- "64"
	- --top-p
	- "0.95"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	- -np
	- "1"
	- --cache-type-k
	- "q8_0"
	- --cache-type-v
	- "q8_0"
	ports:
	- "8002:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-gemma4-31b:/root/.cache/llama.cpp
	- hf-hub-cache-gemma4-31b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	networks:
	redaction-net-llama:
	aliases:
	- llama-inference

	# Gemma 4 26B model setup below requires 24GB+ of VRAM to run.
	gemma4-26b_q4_gguf:
	profiles: ["gemma4-26b"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/gemma-4-26B-A4B-it-GGUF
	- --hf-file
	- gemma-4-26B-A4B-it-UD-IQ4_NL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "999"
	- --ctx-size
	- "65536"
	- --fit
	- "off"
	- --temp
	- "1.0"
	- --top-k
	- "64"
	- --top-p
	- "0.95"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	- -np
	- "1"
	- --cache-type-k
	- "q8_0"
	- --cache-type-v
	- "q8_0"
	ports:
	- "8002:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-gemma4-26b:/root/.cache/llama.cpp
	- hf-hub-cache-gemma4-26b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	networks:
	redaction-net-llama:
	aliases:
	- llama-inference

	redaction-app-llama:
	profiles: ["35b_36", "35b", "27b", "9b", "gemma4-31b", "gemma4-26b"]
	image: redaction-app-main
	build:
	context: . # Look in the current folder
	dockerfile: Dockerfile # Use this file
	target: gradio # Use the 'gradio' stage from your Dockerfile
	args: # Pass your build-time variables here!
	- TORCH_GPU_ENABLED=False
	- INSTALL_VLM=False
	- PADDLE_GPU_ENABLED=True
	- INSTALL_PADDLEOCR=True
	shm_size: '8gb'
	depends_on:
	qwen36-35b_q4_gguf:
	condition: service_healthy
	required: false
	qwen35-35b_q4_gguf:
	condition: service_healthy
	required: false
	qwen35-27b_q4_gguf:
	condition: service_healthy
	required: false
	qwen9b_q4_gguf:
	condition: service_healthy
	required: false
	gemma4-31b_q4_gguf:
	condition: service_healthy
	required: false
	gemma4-26b_q4_gguf:
	condition: service_healthy
	required: false
	environment:
	- FLAGS_fraction_of_gpu_memory_to_use=0.05
	- RUN_FASTAPI=True
	- APP_MODE=fastapi
	- SHOW_PADDLE_MODEL_OPTIONS=True
	- SHOW_LOCAL_OCR_MODEL_OPTIONS=True
	- SHOW_LOCAL_PII_DETECTION_OPTIONS=True
	- SHOW_INFERENCE_SERVER_PII_OPTIONS=True
	- SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
	- SHOW_HYBRID_MODELS=True
	- SHOW_DIFFICULT_OCR_EXAMPLES=True
	- SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
	- SHOW_SUMMARISATION=True
	- SHOW_AWS_API_KEYS=True
	- DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
	- DEFAULT_LOCAL_OCR_MODEL=paddle
	- DEFAULT_PII_DETECTION_MODEL=Local
	- INFERENCE_SERVER_API_URL=http://llama-inference:8080
	- DEFAULT_INFERENCE_SERVER_VLM_MODEL=""
	- DEFAULT_INFERENCE_SERVER_PII_MODEL=""
	- CUSTOM_VLM_BACKEND=inference_vlm
	- MAX_WORKERS=12
	- TESSERACT_MAX_WORKERS=8
	- PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
	- LOAD_PADDLE_AT_STARTUP=False
	- EFFICIENT_OCR=True
	- SHOW_CUSTOM_VLM_ENTITIES=True
	- SESSION_OUTPUT_FOLDER=True
	- SAVE_PAGE_OCR_VISUALISATIONS=False
	- HYBRID_OCR_CONFIDENCE_THRESHOLD=97
	- INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
	- PREPROCESS_LOCAL_OCR_IMAGES=False
	- INFERENCE_SERVER_DISABLE_THINKING=True
	- MAX_NEW_TOKENS=16384
	- SAVE_EXAMPLE_HYBRID_IMAGES=False
	- SAVE_VLM_INPUT_IMAGES=False
	- VLM_MAX_DPI=200.0
	- DEFAULT_NEW_BATCH_CHAR_COUNT=1250
	- REPORT_VLM_OUTPUTS_TO_GUI=True
	- REPORT_LLM_OUTPUTS_TO_GUI=True
	- ADD_VLM_BOUNDING_BOX_RULES=False

	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	ports:
	- "7861:7860"
	networks:
	- redaction-net-llama

	networks:
	redaction-net-llama:
	driver: bridge

	volumes:
	hf-llama-cache-qwen36-35b:
	hf-llama-cache-qwen35-35b:
	hf-llama-cache-qwen35-27b:
	hf-llama-cache-qwen9b:
	hf-llama-cache-gemma4-31b:
	hf-llama-cache-gemma4-26b:
	hf-hub-cache-qwen36-35b:
	hf-hub-cache-qwen35-35b:
	hf-hub-cache-qwen35-27b:
	hf-hub-cache-qwen9b:
	hf-hub-cache-gemma4-31b:
	hf-hub-cache-gemma4-26b: