document_redaction / docker-compose_vllm.yml
seanpedrickcase's picture
Sync: Adjustments on redaction overlay formatting
a0a1e81
# Pick which vLLM stack runs via COMPOSE_PROFILES (or --profile). Set VLLM_OPENAI_MODEL in .env
# to match the served model (required for 27b; 9b defaults below if omitted):
# COMPOSE_PROFILES=vllm-9b -> QuantTrio/Qwen3.5-9B-AWQ (default VLLM_OPENAI_MODEL)
# COMPOSE_PROFILES=vllm-27b -> set VLLM_OPENAI_MODEL=QuantTrio/Qwen3.5-27B-AWQ
# App uses http://vllm-inference:8000 (shared network alias on both vLLM services).
# Example CLI commands:
# docker compose -f docker-compose_vllm.yml --profile vllm-9b up -d
# docker compose -f docker-compose_vllm.yml --profile vllm-27b up -d
services:
vllm-server-qwen35-9b:
profiles: ["vllm-9b"]
image: vllm/vllm-openai:latest
shm_size: '8gb'
command: |
--model QuantTrio/Qwen3.5-9B-AWQ
--gpu-memory-utilization 0.7
--tensor-parallel-size 1
--max-num-seqs 1
--reasoning-parser qwen3
--max-model-len 32768
--speculative-config '{"method":"mtp","num_speculative_tokens":3}'
--max-num-batched-tokens 2048
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"]
interval: 30s
timeout: 15s
retries: 8
start_period: 1200s
ports:
- "8000:8000"
volumes:
- hf-model-cache:/root/.cache/huggingface
networks:
redaction-net-vllm:
aliases:
- vllm-inference
vllm-server-qwen35-27b:
profiles: ["vllm-27b"]
image: vllm/vllm-openai:latest
shm_size: '16gb'
command: |
--model QuantTrio/Qwen3.5-27B-AWQ
--gpu-memory-utilization 0.94
--tensor-parallel-size 1
--max-num-seqs 2
--reasoning-parser qwen3
--max-model-len 16384
--max-num-batched-tokens 4096
--enforce-eager
--kv-cache-dtype fp8
--enable-chunked-prefill
--enable-prefix-caching
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"]
interval: 30s
timeout: 15s
retries: 8
start_period: 1200s
ports:
- "8001:8000"
volumes:
- hf-model-cache:/root/.cache/huggingface
networks:
redaction-net-vllm:
aliases:
- vllm-inference
redaction-app-vllm:
profiles: ["vllm-9b", "vllm-27b"]
image: redaction-app-main
build:
context: . # Look in the current folder
dockerfile: Dockerfile # Use this file
target: gradio # Use the 'gradio' stage from your Dockerfile
args: # Pass your build-time variables here!
- TORCH_GPU_ENABLED=False
- INSTALL_VLM=False
- PADDLE_GPU_ENABLED=True
- INSTALL_PADDLEOCR=True
shm_size: '8gb'
depends_on:
vllm-server-qwen35-9b:
condition: service_healthy
required: false
vllm-server-qwen35-27b:
condition: service_healthy
required: false
environment:
- FLAGS_fraction_of_gpu_memory_to_use=0.05
- RUN_FASTAPI=True
- APP_MODE=fastapi
- SHOW_PADDLE_MODEL_OPTIONS=True
- SHOW_LOCAL_OCR_MODEL_OPTIONS=True
- SHOW_INFERENCE_SERVER_PII_OPTIONS=True
- SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
- SHOW_HYBRID_MODELS=True
- SHOW_DIFFICULT_OCR_EXAMPLES=True
- SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
- SHOW_SUMMARISATION=True
- SHOW_AWS_API_KEYS=True
- DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
- DEFAULT_LOCAL_OCR_MODEL=paddle
- DEFAULT_PII_DETECTION_MODEL=Local
- CUSTOM_VLM_BACKEND=inference_vlm
- MAX_WORKERS=12
- TESSERACT_MAX_WORKERS=8
- PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
- LOAD_PADDLE_AT_STARTUP=False
- INFERENCE_SERVER_API_URL=http://vllm-inference:8000
- DEFAULT_INFERENCE_SERVER_VLM_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
- DEFAULT_INFERENCE_SERVER_PII_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
- EFFICIENT_OCR=True
- SHOW_CUSTOM_VLM_ENTITIES=True
- SESSION_OUTPUT_FOLDER=True
- SAVE_PAGE_OCR_VISUALISATIONS=False
- HYBRID_OCR_CONFIDENCE_THRESHOLD=97
- INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
- PREPROCESS_LOCAL_OCR_IMAGES=False
- INFERENCE_SERVER_DISABLE_THINKING=True
- MAX_NEW_TOKENS=16384
- SAVE_EXAMPLE_HYBRID_IMAGES=False
- SAVE_VLM_INPUT_IMAGES=False
- VLM_MAX_DPI=200.0
- DEFAULT_NEW_BATCH_CHAR_COUNT=1250
- REPORT_VLM_OUTPUTS_TO_GUI=True
- REPORT_LLM_OUTPUTS_TO_GUI=True
- ADD_VLM_BOUNDING_BOX_RULES=False
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
ports:
- "7860:7860"
networks:
- redaction-net-vllm
networks:
redaction-net-vllm:
driver: bridge
volumes:
hf-model-cache: