# Pick which vLLM stack runs via COMPOSE_PROFILES (or --profile). Set VLLM_OPENAI_MODEL in .env # to match the served model (required for 27b; 9b defaults below if omitted): # COMPOSE_PROFILES=vllm-9b -> QuantTrio/Qwen3.5-9B-AWQ (default VLLM_OPENAI_MODEL) # COMPOSE_PROFILES=vllm-27b -> set VLLM_OPENAI_MODEL=QuantTrio/Qwen3.5-27B-AWQ # App uses http://vllm-inference:8000 (shared network alias on both vLLM services). # Example CLI commands: # docker compose -f docker-compose_vllm.yml --profile vllm-9b up -d # docker compose -f docker-compose_vllm.yml --profile vllm-27b up -d services: vllm-server-qwen35-9b: profiles: ["vllm-9b"] image: vllm/vllm-openai:latest shm_size: '8gb' command: | --model QuantTrio/Qwen3.5-9B-AWQ --gpu-memory-utilization 0.7 --tensor-parallel-size 1 --max-num-seqs 1 --reasoning-parser qwen3 --max-model-len 32768 --speculative-config '{"method":"mtp","num_speculative_tokens":3}' --max-num-batched-tokens 2048 deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"] interval: 30s timeout: 15s retries: 8 start_period: 1200s ports: - "8000:8000" volumes: - hf-model-cache:/root/.cache/huggingface networks: redaction-net-vllm: aliases: - vllm-inference vllm-server-qwen35-27b: profiles: ["vllm-27b"] image: vllm/vllm-openai:latest shm_size: '16gb' command: | --model QuantTrio/Qwen3.5-27B-AWQ --gpu-memory-utilization 0.94 --tensor-parallel-size 1 --max-num-seqs 2 --reasoning-parser qwen3 --max-model-len 16384 --max-num-batched-tokens 4096 --enforce-eager --kv-cache-dtype fp8 --enable-chunked-prefill --enable-prefix-caching deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"] interval: 30s timeout: 15s retries: 8 start_period: 1200s ports: - "8001:8000" volumes: - hf-model-cache:/root/.cache/huggingface networks: redaction-net-vllm: aliases: - vllm-inference redaction-app-vllm: profiles: ["vllm-9b", "vllm-27b"] image: redaction-app-main build: context: . # Look in the current folder dockerfile: Dockerfile # Use this file target: gradio # Use the 'gradio' stage from your Dockerfile args: # Pass your build-time variables here! - TORCH_GPU_ENABLED=False - INSTALL_VLM=False - PADDLE_GPU_ENABLED=True - INSTALL_PADDLEOCR=True shm_size: '8gb' depends_on: vllm-server-qwen35-9b: condition: service_healthy required: false vllm-server-qwen35-27b: condition: service_healthy required: false environment: - FLAGS_fraction_of_gpu_memory_to_use=0.05 - RUN_FASTAPI=True - APP_MODE=fastapi - SHOW_PADDLE_MODEL_OPTIONS=True - SHOW_LOCAL_OCR_MODEL_OPTIONS=True - SHOW_INFERENCE_SERVER_PII_OPTIONS=True - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True - SHOW_HYBRID_MODELS=True - SHOW_DIFFICULT_OCR_EXAMPLES=True - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True - SHOW_SUMMARISATION=True - SHOW_AWS_API_KEYS=True - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text - DEFAULT_LOCAL_OCR_MODEL=paddle - DEFAULT_PII_DETECTION_MODEL=Local - CUSTOM_VLM_BACKEND=inference_vlm - MAX_WORKERS=12 - TESSERACT_MAX_WORKERS=8 - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors - LOAD_PADDLE_AT_STARTUP=False - INFERENCE_SERVER_API_URL=http://vllm-inference:8000 - DEFAULT_INFERENCE_SERVER_VLM_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model - DEFAULT_INFERENCE_SERVER_PII_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model - EFFICIENT_OCR=True - SHOW_CUSTOM_VLM_ENTITIES=True - SESSION_OUTPUT_FOLDER=True - SAVE_PAGE_OCR_VISUALISATIONS=False - HYBRID_OCR_CONFIDENCE_THRESHOLD=97 - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True - PREPROCESS_LOCAL_OCR_IMAGES=False - INFERENCE_SERVER_DISABLE_THINKING=True - MAX_NEW_TOKENS=16384 - SAVE_EXAMPLE_HYBRID_IMAGES=False - SAVE_VLM_INPUT_IMAGES=False - VLM_MAX_DPI=200.0 - DEFAULT_NEW_BATCH_CHAR_COUNT=1250 - REPORT_VLM_OUTPUTS_TO_GUI=True - REPORT_LLM_OUTPUTS_TO_GUI=True - ADD_VLM_BOUNDING_BOX_RULES=False deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] ports: - "7860:7860" networks: - redaction-net-vllm networks: redaction-net-vllm: driver: bridge volumes: hf-model-cache: