| |
| |
| |
| |
| |
| |
| |
| |
| services: |
| vllm-server-qwen35-9b: |
| profiles: ["vllm-9b"] |
| image: vllm/vllm-openai:latest |
| shm_size: '8gb' |
| command: | |
| --model QuantTrio/Qwen3.5-9B-AWQ |
| --gpu-memory-utilization 0.7 |
| --tensor-parallel-size 1 |
| --max-num-seqs 1 |
| --reasoning-parser qwen3 |
| --max-model-len 32768 |
| --speculative-config '{"method":"mtp","num_speculative_tokens":3}' |
| --max-num-batched-tokens 2048 |
| |
| deploy: |
| resources: |
| reservations: |
| devices: |
| - driver: nvidia |
| count: all |
| capabilities: [gpu] |
| healthcheck: |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"] |
| interval: 30s |
| timeout: 15s |
| retries: 8 |
| start_period: 1200s |
| ports: |
| - "8000:8000" |
| volumes: |
| - hf-model-cache:/root/.cache/huggingface |
| networks: |
| redaction-net-vllm: |
| aliases: |
| - vllm-inference |
|
|
| vllm-server-qwen35-27b: |
| profiles: ["vllm-27b"] |
| image: vllm/vllm-openai:latest |
| shm_size: '16gb' |
| command: | |
| --model QuantTrio/Qwen3.5-27B-AWQ |
| --gpu-memory-utilization 0.94 |
| --tensor-parallel-size 1 |
| --max-num-seqs 2 |
| --reasoning-parser qwen3 |
| --max-model-len 16384 |
| --max-num-batched-tokens 4096 |
| --enforce-eager |
| --kv-cache-dtype fp8 |
| --enable-chunked-prefill |
| --enable-prefix-caching |
| |
| deploy: |
| resources: |
| reservations: |
| devices: |
| - driver: nvidia |
| count: all |
| capabilities: [gpu] |
| healthcheck: |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"] |
| interval: 30s |
| timeout: 15s |
| retries: 8 |
| start_period: 1200s |
| ports: |
| - "8001:8000" |
| volumes: |
| - hf-model-cache:/root/.cache/huggingface |
| networks: |
| redaction-net-vllm: |
| aliases: |
| - vllm-inference |
|
|
| redaction-app-vllm: |
| profiles: ["vllm-9b", "vllm-27b"] |
| image: redaction-app-main |
| build: |
| context: . |
| dockerfile: Dockerfile |
| target: gradio |
| args: |
| - TORCH_GPU_ENABLED=False |
| - INSTALL_VLM=False |
| - PADDLE_GPU_ENABLED=True |
| - INSTALL_PADDLEOCR=True |
| shm_size: '8gb' |
| depends_on: |
| vllm-server-qwen35-9b: |
| condition: service_healthy |
| required: false |
| vllm-server-qwen35-27b: |
| condition: service_healthy |
| required: false |
| environment: |
| - FLAGS_fraction_of_gpu_memory_to_use=0.05 |
| - RUN_FASTAPI=True |
| - APP_MODE=fastapi |
| - SHOW_PADDLE_MODEL_OPTIONS=True |
| - SHOW_LOCAL_OCR_MODEL_OPTIONS=True |
| - SHOW_INFERENCE_SERVER_PII_OPTIONS=True |
| - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True |
| - SHOW_HYBRID_MODELS=True |
| - SHOW_DIFFICULT_OCR_EXAMPLES=True |
| - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True |
| - SHOW_SUMMARISATION=True |
| - SHOW_AWS_API_KEYS=True |
| - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text |
| - DEFAULT_LOCAL_OCR_MODEL=paddle |
| - DEFAULT_PII_DETECTION_MODEL=Local |
| - CUSTOM_VLM_BACKEND=inference_vlm |
| - MAX_WORKERS=12 |
| - TESSERACT_MAX_WORKERS=8 |
| - PADDLE_MAX_WORKERS=1 |
| - LOAD_PADDLE_AT_STARTUP=False |
| - INFERENCE_SERVER_API_URL=http://vllm-inference:8000 |
| - DEFAULT_INFERENCE_SERVER_VLM_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} |
| - DEFAULT_INFERENCE_SERVER_PII_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} |
| - EFFICIENT_OCR=True |
| - SHOW_CUSTOM_VLM_ENTITIES=True |
| - SESSION_OUTPUT_FOLDER=True |
| - SAVE_PAGE_OCR_VISUALISATIONS=False |
| - HYBRID_OCR_CONFIDENCE_THRESHOLD=97 |
| - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True |
| - PREPROCESS_LOCAL_OCR_IMAGES=False |
| - INFERENCE_SERVER_DISABLE_THINKING=True |
| - MAX_NEW_TOKENS=16384 |
| - SAVE_EXAMPLE_HYBRID_IMAGES=False |
| - SAVE_VLM_INPUT_IMAGES=False |
| - VLM_MAX_DPI=200.0 |
| - DEFAULT_NEW_BATCH_CHAR_COUNT=1250 |
| - REPORT_VLM_OUTPUTS_TO_GUI=True |
| - REPORT_LLM_OUTPUTS_TO_GUI=True |
| - ADD_VLM_BOUNDING_BOX_RULES=False |
|
|
| deploy: |
| resources: |
| reservations: |
| devices: |
| - driver: nvidia |
| count: all |
| capabilities: [gpu] |
| ports: |
| - "7860:7860" |
| networks: |
| - redaction-net-vllm |
|
|
| networks: |
| redaction-net-vllm: |
| driver: bridge |
|
|
| volumes: |
| hf-model-cache: |