Sync: Updated review data modification method. Added page ocr visualtion save on demand. Updated agentic api routes. Can now set full with gui with variable.
4a56925 | # Pick which GGUF model runs by setting COMPOSE_PROFILES in .env (or pass --profile): | |
| # COMPOSE_PROFILES=35b -> qwen35-35b_q4_gguf | |
| # COMPOSE_PROFILES=27b -> qwen35-27b_q4_gguf | |
| # The app always talks to http://llama-inference:8080 (shared network alias on both model services). | |
| # Each model service uses its own llama.cpp and Hugging Face hub cache volumes so mmproj-F16.gguf | |
| # (same filename per repo) and -hf downloads are not shared across profiles. | |
| # Example CLI commands: | |
| # docker compose -f docker-compose_llama.yml --profile 35b_36 up -d | |
| # docker compose -f docker-compose_llama.yml --profile 35b up -d | |
| # docker compose -f docker-compose_llama.yml --profile 27b up -d | |
| # docker compose -f docker-compose_llama.yml --profile 9b up -d | |
| # docker compose -f docker-compose_llama.yml --profile gemma4-31b up -d | |
| # docker compose -f docker-compose_llama.yml --profile gemma4-26b up -d | |
| # Add --build to the above if you want to rebuild the app image. | |
| services: | |
| # Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system. | |
| qwen36-35b_q4_gguf: | |
| profiles: ["35b_36"] | |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 | |
| command: | |
| - -hf | |
| - unsloth/Qwen3.6-35B-A3B-GGUF | |
| - --hf-file | |
| - Qwen3.6-35B-A3B-UD-IQ4_NL.gguf | |
| - --mmproj-url | |
| - https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf | |
| - --n-gpu-layers | |
| - "999" | |
| - --ctx-size | |
| - "32768" | |
| - --fit | |
| - "off" | |
| - --temp | |
| - "0.7" | |
| - --top-k | |
| - "20" | |
| - --top-p | |
| - "0.8" | |
| - --min-p | |
| - "0.0" | |
| - --frequency-penalty | |
| - "1" | |
| - --presence-penalty | |
| - "1" | |
| - --host | |
| - "0.0.0.0" | |
| - --port | |
| - "8080" | |
| - --no-warmup | |
| - --seed | |
| - "42" | |
| - --n-cpu-moe | |
| - "0" # Increase this value to fit within your availableVRAM | |
| ports: | |
| - "8005:8080" | |
| volumes: | |
| - ./models:/models | |
| - hf-llama-cache-qwen36-35b:/root/.cache/llama.cpp | |
| - hf-hub-cache-qwen36-35b:/root/.cache/huggingface | |
| pull_policy: always | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| healthcheck: | |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] | |
| interval: 30s | |
| timeout: 15s | |
| retries: 8 | |
| start_period: 1200s | |
| networks: | |
| redaction-net-llama: | |
| aliases: | |
| - llama-inference | |
| # Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system. | |
| qwen35-35b_q4_gguf: | |
| profiles: ["35b"] | |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 | |
| command: | |
| - -hf | |
| - unsloth/Qwen3.5-35B-A3B-GGUF | |
| - --hf-file | |
| - Qwen3.5-35B-A3B-UD-IQ4_NL.gguf | |
| - --mmproj-url | |
| - https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf | |
| - --n-gpu-layers | |
| - "999" | |
| - --ctx-size | |
| - "32768" | |
| - --fit | |
| - "off" | |
| - --temp | |
| - "0.7" | |
| - --top-k | |
| - "20" | |
| - --top-p | |
| - "0.8" | |
| - --min-p | |
| - "0.0" | |
| - --frequency-penalty | |
| - "1" | |
| - --presence-penalty | |
| - "1" | |
| - --host | |
| - "0.0.0.0" | |
| - --port | |
| - "8080" | |
| - --no-warmup | |
| - --seed | |
| - "42" | |
| - --n-cpu-moe | |
| - "0" # Increase this value to fit within your availableVRAM | |
| ports: | |
| - "8001:8080" | |
| volumes: | |
| - ./models:/models | |
| - hf-llama-cache-qwen35-35b:/root/.cache/llama.cpp | |
| - hf-hub-cache-qwen35-35b:/root/.cache/huggingface | |
| pull_policy: always | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| healthcheck: | |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] | |
| interval: 30s | |
| timeout: 15s | |
| retries: 8 | |
| start_period: 1200s | |
| networks: | |
| redaction-net-llama: | |
| aliases: | |
| - llama-inference | |
| # Qwen 3.5 27B model setup below requires 24GB of VRAM to run. | |
| qwen35-27b_q4_gguf: | |
| profiles: ["27b"] | |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 | |
| command: | |
| - -hf | |
| - unsloth/Qwen3.5-27B-GGUF | |
| - --hf-file | |
| - Qwen3.5-27B-UD-Q4_K_XL.gguf | |
| - --mmproj-url | |
| - https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/mmproj-F16.gguf | |
| - --n-gpu-layers | |
| - "999" | |
| - --ctx-size | |
| - "32768" | |
| - --fit | |
| - "off" | |
| - --temp | |
| - "0.7" | |
| - --top-k | |
| - "20" | |
| - --top-p | |
| - "0.8" | |
| - --min-p | |
| - "0.0" | |
| - --frequency-penalty | |
| - "1" | |
| - --presence-penalty | |
| - "1" | |
| - --host | |
| - "0.0.0.0" | |
| - --port | |
| - "8080" | |
| - --no-warmup | |
| - --seed | |
| - "42" | |
| ports: | |
| - "8000:8080" | |
| volumes: | |
| - ./models:/models | |
| - hf-llama-cache-qwen35-27b:/root/.cache/llama.cpp | |
| - hf-hub-cache-qwen35-27b:/root/.cache/huggingface | |
| pull_policy: always | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| healthcheck: | |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] | |
| interval: 30s | |
| timeout: 15s | |
| retries: 8 | |
| start_period: 1200s | |
| networks: | |
| redaction-net-llama: | |
| aliases: | |
| - llama-inference | |
| qwen9b_q4_gguf: | |
| profiles: ["qwen9b"] | |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 | |
| command: | |
| - -hf | |
| - unsloth/Qwen3.5-9B-A3B-GGUF | |
| - --hf-file | |
| - Qwen3.5-9B-A3B-UD-IQ4_NL.gguf | |
| - --mmproj-url | |
| - https://huggingface.co/unsloth/Qwen3.5-9B-A3B-GGUF/resolve/main/mmproj-F16.gguf | |
| - --n-gpu-layers | |
| - "999" | |
| - --ctx-size | |
| - "16384" | |
| - --fit | |
| - "off" | |
| - --temp | |
| - "0.7" | |
| - --top-k | |
| - "20" | |
| - --top-p | |
| - "0.8" | |
| - --min-p | |
| - "0.0" | |
| - --frequency-penalty | |
| - "1" | |
| - --presence-penalty | |
| - "1" | |
| - --host | |
| - "0.0.0.0" | |
| - --port | |
| - "8080" | |
| - --no-warmup | |
| - --seed | |
| - "42" | |
| - --n-cpu-moe | |
| - "0" # Increase this value to fit within your availableVRAM | |
| - --cache-type-k | |
| - "q8_0" | |
| - --cache-type-v | |
| - "q8_0" | |
| ports: | |
| - "8003:8080" | |
| volumes: | |
| - ./models:/models | |
| - hf-llama-cache-qwen9b:/root/.cache/llama.cpp | |
| - hf-hub-cache-qwen9b:/root/.cache/huggingface | |
| pull_policy: always | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| healthcheck: | |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] | |
| # Gemma 4 31B model setup below requires 24GB+ of VRAM to run. | |
| gemma4-31b_q4_gguf: | |
| profiles: ["gemma4-31b"] | |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 | |
| command: | |
| - -hf | |
| - unsloth/gemma-4-31B-it-GGUF | |
| - --hf-file | |
| - gemma-4-31B-it-IQ4_NL.gguf | |
| - --mmproj-url | |
| - https://huggingface.co/unsloth/gemma-4-31B-it-GGUF/resolve/main/mmproj-F16.gguf | |
| - --n-gpu-layers | |
| - "999" | |
| - --ctx-size | |
| - "16384" | |
| - --fit | |
| - "off" | |
| - --temp | |
| - "1.0" | |
| - --top-k | |
| - "64" | |
| - --top-p | |
| - "0.95" | |
| - --host | |
| - "0.0.0.0" | |
| - --port | |
| - "8080" | |
| - --no-warmup | |
| - --seed | |
| - "42" | |
| - -np | |
| - "1" | |
| - --cache-type-k | |
| - "q8_0" | |
| - --cache-type-v | |
| - "q8_0" | |
| ports: | |
| - "8002:8080" | |
| volumes: | |
| - ./models:/models | |
| - hf-llama-cache-gemma4-31b:/root/.cache/llama.cpp | |
| - hf-hub-cache-gemma4-31b:/root/.cache/huggingface | |
| pull_policy: always | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| healthcheck: | |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] | |
| interval: 30s | |
| timeout: 15s | |
| retries: 8 | |
| start_period: 1200s | |
| networks: | |
| redaction-net-llama: | |
| aliases: | |
| - llama-inference | |
| # Gemma 4 26B model setup below requires 24GB+ of VRAM to run. | |
| gemma4-26b_q4_gguf: | |
| profiles: ["gemma4-26b"] | |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 | |
| command: | |
| - -hf | |
| - unsloth/gemma-4-26B-A4B-it-GGUF | |
| - --hf-file | |
| - gemma-4-26B-A4B-it-UD-IQ4_NL.gguf | |
| - --mmproj-url | |
| - https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/resolve/main/mmproj-F16.gguf | |
| - --n-gpu-layers | |
| - "999" | |
| - --ctx-size | |
| - "65536" | |
| - --fit | |
| - "off" | |
| - --temp | |
| - "1.0" | |
| - --top-k | |
| - "64" | |
| - --top-p | |
| - "0.95" | |
| - --host | |
| - "0.0.0.0" | |
| - --port | |
| - "8080" | |
| - --no-warmup | |
| - --seed | |
| - "42" | |
| - -np | |
| - "1" | |
| - --cache-type-k | |
| - "q8_0" | |
| - --cache-type-v | |
| - "q8_0" | |
| ports: | |
| - "8002:8080" | |
| volumes: | |
| - ./models:/models | |
| - hf-llama-cache-gemma4-26b:/root/.cache/llama.cpp | |
| - hf-hub-cache-gemma4-26b:/root/.cache/huggingface | |
| pull_policy: always | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| healthcheck: | |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] | |
| interval: 30s | |
| timeout: 15s | |
| retries: 8 | |
| start_period: 1200s | |
| networks: | |
| redaction-net-llama: | |
| aliases: | |
| - llama-inference | |
| redaction-app-llama: | |
| profiles: ["35b_36", "35b", "27b", "9b", "gemma4-31b", "gemma4-26b"] | |
| image: redaction-app-main | |
| build: | |
| context: . # Look in the current folder | |
| dockerfile: Dockerfile # Use this file | |
| target: gradio # Use the 'gradio' stage from your Dockerfile | |
| args: # Pass your build-time variables here! | |
| - TORCH_GPU_ENABLED=False | |
| - INSTALL_VLM=False | |
| - PADDLE_GPU_ENABLED=True | |
| - INSTALL_PADDLEOCR=True | |
| shm_size: '8gb' | |
| depends_on: | |
| qwen36-35b_q4_gguf: | |
| condition: service_healthy | |
| required: false | |
| qwen35-35b_q4_gguf: | |
| condition: service_healthy | |
| required: false | |
| qwen35-27b_q4_gguf: | |
| condition: service_healthy | |
| required: false | |
| qwen9b_q4_gguf: | |
| condition: service_healthy | |
| required: false | |
| gemma4-31b_q4_gguf: | |
| condition: service_healthy | |
| required: false | |
| gemma4-26b_q4_gguf: | |
| condition: service_healthy | |
| required: false | |
| environment: | |
| - FLAGS_fraction_of_gpu_memory_to_use=0.05 | |
| - RUN_FASTAPI=True | |
| - APP_MODE=fastapi | |
| - SHOW_PADDLE_MODEL_OPTIONS=True | |
| - SHOW_LOCAL_OCR_MODEL_OPTIONS=True | |
| - SHOW_LOCAL_PII_DETECTION_OPTIONS=True | |
| - SHOW_INFERENCE_SERVER_PII_OPTIONS=True | |
| - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True | |
| - SHOW_HYBRID_MODELS=True | |
| - SHOW_DIFFICULT_OCR_EXAMPLES=True | |
| - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True | |
| - SHOW_SUMMARISATION=True | |
| - SHOW_AWS_API_KEYS=True | |
| - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text | |
| - DEFAULT_LOCAL_OCR_MODEL=paddle | |
| - DEFAULT_PII_DETECTION_MODEL=Local | |
| - INFERENCE_SERVER_API_URL=http://llama-inference:8080 | |
| - DEFAULT_INFERENCE_SERVER_VLM_MODEL="" | |
| - DEFAULT_INFERENCE_SERVER_PII_MODEL="" | |
| - CUSTOM_VLM_BACKEND=inference_vlm | |
| - MAX_WORKERS=12 | |
| - TESSERACT_MAX_WORKERS=8 | |
| - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors | |
| - LOAD_PADDLE_AT_STARTUP=False | |
| - EFFICIENT_OCR=True | |
| - SHOW_CUSTOM_VLM_ENTITIES=True | |
| - SESSION_OUTPUT_FOLDER=True | |
| - SAVE_PAGE_OCR_VISUALISATIONS=False | |
| - HYBRID_OCR_CONFIDENCE_THRESHOLD=97 | |
| - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True | |
| - PREPROCESS_LOCAL_OCR_IMAGES=False | |
| - INFERENCE_SERVER_DISABLE_THINKING=True | |
| - MAX_NEW_TOKENS=16384 | |
| - SAVE_EXAMPLE_HYBRID_IMAGES=False | |
| - SAVE_VLM_INPUT_IMAGES=False | |
| - VLM_MAX_DPI=200.0 | |
| - DEFAULT_NEW_BATCH_CHAR_COUNT=1250 | |
| - REPORT_VLM_OUTPUTS_TO_GUI=True | |
| - REPORT_LLM_OUTPUTS_TO_GUI=True | |
| - ADD_VLM_BOUNDING_BOX_RULES=False | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| ports: | |
| - "7861:7860" | |
| networks: | |
| - redaction-net-llama | |
| networks: | |
| redaction-net-llama: | |
| driver: bridge | |
| volumes: | |
| hf-llama-cache-qwen36-35b: | |
| hf-llama-cache-qwen35-35b: | |
| hf-llama-cache-qwen35-27b: | |
| hf-llama-cache-qwen9b: | |
| hf-llama-cache-gemma4-31b: | |
| hf-llama-cache-gemma4-26b: | |
| hf-hub-cache-qwen36-35b: | |
| hf-hub-cache-qwen35-35b: | |
| hf-hub-cache-qwen35-27b: | |
| hf-hub-cache-qwen9b: | |
| hf-hub-cache-gemma4-31b: | |
| hf-hub-cache-gemma4-26b: |