| # Set number of threads. | |
| # Note: prefer the number of physical cores. Overbooking the CPU degrades performance notably. | |
| LOCALAI_THREADS=14 | |
| # Specify a different bind address (defaults to ":8080") | |
| LOCALAI_ADDRESS=127.0.0.1:8080 | |
| # Default models context size | |
| LOCALAI_CONTEXT_SIZE=512 | |
| ## Define galleries. | |
| # models will to install will be visible in `/models/available` | |
| LOCALAI_GALLERIES=[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}] | |
| # CORS settings | |
| LOCALAI_CORS=true | |
| LOCALAI_CORS_ALLOW_ORIGINS=* | |
| # Default path for models | |
| # LOCALAI_MODELS_PATH=/models | |
| # Enable debug mode | |
| LOCALAI_LOG_LEVEL=debug | |
| # Disables COMPEL (Diffusers) | |
| COMPEL=0 | |
| # Enable/Disable single backend (useful if only one GPU is available) | |
| LOCALAI_SINGLE_ACTIVE_BACKEND=true | |
| Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set) | |
| LOCALAI_FORCE_BACKEND_SHUTDOWN=true | |
| # Path where to store generated images | |
| LOCALAI_IMAGE_PATH=/tmp/generated/images | |
| # Specify a default upload limit in MB (whisper) | |
| LOCALAI_UPLOAD_LIMIT=15 | |
| # List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/) | |
| LOCALAI_EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py | |
| ## Advanced settings ### | |
| ## Those are not really used by LocalAI, but from components in the stack ### | |
| # | |
| ## Preload libraries | |
| LD_PRELOAD= | |
| ## Huggingface cache for models | |
| HUGGINGFACE_HUB_CACHE=/usr/local/huggingface | |
| ## Python backends GRPC max workers | |
| ## Default number of workers for GRPC Python backends. | |
| ## This actually controls wether a backend can process multiple requests or not. | |
| PYTHON_GRPC_MAX_WORKERS=1 | |
| ## Define the number of parallel LLAMA.cpp workers (Defaults to 1) | |
| LLAMACPP_PARALLEL=1 | |
| ## Define a list of GRPC Servers for llama-cpp workers to distribute the load | |
| https://github.com/ggerganov/llama.cpp/pull/6829 | |
| https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md | |
| LLAMACPP_GRPC_SERVERS="" | |
| ## Enable to run parallel requests | |
| LOCALAI_PARALLEL_REQUESTS=true | |
| Enable to allow p2p mode | |
| LOCALAI_P2P=true | |
| Enable to use federated mode | |
| LOCALAI_FEDERATED=true | |
| Enable to start federation server | |
| FEDERATED_SERVER=true | |
| Define to use federation token | |
| TOKEN="" | |
| ## Watchdog settings | |
| ## | |
| Enables watchdog to kill backends that are inactive for too much time | |
| LOCALAI_WATCHDOG_IDLE=true | |
| # Time in duration format (e.g. 1h30m) after which a backend is considered idle | |
| LOCALAI_WATCHDOG_IDLE_TIMEOUT=5m | |
| # Enables watchdog to kill backends that are busy for too much time | |
| LOCALAI_WATCHDOG_BUSY=true | |
| # Time in duration format (e.g. 1h30m) after which a backend is considered busy | |
| LOCALAI_WATCHDOG_BUSY_TIMEOUT=5m | |