is this the way?
#1
by datayoda - opened
is this the best way to run that model in llama-server on strix halo for coding?
AMD_VULKAN_ICD=RADV llama-server
--host 127.0.0.1 --port 8080
--no-webui
--jinja
--reasoning off
-t 24 -tb 24
--parallel 1
-ngl 999 --no-mmap -fa on
-ctk q8_0 -ctv q8_0
--spec-type ngram-mod
--cache-reuse 4096
--cache-ram 4096
--batch-size 1024
--ubatch-size 1024
-c 262144 -n 32768
--temp 0.1
--top-k 40
--top-p 0.95
--min-p 0.05
--repeat-penalty 1.05
--repeat-last-n 256
--alias mistral-small4-119b
--model /mnt/models/mistral/mistral-small-4-119b-q80-q6k_ffn.gguf