LilaRest commited on
Commit
819f9e2
·
1 Parent(s): 2dac6d0

Update README

Browse files
Files changed (2) hide show
  1. README.md +23 -1
  2. config.json +1 -0
README.md CHANGED
@@ -17,6 +17,28 @@ tags:
17
  - Gemma-4-31B-IT
18
  - lighthouse
19
  pipeline_tag: text-generation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  ---
22
 
@@ -104,7 +126,7 @@ vllm serve LilaRest/gemma-4-31B-it-NVFP4-turbo \
104
 
105
  - `--quantization modelopt` — required, activates NVIDIA's optimized CUTLASS kernels
106
  - `--kv-cache-dtype fp8` — halves KV cache memory on Blackwell
107
- - `--max-model-len 16384` — maximum context length per request. Limited to ~30-40K on RTX 5090, full 262K on PRO 6000
108
 
109
  ## Compatibility
110
 
 
17
  - Gemma-4-31B-IT
18
  - lighthouse
19
  pipeline_tag: text-generation
20
+ model-index:
21
+ - name: gemma-4-31B-it-NVFP4-turbo
22
+ results:
23
+ - task:
24
+ type: text-generation
25
+ dataset:
26
+ name: GPQA Diamond
27
+ type: Idavidrein/gpqa
28
+ config: gpqa_diamond
29
+ metrics:
30
+ - name: Accuracy
31
+ type: accuracy
32
+ value: 72.73
33
+ - task:
34
+ type: text-generation
35
+ dataset:
36
+ name: MMLU Pro
37
+ type: TIGER-Lab/MMLU-Pro
38
+ metrics:
39
+ - name: Accuracy
40
+ type: accuracy
41
+ value: 83.93
42
 
43
  ---
44
 
 
126
 
127
  - `--quantization modelopt` — required, activates NVIDIA's optimized CUTLASS kernels
128
  - `--kv-cache-dtype fp8` — halves KV cache memory on Blackwell
129
+ - `--max-model-len 16384` — maximum context length per request. See [Compatibility](#compatibility) for max value per GPU.
130
 
131
  ## Compatibility
132
 
config.json CHANGED
@@ -154,6 +154,7 @@
154
  "model.embed_vision*",
155
  "model.vision_tower*"
156
  ],
 
157
  "quant_algo": "NVFP4",
158
  "kv_cache_scheme": {
159
  "dynamic": false,
 
154
  "model.embed_vision*",
155
  "model.vision_tower*"
156
  ],
157
+ "bits": 4,
158
  "quant_algo": "NVFP4",
159
  "kv_cache_scheme": {
160
  "dynamic": false,