clemsail commited on
Commit
611a651
·
verified ·
1 Parent(s): afb5696

chore: upload lm-eval-harness results

Browse files
evals/results_2026-04-15T15-32-15.795505.json ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "ifeval": {
4
+ "alias": "ifeval",
5
+ "prompt_level_strict_acc,none": 0.6913123844731978,
6
+ "prompt_level_strict_acc_stderr,none": 0.019879245251116444,
7
+ "inst_level_strict_acc,none": 0.7733812949640287,
8
+ "inst_level_strict_acc_stderr,none": "N/A",
9
+ "prompt_level_loose_acc,none": 0.7245841035120147,
10
+ "prompt_level_loose_acc_stderr,none": 0.01922392319624193,
11
+ "inst_level_loose_acc,none": 0.8009592326139089,
12
+ "inst_level_loose_acc_stderr,none": "N/A"
13
+ }
14
+ },
15
+ "group_subtasks": {
16
+ "ifeval": []
17
+ },
18
+ "configs": {
19
+ "ifeval": {
20
+ "task": "ifeval",
21
+ "dataset_path": "google/IFEval",
22
+ "test_split": "train",
23
+ "doc_to_text": "prompt",
24
+ "doc_to_target": 0,
25
+ "unsafe_code": false,
26
+ "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "fewshot_config": {
31
+ "sampler": "default",
32
+ "split": null,
33
+ "process_docs": null,
34
+ "fewshot_indices": null,
35
+ "samples": null,
36
+ "doc_to_text": "prompt",
37
+ "doc_to_choice": null,
38
+ "doc_to_target": 0,
39
+ "gen_prefix": null,
40
+ "fewshot_delimiter": "\n\n",
41
+ "target_delimiter": " "
42
+ },
43
+ "num_fewshot": 0,
44
+ "metric_list": [
45
+ {
46
+ "metric": "prompt_level_strict_acc",
47
+ "aggregation": "mean",
48
+ "higher_is_better": true
49
+ },
50
+ {
51
+ "metric": "inst_level_strict_acc",
52
+ "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
53
+ "higher_is_better": true
54
+ },
55
+ {
56
+ "metric": "prompt_level_loose_acc",
57
+ "aggregation": "mean",
58
+ "higher_is_better": true
59
+ },
60
+ {
61
+ "metric": "inst_level_loose_acc",
62
+ "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
63
+ "higher_is_better": true
64
+ }
65
+ ],
66
+ "output_type": "generate_until",
67
+ "generation_kwargs": {
68
+ "until": [],
69
+ "do_sample": false,
70
+ "temperature": 0.0,
71
+ "max_gen_toks": 1280
72
+ },
73
+ "repeats": 1,
74
+ "should_decontaminate": false,
75
+ "metadata": {
76
+ "version": 4.0,
77
+ "base_url": "http://localhost:8000/v1/chat/completions",
78
+ "model": "devstral",
79
+ "num_concurrent": 1
80
+ }
81
+ }
82
+ },
83
+ "versions": {
84
+ "ifeval": 4.0
85
+ },
86
+ "n-shot": {
87
+ "ifeval": 0
88
+ },
89
+ "higher_is_better": {
90
+ "ifeval": {
91
+ "prompt_level_strict_acc": true,
92
+ "inst_level_strict_acc": true,
93
+ "prompt_level_loose_acc": true,
94
+ "inst_level_loose_acc": true
95
+ }
96
+ },
97
+ "n-samples": {
98
+ "ifeval": {
99
+ "original": 541,
100
+ "effective": 541
101
+ }
102
+ },
103
+ "config": {
104
+ "model": "local-chat-completions",
105
+ "model_args": {
106
+ "base_url": "http://localhost:8000/v1/chat/completions",
107
+ "model": "devstral",
108
+ "num_concurrent": 1
109
+ },
110
+ "batch_size": "1",
111
+ "batch_sizes": [],
112
+ "device": "cuda:0",
113
+ "use_cache": null,
114
+ "limit": null,
115
+ "bootstrap_iters": 100000,
116
+ "gen_kwargs": {},
117
+ "random_seed": 0,
118
+ "numpy_seed": 1234,
119
+ "torch_seed": 1234,
120
+ "fewshot_seed": 1234
121
+ },
122
+ "git_hash": null,
123
+ "date": 1776256793.262322,
124
+ "pretty_env_info": "N/A (torch not installed)",
125
+ "transformers_version": "N/A",
126
+ "lm_eval_version": "0.4.11",
127
+ "upper_git_hash": null,
128
+ "task_hashes": {
129
+ "ifeval": "33ea6c342001769a1e568f8fb1f146e8b8c2d7ad1420ed0770c7e67a7a121685"
130
+ },
131
+ "model_source": "local-chat-completions",
132
+ "model_name": "devstral",
133
+ "model_name_sanitized": "devstral",
134
+ "system_instruction": null,
135
+ "system_instruction_sha": null,
136
+ "fewshot_as_multiturn": true,
137
+ "chat_template": "",
138
+ "chat_template_sha": null,
139
+ "total_evaluation_time_seconds": "3143.920960575342"
140
+ }