DJLougen commited on
Commit
8f8a4dc
·
verified ·
1 Parent(s): c9d1ad0

Promote best BusyBeaver checkpoint 250

Browse files
README.md CHANGED
@@ -184,10 +184,22 @@ BusyBeaver-50M predicts tool calls; it does not execute them. Production harness
184
  - Browser-agent data was not the primary training target yet.
185
  - The architecture is custom, so ordinary inference engines need a BusyBeaver adapter unless exported through a compatible runtime wrapper.
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  ## Provenance
188
 
189
- - Internal run label: V11 grounding
190
- - Training hardware: RunPod GPU pod
191
- - Promoted checkpoint: 200
192
- - Full checkpoint archive: `GestaltLabs/BusyBeaver-50M-v11-grounding-runpod`
193
- - Training payload: `DJLougen/busybeaver-training-payload-v11-grounding`
 
184
  - Browser-agent data was not the primary training target yet.
185
  - The architecture is custom, so ordinary inference engines need a BusyBeaver adapter unless exported through a compatible runtime wrapper.
186
 
187
+ ## Latest Promotion
188
+
189
+ Promoted from `GestaltLabs/BusyBeaver-50M-v12-path-grounding-runpod` checkpoint `250`.
190
+
191
+ | Metric | Score |
192
+ | --- | ---: |
193
+ | json_validity_rate | 1.0000 |
194
+ | strict_json_rate | 1.0000 |
195
+ | schema_validity_rate | 0.9792 |
196
+ | valid_tool_rate | 0.9974 |
197
+ | correct_tool_accuracy | 0.9818 |
198
+ | argument_exact_match | 0.6432 |
199
+ | argument_semantic_match | 0.6510 |
200
+
201
  ## Provenance
202
 
203
+ - Promoted checkpoint: 250
204
+ - Source checkpoint archive: `GestaltLabs/BusyBeaver-50M-v12-path-grounding-runpod`
205
+
 
 
busybeaver_eval/metrics.json CHANGED
@@ -1,24 +1,24 @@
1
  {
2
- "json_validity_rate": 0.99609375,
3
- "strict_json_rate": 0.99609375,
4
- "schema_validity_rate": 0.99609375,
5
- "valid_tool_rate": 0.99609375,
6
- "correct_tool_accuracy": 0.98046875,
7
- "argument_exact_match": 0.8984375,
8
- "argument_semantic_match": 0.90234375,
9
  "groups": {
10
  "edit": {
11
- "n": 33,
12
  "json_validity_rate": 1.0,
13
  "strict_json_rate": 1.0,
14
  "schema_validity_rate": 1.0,
15
  "valid_tool_rate": 1.0,
16
  "correct_tool_accuracy": 1.0,
17
- "argument_exact_match": 0.7878787878787878,
18
- "argument_semantic_match": 0.7878787878787878
19
  },
20
  "escalate": {
21
- "n": 3,
22
  "json_validity_rate": 1.0,
23
  "strict_json_rate": 1.0,
24
  "schema_validity_rate": 1.0,
@@ -28,44 +28,54 @@
28
  "argument_semantic_match": 1.0
29
  },
30
  "execute": {
31
- "n": 32,
32
  "json_validity_rate": 1.0,
33
  "strict_json_rate": 1.0,
34
  "schema_validity_rate": 1.0,
35
  "valid_tool_rate": 1.0,
36
  "correct_tool_accuracy": 1.0,
37
- "argument_exact_match": 1.0,
38
- "argument_semantic_match": 1.0
39
  },
40
  "inspect": {
41
- "n": 143,
42
- "json_validity_rate": 0.993006993006993,
43
- "strict_json_rate": 0.993006993006993,
44
- "schema_validity_rate": 0.993006993006993,
45
- "valid_tool_rate": 0.993006993006993,
46
- "correct_tool_accuracy": 0.965034965034965,
47
- "argument_exact_match": 0.9020979020979021,
48
- "argument_semantic_match": 0.9090909090909091
 
 
 
 
 
 
 
 
 
 
49
  },
50
  "other": {
51
- "n": 21,
52
  "json_validity_rate": 1.0,
53
  "strict_json_rate": 1.0,
54
  "schema_validity_rate": 1.0,
55
  "valid_tool_rate": 1.0,
56
  "correct_tool_accuracy": 1.0,
57
- "argument_exact_match": 1.0,
58
- "argument_semantic_match": 1.0
59
  },
60
  "test": {
61
- "n": 24,
62
  "json_validity_rate": 1.0,
63
  "strict_json_rate": 1.0,
64
  "schema_validity_rate": 1.0,
65
  "valid_tool_rate": 1.0,
66
  "correct_tool_accuracy": 1.0,
67
- "argument_exact_match": 0.7916666666666666,
68
- "argument_semantic_match": 0.7916666666666666
69
  }
70
  }
71
  }
 
1
  {
2
+ "json_validity_rate": 1.0,
3
+ "strict_json_rate": 1.0,
4
+ "schema_validity_rate": 0.9791666666666666,
5
+ "valid_tool_rate": 0.9973958333333334,
6
+ "correct_tool_accuracy": 0.9817708333333334,
7
+ "argument_exact_match": 0.6432291666666666,
8
+ "argument_semantic_match": 0.6510416666666666,
9
  "groups": {
10
  "edit": {
11
+ "n": 49,
12
  "json_validity_rate": 1.0,
13
  "strict_json_rate": 1.0,
14
  "schema_validity_rate": 1.0,
15
  "valid_tool_rate": 1.0,
16
  "correct_tool_accuracy": 1.0,
17
+ "argument_exact_match": 0.6530612244897959,
18
+ "argument_semantic_match": 0.6530612244897959
19
  },
20
  "escalate": {
21
+ "n": 14,
22
  "json_validity_rate": 1.0,
23
  "strict_json_rate": 1.0,
24
  "schema_validity_rate": 1.0,
 
28
  "argument_semantic_match": 1.0
29
  },
30
  "execute": {
31
+ "n": 45,
32
  "json_validity_rate": 1.0,
33
  "strict_json_rate": 1.0,
34
  "schema_validity_rate": 1.0,
35
  "valid_tool_rate": 1.0,
36
  "correct_tool_accuracy": 1.0,
37
+ "argument_exact_match": 0.7111111111111111,
38
+ "argument_semantic_match": 0.7111111111111111
39
  },
40
  "inspect": {
41
+ "n": 193,
42
+ "json_validity_rate": 1.0,
43
+ "strict_json_rate": 1.0,
44
+ "schema_validity_rate": 0.9585492227979274,
45
+ "valid_tool_rate": 0.9948186528497409,
46
+ "correct_tool_accuracy": 0.9637305699481865,
47
+ "argument_exact_match": 0.7150259067357513,
48
+ "argument_semantic_match": 0.7202072538860104
49
+ },
50
+ "memory": {
51
+ "n": 12,
52
+ "json_validity_rate": 1.0,
53
+ "strict_json_rate": 1.0,
54
+ "schema_validity_rate": 1.0,
55
+ "valid_tool_rate": 1.0,
56
+ "correct_tool_accuracy": 1.0,
57
+ "argument_exact_match": 0.0,
58
+ "argument_semantic_match": 0.0
59
  },
60
  "other": {
61
+ "n": 45,
62
  "json_validity_rate": 1.0,
63
  "strict_json_rate": 1.0,
64
  "schema_validity_rate": 1.0,
65
  "valid_tool_rate": 1.0,
66
  "correct_tool_accuracy": 1.0,
67
+ "argument_exact_match": 0.4444444444444444,
68
+ "argument_semantic_match": 0.4888888888888889
69
  },
70
  "test": {
71
+ "n": 26,
72
  "json_validity_rate": 1.0,
73
  "strict_json_rate": 1.0,
74
  "schema_validity_rate": 1.0,
75
  "valid_tool_rate": 1.0,
76
  "correct_tool_accuracy": 1.0,
77
+ "argument_exact_match": 0.4230769230769231,
78
+ "argument_semantic_match": 0.4230769230769231
79
  }
80
  }
81
  }
busybeaver_eval/report.md CHANGED
@@ -1,26 +1,26 @@
1
  # BusyBeaver Checkpoint Evaluation
2
 
3
- - Step: 200
4
 
5
- - json_validity_rate: 0.9961
6
- - strict_json_rate: 0.9961
7
- - schema_validity_rate: 0.9961
8
- - valid_tool_rate: 0.9961
9
- - correct_tool_accuracy: 0.9805
10
- - argument_exact_match: 0.8984
11
- - argument_semantic_match: 0.9023
12
 
13
  ## Grouped Metrics
14
 
15
- ### edit (n=33)
16
  - json_validity_rate: 1.0000
17
  - strict_json_rate: 1.0000
18
  - schema_validity_rate: 1.0000
19
  - valid_tool_rate: 1.0000
20
  - correct_tool_accuracy: 1.0000
21
- - argument_exact_match: 0.7879
22
- - argument_semantic_match: 0.7879
23
- ### escalate (n=3)
24
  - json_validity_rate: 1.0000
25
  - strict_json_rate: 1.0000
26
  - schema_validity_rate: 1.0000
@@ -28,35 +28,43 @@
28
  - correct_tool_accuracy: 1.0000
29
  - argument_exact_match: 1.0000
30
  - argument_semantic_match: 1.0000
31
- ### execute (n=32)
32
  - json_validity_rate: 1.0000
33
  - strict_json_rate: 1.0000
34
  - schema_validity_rate: 1.0000
35
  - valid_tool_rate: 1.0000
36
  - correct_tool_accuracy: 1.0000
37
- - argument_exact_match: 1.0000
38
- - argument_semantic_match: 1.0000
39
- ### inspect (n=143)
40
- - json_validity_rate: 0.9930
41
- - strict_json_rate: 0.9930
42
- - schema_validity_rate: 0.9930
43
- - valid_tool_rate: 0.9930
44
- - correct_tool_accuracy: 0.9650
45
- - argument_exact_match: 0.9021
46
- - argument_semantic_match: 0.9091
47
- ### other (n=21)
48
  - json_validity_rate: 1.0000
49
  - strict_json_rate: 1.0000
50
  - schema_validity_rate: 1.0000
51
  - valid_tool_rate: 1.0000
52
  - correct_tool_accuracy: 1.0000
53
- - argument_exact_match: 1.0000
54
- - argument_semantic_match: 1.0000
55
- ### test (n=24)
 
 
 
 
 
 
 
 
56
  - json_validity_rate: 1.0000
57
  - strict_json_rate: 1.0000
58
  - schema_validity_rate: 1.0000
59
  - valid_tool_rate: 1.0000
60
  - correct_tool_accuracy: 1.0000
61
- - argument_exact_match: 0.7917
62
- - argument_semantic_match: 0.7917
 
1
  # BusyBeaver Checkpoint Evaluation
2
 
3
+ - Step: 250
4
 
5
+ - json_validity_rate: 1.0000
6
+ - strict_json_rate: 1.0000
7
+ - schema_validity_rate: 0.9792
8
+ - valid_tool_rate: 0.9974
9
+ - correct_tool_accuracy: 0.9818
10
+ - argument_exact_match: 0.6432
11
+ - argument_semantic_match: 0.6510
12
 
13
  ## Grouped Metrics
14
 
15
+ ### edit (n=49)
16
  - json_validity_rate: 1.0000
17
  - strict_json_rate: 1.0000
18
  - schema_validity_rate: 1.0000
19
  - valid_tool_rate: 1.0000
20
  - correct_tool_accuracy: 1.0000
21
+ - argument_exact_match: 0.6531
22
+ - argument_semantic_match: 0.6531
23
+ ### escalate (n=14)
24
  - json_validity_rate: 1.0000
25
  - strict_json_rate: 1.0000
26
  - schema_validity_rate: 1.0000
 
28
  - correct_tool_accuracy: 1.0000
29
  - argument_exact_match: 1.0000
30
  - argument_semantic_match: 1.0000
31
+ ### execute (n=45)
32
  - json_validity_rate: 1.0000
33
  - strict_json_rate: 1.0000
34
  - schema_validity_rate: 1.0000
35
  - valid_tool_rate: 1.0000
36
  - correct_tool_accuracy: 1.0000
37
+ - argument_exact_match: 0.7111
38
+ - argument_semantic_match: 0.7111
39
+ ### inspect (n=193)
40
+ - json_validity_rate: 1.0000
41
+ - strict_json_rate: 1.0000
42
+ - schema_validity_rate: 0.9585
43
+ - valid_tool_rate: 0.9948
44
+ - correct_tool_accuracy: 0.9637
45
+ - argument_exact_match: 0.7150
46
+ - argument_semantic_match: 0.7202
47
+ ### memory (n=12)
48
  - json_validity_rate: 1.0000
49
  - strict_json_rate: 1.0000
50
  - schema_validity_rate: 1.0000
51
  - valid_tool_rate: 1.0000
52
  - correct_tool_accuracy: 1.0000
53
+ - argument_exact_match: 0.0000
54
+ - argument_semantic_match: 0.0000
55
+ ### other (n=45)
56
+ - json_validity_rate: 1.0000
57
+ - strict_json_rate: 1.0000
58
+ - schema_validity_rate: 1.0000
59
+ - valid_tool_rate: 1.0000
60
+ - correct_tool_accuracy: 1.0000
61
+ - argument_exact_match: 0.4444
62
+ - argument_semantic_match: 0.4889
63
+ ### test (n=26)
64
  - json_validity_rate: 1.0000
65
  - strict_json_rate: 1.0000
66
  - schema_validity_rate: 1.0000
67
  - valid_tool_rate: 1.0000
68
  - correct_tool_accuracy: 1.0000
69
+ - argument_exact_match: 0.4231
70
+ - argument_semantic_match: 0.4231
busybeaver_eval/traces.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
busybeaver_state.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3727f8dd1c1c7a2f41a14735ce950b84a51a370ab727cdc2871c11959b43019
3
  size 222742359
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7c3da29992eaf6c5f8fea2dcbb3c1b66d93ef6c6ff2fe59f38be71d18de4d80
3
  size 222742359
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dde654cf20eee4f96217f749599ac06efd21c9c3d3b786e82844812e86c4a49c
3
  size 197545296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb10cfbe0360a75512146cadff5aa5606106084d406809423a75138a372943d1
3
  size 197545296