amirali1985 commited on
Commit
e793093
·
verified ·
1 Parent(s): e905fdb

Upload add_sub_baseline_10K/metrics.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. add_sub_baseline_10K/metrics.json +238 -173
add_sub_baseline_10K/metrics.json CHANGED
@@ -330,502 +330,567 @@
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
- "n_per_split": 100
334
  },
335
  "splits": {
336
  "add_S0": {
337
  "full_accuracy": 0.94,
338
- "n_examples": 100,
 
339
  "per_subtask": {
340
  "SA": {
341
- "accuracy": 0.9900826446280991,
342
- "count": 605
343
  },
344
  "SS": {
345
  "accuracy": 1.0,
346
- "count": 95
347
  }
348
  }
349
  },
350
  "add_S1": {
351
- "full_accuracy": 0.99,
352
- "n_examples": 100,
 
353
  "per_subtask": {
354
  "SA": {
355
- "accuracy": 0.9950980392156863,
356
- "count": 204
357
  },
358
  "SC": {
359
  "accuracy": 1.0,
360
- "count": 169
361
  },
362
  "SS": {
363
  "accuracy": 1.0,
364
- "count": 31
365
  },
366
  "UC": {
367
- "accuracy": 1.0,
368
- "count": 296
369
  }
370
  }
371
  },
372
  "add_S2": {
373
- "full_accuracy": 0.87,
374
- "n_examples": 100,
 
375
  "per_subtask": {
376
  "SA": {
377
- "accuracy": 0.9877300613496932,
378
- "count": 163
379
  },
380
  "SC": {
381
  "accuracy": 1.0,
382
- "count": 130
383
  },
384
  "SS": {
385
- "accuracy": 0.9885057471264368,
386
- "count": 87
387
  },
388
  "UC": {
389
- "accuracy": 0.9408866995073891,
390
- "count": 203
391
  },
392
  "US": {
393
  "accuracy": 1.0,
394
- "count": 117
395
  }
396
  }
397
  },
398
  "add_S3": {
399
- "full_accuracy": 0.69,
400
- "n_examples": 100,
 
401
  "per_subtask": {
402
  "SA": {
403
  "accuracy": 1.0,
404
- "count": 121
405
  },
406
  "SC": {
407
- "accuracy": 0.9917355371900827,
408
- "count": 121
409
  },
410
  "SS": {
411
  "accuracy": 1.0,
412
- "count": 49
413
  },
414
  "UC": {
415
- "accuracy": 0.8387096774193549,
416
- "count": 186
417
  },
418
  "US": {
419
  "accuracy": 1.0,
420
- "count": 223
421
  }
422
  }
423
  },
424
  "add_S4": {
425
- "full_accuracy": 0.57,
426
- "n_examples": 100,
 
427
  "per_subtask": {
428
  "SA": {
429
  "accuracy": 1.0,
430
- "count": 104
431
  },
432
  "SC": {
433
  "accuracy": 1.0,
434
- "count": 106
435
  },
436
  "SS": {
437
  "accuracy": 1.0,
438
- "count": 23
439
  },
440
  "UC": {
441
- "accuracy": 0.74375,
442
- "count": 160
443
  },
444
  "US": {
445
- "accuracy": 0.9511400651465798,
446
- "count": 307
447
  }
448
  }
449
  },
450
  "add_S5": {
451
- "full_accuracy": 0.32,
452
- "n_examples": 100,
 
453
  "per_subtask": {
454
  "SA": {
455
  "accuracy": 1.0,
456
- "count": 100
457
  },
458
  "SC": {
459
  "accuracy": 1.0,
460
- "count": 100
461
  },
462
  "UC": {
463
- "accuracy": 0.39,
464
- "count": 100
465
  },
466
  "US": {
467
- "accuracy": 0.7075,
468
- "count": 400
469
  }
470
  }
471
  },
472
  "add_S6": {
473
- "full_accuracy": 0.46,
474
- "n_examples": 100,
 
475
  "per_subtask": {
476
  "SC": {
477
  "accuracy": 1.0,
478
- "count": 100
479
  },
480
  "UC": {
481
- "accuracy": 0.55,
482
- "count": 100
483
  },
484
  "US": {
485
- "accuracy": 0.718,
486
- "count": 500
487
  }
488
  }
489
  },
490
  "add_random": {
491
- "full_accuracy": 0.945,
 
492
  "n_examples": 200,
493
  "per_subtask": {
494
  "SA": {
495
- "accuracy": 0.9932885906040269,
496
- "count": 447
497
  },
498
  "SC": {
499
  "accuracy": 1.0,
500
- "count": 320
501
  },
502
  "SS": {
503
  "accuracy": 1.0,
504
- "count": 56
505
  },
506
  "UC": {
507
- "accuracy": 0.9848771266540642,
508
- "count": 529
509
  },
510
  "US": {
511
  "accuracy": 1.0,
512
- "count": 48
513
  }
514
  }
515
  },
516
- "add_C3": {
517
- "full_accuracy": 0.83,
518
- "n_examples": 100,
 
519
  "per_subtask": {
520
  "SA": {
521
  "accuracy": 1.0,
522
- "count": 300
523
  },
524
  "SC": {
525
  "accuracy": 1.0,
526
- "count": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
  },
528
  "UC": {
529
- "accuracy": 0.9119170984455959,
530
- "count": 193
531
  },
532
  "US": {
533
  "accuracy": 1.0,
534
- "count": 107
535
  }
536
  }
537
  },
538
  "add_C4": {
539
  "full_accuracy": 0.88,
540
- "n_examples": 100,
 
541
  "per_subtask": {
542
  "SA": {
543
- "accuracy": 0.995,
544
- "count": 200
545
  },
546
  "SC": {
547
  "accuracy": 1.0,
548
- "count": 100
549
  },
550
  "UC": {
551
- "accuracy": 0.95703125,
552
- "count": 256
553
  },
554
  "US": {
555
  "accuracy": 1.0,
556
- "count": 144
557
  }
558
  }
559
  },
560
  "add_C5": {
561
- "full_accuracy": 0.8,
562
- "n_examples": 100,
 
563
  "per_subtask": {
564
  "SA": {
565
  "accuracy": 1.0,
566
- "count": 100
567
  },
568
  "SC": {
569
  "accuracy": 1.0,
570
- "count": 100
571
  },
572
  "UC": {
573
- "accuracy": 0.934640522875817,
574
- "count": 306
575
  },
576
  "US": {
577
- "accuracy": 0.9690721649484536,
578
- "count": 194
579
  }
580
  }
581
  },
582
  "add_C6": {
583
- "full_accuracy": 0.78,
584
- "n_examples": 100,
 
585
  "per_subtask": {
586
  "SC": {
587
  "accuracy": 1.0,
588
- "count": 100
589
  },
590
  "UC": {
591
- "accuracy": 0.9398907103825137,
592
- "count": 366
593
  },
594
  "US": {
595
- "accuracy": 0.9871794871794872,
596
- "count": 234
597
  }
598
  }
599
  },
600
  "sub_M0": {
601
- "full_accuracy": 0.96,
602
- "n_examples": 100,
 
603
  "per_subtask": {
604
  "MD": {
605
- "accuracy": 0.9933444259567388,
606
- "count": 601
607
  },
608
  "ME": {
609
  "accuracy": 1.0,
610
- "count": 99
611
  }
612
  }
613
  },
614
  "sub_M1": {
615
- "full_accuracy": 1.0,
616
- "n_examples": 100,
 
617
  "per_subtask": {
618
  "MD": {
619
- "accuracy": 1.0,
620
- "count": 279
621
  },
622
  "MB": {
623
- "accuracy": 1.0,
624
- "count": 145
625
  },
626
  "ME": {
627
  "accuracy": 1.0,
628
- "count": 24
629
  },
630
  "UB": {
631
  "accuracy": 1.0,
632
- "count": 252
633
  }
634
  }
635
  },
636
  "sub_M2": {
637
- "full_accuracy": 0.9,
638
- "n_examples": 100,
 
639
  "per_subtask": {
640
  "MD": {
641
  "accuracy": 1.0,
642
- "count": 213
643
  },
644
  "MB": {
645
- "accuracy": 0.9911504424778761,
646
- "count": 113
647
  },
648
  "ME": {
649
  "accuracy": 1.0,
650
- "count": 85
651
  },
652
  "UB": {
653
- "accuracy": 0.9447513812154696,
654
- "count": 181
655
  },
656
  "UD": {
657
  "accuracy": 1.0,
658
- "count": 108
659
  }
660
  }
661
  },
662
  "sub_M3": {
663
- "full_accuracy": 0.47,
664
- "n_examples": 100,
 
665
  "per_subtask": {
666
  "MD": {
667
  "accuracy": 1.0,
668
- "count": 179
669
  },
670
  "MB": {
671
- "accuracy": 0.9902912621359223,
672
- "count": 103
673
  },
674
  "ME": {
675
  "accuracy": 1.0,
676
- "count": 56
677
  },
678
  "UB": {
679
- "accuracy": 0.6375838926174496,
680
- "count": 149
681
  },
682
  "UD": {
683
  "accuracy": 1.0,
684
- "count": 213
685
  }
686
  }
687
  },
688
  "sub_M4": {
689
- "full_accuracy": 0.08,
690
- "n_examples": 100,
 
691
  "per_subtask": {
692
  "MD": {
693
  "accuracy": 1.0,
694
- "count": 200
695
  },
696
  "MB": {
697
  "accuracy": 1.0,
698
- "count": 100
699
  },
700
  "UB": {
701
- "accuracy": 0.3,
702
- "count": 100
703
  },
704
  "UD": {
705
- "accuracy": 0.7433333333333333,
706
- "count": 300
707
  }
708
  }
709
  },
710
  "sub_M5": {
711
- "full_accuracy": 0.02,
712
- "n_examples": 100,
 
713
  "per_subtask": {
714
  "MD": {
715
  "accuracy": 1.0,
716
- "count": 100
717
  },
718
  "MB": {
719
  "accuracy": 1.0,
720
- "count": 100
721
  },
722
  "UB": {
723
  "accuracy": 0.38,
724
- "count": 100
725
  },
726
  "UD": {
727
- "accuracy": 0.5575,
728
- "count": 400
729
  }
730
  }
731
  },
732
  "sub_random": {
733
  "full_accuracy": 0.96,
 
734
  "n_examples": 200,
735
  "per_subtask": {
736
  "MD": {
737
- "accuracy": 0.9983333333333333,
738
- "count": 600
739
  },
740
  "MB": {
741
- "accuracy": 0.9925093632958801,
742
- "count": 267
743
  },
744
  "ME": {
745
  "accuracy": 1.0,
746
  "count": 53
747
  },
748
  "UB": {
749
- "accuracy": 0.9886104783599089,
750
- "count": 439
751
  },
752
  "UD": {
753
  "accuracy": 1.0,
754
- "count": 41
755
  }
756
  }
757
  },
758
  "sub_B3": {
759
- "full_accuracy": 0.75,
760
- "n_examples": 100,
 
761
  "per_subtask": {
762
  "MD": {
763
  "accuracy": 1.0,
764
- "count": 300
765
  },
766
  "MB": {
767
  "accuracy": 1.0,
768
- "count": 100
769
  },
770
  "UB": {
771
- "accuracy": 0.8730964467005076,
772
- "count": 197
773
  },
774
  "UD": {
775
  "accuracy": 1.0,
776
- "count": 103
777
  }
778
  }
779
  },
780
  "sub_B4": {
781
- "full_accuracy": 0.68,
782
- "n_examples": 100,
 
783
  "per_subtask": {
784
  "MD": {
785
  "accuracy": 1.0,
786
- "count": 200
787
  },
788
  "MB": {
789
  "accuracy": 1.0,
790
- "count": 100
791
  },
792
  "UB": {
793
- "accuracy": 0.8866396761133604,
794
- "count": 247
795
  },
796
  "UD": {
797
- "accuracy": 0.934640522875817,
798
- "count": 153
799
  }
800
  }
801
  },
802
  "sub_B5": {
803
- "full_accuracy": 0.59,
804
- "n_examples": 100,
 
805
  "per_subtask": {
806
  "MD": {
807
  "accuracy": 1.0,
808
- "count": 100
809
  },
810
  "MB": {
811
  "accuracy": 1.0,
812
- "count": 100
813
  },
814
  "UB": {
815
- "accuracy": 0.87248322147651,
816
- "count": 298
817
  },
818
  "UD": {
819
- "accuracy": 0.9158415841584159,
820
- "count": 202
821
  }
822
  }
823
  }
824
  },
825
  "summary": {
826
- "overall_accuracy": 0.7241666666666666,
827
- "total_examples": 2400,
828
- "n_splits": 22
 
829
  }
830
  }
831
  }
 
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
+ "n_per_split": 50
334
  },
335
  "splits": {
336
  "add_S0": {
337
  "full_accuracy": 0.94,
338
+ "digit_accuracy": 0.9914285714285714,
339
+ "n_examples": 50,
340
  "per_subtask": {
341
  "SA": {
342
+ "accuracy": 0.9898305084745763,
343
+ "count": 295
344
  },
345
  "SS": {
346
  "accuracy": 1.0,
347
+ "count": 55
348
  }
349
  }
350
  },
351
  "add_S1": {
352
+ "full_accuracy": 0.98,
353
+ "digit_accuracy": 0.9971428571428571,
354
+ "n_examples": 50,
355
  "per_subtask": {
356
  "SA": {
357
+ "accuracy": 1.0,
358
+ "count": 126
359
  },
360
  "SC": {
361
  "accuracy": 1.0,
362
+ "count": 79
363
  },
364
  "SS": {
365
  "accuracy": 1.0,
366
+ "count": 21
367
  },
368
  "UC": {
369
+ "accuracy": 0.9919354838709677,
370
+ "count": 124
371
  }
372
  }
373
  },
374
  "add_S2": {
375
+ "full_accuracy": 0.9,
376
+ "digit_accuracy": 0.9828571428571429,
377
+ "n_examples": 50,
378
  "per_subtask": {
379
  "SA": {
380
+ "accuracy": 0.9866666666666667,
381
+ "count": 75
382
  },
383
  "SC": {
384
  "accuracy": 1.0,
385
+ "count": 62
386
  },
387
  "SS": {
388
+ "accuracy": 1.0,
389
+ "count": 39
390
  },
391
  "UC": {
392
+ "accuracy": 0.954954954954955,
393
+ "count": 111
394
  },
395
  "US": {
396
  "accuracy": 1.0,
397
+ "count": 63
398
  }
399
  }
400
  },
401
  "add_S3": {
402
+ "full_accuracy": 0.7,
403
+ "digit_accuracy": 0.9571428571428572,
404
+ "n_examples": 50,
405
  "per_subtask": {
406
  "SA": {
407
  "accuracy": 1.0,
408
+ "count": 60
409
  },
410
  "SC": {
411
+ "accuracy": 1.0,
412
+ "count": 57
413
  },
414
  "SS": {
415
  "accuracy": 1.0,
416
+ "count": 19
417
  },
418
  "UC": {
419
+ "accuracy": 0.8557692307692307,
420
+ "count": 104
421
  },
422
  "US": {
423
  "accuracy": 1.0,
424
+ "count": 110
425
  }
426
  }
427
  },
428
  "add_S4": {
429
+ "full_accuracy": 0.64,
430
+ "digit_accuracy": 0.9342857142857143,
431
+ "n_examples": 50,
432
  "per_subtask": {
433
  "SA": {
434
  "accuracy": 1.0,
435
+ "count": 48
436
  },
437
  "SC": {
438
  "accuracy": 1.0,
439
+ "count": 52
440
  },
441
  "SS": {
442
  "accuracy": 1.0,
443
+ "count": 7
444
  },
445
  "UC": {
446
+ "accuracy": 0.8202247191011236,
447
+ "count": 89
448
  },
449
  "US": {
450
+ "accuracy": 0.9545454545454546,
451
+ "count": 154
452
  }
453
  }
454
  },
455
  "add_S5": {
456
+ "full_accuracy": 0.3,
457
+ "digit_accuracy": 0.7342857142857143,
458
+ "n_examples": 50,
459
  "per_subtask": {
460
  "SA": {
461
  "accuracy": 1.0,
462
+ "count": 50
463
  },
464
  "SC": {
465
  "accuracy": 1.0,
466
+ "count": 50
467
  },
468
  "UC": {
469
+ "accuracy": 0.38,
470
+ "count": 50
471
  },
472
  "US": {
473
+ "accuracy": 0.69,
474
+ "count": 200
475
  }
476
  }
477
  },
478
  "add_S6": {
479
+ "full_accuracy": 0.36,
480
+ "digit_accuracy": 0.6685714285714286,
481
+ "n_examples": 50,
482
  "per_subtask": {
483
  "SC": {
484
  "accuracy": 1.0,
485
+ "count": 50
486
  },
487
  "UC": {
488
+ "accuracy": 0.46,
489
+ "count": 50
490
  },
491
  "US": {
492
+ "accuracy": 0.644,
493
+ "count": 250
494
  }
495
  }
496
  },
497
  "add_random": {
498
+ "full_accuracy": 0.96,
499
+ "digit_accuracy": 0.9935714285714285,
500
  "n_examples": 200,
501
  "per_subtask": {
502
  "SA": {
503
+ "accuracy": 0.9953596287703016,
504
+ "count": 431
505
  },
506
  "SC": {
507
  "accuracy": 1.0,
508
+ "count": 316
509
  },
510
  "SS": {
511
  "accuracy": 1.0,
512
+ "count": 39
513
  },
514
  "UC": {
515
+ "accuracy": 0.9875,
516
+ "count": 560
517
  },
518
  "US": {
519
  "accuracy": 1.0,
520
+ "count": 54
521
  }
522
  }
523
  },
524
+ "add_C1": {
525
+ "full_accuracy": 1.0,
526
+ "digit_accuracy": 1.0,
527
+ "n_examples": 50,
528
  "per_subtask": {
529
  "SA": {
530
  "accuracy": 1.0,
531
+ "count": 250
532
  },
533
  "SC": {
534
  "accuracy": 1.0,
535
+ "count": 50
536
+ },
537
+ "UC": {
538
+ "accuracy": 1.0,
539
+ "count": 50
540
+ }
541
+ }
542
+ },
543
+ "add_C2": {
544
+ "full_accuracy": 0.98,
545
+ "digit_accuracy": 0.9971428571428571,
546
+ "n_examples": 50,
547
+ "per_subtask": {
548
+ "SA": {
549
+ "accuracy": 1.0,
550
+ "count": 200
551
+ },
552
+ "SC": {
553
+ "accuracy": 1.0,
554
+ "count": 50
555
+ },
556
+ "UC": {
557
+ "accuracy": 0.9879518072289156,
558
+ "count": 83
559
+ },
560
+ "US": {
561
+ "accuracy": 1.0,
562
+ "count": 17
563
+ }
564
+ }
565
+ },
566
+ "add_C3": {
567
+ "full_accuracy": 0.78,
568
+ "digit_accuracy": 0.9685714285714285,
569
+ "n_examples": 50,
570
+ "per_subtask": {
571
+ "SA": {
572
+ "accuracy": 0.9933333333333333,
573
+ "count": 150
574
+ },
575
+ "SC": {
576
+ "accuracy": 1.0,
577
+ "count": 50
578
  },
579
  "UC": {
580
+ "accuracy": 0.9,
581
+ "count": 100
582
  },
583
  "US": {
584
  "accuracy": 1.0,
585
+ "count": 50
586
  }
587
  }
588
  },
589
  "add_C4": {
590
  "full_accuracy": 0.88,
591
+ "digit_accuracy": 0.9828571428571429,
592
+ "n_examples": 50,
593
  "per_subtask": {
594
  "SA": {
595
+ "accuracy": 1.0,
596
+ "count": 100
597
  },
598
  "SC": {
599
  "accuracy": 1.0,
600
+ "count": 50
601
  },
602
  "UC": {
603
+ "accuracy": 0.9545454545454546,
604
+ "count": 132
605
  },
606
  "US": {
607
  "accuracy": 1.0,
608
+ "count": 68
609
  }
610
  }
611
  },
612
  "add_C5": {
613
+ "full_accuracy": 0.76,
614
+ "digit_accuracy": 0.9571428571428572,
615
+ "n_examples": 50,
616
  "per_subtask": {
617
  "SA": {
618
  "accuracy": 1.0,
619
+ "count": 50
620
  },
621
  "SC": {
622
  "accuracy": 1.0,
623
+ "count": 50
624
  },
625
  "UC": {
626
+ "accuracy": 0.9178082191780822,
627
+ "count": 146
628
  },
629
  "US": {
630
+ "accuracy": 0.9711538461538461,
631
+ "count": 104
632
  }
633
  }
634
  },
635
  "add_C6": {
636
+ "full_accuracy": 0.92,
637
+ "digit_accuracy": 0.9885714285714285,
638
+ "n_examples": 50,
639
  "per_subtask": {
640
  "SC": {
641
  "accuracy": 1.0,
642
+ "count": 50
643
  },
644
  "UC": {
645
+ "accuracy": 0.9788359788359788,
646
+ "count": 189
647
  },
648
  "US": {
649
+ "accuracy": 1.0,
650
+ "count": 111
651
  }
652
  }
653
  },
654
  "sub_M0": {
655
+ "full_accuracy": 1.0,
656
+ "digit_accuracy": 1.0,
657
+ "n_examples": 50,
658
  "per_subtask": {
659
  "MD": {
660
+ "accuracy": 1.0,
661
+ "count": 303
662
  },
663
  "ME": {
664
  "accuracy": 1.0,
665
+ "count": 47
666
  }
667
  }
668
  },
669
  "sub_M1": {
670
+ "full_accuracy": 0.92,
671
+ "digit_accuracy": 0.9885714285714285,
672
+ "n_examples": 50,
673
  "per_subtask": {
674
  "MD": {
675
+ "accuracy": 0.9858156028368794,
676
+ "count": 141
677
  },
678
  "MB": {
679
+ "accuracy": 0.9722222222222222,
680
+ "count": 72
681
  },
682
  "ME": {
683
  "accuracy": 1.0,
684
+ "count": 18
685
  },
686
  "UB": {
687
  "accuracy": 1.0,
688
+ "count": 119
689
  }
690
  }
691
  },
692
  "sub_M2": {
693
+ "full_accuracy": 0.76,
694
+ "digit_accuracy": 0.9657142857142857,
695
+ "n_examples": 50,
696
  "per_subtask": {
697
  "MD": {
698
  "accuracy": 1.0,
699
+ "count": 112
700
  },
701
  "MB": {
702
+ "accuracy": 0.9056603773584906,
703
+ "count": 53
704
  },
705
  "ME": {
706
  "accuracy": 1.0,
707
+ "count": 47
708
  },
709
  "UB": {
710
+ "accuracy": 0.9176470588235294,
711
+ "count": 85
712
  },
713
  "UD": {
714
  "accuracy": 1.0,
715
+ "count": 53
716
  }
717
  }
718
  },
719
  "sub_M3": {
720
+ "full_accuracy": 0.42,
721
+ "digit_accuracy": 0.9171428571428571,
722
+ "n_examples": 50,
723
  "per_subtask": {
724
  "MD": {
725
  "accuracy": 1.0,
726
+ "count": 97
727
  },
728
  "MB": {
729
+ "accuracy": 1.0,
730
+ "count": 51
731
  },
732
  "ME": {
733
  "accuracy": 1.0,
734
+ "count": 27
735
  },
736
  "UB": {
737
+ "accuracy": 0.6081081081081081,
738
+ "count": 74
739
  },
740
  "UD": {
741
  "accuracy": 1.0,
742
+ "count": 101
743
  }
744
  }
745
  },
746
  "sub_M4": {
747
+ "full_accuracy": 0.1,
748
+ "digit_accuracy": 0.8228571428571428,
749
+ "n_examples": 50,
750
  "per_subtask": {
751
  "MD": {
752
  "accuracy": 1.0,
753
+ "count": 100
754
  },
755
  "MB": {
756
  "accuracy": 1.0,
757
+ "count": 50
758
  },
759
  "UB": {
760
+ "accuracy": 0.44,
761
+ "count": 50
762
  },
763
  "UD": {
764
+ "accuracy": 0.7733333333333333,
765
+ "count": 150
766
  }
767
  }
768
  },
769
  "sub_M5": {
770
+ "full_accuracy": 0.06,
771
+ "digit_accuracy": 0.6885714285714286,
772
+ "n_examples": 50,
773
  "per_subtask": {
774
  "MD": {
775
  "accuracy": 1.0,
776
+ "count": 50
777
  },
778
  "MB": {
779
  "accuracy": 1.0,
780
+ "count": 50
781
  },
782
  "UB": {
783
  "accuracy": 0.38,
784
+ "count": 50
785
  },
786
  "UD": {
787
+ "accuracy": 0.61,
788
+ "count": 200
789
  }
790
  }
791
  },
792
  "sub_random": {
793
  "full_accuracy": 0.96,
794
+ "digit_accuracy": 0.9942857142857143,
795
  "n_examples": 200,
796
  "per_subtask": {
797
  "MD": {
798
+ "accuracy": 0.9964912280701754,
799
+ "count": 570
800
  },
801
  "MB": {
802
+ "accuracy": 0.9819494584837545,
803
+ "count": 277
804
  },
805
  "ME": {
806
  "accuracy": 1.0,
807
  "count": 53
808
  },
809
  "UB": {
810
+ "accuracy": 0.9978768577494692,
811
+ "count": 471
812
  },
813
  "UD": {
814
  "accuracy": 1.0,
815
+ "count": 29
816
  }
817
  }
818
  },
819
  "sub_B3": {
820
+ "full_accuracy": 0.82,
821
+ "digit_accuracy": 0.9742857142857143,
822
+ "n_examples": 50,
823
  "per_subtask": {
824
  "MD": {
825
  "accuracy": 1.0,
826
+ "count": 150
827
  },
828
  "MB": {
829
  "accuracy": 1.0,
830
+ "count": 50
831
  },
832
  "UB": {
833
+ "accuracy": 0.9108910891089109,
834
+ "count": 101
835
  },
836
  "UD": {
837
  "accuracy": 1.0,
838
+ "count": 49
839
  }
840
  }
841
  },
842
  "sub_B4": {
843
+ "full_accuracy": 0.62,
844
+ "digit_accuracy": 0.9371428571428572,
845
+ "n_examples": 50,
846
  "per_subtask": {
847
  "MD": {
848
  "accuracy": 1.0,
849
+ "count": 100
850
  },
851
  "MB": {
852
  "accuracy": 1.0,
853
+ "count": 50
854
  },
855
  "UB": {
856
+ "accuracy": 0.859504132231405,
857
+ "count": 121
858
  },
859
  "UD": {
860
+ "accuracy": 0.9367088607594937,
861
+ "count": 79
862
  }
863
  }
864
  },
865
  "sub_B5": {
866
+ "full_accuracy": 0.46,
867
+ "digit_accuracy": 0.9142857142857143,
868
+ "n_examples": 50,
869
  "per_subtask": {
870
  "MD": {
871
  "accuracy": 1.0,
872
+ "count": 50
873
  },
874
  "MB": {
875
  "accuracy": 1.0,
876
+ "count": 50
877
  },
878
  "UB": {
879
+ "accuracy": 0.8289473684210527,
880
+ "count": 152
881
  },
882
  "UD": {
883
+ "accuracy": 0.9591836734693877,
884
+ "count": 98
885
  }
886
  }
887
  }
888
  },
889
  "summary": {
890
+ "overall_accuracy": 0.766,
891
+ "digit_accuracy": 0.9439047619047619,
892
+ "total_examples": 1500,
893
+ "n_splits": 24
894
  }
895
  }
896
  }