amirali1985 commited on
Commit
e905fdb
·
verified ·
1 Parent(s): ee8506a

Upload add_sub_baseline_10K_1L2H256d/metrics.json with huggingface_hub

Browse files
add_sub_baseline_10K_1L2H256d/metrics.json CHANGED
@@ -330,502 +330,567 @@
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
- "n_per_split": 100
334
  },
335
  "splits": {
336
  "add_S0": {
337
  "full_accuracy": 0.0,
338
- "n_examples": 100,
 
339
  "per_subtask": {
340
  "SA": {
341
- "accuracy": 0.2066115702479339,
342
- "count": 605
343
  },
344
  "SS": {
345
  "accuracy": 1.0,
346
- "count": 95
347
  }
348
  }
349
  },
350
  "add_S1": {
351
  "full_accuracy": 0.0,
352
- "n_examples": 100,
 
353
  "per_subtask": {
354
  "SA": {
355
- "accuracy": 0.2647058823529412,
356
- "count": 204
357
  },
358
  "SC": {
359
- "accuracy": 0.1242603550295858,
360
- "count": 169
361
  },
362
  "SS": {
363
- "accuracy": 0.8709677419354839,
364
- "count": 31
365
  },
366
  "UC": {
367
- "accuracy": 0.2533783783783784,
368
- "count": 296
369
  }
370
  }
371
  },
372
  "add_S2": {
373
  "full_accuracy": 0.0,
374
- "n_examples": 100,
 
375
  "per_subtask": {
376
  "SA": {
377
- "accuracy": 0.3987730061349693,
378
- "count": 163
379
  },
380
  "SC": {
381
- "accuracy": 0.09230769230769231,
382
- "count": 130
383
  },
384
  "SS": {
385
- "accuracy": 0.5517241379310345,
386
- "count": 87
387
  },
388
  "UC": {
389
- "accuracy": 0.3842364532019704,
390
- "count": 203
391
  },
392
  "US": {
393
- "accuracy": 0.4700854700854701,
394
- "count": 117
395
  }
396
  }
397
  },
398
  "add_S3": {
399
  "full_accuracy": 0.0,
400
- "n_examples": 100,
 
401
  "per_subtask": {
402
  "SA": {
403
- "accuracy": 0.49586776859504134,
404
- "count": 121
405
  },
406
  "SC": {
407
- "accuracy": 0.024793388429752067,
408
- "count": 121
409
  },
410
  "SS": {
411
- "accuracy": 0.8571428571428571,
412
- "count": 49
413
  },
414
  "UC": {
415
- "accuracy": 0.3118279569892473,
416
- "count": 186
417
  },
418
  "US": {
419
- "accuracy": 0.3632286995515695,
420
- "count": 223
421
  }
422
  }
423
  },
424
  "add_S4": {
425
  "full_accuracy": 0.0,
426
- "n_examples": 100,
 
427
  "per_subtask": {
428
  "SA": {
429
- "accuracy": 0.4807692307692308,
430
- "count": 104
431
  },
432
  "SC": {
433
- "accuracy": 0.0660377358490566,
434
- "count": 106
435
  },
436
  "SS": {
437
- "accuracy": 0.6956521739130435,
438
- "count": 23
439
  },
440
  "UC": {
441
- "accuracy": 0.38125,
442
- "count": 160
443
  },
444
  "US": {
445
- "accuracy": 0.30293159609120524,
446
- "count": 307
447
  }
448
  }
449
  },
450
  "add_S5": {
451
  "full_accuracy": 0.0,
452
- "n_examples": 100,
 
453
  "per_subtask": {
454
  "SA": {
455
- "accuracy": 0.46,
456
- "count": 100
457
  },
458
  "SC": {
459
- "accuracy": 0.02,
460
- "count": 100
461
  },
462
  "UC": {
463
- "accuracy": 0.15,
464
- "count": 100
465
  },
466
  "US": {
467
- "accuracy": 0.0525,
468
- "count": 400
469
  }
470
  }
471
  },
472
  "add_S6": {
473
- "full_accuracy": 0.04,
474
- "n_examples": 100,
 
475
  "per_subtask": {
476
  "SC": {
477
- "accuracy": 0.05,
478
- "count": 100
479
  },
480
  "UC": {
481
- "accuracy": 0.64,
482
- "count": 100
483
  },
484
  "US": {
485
- "accuracy": 0.562,
486
- "count": 500
487
  }
488
  }
489
  },
490
  "add_random": {
491
  "full_accuracy": 0.0,
 
492
  "n_examples": 200,
493
  "per_subtask": {
494
  "SA": {
495
- "accuracy": 0.2684563758389262,
496
- "count": 447
497
  },
498
  "SC": {
499
- "accuracy": 0.078125,
500
- "count": 320
501
  },
502
  "SS": {
503
- "accuracy": 0.8571428571428571,
504
- "count": 56
505
  },
506
  "UC": {
507
- "accuracy": 0.2646502835538752,
508
- "count": 529
509
  },
510
  "US": {
511
- "accuracy": 0.3958333333333333,
512
- "count": 48
513
  }
514
  }
515
  },
516
- "add_C3": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  "full_accuracy": 0.0,
518
- "n_examples": 100,
 
519
  "per_subtask": {
520
  "SA": {
521
- "accuracy": 0.24333333333333335,
522
- "count": 300
523
  },
524
  "SC": {
525
  "accuracy": 0.02,
526
- "count": 100
527
  },
528
  "UC": {
529
- "accuracy": 0.11917098445595854,
530
- "count": 193
531
  },
532
  "US": {
533
- "accuracy": 0.07476635514018691,
534
- "count": 107
535
  }
536
  }
537
  },
538
- "add_C4": {
539
  "full_accuracy": 0.0,
540
- "n_examples": 100,
 
541
  "per_subtask": {
542
  "SA": {
543
- "accuracy": 0.36,
544
- "count": 200
545
  },
546
  "SC": {
547
- "accuracy": 0.03,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
  "count": 100
549
  },
 
 
 
 
550
  "UC": {
551
- "accuracy": 0.10546875,
552
- "count": 256
553
  },
554
  "US": {
555
- "accuracy": 0.1736111111111111,
556
- "count": 144
557
  }
558
  }
559
  },
560
  "add_C5": {
561
  "full_accuracy": 0.0,
562
- "n_examples": 100,
 
563
  "per_subtask": {
564
  "SA": {
565
- "accuracy": 0.53,
566
- "count": 100
567
  },
568
  "SC": {
569
- "accuracy": 0.06,
570
- "count": 100
571
  },
572
  "UC": {
573
- "accuracy": 0.1437908496732026,
574
- "count": 306
575
  },
576
  "US": {
577
- "accuracy": 0.2268041237113402,
578
- "count": 194
579
  }
580
  }
581
  },
582
  "add_C6": {
583
  "full_accuracy": 0.0,
584
- "n_examples": 100,
 
585
  "per_subtask": {
586
  "SC": {
587
- "accuracy": 0.07,
588
- "count": 100
589
  },
590
  "UC": {
591
- "accuracy": 0.2923497267759563,
592
- "count": 366
593
  },
594
  "US": {
595
- "accuracy": 0.6068376068376068,
596
- "count": 234
597
  }
598
  }
599
  },
600
  "sub_M0": {
601
  "full_accuracy": 0.0,
602
- "n_examples": 100,
 
603
  "per_subtask": {
604
  "MD": {
605
- "accuracy": 0.20465890183028287,
606
- "count": 601
607
  },
608
  "ME": {
609
  "accuracy": 1.0,
610
- "count": 99
611
  }
612
  }
613
  },
614
  "sub_M1": {
615
  "full_accuracy": 0.0,
616
- "n_examples": 100,
 
617
  "per_subtask": {
618
  "MD": {
619
- "accuracy": 0.3835125448028674,
620
- "count": 279
621
  },
622
  "MB": {
623
  "accuracy": 0.0,
624
- "count": 145
625
  },
626
  "ME": {
627
  "accuracy": 1.0,
628
- "count": 24
629
  },
630
  "UB": {
631
- "accuracy": 0.09523809523809523,
632
- "count": 252
633
  }
634
  }
635
  },
636
  "sub_M2": {
637
  "full_accuracy": 0.0,
638
- "n_examples": 100,
 
639
  "per_subtask": {
640
  "MD": {
641
- "accuracy": 0.6150234741784038,
642
- "count": 213
643
  },
644
  "MB": {
645
  "accuracy": 0.0,
646
- "count": 113
647
  },
648
  "ME": {
649
  "accuracy": 1.0,
650
- "count": 85
651
  },
652
  "UB": {
653
- "accuracy": 0.16574585635359115,
654
- "count": 181
655
  },
656
  "UD": {
657
  "accuracy": 0.0,
658
- "count": 108
659
  }
660
  }
661
  },
662
  "sub_M3": {
663
  "full_accuracy": 0.0,
664
- "n_examples": 100,
 
665
  "per_subtask": {
666
  "MD": {
667
- "accuracy": 0.7597765363128491,
668
- "count": 179
669
  },
670
  "MB": {
671
  "accuracy": 0.0,
672
- "count": 103
673
  },
674
  "ME": {
675
  "accuracy": 1.0,
676
- "count": 56
677
  },
678
  "UB": {
679
- "accuracy": 0.12080536912751678,
680
- "count": 149
681
  },
682
  "UD": {
683
  "accuracy": 0.0,
684
- "count": 213
685
  }
686
  }
687
  },
688
  "sub_M4": {
689
  "full_accuracy": 0.0,
690
- "n_examples": 100,
 
691
  "per_subtask": {
692
  "MD": {
693
  "accuracy": 0.5,
694
- "count": 200
695
  },
696
  "MB": {
697
  "accuracy": 0.0,
698
- "count": 100
699
  },
700
  "UB": {
701
- "accuracy": 0.3,
702
- "count": 100
703
  },
704
  "UD": {
705
  "accuracy": 0.0,
706
- "count": 300
707
  }
708
  }
709
  },
710
  "sub_M5": {
711
  "full_accuracy": 0.0,
712
- "n_examples": 100,
 
713
  "per_subtask": {
714
  "MD": {
715
  "accuracy": 1.0,
716
- "count": 100
717
  },
718
  "MB": {
719
  "accuracy": 0.0,
720
- "count": 100
721
  },
722
  "UB": {
723
- "accuracy": 0.31,
724
- "count": 100
725
  },
726
  "UD": {
727
  "accuracy": 0.0,
728
- "count": 400
729
  }
730
  }
731
  },
732
  "sub_random": {
733
  "full_accuracy": 0.0,
 
734
  "n_examples": 200,
735
  "per_subtask": {
736
  "MD": {
737
- "accuracy": 0.3616666666666667,
738
- "count": 600
739
  },
740
  "MB": {
741
  "accuracy": 0.0,
742
- "count": 267
743
  },
744
  "ME": {
745
  "accuracy": 1.0,
746
  "count": 53
747
  },
748
  "UB": {
749
- "accuracy": 0.12072892938496584,
750
- "count": 439
751
  },
752
  "UD": {
753
  "accuracy": 0.0,
754
- "count": 41
755
  }
756
  }
757
  },
758
  "sub_B3": {
759
  "full_accuracy": 0.0,
760
- "n_examples": 100,
 
761
  "per_subtask": {
762
  "MD": {
763
  "accuracy": 0.3333333333333333,
764
- "count": 300
765
  },
766
  "MB": {
767
  "accuracy": 0.0,
768
- "count": 100
769
  },
770
  "UB": {
771
- "accuracy": 0.17766497461928935,
772
- "count": 197
773
  },
774
  "UD": {
775
  "accuracy": 0.0,
776
- "count": 103
777
  }
778
  }
779
  },
780
  "sub_B4": {
781
  "full_accuracy": 0.0,
782
- "n_examples": 100,
 
783
  "per_subtask": {
784
  "MD": {
785
  "accuracy": 0.5,
786
- "count": 200
787
  },
788
  "MB": {
789
  "accuracy": 0.0,
790
- "count": 100
791
  },
792
  "UB": {
793
- "accuracy": 0.145748987854251,
794
- "count": 247
795
  },
796
  "UD": {
797
  "accuracy": 0.0,
798
- "count": 153
799
  }
800
  }
801
  },
802
  "sub_B5": {
803
  "full_accuracy": 0.0,
804
- "n_examples": 100,
 
805
  "per_subtask": {
806
  "MD": {
807
  "accuracy": 1.0,
808
- "count": 100
809
  },
810
  "MB": {
811
  "accuracy": 0.0,
812
- "count": 100
813
  },
814
  "UB": {
815
- "accuracy": 0.11073825503355705,
816
- "count": 298
817
  },
818
  "UD": {
819
  "accuracy": 0.0,
820
- "count": 202
821
  }
822
  }
823
  }
824
  },
825
  "summary": {
826
- "overall_accuracy": 0.0016666666666666668,
827
- "total_examples": 2400,
828
- "n_splits": 22
 
829
  }
830
  }
831
  }
 
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
+ "n_per_split": 50
334
  },
335
  "splits": {
336
  "add_S0": {
337
  "full_accuracy": 0.0,
338
+ "digit_accuracy": 0.3457142857142857,
339
+ "n_examples": 50,
340
  "per_subtask": {
341
  "SA": {
342
+ "accuracy": 0.22372881355932203,
343
+ "count": 295
344
  },
345
  "SS": {
346
  "accuracy": 1.0,
347
+ "count": 55
348
  }
349
  }
350
  },
351
  "add_S1": {
352
  "full_accuracy": 0.0,
353
+ "digit_accuracy": 0.24571428571428572,
354
+ "n_examples": 50,
355
  "per_subtask": {
356
  "SA": {
357
+ "accuracy": 0.29365079365079366,
358
+ "count": 126
359
  },
360
  "SC": {
361
+ "accuracy": 0.08860759493670886,
362
+ "count": 79
363
  },
364
  "SS": {
365
+ "accuracy": 0.9047619047619048,
366
+ "count": 21
367
  },
368
  "UC": {
369
+ "accuracy": 0.18548387096774194,
370
+ "count": 124
371
  }
372
  }
373
  },
374
  "add_S2": {
375
  "full_accuracy": 0.0,
376
+ "digit_accuracy": 0.37142857142857144,
377
+ "n_examples": 50,
378
  "per_subtask": {
379
  "SA": {
380
+ "accuracy": 0.4,
381
+ "count": 75
382
  },
383
  "SC": {
384
+ "accuracy": 0.12903225806451613,
385
+ "count": 62
386
  },
387
  "SS": {
388
+ "accuracy": 0.5128205128205128,
389
+ "count": 39
390
  },
391
  "UC": {
392
+ "accuracy": 0.36936936936936937,
393
+ "count": 111
394
  },
395
  "US": {
396
+ "accuracy": 0.49206349206349204,
397
+ "count": 63
398
  }
399
  }
400
  },
401
  "add_S3": {
402
  "full_accuracy": 0.0,
403
+ "digit_accuracy": 0.37142857142857144,
404
+ "n_examples": 50,
405
  "per_subtask": {
406
  "SA": {
407
+ "accuracy": 0.5,
408
+ "count": 60
409
  },
410
  "SC": {
411
+ "accuracy": 0.10526315789473684,
412
+ "count": 57
413
  },
414
  "SS": {
415
+ "accuracy": 0.6842105263157895,
416
+ "count": 19
417
  },
418
  "UC": {
419
+ "accuracy": 0.3557692307692308,
420
+ "count": 104
421
  },
422
  "US": {
423
+ "accuracy": 0.4,
424
+ "count": 110
425
  }
426
  }
427
  },
428
  "add_S4": {
429
  "full_accuracy": 0.0,
430
+ "digit_accuracy": 0.32,
431
+ "n_examples": 50,
432
  "per_subtask": {
433
  "SA": {
434
+ "accuracy": 0.4791666666666667,
435
+ "count": 48
436
  },
437
  "SC": {
438
+ "accuracy": 0.057692307692307696,
439
+ "count": 52
440
  },
441
  "SS": {
442
+ "accuracy": 0.8571428571428571,
443
+ "count": 7
444
  },
445
  "UC": {
446
+ "accuracy": 0.3146067415730337,
447
+ "count": 89
448
  },
449
  "US": {
450
+ "accuracy": 0.33766233766233766,
451
+ "count": 154
452
  }
453
  }
454
  },
455
  "add_S5": {
456
  "full_accuracy": 0.0,
457
+ "digit_accuracy": 0.07714285714285714,
458
+ "n_examples": 50,
459
  "per_subtask": {
460
  "SA": {
461
+ "accuracy": 0.5,
462
+ "count": 50
463
  },
464
  "SC": {
465
+ "accuracy": 0.0,
466
+ "count": 50
467
  },
468
  "UC": {
469
+ "accuracy": 0.04,
470
+ "count": 50
471
  },
472
  "US": {
473
+ "accuracy": 0.0,
474
+ "count": 200
475
  }
476
  }
477
  },
478
  "add_S6": {
479
+ "full_accuracy": 0.12,
480
+ "digit_accuracy": 0.3485714285714286,
481
+ "n_examples": 50,
482
  "per_subtask": {
483
  "SC": {
484
+ "accuracy": 0.12,
485
+ "count": 50
486
  },
487
  "UC": {
488
+ "accuracy": 0.52,
489
+ "count": 50
490
  },
491
  "US": {
492
+ "accuracy": 0.36,
493
+ "count": 250
494
  }
495
  }
496
  },
497
  "add_random": {
498
  "full_accuracy": 0.0,
499
+ "digit_accuracy": 0.24357142857142858,
500
  "n_examples": 200,
501
  "per_subtask": {
502
  "SA": {
503
+ "accuracy": 0.2505800464037123,
504
+ "count": 431
505
  },
506
  "SC": {
507
+ "accuracy": 0.10126582278481013,
508
+ "count": 316
509
  },
510
  "SS": {
511
+ "accuracy": 0.8974358974358975,
512
+ "count": 39
513
  },
514
  "UC": {
515
+ "accuracy": 0.2571428571428571,
516
+ "count": 560
517
  },
518
  "US": {
519
+ "accuracy": 0.4074074074074074,
520
+ "count": 54
521
  }
522
  }
523
  },
524
+ "add_C1": {
525
+ "full_accuracy": 0.0,
526
+ "digit_accuracy": 0.15714285714285714,
527
+ "n_examples": 50,
528
+ "per_subtask": {
529
+ "SA": {
530
+ "accuracy": 0.184,
531
+ "count": 250
532
+ },
533
+ "SC": {
534
+ "accuracy": 0.06,
535
+ "count": 50
536
+ },
537
+ "UC": {
538
+ "accuracy": 0.12,
539
+ "count": 50
540
+ }
541
+ }
542
+ },
543
+ "add_C2": {
544
  "full_accuracy": 0.0,
545
+ "digit_accuracy": 0.16285714285714287,
546
+ "n_examples": 50,
547
  "per_subtask": {
548
  "SA": {
549
+ "accuracy": 0.21,
550
+ "count": 200
551
  },
552
  "SC": {
553
  "accuracy": 0.02,
554
+ "count": 50
555
  },
556
  "UC": {
557
+ "accuracy": 0.13253012048192772,
558
+ "count": 83
559
  },
560
  "US": {
561
+ "accuracy": 0.17647058823529413,
562
+ "count": 17
563
  }
564
  }
565
  },
566
+ "add_C3": {
567
  "full_accuracy": 0.0,
568
+ "digit_accuracy": 0.16857142857142857,
569
+ "n_examples": 50,
570
  "per_subtask": {
571
  "SA": {
572
+ "accuracy": 0.26,
573
+ "count": 150
574
  },
575
  "SC": {
576
+ "accuracy": 0.04,
577
+ "count": 50
578
+ },
579
+ "UC": {
580
+ "accuracy": 0.13,
581
+ "count": 100
582
+ },
583
+ "US": {
584
+ "accuracy": 0.1,
585
+ "count": 50
586
+ }
587
+ }
588
+ },
589
+ "add_C4": {
590
+ "full_accuracy": 0.0,
591
+ "digit_accuracy": 0.18857142857142858,
592
+ "n_examples": 50,
593
+ "per_subtask": {
594
+ "SA": {
595
+ "accuracy": 0.39,
596
  "count": 100
597
  },
598
+ "SC": {
599
+ "accuracy": 0.0,
600
+ "count": 50
601
+ },
602
  "UC": {
603
+ "accuracy": 0.09090909090909091,
604
+ "count": 132
605
  },
606
  "US": {
607
+ "accuracy": 0.22058823529411764,
608
+ "count": 68
609
  }
610
  }
611
  },
612
  "add_C5": {
613
  "full_accuracy": 0.0,
614
+ "digit_accuracy": 0.18857142857142858,
615
+ "n_examples": 50,
616
  "per_subtask": {
617
  "SA": {
618
+ "accuracy": 0.44,
619
+ "count": 50
620
  },
621
  "SC": {
622
+ "accuracy": 0.04,
623
+ "count": 50
624
  },
625
  "UC": {
626
+ "accuracy": 0.1506849315068493,
627
+ "count": 146
628
  },
629
  "US": {
630
+ "accuracy": 0.19230769230769232,
631
+ "count": 104
632
  }
633
  }
634
  },
635
  "add_C6": {
636
  "full_accuracy": 0.0,
637
+ "digit_accuracy": 0.33714285714285713,
638
+ "n_examples": 50,
639
  "per_subtask": {
640
  "SC": {
641
+ "accuracy": 0.06,
642
+ "count": 50
643
  },
644
  "UC": {
645
+ "accuracy": 0.2962962962962963,
646
+ "count": 189
647
  },
648
  "US": {
649
+ "accuracy": 0.5315315315315315,
650
+ "count": 111
651
  }
652
  }
653
  },
654
  "sub_M0": {
655
  "full_accuracy": 0.0,
656
+ "digit_accuracy": 0.3057142857142857,
657
+ "n_examples": 50,
658
  "per_subtask": {
659
  "MD": {
660
+ "accuracy": 0.19801980198019803,
661
+ "count": 303
662
  },
663
  "ME": {
664
  "accuracy": 1.0,
665
+ "count": 47
666
  }
667
  }
668
  },
669
  "sub_M1": {
670
  "full_accuracy": 0.0,
671
+ "digit_accuracy": 0.26285714285714284,
672
+ "n_examples": 50,
673
  "per_subtask": {
674
  "MD": {
675
+ "accuracy": 0.3900709219858156,
676
+ "count": 141
677
  },
678
  "MB": {
679
  "accuracy": 0.0,
680
+ "count": 72
681
  },
682
  "ME": {
683
  "accuracy": 1.0,
684
+ "count": 18
685
  },
686
  "UB": {
687
+ "accuracy": 0.15966386554621848,
688
+ "count": 119
689
  }
690
  }
691
  },
692
  "sub_M2": {
693
  "full_accuracy": 0.0,
694
+ "digit_accuracy": 0.38285714285714284,
695
+ "n_examples": 50,
696
  "per_subtask": {
697
  "MD": {
698
+ "accuracy": 0.6428571428571429,
699
+ "count": 112
700
  },
701
  "MB": {
702
  "accuracy": 0.0,
703
+ "count": 53
704
  },
705
  "ME": {
706
  "accuracy": 1.0,
707
+ "count": 47
708
  },
709
  "UB": {
710
+ "accuracy": 0.17647058823529413,
711
+ "count": 85
712
  },
713
  "UD": {
714
  "accuracy": 0.0,
715
+ "count": 53
716
  }
717
  }
718
  },
719
  "sub_M3": {
720
  "full_accuracy": 0.0,
721
+ "digit_accuracy": 0.28,
722
+ "n_examples": 50,
723
  "per_subtask": {
724
  "MD": {
725
+ "accuracy": 0.6494845360824743,
726
+ "count": 97
727
  },
728
  "MB": {
729
  "accuracy": 0.0,
730
+ "count": 51
731
  },
732
  "ME": {
733
  "accuracy": 1.0,
734
+ "count": 27
735
  },
736
  "UB": {
737
+ "accuracy": 0.10810810810810811,
738
+ "count": 74
739
  },
740
  "UD": {
741
  "accuracy": 0.0,
742
+ "count": 101
743
  }
744
  }
745
  },
746
  "sub_M4": {
747
  "full_accuracy": 0.0,
748
+ "digit_accuracy": 0.21142857142857144,
749
+ "n_examples": 50,
750
  "per_subtask": {
751
  "MD": {
752
  "accuracy": 0.5,
753
+ "count": 100
754
  },
755
  "MB": {
756
  "accuracy": 0.0,
757
+ "count": 50
758
  },
759
  "UB": {
760
+ "accuracy": 0.48,
761
+ "count": 50
762
  },
763
  "UD": {
764
  "accuracy": 0.0,
765
+ "count": 150
766
  }
767
  }
768
  },
769
  "sub_M5": {
770
  "full_accuracy": 0.0,
771
+ "digit_accuracy": 0.18285714285714286,
772
+ "n_examples": 50,
773
  "per_subtask": {
774
  "MD": {
775
  "accuracy": 1.0,
776
+ "count": 50
777
  },
778
  "MB": {
779
  "accuracy": 0.0,
780
+ "count": 50
781
  },
782
  "UB": {
783
+ "accuracy": 0.28,
784
+ "count": 50
785
  },
786
  "UD": {
787
  "accuracy": 0.0,
788
+ "count": 200
789
  }
790
  }
791
  },
792
  "sub_random": {
793
  "full_accuracy": 0.0,
794
+ "digit_accuracy": 0.2307142857142857,
795
  "n_examples": 200,
796
  "per_subtask": {
797
  "MD": {
798
+ "accuracy": 0.37719298245614036,
799
+ "count": 570
800
  },
801
  "MB": {
802
  "accuracy": 0.0,
803
+ "count": 277
804
  },
805
  "ME": {
806
  "accuracy": 1.0,
807
  "count": 53
808
  },
809
  "UB": {
810
+ "accuracy": 0.11677282377919321,
811
+ "count": 471
812
  },
813
  "UD": {
814
  "accuracy": 0.0,
815
+ "count": 29
816
  }
817
  }
818
  },
819
  "sub_B3": {
820
  "full_accuracy": 0.0,
821
+ "digit_accuracy": 0.18857142857142858,
822
+ "n_examples": 50,
823
  "per_subtask": {
824
  "MD": {
825
  "accuracy": 0.3333333333333333,
826
+ "count": 150
827
  },
828
  "MB": {
829
  "accuracy": 0.0,
830
+ "count": 50
831
  },
832
  "UB": {
833
+ "accuracy": 0.15841584158415842,
834
+ "count": 101
835
  },
836
  "UD": {
837
  "accuracy": 0.0,
838
+ "count": 49
839
  }
840
  }
841
  },
842
  "sub_B4": {
843
  "full_accuracy": 0.0,
844
+ "digit_accuracy": 0.17714285714285713,
845
+ "n_examples": 50,
846
  "per_subtask": {
847
  "MD": {
848
  "accuracy": 0.5,
849
+ "count": 100
850
  },
851
  "MB": {
852
  "accuracy": 0.0,
853
+ "count": 50
854
  },
855
  "UB": {
856
+ "accuracy": 0.09917355371900827,
857
+ "count": 121
858
  },
859
  "UD": {
860
  "accuracy": 0.0,
861
+ "count": 79
862
  }
863
  }
864
  },
865
  "sub_B5": {
866
  "full_accuracy": 0.0,
867
+ "digit_accuracy": 0.18,
868
+ "n_examples": 50,
869
  "per_subtask": {
870
  "MD": {
871
  "accuracy": 1.0,
872
+ "count": 50
873
  },
874
  "MB": {
875
  "accuracy": 0.0,
876
+ "count": 50
877
  },
878
  "UB": {
879
+ "accuracy": 0.08552631578947369,
880
+ "count": 152
881
  },
882
  "UD": {
883
  "accuracy": 0.0,
884
+ "count": 98
885
  }
886
  }
887
  }
888
  },
889
  "summary": {
890
+ "overall_accuracy": 0.004,
891
+ "digit_accuracy": 0.24485714285714286,
892
+ "total_examples": 1500,
893
+ "n_splits": 24
894
  }
895
  }
896
  }