amirali1985 commited on
Commit
d8f4ceb
·
verified ·
1 Parent(s): 56bc453

Upload add_sub_sorl_v1_abs30_10K/metrics.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. add_sub_sorl_v1_abs30_10K/metrics.json +530 -400
add_sub_sorl_v1_abs30_10K/metrics.json CHANGED
@@ -290,502 +290,567 @@
290
  "K": null,
291
  "mode": "sft",
292
  "n_digits": 6,
293
- "n_per_split": 100
294
  },
295
  "splits": {
296
  "add_S0": {
297
  "full_accuracy": 0.0,
298
- "n_examples": 100,
 
299
  "per_subtask": {
300
  "SA": {
301
- "accuracy": 0.2528925619834711,
302
- "count": 605
303
  },
304
  "SS": {
305
- "accuracy": 0.8315789473684211,
306
- "count": 95
307
  }
308
  }
309
  },
310
  "add_S1": {
311
  "full_accuracy": 0.0,
312
- "n_examples": 100,
 
313
  "per_subtask": {
314
  "SA": {
315
- "accuracy": 0.30392156862745096,
316
- "count": 204
317
  },
318
  "SC": {
319
- "accuracy": 0.21893491124260356,
320
- "count": 169
321
  },
322
  "SS": {
323
- "accuracy": 0.7096774193548387,
324
- "count": 31
325
  },
326
  "UC": {
327
- "accuracy": 0.27702702702702703,
328
- "count": 296
329
  }
330
  }
331
  },
332
  "add_S2": {
333
  "full_accuracy": 0.0,
334
- "n_examples": 100,
 
335
  "per_subtask": {
336
  "SA": {
337
- "accuracy": 0.3312883435582822,
338
- "count": 163
339
  },
340
  "SC": {
341
- "accuracy": 0.13846153846153847,
342
- "count": 130
343
  },
344
  "SS": {
345
- "accuracy": 0.4482758620689655,
346
- "count": 87
347
  },
348
  "UC": {
349
- "accuracy": 0.3842364532019704,
350
- "count": 203
351
  },
352
  "US": {
353
- "accuracy": 0.5299145299145299,
354
- "count": 117
355
  }
356
  }
357
  },
358
  "add_S3": {
359
  "full_accuracy": 0.0,
360
- "n_examples": 100,
 
361
  "per_subtask": {
362
  "SA": {
363
- "accuracy": 0.36363636363636365,
364
- "count": 121
365
  },
366
  "SC": {
367
- "accuracy": 0.06611570247933884,
368
- "count": 121
369
  },
370
  "SS": {
371
- "accuracy": 0.42857142857142855,
372
- "count": 49
373
  },
374
  "UC": {
375
- "accuracy": 0.4032258064516129,
376
- "count": 186
377
  },
378
  "US": {
379
- "accuracy": 0.5919282511210763,
380
- "count": 223
381
  }
382
  }
383
  },
384
  "add_S4": {
385
  "full_accuracy": 0.0,
386
- "n_examples": 100,
 
387
  "per_subtask": {
388
  "SA": {
389
- "accuracy": 0.38461538461538464,
390
- "count": 104
391
  },
392
  "SC": {
393
- "accuracy": 0.11320754716981132,
394
- "count": 106
395
  },
396
  "SS": {
397
- "accuracy": 0.4782608695652174,
398
- "count": 23
399
  },
400
  "UC": {
401
- "accuracy": 0.41875,
402
- "count": 160
403
  },
404
  "US": {
405
- "accuracy": 0.49185667752442996,
406
- "count": 307
407
  }
408
  }
409
  },
410
  "add_S5": {
411
- "full_accuracy": 0.03,
412
- "n_examples": 100,
 
413
  "per_subtask": {
414
  "SA": {
415
- "accuracy": 0.47,
416
- "count": 100
417
  },
418
  "SC": {
419
- "accuracy": 0.09,
420
- "count": 100
421
  },
422
  "UC": {
423
- "accuracy": 0.43,
424
- "count": 100
425
  },
426
  "US": {
427
- "accuracy": 0.3375,
428
- "count": 400
429
  }
430
  }
431
  },
432
  "add_S6": {
433
- "full_accuracy": 0.12,
434
- "n_examples": 100,
 
435
  "per_subtask": {
436
  "SC": {
437
- "accuracy": 0.12,
438
- "count": 100
439
  },
440
  "UC": {
441
- "accuracy": 0.64,
442
- "count": 100
443
  },
444
  "US": {
445
- "accuracy": 0.636,
446
- "count": 500
447
  }
448
  }
449
  },
450
  "add_random": {
451
  "full_accuracy": 0.0,
 
452
  "n_examples": 200,
453
  "per_subtask": {
454
  "SA": {
455
- "accuracy": 0.3087248322147651,
456
- "count": 447
457
  },
458
  "SC": {
459
- "accuracy": 0.165625,
460
- "count": 320
461
  },
462
  "SS": {
463
- "accuracy": 0.5892857142857143,
464
- "count": 56
465
  },
466
  "UC": {
467
- "accuracy": 0.29300567107750475,
468
- "count": 529
469
  },
470
  "US": {
471
- "accuracy": 0.4791666666666667,
472
- "count": 48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  }
474
  }
475
  },
476
  "add_C3": {
477
  "full_accuracy": 0.0,
478
- "n_examples": 100,
 
479
  "per_subtask": {
480
  "SA": {
481
- "accuracy": 0.35,
482
- "count": 300
483
  },
484
  "SC": {
485
  "accuracy": 0.1,
486
- "count": 100
487
  },
488
  "UC": {
489
- "accuracy": 0.21243523316062177,
490
- "count": 193
491
  },
492
  "US": {
493
- "accuracy": 0.35514018691588783,
494
- "count": 107
495
  }
496
  }
497
  },
498
  "add_C4": {
499
  "full_accuracy": 0.0,
500
- "n_examples": 100,
 
501
  "per_subtask": {
502
  "SA": {
503
- "accuracy": 0.46,
504
- "count": 200
505
  },
506
  "SC": {
507
- "accuracy": 0.07,
508
- "count": 100
509
  },
510
  "UC": {
511
- "accuracy": 0.15234375,
512
- "count": 256
513
  },
514
  "US": {
515
- "accuracy": 0.2916666666666667,
516
- "count": 144
517
  }
518
  }
519
  },
520
  "add_C5": {
521
- "full_accuracy": 0.02,
522
- "n_examples": 100,
 
523
  "per_subtask": {
524
  "SA": {
525
- "accuracy": 0.57,
526
- "count": 100
527
  },
528
  "SC": {
529
- "accuracy": 0.15,
530
- "count": 100
531
  },
532
  "UC": {
533
- "accuracy": 0.2647058823529412,
534
- "count": 306
535
  },
536
  "US": {
537
- "accuracy": 0.4793814432989691,
538
- "count": 194
539
  }
540
  }
541
  },
542
  "add_C6": {
543
  "full_accuracy": 0.0,
544
- "n_examples": 100,
 
545
  "per_subtask": {
546
  "SC": {
547
- "accuracy": 0.16,
548
- "count": 100
549
  },
550
  "UC": {
551
- "accuracy": 0.28688524590163933,
552
- "count": 366
553
  },
554
  "US": {
555
- "accuracy": 0.6794871794871795,
556
- "count": 234
557
  }
558
  }
559
  },
560
  "sub_M0": {
561
  "full_accuracy": 0.0,
562
- "n_examples": 100,
 
563
  "per_subtask": {
564
  "MD": {
565
- "accuracy": 0.2579034941763727,
566
- "count": 601
567
  },
568
  "ME": {
569
- "accuracy": 0.898989898989899,
570
- "count": 99
571
  }
572
  }
573
  },
574
  "sub_M1": {
575
  "full_accuracy": 0.0,
576
- "n_examples": 100,
 
577
  "per_subtask": {
578
  "MD": {
579
- "accuracy": 0.44086021505376344,
580
- "count": 279
581
  },
582
  "MB": {
583
- "accuracy": 0.013793103448275862,
584
- "count": 145
585
  },
586
  "ME": {
587
- "accuracy": 0.875,
588
- "count": 24
589
  },
590
  "UB": {
591
- "accuracy": 0.15873015873015872,
592
- "count": 252
593
  }
594
  }
595
  },
596
  "sub_M2": {
597
  "full_accuracy": 0.0,
598
- "n_examples": 100,
 
599
  "per_subtask": {
600
  "MD": {
601
- "accuracy": 0.6525821596244131,
602
- "count": 213
603
  },
604
  "MB": {
605
- "accuracy": 0.02654867256637168,
606
- "count": 113
607
  },
608
  "ME": {
609
- "accuracy": 0.8941176470588236,
610
- "count": 85
611
  },
612
  "UB": {
613
- "accuracy": 0.19889502762430938,
614
- "count": 181
615
  },
616
  "UD": {
617
- "accuracy": 0.1111111111111111,
618
- "count": 108
619
  }
620
  }
621
  },
622
  "sub_M3": {
623
  "full_accuracy": 0.0,
624
- "n_examples": 100,
 
625
  "per_subtask": {
626
  "MD": {
627
- "accuracy": 0.7821229050279329,
628
- "count": 179
629
  },
630
  "MB": {
631
- "accuracy": 0.02912621359223301,
632
- "count": 103
633
  },
634
  "ME": {
635
- "accuracy": 0.8392857142857143,
636
- "count": 56
637
  },
638
  "UB": {
639
- "accuracy": 0.2550335570469799,
640
- "count": 149
641
  },
642
  "UD": {
643
- "accuracy": 0.1267605633802817,
644
- "count": 213
645
  }
646
  }
647
  },
648
  "sub_M4": {
649
  "full_accuracy": 0.0,
650
- "n_examples": 100,
 
651
  "per_subtask": {
652
  "MD": {
653
- "accuracy": 0.59,
654
- "count": 200
655
  },
656
  "MB": {
657
- "accuracy": 0.03,
658
- "count": 100
659
  },
660
  "UB": {
661
- "accuracy": 0.35,
662
- "count": 100
663
  },
664
  "UD": {
665
- "accuracy": 0.14333333333333334,
666
- "count": 300
667
  }
668
  }
669
  },
670
  "sub_M5": {
671
  "full_accuracy": 0.02,
672
- "n_examples": 100,
 
673
  "per_subtask": {
674
  "MD": {
675
  "accuracy": 1.0,
676
- "count": 100
677
  },
678
  "MB": {
679
- "accuracy": 0.09,
680
- "count": 100
681
  },
682
  "UB": {
683
- "accuracy": 0.52,
684
- "count": 100
685
  },
686
  "UD": {
687
- "accuracy": 0.17,
688
- "count": 400
689
  }
690
  }
691
  },
692
  "sub_random": {
693
  "full_accuracy": 0.0,
 
694
  "n_examples": 200,
695
  "per_subtask": {
696
  "MD": {
697
- "accuracy": 0.4116666666666667,
698
- "count": 600
699
  },
700
  "MB": {
701
- "accuracy": 0.02247191011235955,
702
- "count": 267
703
  },
704
  "ME": {
705
  "accuracy": 0.8679245283018868,
706
  "count": 53
707
  },
708
  "UB": {
709
- "accuracy": 0.1867881548974943,
710
- "count": 439
711
  },
712
  "UD": {
713
- "accuracy": 0.1951219512195122,
714
- "count": 41
715
  }
716
  }
717
  },
718
  "sub_B3": {
719
  "full_accuracy": 0.0,
720
- "n_examples": 100,
 
721
  "per_subtask": {
722
  "MD": {
723
- "accuracy": 0.38333333333333336,
724
- "count": 300
725
  },
726
  "MB": {
727
- "accuracy": 0.01,
728
- "count": 100
729
  },
730
  "UB": {
731
- "accuracy": 0.2131979695431472,
732
- "count": 197
733
  },
734
  "UD": {
735
- "accuracy": 0.038834951456310676,
736
- "count": 103
737
  }
738
  }
739
  },
740
  "sub_B4": {
741
  "full_accuracy": 0.0,
742
- "n_examples": 100,
 
743
  "per_subtask": {
744
  "MD": {
745
- "accuracy": 0.555,
746
- "count": 200
747
  },
748
  "MB": {
749
- "accuracy": 0.02,
750
- "count": 100
751
  },
752
  "UB": {
753
- "accuracy": 0.21862348178137653,
754
- "count": 247
755
  },
756
  "UD": {
757
- "accuracy": 0.0784313725490196,
758
- "count": 153
759
  }
760
  }
761
  },
762
  "sub_B5": {
763
  "full_accuracy": 0.0,
764
- "n_examples": 100,
 
765
  "per_subtask": {
766
  "MD": {
767
  "accuracy": 1.0,
768
- "count": 100
769
  },
770
  "MB": {
771
- "accuracy": 0.03,
772
- "count": 100
773
  },
774
  "UB": {
775
- "accuracy": 0.174496644295302,
776
- "count": 298
777
  },
778
  "UD": {
779
- "accuracy": 0.04455445544554455,
780
- "count": 202
781
  }
782
  }
783
  }
784
  },
785
  "summary": {
786
- "overall_accuracy": 0.007916666666666667,
787
- "total_examples": 2400,
788
- "n_splits": 22
 
789
  }
790
  },
791
  "sorl_eval": {
@@ -794,502 +859,567 @@
794
  "K": 4,
795
  "mode": "sorl",
796
  "n_digits": 6,
797
- "n_per_split": 100
798
  },
799
  "splits": {
800
  "add_S0": {
801
  "full_accuracy": 0.0,
802
- "n_examples": 100,
 
803
  "per_subtask": {
804
  "SA": {
805
- "accuracy": 0.39669421487603307,
806
- "count": 605
807
  },
808
  "SS": {
809
- "accuracy": 0.9263157894736842,
810
- "count": 95
811
  }
812
  }
813
  },
814
  "add_S1": {
815
  "full_accuracy": 0.0,
816
- "n_examples": 100,
 
817
  "per_subtask": {
818
  "SA": {
819
- "accuracy": 0.45098039215686275,
820
- "count": 204
821
  },
822
  "SC": {
823
- "accuracy": 0.2603550295857988,
824
- "count": 169
825
  },
826
  "SS": {
827
- "accuracy": 0.8709677419354839,
828
- "count": 31
829
  },
830
  "UC": {
831
- "accuracy": 0.3141891891891892,
832
- "count": 296
833
  }
834
  }
835
  },
836
  "add_S2": {
837
  "full_accuracy": 0.0,
838
- "n_examples": 100,
 
839
  "per_subtask": {
840
  "SA": {
841
- "accuracy": 0.4785276073619632,
842
- "count": 163
843
  },
844
  "SC": {
845
- "accuracy": 0.12307692307692308,
846
- "count": 130
847
  },
848
  "SS": {
849
- "accuracy": 0.3563218390804598,
850
- "count": 87
851
  },
852
  "UC": {
853
- "accuracy": 0.4187192118226601,
854
- "count": 203
855
  },
856
  "US": {
857
- "accuracy": 0.5641025641025641,
858
- "count": 117
859
  }
860
  }
861
  },
862
  "add_S3": {
863
  "full_accuracy": 0.0,
864
- "n_examples": 100,
 
865
  "per_subtask": {
866
  "SA": {
867
- "accuracy": 0.512396694214876,
868
- "count": 121
869
  },
870
  "SC": {
871
- "accuracy": 0.09917355371900827,
872
- "count": 121
873
  },
874
  "SS": {
875
- "accuracy": 0.5102040816326531,
876
- "count": 49
877
  },
878
  "UC": {
879
- "accuracy": 0.4032258064516129,
880
- "count": 186
881
  },
882
  "US": {
883
- "accuracy": 0.5560538116591929,
884
- "count": 223
885
  }
886
  }
887
  },
888
  "add_S4": {
889
  "full_accuracy": 0.0,
890
- "n_examples": 100,
 
891
  "per_subtask": {
892
  "SA": {
893
- "accuracy": 0.5865384615384616,
894
- "count": 104
895
  },
896
  "SC": {
897
- "accuracy": 0.11320754716981132,
898
- "count": 106
899
  },
900
  "SS": {
901
- "accuracy": 0.6521739130434783,
902
- "count": 23
903
  },
904
  "UC": {
905
- "accuracy": 0.43125,
906
- "count": 160
907
  },
908
  "US": {
909
- "accuracy": 0.48534201954397393,
910
- "count": 307
911
  }
912
  }
913
  },
914
  "add_S5": {
915
  "full_accuracy": 0.0,
916
- "n_examples": 100,
 
917
  "per_subtask": {
918
  "SA": {
919
- "accuracy": 0.63,
920
- "count": 100
921
  },
922
  "SC": {
923
- "accuracy": 0.06,
924
- "count": 100
925
  },
926
  "UC": {
927
- "accuracy": 0.34,
928
- "count": 100
929
  },
930
  "US": {
931
- "accuracy": 0.2575,
932
- "count": 400
933
  }
934
  }
935
  },
936
  "add_S6": {
937
- "full_accuracy": 0.1,
938
- "n_examples": 100,
 
939
  "per_subtask": {
940
  "SC": {
941
- "accuracy": 0.1,
942
- "count": 100
943
  },
944
  "UC": {
945
- "accuracy": 0.53,
946
- "count": 100
947
  },
948
  "US": {
949
- "accuracy": 0.506,
950
- "count": 500
951
  }
952
  }
953
  },
954
  "add_random": {
955
  "full_accuracy": 0.0,
 
956
  "n_examples": 200,
957
  "per_subtask": {
958
  "SA": {
959
- "accuracy": 0.46308724832214765,
960
- "count": 447
961
  },
962
  "SC": {
963
- "accuracy": 0.1875,
964
- "count": 320
965
  },
966
  "SS": {
967
- "accuracy": 0.6785714285714286,
968
- "count": 56
969
  },
970
  "UC": {
971
- "accuracy": 0.3856332703213611,
972
- "count": 529
973
  },
974
  "US": {
975
- "accuracy": 0.4791666666666667,
976
- "count": 48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
977
  }
978
  }
979
  },
980
  "add_C3": {
981
  "full_accuracy": 0.0,
982
- "n_examples": 100,
 
983
  "per_subtask": {
984
  "SA": {
985
- "accuracy": 0.47333333333333333,
986
- "count": 300
987
  },
988
  "SC": {
989
- "accuracy": 0.04,
990
- "count": 100
991
  },
992
  "UC": {
993
- "accuracy": 0.21761658031088082,
994
- "count": 193
995
  },
996
  "US": {
997
- "accuracy": 0.2336448598130841,
998
- "count": 107
999
  }
1000
  }
1001
  },
1002
  "add_C4": {
1003
  "full_accuracy": 0.0,
1004
- "n_examples": 100,
 
1005
  "per_subtask": {
1006
  "SA": {
1007
- "accuracy": 0.625,
1008
- "count": 200
1009
  },
1010
  "SC": {
1011
- "accuracy": 0.04,
1012
- "count": 100
1013
  },
1014
  "UC": {
1015
- "accuracy": 0.19140625,
1016
- "count": 256
1017
  },
1018
  "US": {
1019
- "accuracy": 0.2986111111111111,
1020
- "count": 144
1021
  }
1022
  }
1023
  },
1024
  "add_C5": {
1025
  "full_accuracy": 0.0,
1026
- "n_examples": 100,
 
1027
  "per_subtask": {
1028
  "SA": {
1029
- "accuracy": 0.62,
1030
- "count": 100
1031
  },
1032
  "SC": {
1033
- "accuracy": 0.08,
1034
- "count": 100
1035
  },
1036
  "UC": {
1037
- "accuracy": 0.25163398692810457,
1038
- "count": 306
1039
  },
1040
  "US": {
1041
- "accuracy": 0.38144329896907214,
1042
- "count": 194
1043
  }
1044
  }
1045
  },
1046
  "add_C6": {
1047
  "full_accuracy": 0.0,
1048
- "n_examples": 100,
 
1049
  "per_subtask": {
1050
  "SC": {
1051
- "accuracy": 0.14,
1052
- "count": 100
1053
  },
1054
  "UC": {
1055
- "accuracy": 0.32786885245901637,
1056
- "count": 366
1057
  },
1058
  "US": {
1059
- "accuracy": 0.7307692307692307,
1060
- "count": 234
1061
  }
1062
  }
1063
  },
1064
  "sub_M0": {
1065
  "full_accuracy": 0.0,
1066
- "n_examples": 100,
 
1067
  "per_subtask": {
1068
  "MD": {
1069
- "accuracy": 0.36605657237936773,
1070
- "count": 601
1071
  },
1072
  "ME": {
1073
- "accuracy": 0.8888888888888888,
1074
- "count": 99
1075
  }
1076
  }
1077
  },
1078
  "sub_M1": {
1079
  "full_accuracy": 0.0,
1080
- "n_examples": 100,
 
1081
  "per_subtask": {
1082
  "MD": {
1083
- "accuracy": 0.5913978494623656,
1084
- "count": 279
1085
  },
1086
  "MB": {
1087
- "accuracy": 0.034482758620689655,
1088
- "count": 145
1089
  },
1090
  "ME": {
1091
- "accuracy": 0.75,
1092
- "count": 24
1093
  },
1094
  "UB": {
1095
- "accuracy": 0.2222222222222222,
1096
- "count": 252
1097
  }
1098
  }
1099
  },
1100
  "sub_M2": {
1101
  "full_accuracy": 0.0,
1102
- "n_examples": 100,
 
1103
  "per_subtask": {
1104
  "MD": {
1105
- "accuracy": 0.704225352112676,
1106
- "count": 213
1107
  },
1108
  "MB": {
1109
- "accuracy": 0.04424778761061947,
1110
- "count": 113
1111
  },
1112
  "ME": {
1113
- "accuracy": 0.9647058823529412,
1114
- "count": 85
1115
  },
1116
  "UB": {
1117
- "accuracy": 0.281767955801105,
1118
- "count": 181
1119
  },
1120
  "UD": {
1121
- "accuracy": 0.12037037037037036,
1122
- "count": 108
1123
  }
1124
  }
1125
  },
1126
  "sub_M3": {
1127
  "full_accuracy": 0.0,
1128
- "n_examples": 100,
 
1129
  "per_subtask": {
1130
  "MD": {
1131
- "accuracy": 0.8156424581005587,
1132
- "count": 179
1133
  },
1134
  "MB": {
1135
  "accuracy": 0.0,
1136
- "count": 103
1137
  },
1138
  "ME": {
1139
- "accuracy": 0.9285714285714286,
1140
- "count": 56
1141
  },
1142
  "UB": {
1143
- "accuracy": 0.348993288590604,
1144
- "count": 149
1145
  },
1146
  "UD": {
1147
- "accuracy": 0.0892018779342723,
1148
- "count": 213
1149
  }
1150
  }
1151
  },
1152
  "sub_M4": {
1153
  "full_accuracy": 0.0,
1154
- "n_examples": 100,
 
1155
  "per_subtask": {
1156
  "MD": {
1157
- "accuracy": 0.76,
1158
- "count": 200
1159
  },
1160
  "MB": {
1161
- "accuracy": 0.01,
1162
- "count": 100
1163
  },
1164
  "UB": {
1165
- "accuracy": 0.44,
1166
- "count": 100
1167
  },
1168
  "UD": {
1169
- "accuracy": 0.08333333333333333,
1170
- "count": 300
1171
  }
1172
  }
1173
  },
1174
  "sub_M5": {
1175
- "full_accuracy": 0.01,
1176
- "n_examples": 100,
 
1177
  "per_subtask": {
1178
  "MD": {
1179
  "accuracy": 1.0,
1180
- "count": 100
1181
  },
1182
  "MB": {
1183
- "accuracy": 0.04,
1184
- "count": 100
1185
  },
1186
  "UB": {
1187
- "accuracy": 0.7,
1188
- "count": 100
1189
  },
1190
  "UD": {
1191
- "accuracy": 0.065,
1192
- "count": 400
1193
  }
1194
  }
1195
  },
1196
  "sub_random": {
1197
  "full_accuracy": 0.0,
 
1198
  "n_examples": 200,
1199
  "per_subtask": {
1200
  "MD": {
1201
- "accuracy": 0.555,
1202
- "count": 600
1203
  },
1204
  "MB": {
1205
- "accuracy": 0.03745318352059925,
1206
- "count": 267
1207
  },
1208
  "ME": {
1209
- "accuracy": 0.9245283018867925,
1210
  "count": 53
1211
  },
1212
  "UB": {
1213
- "accuracy": 0.275626423690205,
1214
- "count": 439
1215
  },
1216
  "UD": {
1217
- "accuracy": 0.1951219512195122,
1218
- "count": 41
1219
  }
1220
  }
1221
  },
1222
  "sub_B3": {
1223
  "full_accuracy": 0.0,
1224
- "n_examples": 100,
 
1225
  "per_subtask": {
1226
  "MD": {
1227
- "accuracy": 0.55,
1228
- "count": 300
1229
  },
1230
  "MB": {
1231
- "accuracy": 0.07,
1232
- "count": 100
1233
  },
1234
  "UB": {
1235
- "accuracy": 0.2233502538071066,
1236
- "count": 197
1237
  },
1238
  "UD": {
1239
- "accuracy": 0.1941747572815534,
1240
- "count": 103
1241
  }
1242
  }
1243
  },
1244
  "sub_B4": {
1245
  "full_accuracy": 0.0,
1246
- "n_examples": 100,
 
1247
  "per_subtask": {
1248
  "MD": {
1249
- "accuracy": 0.68,
1250
- "count": 200
1251
  },
1252
  "MB": {
1253
- "accuracy": 0.03,
1254
- "count": 100
1255
  },
1256
  "UB": {
1257
- "accuracy": 0.2550607287449393,
1258
- "count": 247
1259
  },
1260
  "UD": {
1261
- "accuracy": 0.1568627450980392,
1262
- "count": 153
1263
  }
1264
  }
1265
  },
1266
  "sub_B5": {
1267
  "full_accuracy": 0.0,
1268
- "n_examples": 100,
 
1269
  "per_subtask": {
1270
  "MD": {
1271
  "accuracy": 1.0,
1272
- "count": 100
1273
  },
1274
  "MB": {
1275
- "accuracy": 0.06,
1276
- "count": 100
1277
  },
1278
  "UB": {
1279
- "accuracy": 0.31208053691275167,
1280
- "count": 298
1281
  },
1282
  "UD": {
1283
- "accuracy": 0.19801980198019803,
1284
- "count": 202
1285
  }
1286
  }
1287
  }
1288
  },
1289
  "summary": {
1290
- "overall_accuracy": 0.004583333333333333,
1291
- "total_examples": 2400,
1292
- "n_splits": 22
 
1293
  }
1294
  },
1295
  "sorl_overall_accuracy": 0.004583333333333333,
 
290
  "K": null,
291
  "mode": "sft",
292
  "n_digits": 6,
293
+ "n_per_split": 50
294
  },
295
  "splits": {
296
  "add_S0": {
297
  "full_accuracy": 0.0,
298
+ "digit_accuracy": 0.3628571428571429,
299
+ "n_examples": 50,
300
  "per_subtask": {
301
  "SA": {
302
+ "accuracy": 0.2711864406779661,
303
+ "count": 295
304
  },
305
  "SS": {
306
+ "accuracy": 0.8545454545454545,
307
+ "count": 55
308
  }
309
  }
310
  },
311
  "add_S1": {
312
  "full_accuracy": 0.0,
313
+ "digit_accuracy": 0.22857142857142856,
314
+ "n_examples": 50,
315
  "per_subtask": {
316
  "SA": {
317
+ "accuracy": 0.2857142857142857,
318
+ "count": 126
319
  },
320
  "SC": {
321
+ "accuracy": 0.12658227848101267,
322
+ "count": 79
323
  },
324
  "SS": {
325
+ "accuracy": 0.3333333333333333,
326
+ "count": 21
327
  },
328
  "UC": {
329
+ "accuracy": 0.21774193548387097,
330
+ "count": 124
331
  }
332
  }
333
  },
334
  "add_S2": {
335
  "full_accuracy": 0.0,
336
+ "digit_accuracy": 0.3457142857142857,
337
+ "n_examples": 50,
338
  "per_subtask": {
339
  "SA": {
340
+ "accuracy": 0.32,
341
+ "count": 75
342
  },
343
  "SC": {
344
+ "accuracy": 0.16129032258064516,
345
+ "count": 62
346
  },
347
  "SS": {
348
+ "accuracy": 0.358974358974359,
349
+ "count": 39
350
  },
351
  "UC": {
352
+ "accuracy": 0.35135135135135137,
353
+ "count": 111
354
  },
355
  "US": {
356
+ "accuracy": 0.5396825396825397,
357
+ "count": 63
358
  }
359
  }
360
  },
361
  "add_S3": {
362
  "full_accuracy": 0.0,
363
+ "digit_accuracy": 0.39714285714285713,
364
+ "n_examples": 50,
365
  "per_subtask": {
366
  "SA": {
367
+ "accuracy": 0.4166666666666667,
368
+ "count": 60
369
  },
370
  "SC": {
371
+ "accuracy": 0.10526315789473684,
372
+ "count": 57
373
  },
374
  "SS": {
375
+ "accuracy": 0.631578947368421,
376
+ "count": 19
377
  },
378
  "UC": {
379
+ "accuracy": 0.3942307692307692,
380
+ "count": 104
381
  },
382
  "US": {
383
+ "accuracy": 0.5,
384
+ "count": 110
385
  }
386
  }
387
  },
388
  "add_S4": {
389
  "full_accuracy": 0.0,
390
+ "digit_accuracy": 0.41714285714285715,
391
+ "n_examples": 50,
392
  "per_subtask": {
393
  "SA": {
394
+ "accuracy": 0.3541666666666667,
395
+ "count": 48
396
  },
397
  "SC": {
398
+ "accuracy": 0.11538461538461539,
399
+ "count": 52
400
  },
401
  "SS": {
402
+ "accuracy": 0.5714285714285714,
403
+ "count": 7
404
  },
405
  "UC": {
406
+ "accuracy": 0.3595505617977528,
407
+ "count": 89
408
  },
409
  "US": {
410
+ "accuracy": 0.564935064935065,
411
+ "count": 154
412
  }
413
  }
414
  },
415
  "add_S5": {
416
+ "full_accuracy": 0.0,
417
+ "digit_accuracy": 0.34,
418
+ "n_examples": 50,
419
  "per_subtask": {
420
  "SA": {
421
+ "accuracy": 0.54,
422
+ "count": 50
423
  },
424
  "SC": {
425
+ "accuracy": 0.12,
426
+ "count": 50
427
  },
428
  "UC": {
429
+ "accuracy": 0.44,
430
+ "count": 50
431
  },
432
  "US": {
433
+ "accuracy": 0.32,
434
+ "count": 200
435
  }
436
  }
437
  },
438
  "add_S6": {
439
+ "full_accuracy": 0.18,
440
+ "digit_accuracy": 0.3514285714285714,
441
+ "n_examples": 50,
442
  "per_subtask": {
443
  "SC": {
444
+ "accuracy": 0.18,
445
+ "count": 50
446
  },
447
  "UC": {
448
+ "accuracy": 0.38,
449
+ "count": 50
450
  },
451
  "US": {
452
+ "accuracy": 0.38,
453
+ "count": 250
454
  }
455
  }
456
  },
457
  "add_random": {
458
  "full_accuracy": 0.0,
459
+ "digit_accuracy": 0.26785714285714285,
460
  "n_examples": 200,
461
  "per_subtask": {
462
  "SA": {
463
+ "accuracy": 0.2691415313225058,
464
+ "count": 431
465
  },
466
  "SC": {
467
+ "accuracy": 0.14556962025316456,
468
+ "count": 316
469
  },
470
  "SS": {
471
+ "accuracy": 0.6410256410256411,
472
+ "count": 39
473
  },
474
  "UC": {
475
+ "accuracy": 0.29464285714285715,
476
+ "count": 560
477
  },
478
  "US": {
479
+ "accuracy": 0.42592592592592593,
480
+ "count": 54
481
+ }
482
+ }
483
+ },
484
+ "add_C1": {
485
+ "full_accuracy": 0.0,
486
+ "digit_accuracy": 0.21428571428571427,
487
+ "n_examples": 50,
488
+ "per_subtask": {
489
+ "SA": {
490
+ "accuracy": 0.256,
491
+ "count": 250
492
+ },
493
+ "SC": {
494
+ "accuracy": 0.12,
495
+ "count": 50
496
+ },
497
+ "UC": {
498
+ "accuracy": 0.1,
499
+ "count": 50
500
+ }
501
+ }
502
+ },
503
+ "add_C2": {
504
+ "full_accuracy": 0.0,
505
+ "digit_accuracy": 0.24857142857142858,
506
+ "n_examples": 50,
507
+ "per_subtask": {
508
+ "SA": {
509
+ "accuracy": 0.255,
510
+ "count": 200
511
+ },
512
+ "SC": {
513
+ "accuracy": 0.14,
514
+ "count": 50
515
+ },
516
+ "UC": {
517
+ "accuracy": 0.26506024096385544,
518
+ "count": 83
519
+ },
520
+ "US": {
521
+ "accuracy": 0.4117647058823529,
522
+ "count": 17
523
  }
524
  }
525
  },
526
  "add_C3": {
527
  "full_accuracy": 0.0,
528
+ "digit_accuracy": 0.26285714285714284,
529
+ "n_examples": 50,
530
  "per_subtask": {
531
  "SA": {
532
+ "accuracy": 0.30666666666666664,
533
+ "count": 150
534
  },
535
  "SC": {
536
  "accuracy": 0.1,
537
+ "count": 50
538
  },
539
  "UC": {
540
+ "accuracy": 0.23,
541
+ "count": 100
542
  },
543
  "US": {
544
+ "accuracy": 0.36,
545
+ "count": 50
546
  }
547
  }
548
  },
549
  "add_C4": {
550
  "full_accuracy": 0.0,
551
+ "digit_accuracy": 0.28,
552
+ "n_examples": 50,
553
  "per_subtask": {
554
  "SA": {
555
+ "accuracy": 0.47,
556
+ "count": 100
557
  },
558
  "SC": {
559
+ "accuracy": 0.08,
560
+ "count": 50
561
  },
562
  "UC": {
563
+ "accuracy": 0.13636363636363635,
564
+ "count": 132
565
  },
566
  "US": {
567
+ "accuracy": 0.4264705882352941,
568
+ "count": 68
569
  }
570
  }
571
  },
572
  "add_C5": {
573
+ "full_accuracy": 0.0,
574
+ "digit_accuracy": 0.26,
575
+ "n_examples": 50,
576
  "per_subtask": {
577
  "SA": {
578
+ "accuracy": 0.4,
579
+ "count": 50
580
  },
581
  "SC": {
582
+ "accuracy": 0.04,
583
+ "count": 50
584
  },
585
  "UC": {
586
+ "accuracy": 0.2328767123287671,
587
+ "count": 146
588
  },
589
  "US": {
590
+ "accuracy": 0.33653846153846156,
591
+ "count": 104
592
  }
593
  }
594
  },
595
  "add_C6": {
596
  "full_accuracy": 0.0,
597
+ "digit_accuracy": 0.4085714285714286,
598
+ "n_examples": 50,
599
  "per_subtask": {
600
  "SC": {
601
+ "accuracy": 0.14,
602
+ "count": 50
603
  },
604
  "UC": {
605
+ "accuracy": 0.31216931216931215,
606
+ "count": 189
607
  },
608
  "US": {
609
+ "accuracy": 0.6936936936936937,
610
+ "count": 111
611
  }
612
  }
613
  },
614
  "sub_M0": {
615
  "full_accuracy": 0.0,
616
+ "digit_accuracy": 0.3457142857142857,
617
+ "n_examples": 50,
618
  "per_subtask": {
619
  "MD": {
620
+ "accuracy": 0.26732673267326734,
621
+ "count": 303
622
  },
623
  "ME": {
624
+ "accuracy": 0.851063829787234,
625
+ "count": 47
626
  }
627
  }
628
  },
629
  "sub_M1": {
630
  "full_accuracy": 0.0,
631
+ "digit_accuracy": 0.3171428571428571,
632
+ "n_examples": 50,
633
  "per_subtask": {
634
  "MD": {
635
+ "accuracy": 0.46099290780141844,
636
+ "count": 141
637
  },
638
  "MB": {
639
+ "accuracy": 0.027777777777777776,
640
+ "count": 72
641
  },
642
  "ME": {
643
+ "accuracy": 1.0,
644
+ "count": 18
645
  },
646
  "UB": {
647
+ "accuracy": 0.2184873949579832,
648
+ "count": 119
649
  }
650
  }
651
  },
652
  "sub_M2": {
653
  "full_accuracy": 0.0,
654
+ "digit_accuracy": 0.4085714285714286,
655
+ "n_examples": 50,
656
  "per_subtask": {
657
  "MD": {
658
+ "accuracy": 0.6696428571428571,
659
+ "count": 112
660
  },
661
  "MB": {
662
+ "accuracy": 0.018867924528301886,
663
+ "count": 53
664
  },
665
  "ME": {
666
+ "accuracy": 0.9148936170212766,
667
+ "count": 47
668
  },
669
  "UB": {
670
+ "accuracy": 0.21176470588235294,
671
+ "count": 85
672
  },
673
  "UD": {
674
+ "accuracy": 0.11320754716981132,
675
+ "count": 53
676
  }
677
  }
678
  },
679
  "sub_M3": {
680
  "full_accuracy": 0.0,
681
+ "digit_accuracy": 0.32,
682
+ "n_examples": 50,
683
  "per_subtask": {
684
  "MD": {
685
+ "accuracy": 0.6804123711340206,
686
+ "count": 97
687
  },
688
  "MB": {
689
+ "accuracy": 0.0,
690
+ "count": 51
691
  },
692
  "ME": {
693
+ "accuracy": 0.8518518518518519,
694
+ "count": 27
695
  },
696
  "UB": {
697
+ "accuracy": 0.20270270270270271,
698
+ "count": 74
699
  },
700
  "UD": {
701
+ "accuracy": 0.07920792079207921,
702
+ "count": 101
703
  }
704
  }
705
  },
706
  "sub_M4": {
707
  "full_accuracy": 0.0,
708
+ "digit_accuracy": 0.29428571428571426,
709
+ "n_examples": 50,
710
  "per_subtask": {
711
  "MD": {
712
+ "accuracy": 0.55,
713
+ "count": 100
714
  },
715
  "MB": {
716
+ "accuracy": 0.02,
717
+ "count": 50
718
  },
719
  "UB": {
720
+ "accuracy": 0.58,
721
+ "count": 50
722
  },
723
  "UD": {
724
+ "accuracy": 0.12,
725
+ "count": 150
726
  }
727
  }
728
  },
729
  "sub_M5": {
730
  "full_accuracy": 0.02,
731
+ "digit_accuracy": 0.29428571428571426,
732
+ "n_examples": 50,
733
  "per_subtask": {
734
  "MD": {
735
  "accuracy": 1.0,
736
+ "count": 50
737
  },
738
  "MB": {
739
+ "accuracy": 0.04,
740
+ "count": 50
741
  },
742
  "UB": {
743
+ "accuracy": 0.54,
744
+ "count": 50
745
  },
746
  "UD": {
747
+ "accuracy": 0.12,
748
+ "count": 200
749
  }
750
  }
751
  },
752
  "sub_random": {
753
  "full_accuracy": 0.0,
754
+ "digit_accuracy": 0.27,
755
  "n_examples": 200,
756
  "per_subtask": {
757
  "MD": {
758
+ "accuracy": 0.42105263157894735,
759
+ "count": 570
760
  },
761
  "MB": {
762
+ "accuracy": 0.021660649819494584,
763
+ "count": 277
764
  },
765
  "ME": {
766
  "accuracy": 0.8679245283018868,
767
  "count": 53
768
  },
769
  "UB": {
770
+ "accuracy": 0.17834394904458598,
771
+ "count": 471
772
  },
773
  "UD": {
774
+ "accuracy": 0.06896551724137931,
775
+ "count": 29
776
  }
777
  }
778
  },
779
  "sub_B3": {
780
  "full_accuracy": 0.0,
781
+ "digit_accuracy": 0.24,
782
+ "n_examples": 50,
783
  "per_subtask": {
784
  "MD": {
785
+ "accuracy": 0.38666666666666666,
786
+ "count": 150
787
  },
788
  "MB": {
789
+ "accuracy": 0.02,
790
+ "count": 50
791
  },
792
  "UB": {
793
+ "accuracy": 0.2376237623762376,
794
+ "count": 101
795
  },
796
  "UD": {
797
+ "accuracy": 0.02040816326530612,
798
+ "count": 49
799
  }
800
  }
801
  },
802
  "sub_B4": {
803
  "full_accuracy": 0.0,
804
+ "digit_accuracy": 0.24,
805
+ "n_examples": 50,
806
  "per_subtask": {
807
  "MD": {
808
+ "accuracy": 0.6,
809
+ "count": 100
810
  },
811
  "MB": {
812
+ "accuracy": 0.04,
813
+ "count": 50
814
  },
815
  "UB": {
816
+ "accuracy": 0.1652892561983471,
817
+ "count": 121
818
  },
819
  "UD": {
820
+ "accuracy": 0.02531645569620253,
821
+ "count": 79
822
  }
823
  }
824
  },
825
  "sub_B5": {
826
  "full_accuracy": 0.0,
827
+ "digit_accuracy": 0.24571428571428572,
828
+ "n_examples": 50,
829
  "per_subtask": {
830
  "MD": {
831
  "accuracy": 1.0,
832
+ "count": 50
833
  },
834
  "MB": {
835
+ "accuracy": 0.02,
836
+ "count": 50
837
  },
838
  "UB": {
839
+ "accuracy": 0.17763157894736842,
840
+ "count": 152
841
  },
842
  "UD": {
843
+ "accuracy": 0.08163265306122448,
844
+ "count": 98
845
  }
846
  }
847
  }
848
  },
849
  "summary": {
850
+ "overall_accuracy": 0.006666666666666667,
851
+ "digit_accuracy": 0.29895238095238097,
852
+ "total_examples": 1500,
853
+ "n_splits": 24
854
  }
855
  },
856
  "sorl_eval": {
 
859
  "K": 4,
860
  "mode": "sorl",
861
  "n_digits": 6,
862
+ "n_per_split": 50
863
  },
864
  "splits": {
865
  "add_S0": {
866
  "full_accuracy": 0.0,
867
+ "digit_accuracy": 0.48,
868
+ "n_examples": 50,
869
  "per_subtask": {
870
  "SA": {
871
+ "accuracy": 0.39661016949152544,
872
+ "count": 295
873
  },
874
  "SS": {
875
+ "accuracy": 0.9272727272727272,
876
+ "count": 55
877
  }
878
  }
879
  },
880
  "add_S1": {
881
  "full_accuracy": 0.0,
882
+ "digit_accuracy": 0.35714285714285715,
883
+ "n_examples": 50,
884
  "per_subtask": {
885
  "SA": {
886
+ "accuracy": 0.48412698412698413,
887
+ "count": 126
888
  },
889
  "SC": {
890
+ "accuracy": 0.13924050632911392,
891
+ "count": 79
892
  },
893
  "SS": {
894
+ "accuracy": 0.6666666666666666,
895
+ "count": 21
896
  },
897
  "UC": {
898
+ "accuracy": 0.31451612903225806,
899
+ "count": 124
900
  }
901
  }
902
  },
903
  "add_S2": {
904
  "full_accuracy": 0.0,
905
+ "digit_accuracy": 0.4142857142857143,
906
+ "n_examples": 50,
907
  "per_subtask": {
908
  "SA": {
909
+ "accuracy": 0.49333333333333335,
910
+ "count": 75
911
  },
912
  "SC": {
913
+ "accuracy": 0.1774193548387097,
914
+ "count": 62
915
  },
916
  "SS": {
917
+ "accuracy": 0.38461538461538464,
918
+ "count": 39
919
  },
920
  "UC": {
921
+ "accuracy": 0.40540540540540543,
922
+ "count": 111
923
  },
924
  "US": {
925
+ "accuracy": 0.5873015873015873,
926
+ "count": 63
927
  }
928
  }
929
  },
930
  "add_S3": {
931
  "full_accuracy": 0.0,
932
+ "digit_accuracy": 0.42,
933
+ "n_examples": 50,
934
  "per_subtask": {
935
  "SA": {
936
+ "accuracy": 0.5666666666666667,
937
+ "count": 60
938
  },
939
  "SC": {
940
+ "accuracy": 0.08771929824561403,
941
+ "count": 57
942
  },
943
  "SS": {
944
+ "accuracy": 0.7894736842105263,
945
+ "count": 19
946
  },
947
  "UC": {
948
+ "accuracy": 0.40384615384615385,
949
+ "count": 104
950
  },
951
  "US": {
952
+ "accuracy": 0.4636363636363636,
953
+ "count": 110
954
  }
955
  }
956
  },
957
  "add_S4": {
958
  "full_accuracy": 0.0,
959
+ "digit_accuracy": 0.44857142857142857,
960
+ "n_examples": 50,
961
  "per_subtask": {
962
  "SA": {
963
+ "accuracy": 0.4583333333333333,
964
+ "count": 48
965
  },
966
  "SC": {
967
+ "accuracy": 0.11538461538461539,
968
+ "count": 52
969
  },
970
  "SS": {
971
+ "accuracy": 0.7142857142857143,
972
+ "count": 7
973
  },
974
  "UC": {
975
+ "accuracy": 0.42696629213483145,
976
+ "count": 89
977
  },
978
  "US": {
979
+ "accuracy": 0.5584415584415584,
980
+ "count": 154
981
  }
982
  }
983
  },
984
  "add_S5": {
985
  "full_accuracy": 0.0,
986
+ "digit_accuracy": 0.32,
987
+ "n_examples": 50,
988
  "per_subtask": {
989
  "SA": {
990
+ "accuracy": 0.58,
991
+ "count": 50
992
  },
993
  "SC": {
994
+ "accuracy": 0.1,
995
+ "count": 50
996
  },
997
  "UC": {
998
+ "accuracy": 0.36,
999
+ "count": 50
1000
  },
1001
  "US": {
1002
+ "accuracy": 0.3,
1003
+ "count": 200
1004
  }
1005
  }
1006
  },
1007
  "add_S6": {
1008
+ "full_accuracy": 0.18,
1009
+ "digit_accuracy": 0.4657142857142857,
1010
+ "n_examples": 50,
1011
  "per_subtask": {
1012
  "SC": {
1013
+ "accuracy": 0.18,
1014
+ "count": 50
1015
  },
1016
  "UC": {
1017
+ "accuracy": 0.54,
1018
+ "count": 50
1019
  },
1020
  "US": {
1021
+ "accuracy": 0.508,
1022
+ "count": 250
1023
  }
1024
  }
1025
  },
1026
  "add_random": {
1027
  "full_accuracy": 0.0,
1028
+ "digit_accuracy": 0.35714285714285715,
1029
  "n_examples": 200,
1030
  "per_subtask": {
1031
  "SA": {
1032
+ "accuracy": 0.4361948955916473,
1033
+ "count": 431
1034
  },
1035
  "SC": {
1036
+ "accuracy": 0.17721518987341772,
1037
+ "count": 316
1038
  },
1039
  "SS": {
1040
+ "accuracy": 0.7948717948717948,
1041
+ "count": 39
1042
  },
1043
  "UC": {
1044
+ "accuracy": 0.35,
1045
+ "count": 560
1046
  },
1047
  "US": {
1048
+ "accuracy": 0.5370370370370371,
1049
+ "count": 54
1050
+ }
1051
+ }
1052
+ },
1053
+ "add_C1": {
1054
+ "full_accuracy": 0.0,
1055
+ "digit_accuracy": 0.3342857142857143,
1056
+ "n_examples": 50,
1057
+ "per_subtask": {
1058
+ "SA": {
1059
+ "accuracy": 0.416,
1060
+ "count": 250
1061
+ },
1062
+ "SC": {
1063
+ "accuracy": 0.12,
1064
+ "count": 50
1065
+ },
1066
+ "UC": {
1067
+ "accuracy": 0.14,
1068
+ "count": 50
1069
+ }
1070
+ }
1071
+ },
1072
+ "add_C2": {
1073
+ "full_accuracy": 0.0,
1074
+ "digit_accuracy": 0.3057142857142857,
1075
+ "n_examples": 50,
1076
+ "per_subtask": {
1077
+ "SA": {
1078
+ "accuracy": 0.42,
1079
+ "count": 200
1080
+ },
1081
+ "SC": {
1082
+ "accuracy": 0.06,
1083
+ "count": 50
1084
+ },
1085
+ "UC": {
1086
+ "accuracy": 0.1927710843373494,
1087
+ "count": 83
1088
+ },
1089
+ "US": {
1090
+ "accuracy": 0.23529411764705882,
1091
+ "count": 17
1092
  }
1093
  }
1094
  },
1095
  "add_C3": {
1096
  "full_accuracy": 0.0,
1097
+ "digit_accuracy": 0.3514285714285714,
1098
+ "n_examples": 50,
1099
  "per_subtask": {
1100
  "SA": {
1101
+ "accuracy": 0.52,
1102
+ "count": 150
1103
  },
1104
  "SC": {
1105
+ "accuracy": 0.1,
1106
+ "count": 50
1107
  },
1108
  "UC": {
1109
+ "accuracy": 0.24,
1110
+ "count": 100
1111
  },
1112
  "US": {
1113
+ "accuracy": 0.32,
1114
+ "count": 50
1115
  }
1116
  }
1117
  },
1118
  "add_C4": {
1119
  "full_accuracy": 0.0,
1120
+ "digit_accuracy": 0.35428571428571426,
1121
+ "n_examples": 50,
1122
  "per_subtask": {
1123
  "SA": {
1124
+ "accuracy": 0.61,
1125
+ "count": 100
1126
  },
1127
  "SC": {
1128
+ "accuracy": 0.1,
1129
+ "count": 50
1130
  },
1131
  "UC": {
1132
+ "accuracy": 0.1893939393939394,
1133
+ "count": 132
1134
  },
1135
  "US": {
1136
+ "accuracy": 0.4852941176470588,
1137
+ "count": 68
1138
  }
1139
  }
1140
  },
1141
  "add_C5": {
1142
  "full_accuracy": 0.0,
1143
+ "digit_accuracy": 0.36857142857142855,
1144
+ "n_examples": 50,
1145
  "per_subtask": {
1146
  "SA": {
1147
+ "accuracy": 0.56,
1148
+ "count": 50
1149
  },
1150
  "SC": {
1151
+ "accuracy": 0.1,
1152
+ "count": 50
1153
  },
1154
  "UC": {
1155
+ "accuracy": 0.2876712328767123,
1156
+ "count": 146
1157
  },
1158
  "US": {
1159
+ "accuracy": 0.5192307692307693,
1160
+ "count": 104
1161
  }
1162
  }
1163
  },
1164
  "add_C6": {
1165
  "full_accuracy": 0.0,
1166
+ "digit_accuracy": 0.3628571428571429,
1167
+ "n_examples": 50,
1168
  "per_subtask": {
1169
  "SC": {
1170
+ "accuracy": 0.06,
1171
+ "count": 50
1172
  },
1173
  "UC": {
1174
+ "accuracy": 0.32275132275132273,
1175
+ "count": 189
1176
  },
1177
  "US": {
1178
+ "accuracy": 0.5675675675675675,
1179
+ "count": 111
1180
  }
1181
  }
1182
  },
1183
  "sub_M0": {
1184
  "full_accuracy": 0.0,
1185
+ "digit_accuracy": 0.45714285714285713,
1186
+ "n_examples": 50,
1187
  "per_subtask": {
1188
  "MD": {
1189
+ "accuracy": 0.3795379537953795,
1190
+ "count": 303
1191
  },
1192
  "ME": {
1193
+ "accuracy": 0.9574468085106383,
1194
+ "count": 47
1195
  }
1196
  }
1197
  },
1198
  "sub_M1": {
1199
  "full_accuracy": 0.0,
1200
+ "digit_accuracy": 0.37142857142857144,
1201
+ "n_examples": 50,
1202
  "per_subtask": {
1203
  "MD": {
1204
+ "accuracy": 0.574468085106383,
1205
+ "count": 141
1206
  },
1207
  "MB": {
1208
+ "accuracy": 0.027777777777777776,
1209
+ "count": 72
1210
  },
1211
  "ME": {
1212
+ "accuracy": 0.8333333333333334,
1213
+ "count": 18
1214
  },
1215
  "UB": {
1216
+ "accuracy": 0.2689075630252101,
1217
+ "count": 119
1218
  }
1219
  }
1220
  },
1221
  "sub_M2": {
1222
  "full_accuracy": 0.0,
1223
+ "digit_accuracy": 0.4514285714285714,
1224
+ "n_examples": 50,
1225
  "per_subtask": {
1226
  "MD": {
1227
+ "accuracy": 0.7232142857142857,
1228
+ "count": 112
1229
  },
1230
  "MB": {
1231
+ "accuracy": 0.0,
1232
+ "count": 53
1233
  },
1234
  "ME": {
1235
+ "accuracy": 0.9361702127659575,
1236
+ "count": 47
1237
  },
1238
  "UB": {
1239
+ "accuracy": 0.3058823529411765,
1240
+ "count": 85
1241
  },
1242
  "UD": {
1243
+ "accuracy": 0.1320754716981132,
1244
+ "count": 53
1245
  }
1246
  }
1247
  },
1248
  "sub_M3": {
1249
  "full_accuracy": 0.0,
1250
+ "digit_accuracy": 0.3628571428571429,
1251
+ "n_examples": 50,
1252
  "per_subtask": {
1253
  "MD": {
1254
+ "accuracy": 0.7319587628865979,
1255
+ "count": 97
1256
  },
1257
  "MB": {
1258
  "accuracy": 0.0,
1259
+ "count": 51
1260
  },
1261
  "ME": {
1262
+ "accuracy": 0.9629629629629629,
1263
+ "count": 27
1264
  },
1265
  "UB": {
1266
+ "accuracy": 0.22972972972972974,
1267
+ "count": 74
1268
  },
1269
  "UD": {
1270
+ "accuracy": 0.12871287128712872,
1271
+ "count": 101
1272
  }
1273
  }
1274
  },
1275
  "sub_M4": {
1276
  "full_accuracy": 0.0,
1277
+ "digit_accuracy": 0.2914285714285714,
1278
+ "n_examples": 50,
1279
  "per_subtask": {
1280
  "MD": {
1281
+ "accuracy": 0.66,
1282
+ "count": 100
1283
  },
1284
  "MB": {
1285
+ "accuracy": 0.0,
1286
+ "count": 50
1287
  },
1288
  "UB": {
1289
+ "accuracy": 0.72,
1290
+ "count": 50
1291
  },
1292
  "UD": {
1293
+ "accuracy": 0.0,
1294
+ "count": 150
1295
  }
1296
  }
1297
  },
1298
  "sub_M5": {
1299
+ "full_accuracy": 0.0,
1300
+ "digit_accuracy": 0.24285714285714285,
1301
+ "n_examples": 50,
1302
  "per_subtask": {
1303
  "MD": {
1304
  "accuracy": 1.0,
1305
+ "count": 50
1306
  },
1307
  "MB": {
1308
+ "accuracy": 0.0,
1309
+ "count": 50
1310
  },
1311
  "UB": {
1312
+ "accuracy": 0.58,
1313
+ "count": 50
1314
  },
1315
  "UD": {
1316
+ "accuracy": 0.03,
1317
+ "count": 200
1318
  }
1319
  }
1320
  },
1321
  "sub_random": {
1322
  "full_accuracy": 0.0,
1323
+ "digit_accuracy": 0.35642857142857143,
1324
  "n_examples": 200,
1325
  "per_subtask": {
1326
  "MD": {
1327
+ "accuracy": 0.5508771929824562,
1328
+ "count": 570
1329
  },
1330
  "MB": {
1331
+ "accuracy": 0.04693140794223827,
1332
+ "count": 277
1333
  },
1334
  "ME": {
1335
+ "accuracy": 0.8113207547169812,
1336
  "count": 53
1337
  },
1338
  "UB": {
1339
+ "accuracy": 0.25902335456475584,
1340
+ "count": 471
1341
  },
1342
  "UD": {
1343
+ "accuracy": 0.2413793103448276,
1344
+ "count": 29
1345
  }
1346
  }
1347
  },
1348
  "sub_B3": {
1349
  "full_accuracy": 0.0,
1350
+ "digit_accuracy": 0.3314285714285714,
1351
+ "n_examples": 50,
1352
  "per_subtask": {
1353
  "MD": {
1354
+ "accuracy": 0.5333333333333333,
1355
+ "count": 150
1356
  },
1357
  "MB": {
1358
+ "accuracy": 0.02,
1359
+ "count": 50
1360
  },
1361
  "UB": {
1362
+ "accuracy": 0.27722772277227725,
1363
+ "count": 101
1364
  },
1365
  "UD": {
1366
+ "accuracy": 0.14285714285714285,
1367
+ "count": 49
1368
  }
1369
  }
1370
  },
1371
  "sub_B4": {
1372
  "full_accuracy": 0.0,
1373
+ "digit_accuracy": 0.3142857142857143,
1374
+ "n_examples": 50,
1375
  "per_subtask": {
1376
  "MD": {
1377
+ "accuracy": 0.74,
1378
+ "count": 100
1379
  },
1380
  "MB": {
1381
+ "accuracy": 0.0,
1382
+ "count": 50
1383
  },
1384
  "UB": {
1385
+ "accuracy": 0.21487603305785125,
1386
+ "count": 121
1387
  },
1388
  "UD": {
1389
+ "accuracy": 0.12658227848101267,
1390
+ "count": 79
1391
  }
1392
  }
1393
  },
1394
  "sub_B5": {
1395
  "full_accuracy": 0.0,
1396
+ "digit_accuracy": 0.3142857142857143,
1397
+ "n_examples": 50,
1398
  "per_subtask": {
1399
  "MD": {
1400
  "accuracy": 1.0,
1401
+ "count": 50
1402
  },
1403
  "MB": {
1404
+ "accuracy": 0.04,
1405
+ "count": 50
1406
  },
1407
  "UB": {
1408
+ "accuracy": 0.29605263157894735,
1409
+ "count": 152
1410
  },
1411
  "UD": {
1412
+ "accuracy": 0.1326530612244898,
1413
+ "count": 98
1414
  }
1415
  }
1416
  }
1417
  },
1418
  "summary": {
1419
+ "overall_accuracy": 0.006,
1420
+ "digit_accuracy": 0.36552380952380953,
1421
+ "total_examples": 1500,
1422
+ "n_splits": 24
1423
  }
1424
  },
1425
  "sorl_overall_accuracy": 0.004583333333333333,