amirali1985 commited on
Commit
3e26788
·
verified ·
1 Parent(s): de47b00

Upload add_sub_baseline_100K_1L3H510d/metrics.json with huggingface_hub

Browse files
add_sub_baseline_100K_1L3H510d/metrics.json CHANGED
@@ -2582,502 +2582,567 @@
2582
  "K": null,
2583
  "mode": "sft",
2584
  "n_digits": 6,
2585
- "n_per_split": 100
2586
  },
2587
  "splits": {
2588
  "add_S0": {
2589
- "full_accuracy": 0.99,
2590
- "n_examples": 100,
 
2591
  "per_subtask": {
2592
  "SA": {
2593
- "accuracy": 0.9983471074380166,
2594
- "count": 605
2595
  },
2596
  "SS": {
2597
  "accuracy": 1.0,
2598
- "count": 95
2599
  }
2600
  }
2601
  },
2602
  "add_S1": {
2603
- "full_accuracy": 0.96,
2604
- "n_examples": 100,
 
2605
  "per_subtask": {
2606
  "SA": {
2607
- "accuracy": 0.9901960784313726,
2608
- "count": 204
2609
  },
2610
  "SC": {
2611
- "accuracy": 1.0,
2612
- "count": 169
2613
  },
2614
  "SS": {
2615
  "accuracy": 1.0,
2616
- "count": 31
2617
  },
2618
  "UC": {
2619
- "accuracy": 0.9932432432432432,
2620
- "count": 296
2621
  }
2622
  }
2623
  },
2624
  "add_S2": {
2625
- "full_accuracy": 0.8,
2626
- "n_examples": 100,
 
2627
  "per_subtask": {
2628
  "SA": {
2629
- "accuracy": 1.0,
2630
- "count": 163
2631
  },
2632
  "SC": {
2633
- "accuracy": 0.9846153846153847,
2634
- "count": 130
2635
  },
2636
  "SS": {
2637
- "accuracy": 0.9540229885057471,
2638
- "count": 87
2639
  },
2640
  "UC": {
2641
- "accuracy": 0.9211822660098522,
2642
- "count": 203
2643
  },
2644
  "US": {
2645
  "accuracy": 1.0,
2646
- "count": 117
2647
  }
2648
  }
2649
  },
2650
  "add_S3": {
2651
- "full_accuracy": 0.57,
2652
- "n_examples": 100,
 
2653
  "per_subtask": {
2654
  "SA": {
2655
- "accuracy": 0.9917355371900827,
2656
- "count": 121
2657
  },
2658
  "SC": {
2659
- "accuracy": 0.9917355371900827,
2660
- "count": 121
2661
  },
2662
  "SS": {
2663
  "accuracy": 1.0,
2664
- "count": 49
2665
  },
2666
  "UC": {
2667
- "accuracy": 0.7795698924731183,
2668
- "count": 186
2669
  },
2670
  "US": {
2671
- "accuracy": 0.968609865470852,
2672
- "count": 223
2673
  }
2674
  }
2675
  },
2676
  "add_S4": {
2677
- "full_accuracy": 0.48,
2678
- "n_examples": 100,
 
2679
  "per_subtask": {
2680
  "SA": {
2681
  "accuracy": 1.0,
2682
- "count": 104
2683
  },
2684
  "SC": {
2685
  "accuracy": 1.0,
2686
- "count": 106
2687
  },
2688
  "SS": {
2689
  "accuracy": 1.0,
2690
- "count": 23
2691
  },
2692
  "UC": {
2693
- "accuracy": 0.76875,
2694
- "count": 160
2695
  },
2696
  "US": {
2697
- "accuracy": 0.7947882736156352,
2698
- "count": 307
2699
  }
2700
  }
2701
  },
2702
  "add_S5": {
2703
- "full_accuracy": 0.18,
2704
- "n_examples": 100,
 
2705
  "per_subtask": {
2706
  "SA": {
2707
  "accuracy": 1.0,
2708
- "count": 100
2709
  },
2710
  "SC": {
2711
  "accuracy": 1.0,
2712
- "count": 100
2713
  },
2714
  "UC": {
2715
- "accuracy": 0.36,
2716
- "count": 100
2717
  },
2718
  "US": {
2719
- "accuracy": 0.495,
2720
- "count": 400
2721
  }
2722
  }
2723
  },
2724
  "add_S6": {
2725
- "full_accuracy": 0.49,
2726
- "n_examples": 100,
 
2727
  "per_subtask": {
2728
  "SC": {
2729
  "accuracy": 1.0,
2730
- "count": 100
2731
  },
2732
  "UC": {
2733
- "accuracy": 0.65,
2734
- "count": 100
2735
  },
2736
  "US": {
2737
- "accuracy": 0.682,
2738
- "count": 500
2739
  }
2740
  }
2741
  },
2742
  "add_random": {
2743
- "full_accuracy": 0.925,
 
2744
  "n_examples": 200,
2745
  "per_subtask": {
2746
  "SA": {
2747
- "accuracy": 0.9932885906040269,
2748
- "count": 447
2749
  },
2750
  "SC": {
2751
- "accuracy": 0.9875,
2752
- "count": 320
2753
  },
2754
  "SS": {
2755
- "accuracy": 0.9821428571428571,
2756
- "count": 56
2757
  },
2758
  "UC": {
2759
- "accuracy": 0.9848771266540642,
2760
- "count": 529
2761
  },
2762
  "US": {
 
 
 
 
 
 
 
 
 
 
 
2763
  "accuracy": 1.0,
2764
- "count": 48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2765
  }
2766
  }
2767
  },
2768
  "add_C3": {
2769
- "full_accuracy": 0.67,
2770
- "n_examples": 100,
 
2771
  "per_subtask": {
2772
  "SA": {
2773
  "accuracy": 1.0,
2774
- "count": 300
2775
  },
2776
  "SC": {
2777
  "accuracy": 1.0,
2778
- "count": 100
2779
  },
2780
  "UC": {
2781
- "accuracy": 0.8393782383419689,
2782
- "count": 193
2783
  },
2784
  "US": {
2785
- "accuracy": 0.9065420560747663,
2786
- "count": 107
2787
  }
2788
  }
2789
  },
2790
  "add_C4": {
2791
- "full_accuracy": 0.67,
2792
- "n_examples": 100,
 
2793
  "per_subtask": {
2794
  "SA": {
2795
  "accuracy": 1.0,
2796
- "count": 200
2797
  },
2798
  "SC": {
2799
  "accuracy": 1.0,
2800
- "count": 100
2801
  },
2802
  "UC": {
2803
- "accuracy": 0.875,
2804
- "count": 256
2805
  },
2806
  "US": {
2807
- "accuracy": 0.8958333333333334,
2808
- "count": 144
2809
  }
2810
  }
2811
  },
2812
  "add_C5": {
2813
- "full_accuracy": 0.57,
2814
- "n_examples": 100,
 
2815
  "per_subtask": {
2816
  "SA": {
2817
  "accuracy": 1.0,
2818
- "count": 100
2819
  },
2820
  "SC": {
2821
  "accuracy": 1.0,
2822
- "count": 100
2823
  },
2824
  "UC": {
2825
- "accuracy": 0.8758169934640523,
2826
- "count": 306
2827
  },
2828
  "US": {
2829
- "accuracy": 0.8350515463917526,
2830
- "count": 194
2831
  }
2832
  }
2833
  },
2834
  "add_C6": {
2835
- "full_accuracy": 0.65,
2836
- "n_examples": 100,
 
2837
  "per_subtask": {
2838
  "SC": {
2839
  "accuracy": 1.0,
2840
- "count": 100
2841
  },
2842
  "UC": {
2843
- "accuracy": 0.912568306010929,
2844
- "count": 366
2845
  },
2846
  "US": {
2847
- "accuracy": 0.9316239316239316,
2848
- "count": 234
2849
  }
2850
  }
2851
  },
2852
  "sub_M0": {
2853
- "full_accuracy": 0.91,
2854
- "n_examples": 100,
 
2855
  "per_subtask": {
2856
  "MD": {
2857
- "accuracy": 0.9866888519134775,
2858
- "count": 601
2859
  },
2860
  "ME": {
2861
- "accuracy": 0.9797979797979798,
2862
- "count": 99
2863
  }
2864
  }
2865
  },
2866
  "sub_M1": {
2867
  "full_accuracy": 0.98,
2868
- "n_examples": 100,
 
2869
  "per_subtask": {
2870
  "MD": {
2871
- "accuracy": 0.996415770609319,
2872
- "count": 279
2873
  },
2874
  "MB": {
2875
- "accuracy": 0.993103448275862,
2876
- "count": 145
2877
  },
2878
  "ME": {
2879
  "accuracy": 1.0,
2880
- "count": 24
2881
  },
2882
  "UB": {
2883
  "accuracy": 1.0,
2884
- "count": 252
2885
  }
2886
  }
2887
  },
2888
  "sub_M2": {
2889
- "full_accuracy": 0.64,
2890
- "n_examples": 100,
 
2891
  "per_subtask": {
2892
  "MD": {
2893
- "accuracy": 0.9859154929577465,
2894
- "count": 213
2895
  },
2896
  "MB": {
2897
- "accuracy": 1.0,
2898
- "count": 113
2899
  },
2900
  "ME": {
2901
  "accuracy": 1.0,
2902
- "count": 85
2903
  },
2904
  "UB": {
2905
- "accuracy": 0.8011049723756906,
2906
- "count": 181
2907
  },
2908
  "UD": {
2909
- "accuracy": 1.0,
2910
- "count": 108
2911
  }
2912
  }
2913
  },
2914
  "sub_M3": {
2915
- "full_accuracy": 0.16,
2916
- "n_examples": 100,
 
2917
  "per_subtask": {
2918
  "MD": {
2919
- "accuracy": 1.0,
2920
- "count": 179
2921
  },
2922
  "MB": {
2923
- "accuracy": 1.0,
2924
- "count": 103
2925
  },
2926
  "ME": {
2927
  "accuracy": 1.0,
2928
- "count": 56
2929
  },
2930
  "UB": {
2931
- "accuracy": 0.4966442953020134,
2932
- "count": 149
2933
  },
2934
  "UD": {
2935
- "accuracy": 0.8356807511737089,
2936
- "count": 213
2937
  }
2938
  }
2939
  },
2940
  "sub_M4": {
2941
- "full_accuracy": 0.05,
2942
- "n_examples": 100,
 
2943
  "per_subtask": {
2944
  "MD": {
2945
  "accuracy": 1.0,
2946
- "count": 200
2947
  },
2948
  "MB": {
2949
  "accuracy": 1.0,
2950
- "count": 100
2951
  },
2952
  "UB": {
2953
- "accuracy": 0.43,
2954
- "count": 100
2955
  },
2956
  "UD": {
2957
  "accuracy": 0.46,
2958
- "count": 300
2959
  }
2960
  }
2961
  },
2962
  "sub_M5": {
2963
- "full_accuracy": 0.06,
2964
- "n_examples": 100,
 
2965
  "per_subtask": {
2966
  "MD": {
2967
  "accuracy": 1.0,
2968
- "count": 100
2969
  },
2970
  "MB": {
2971
  "accuracy": 1.0,
2972
- "count": 100
2973
  },
2974
  "UB": {
2975
- "accuracy": 0.44,
2976
- "count": 100
2977
  },
2978
  "UD": {
2979
- "accuracy": 0.405,
2980
- "count": 400
2981
  }
2982
  }
2983
  },
2984
  "sub_random": {
2985
- "full_accuracy": 0.895,
 
2986
  "n_examples": 200,
2987
  "per_subtask": {
2988
  "MD": {
2989
- "accuracy": 0.9916666666666667,
2990
- "count": 600
2991
  },
2992
  "MB": {
2993
- "accuracy": 1.0,
2994
- "count": 267
2995
  },
2996
  "ME": {
2997
  "accuracy": 1.0,
2998
  "count": 53
2999
  },
3000
  "UB": {
3001
- "accuracy": 0.9635535307517085,
3002
- "count": 439
3003
  },
3004
  "UD": {
3005
  "accuracy": 1.0,
3006
- "count": 41
3007
  }
3008
  }
3009
  },
3010
  "sub_B3": {
3011
- "full_accuracy": 0.51,
3012
- "n_examples": 100,
 
3013
  "per_subtask": {
3014
  "MD": {
3015
  "accuracy": 1.0,
3016
- "count": 300
3017
  },
3018
  "MB": {
3019
  "accuracy": 1.0,
3020
- "count": 100
3021
  },
3022
  "UB": {
3023
- "accuracy": 0.7868020304568528,
3024
- "count": 197
3025
  },
3026
  "UD": {
3027
- "accuracy": 0.8252427184466019,
3028
- "count": 103
3029
  }
3030
  }
3031
  },
3032
  "sub_B4": {
3033
- "full_accuracy": 0.33,
3034
- "n_examples": 100,
 
3035
  "per_subtask": {
3036
  "MD": {
3037
  "accuracy": 1.0,
3038
- "count": 200
3039
  },
3040
  "MB": {
3041
  "accuracy": 1.0,
3042
- "count": 100
3043
  },
3044
  "UB": {
3045
- "accuracy": 0.7732793522267206,
3046
- "count": 247
3047
  },
3048
  "UD": {
3049
- "accuracy": 0.7777777777777778,
3050
- "count": 153
3051
  }
3052
  }
3053
  },
3054
  "sub_B5": {
3055
- "full_accuracy": 0.29,
3056
- "n_examples": 100,
 
3057
  "per_subtask": {
3058
  "MD": {
3059
  "accuracy": 1.0,
3060
- "count": 100
3061
  },
3062
  "MB": {
3063
  "accuracy": 1.0,
3064
- "count": 100
3065
  },
3066
  "UB": {
3067
- "accuracy": 0.7751677852348994,
3068
- "count": 298
3069
  },
3070
  "UD": {
3071
- "accuracy": 0.7425742574257426,
3072
- "count": 202
3073
  }
3074
  }
3075
  }
3076
  },
3077
  "summary": {
3078
- "overall_accuracy": 0.6070833333333333,
3079
- "total_examples": 2400,
3080
- "n_splits": 22
 
3081
  }
3082
  }
3083
  }
 
2582
  "K": null,
2583
  "mode": "sft",
2584
  "n_digits": 6,
2585
+ "n_per_split": 50
2586
  },
2587
  "splits": {
2588
  "add_S0": {
2589
+ "full_accuracy": 0.98,
2590
+ "digit_accuracy": 0.9971428571428571,
2591
+ "n_examples": 50,
2592
  "per_subtask": {
2593
  "SA": {
2594
+ "accuracy": 0.9966101694915255,
2595
+ "count": 295
2596
  },
2597
  "SS": {
2598
  "accuracy": 1.0,
2599
+ "count": 55
2600
  }
2601
  }
2602
  },
2603
  "add_S1": {
2604
+ "full_accuracy": 0.94,
2605
+ "digit_accuracy": 0.9914285714285714,
2606
+ "n_examples": 50,
2607
  "per_subtask": {
2608
  "SA": {
2609
+ "accuracy": 1.0,
2610
+ "count": 126
2611
  },
2612
  "SC": {
2613
+ "accuracy": 0.9746835443037974,
2614
+ "count": 79
2615
  },
2616
  "SS": {
2617
  "accuracy": 1.0,
2618
+ "count": 21
2619
  },
2620
  "UC": {
2621
+ "accuracy": 0.9919354838709677,
2622
+ "count": 124
2623
  }
2624
  }
2625
  },
2626
  "add_S2": {
2627
+ "full_accuracy": 0.74,
2628
+ "digit_accuracy": 0.9542857142857143,
2629
+ "n_examples": 50,
2630
  "per_subtask": {
2631
  "SA": {
2632
+ "accuracy": 0.9733333333333334,
2633
+ "count": 75
2634
  },
2635
  "SC": {
2636
+ "accuracy": 0.9838709677419355,
2637
+ "count": 62
2638
  },
2639
  "SS": {
2640
+ "accuracy": 0.9230769230769231,
2641
+ "count": 39
2642
  },
2643
  "UC": {
2644
+ "accuracy": 0.9099099099099099,
2645
+ "count": 111
2646
  },
2647
  "US": {
2648
  "accuracy": 1.0,
2649
+ "count": 63
2650
  }
2651
  }
2652
  },
2653
  "add_S3": {
2654
+ "full_accuracy": 0.6,
2655
+ "digit_accuracy": 0.9285714285714286,
2656
+ "n_examples": 50,
2657
  "per_subtask": {
2658
  "SA": {
2659
+ "accuracy": 1.0,
2660
+ "count": 60
2661
  },
2662
  "SC": {
2663
+ "accuracy": 1.0,
2664
+ "count": 57
2665
  },
2666
  "SS": {
2667
  "accuracy": 1.0,
2668
+ "count": 19
2669
  },
2670
  "UC": {
2671
+ "accuracy": 0.8076923076923077,
2672
+ "count": 104
2673
  },
2674
  "US": {
2675
+ "accuracy": 0.9545454545454546,
2676
+ "count": 110
2677
  }
2678
  }
2679
  },
2680
  "add_S4": {
2681
+ "full_accuracy": 0.44,
2682
+ "digit_accuracy": 0.8228571428571428,
2683
+ "n_examples": 50,
2684
  "per_subtask": {
2685
  "SA": {
2686
  "accuracy": 1.0,
2687
+ "count": 48
2688
  },
2689
  "SC": {
2690
  "accuracy": 1.0,
2691
+ "count": 52
2692
  },
2693
  "SS": {
2694
  "accuracy": 1.0,
2695
+ "count": 7
2696
  },
2697
  "UC": {
2698
+ "accuracy": 0.7415730337078652,
2699
+ "count": 89
2700
  },
2701
  "US": {
2702
+ "accuracy": 0.7467532467532467,
2703
+ "count": 154
2704
  }
2705
  }
2706
  },
2707
  "add_S5": {
2708
+ "full_accuracy": 0.1,
2709
+ "digit_accuracy": 0.58,
2710
+ "n_examples": 50,
2711
  "per_subtask": {
2712
  "SA": {
2713
  "accuracy": 1.0,
2714
+ "count": 50
2715
  },
2716
  "SC": {
2717
  "accuracy": 1.0,
2718
+ "count": 50
2719
  },
2720
  "UC": {
2721
+ "accuracy": 0.32,
2722
+ "count": 50
2723
  },
2724
  "US": {
2725
+ "accuracy": 0.435,
2726
+ "count": 200
2727
  }
2728
  }
2729
  },
2730
  "add_S6": {
2731
+ "full_accuracy": 0.52,
2732
+ "digit_accuracy": 0.7428571428571429,
2733
+ "n_examples": 50,
2734
  "per_subtask": {
2735
  "SC": {
2736
  "accuracy": 1.0,
2737
+ "count": 50
2738
  },
2739
  "UC": {
2740
+ "accuracy": 0.66,
2741
+ "count": 50
2742
  },
2743
  "US": {
2744
+ "accuracy": 0.708,
2745
+ "count": 250
2746
  }
2747
  }
2748
  },
2749
  "add_random": {
2750
+ "full_accuracy": 0.94,
2751
+ "digit_accuracy": 0.99,
2752
  "n_examples": 200,
2753
  "per_subtask": {
2754
  "SA": {
2755
+ "accuracy": 0.9930394431554525,
2756
+ "count": 431
2757
  },
2758
  "SC": {
2759
+ "accuracy": 0.9968354430379747,
2760
+ "count": 316
2761
  },
2762
  "SS": {
2763
+ "accuracy": 0.9743589743589743,
2764
+ "count": 39
2765
  },
2766
  "UC": {
2767
+ "accuracy": 0.9857142857142858,
2768
+ "count": 560
2769
  },
2770
  "US": {
2771
+ "accuracy": 0.9814814814814815,
2772
+ "count": 54
2773
+ }
2774
+ }
2775
+ },
2776
+ "add_C1": {
2777
+ "full_accuracy": 0.94,
2778
+ "digit_accuracy": 0.9914285714285714,
2779
+ "n_examples": 50,
2780
+ "per_subtask": {
2781
+ "SA": {
2782
  "accuracy": 1.0,
2783
+ "count": 250
2784
+ },
2785
+ "SC": {
2786
+ "accuracy": 1.0,
2787
+ "count": 50
2788
+ },
2789
+ "UC": {
2790
+ "accuracy": 0.94,
2791
+ "count": 50
2792
+ }
2793
+ }
2794
+ },
2795
+ "add_C2": {
2796
+ "full_accuracy": 0.82,
2797
+ "digit_accuracy": 0.9657142857142857,
2798
+ "n_examples": 50,
2799
+ "per_subtask": {
2800
+ "SA": {
2801
+ "accuracy": 1.0,
2802
+ "count": 200
2803
+ },
2804
+ "SC": {
2805
+ "accuracy": 1.0,
2806
+ "count": 50
2807
+ },
2808
+ "UC": {
2809
+ "accuracy": 0.891566265060241,
2810
+ "count": 83
2811
+ },
2812
+ "US": {
2813
+ "accuracy": 0.8235294117647058,
2814
+ "count": 17
2815
  }
2816
  }
2817
  },
2818
  "add_C3": {
2819
+ "full_accuracy": 0.68,
2820
+ "digit_accuracy": 0.94,
2821
+ "n_examples": 50,
2822
  "per_subtask": {
2823
  "SA": {
2824
  "accuracy": 1.0,
2825
+ "count": 150
2826
  },
2827
  "SC": {
2828
  "accuracy": 1.0,
2829
+ "count": 50
2830
  },
2831
  "UC": {
2832
+ "accuracy": 0.86,
2833
+ "count": 100
2834
  },
2835
  "US": {
2836
+ "accuracy": 0.86,
2837
+ "count": 50
2838
  }
2839
  }
2840
  },
2841
  "add_C4": {
2842
+ "full_accuracy": 0.76,
2843
+ "digit_accuracy": 0.9571428571428572,
2844
+ "n_examples": 50,
2845
  "per_subtask": {
2846
  "SA": {
2847
  "accuracy": 1.0,
2848
+ "count": 100
2849
  },
2850
  "SC": {
2851
  "accuracy": 1.0,
2852
+ "count": 50
2853
  },
2854
  "UC": {
2855
+ "accuracy": 0.9166666666666666,
2856
+ "count": 132
2857
  },
2858
  "US": {
2859
+ "accuracy": 0.9411764705882353,
2860
+ "count": 68
2861
  }
2862
  }
2863
  },
2864
  "add_C5": {
2865
+ "full_accuracy": 0.62,
2866
+ "digit_accuracy": 0.9,
2867
+ "n_examples": 50,
2868
  "per_subtask": {
2869
  "SA": {
2870
  "accuracy": 1.0,
2871
+ "count": 50
2872
  },
2873
  "SC": {
2874
  "accuracy": 1.0,
2875
+ "count": 50
2876
  },
2877
  "UC": {
2878
+ "accuracy": 0.8561643835616438,
2879
+ "count": 146
2880
  },
2881
  "US": {
2882
+ "accuracy": 0.8653846153846154,
2883
+ "count": 104
2884
  }
2885
  }
2886
  },
2887
  "add_C6": {
2888
+ "full_accuracy": 0.68,
2889
+ "digit_accuracy": 0.94,
2890
+ "n_examples": 50,
2891
  "per_subtask": {
2892
  "SC": {
2893
  "accuracy": 1.0,
2894
+ "count": 50
2895
  },
2896
  "UC": {
2897
+ "accuracy": 0.9206349206349206,
2898
+ "count": 189
2899
  },
2900
  "US": {
2901
+ "accuracy": 0.9459459459459459,
2902
+ "count": 111
2903
  }
2904
  }
2905
  },
2906
  "sub_M0": {
2907
+ "full_accuracy": 0.92,
2908
+ "digit_accuracy": 0.9885714285714285,
2909
+ "n_examples": 50,
2910
  "per_subtask": {
2911
  "MD": {
2912
+ "accuracy": 0.9867986798679867,
2913
+ "count": 303
2914
  },
2915
  "ME": {
2916
+ "accuracy": 1.0,
2917
+ "count": 47
2918
  }
2919
  }
2920
  },
2921
  "sub_M1": {
2922
  "full_accuracy": 0.98,
2923
+ "digit_accuracy": 0.9971428571428571,
2924
+ "n_examples": 50,
2925
  "per_subtask": {
2926
  "MD": {
2927
+ "accuracy": 1.0,
2928
+ "count": 141
2929
  },
2930
  "MB": {
2931
+ "accuracy": 0.9861111111111112,
2932
+ "count": 72
2933
  },
2934
  "ME": {
2935
  "accuracy": 1.0,
2936
+ "count": 18
2937
  },
2938
  "UB": {
2939
  "accuracy": 1.0,
2940
+ "count": 119
2941
  }
2942
  }
2943
  },
2944
  "sub_M2": {
2945
+ "full_accuracy": 0.6,
2946
+ "digit_accuracy": 0.9371428571428572,
2947
+ "n_examples": 50,
2948
  "per_subtask": {
2949
  "MD": {
2950
+ "accuracy": 0.9910714285714286,
2951
+ "count": 112
2952
  },
2953
  "MB": {
2954
+ "accuracy": 0.9622641509433962,
2955
+ "count": 53
2956
  },
2957
  "ME": {
2958
  "accuracy": 1.0,
2959
+ "count": 47
2960
  },
2961
  "UB": {
2962
+ "accuracy": 0.788235294117647,
2963
+ "count": 85
2964
  },
2965
  "UD": {
2966
+ "accuracy": 0.9811320754716981,
2967
+ "count": 53
2968
  }
2969
  }
2970
  },
2971
  "sub_M3": {
2972
+ "full_accuracy": 0.24,
2973
+ "digit_accuracy": 0.86,
2974
+ "n_examples": 50,
2975
  "per_subtask": {
2976
  "MD": {
2977
+ "accuracy": 0.9896907216494846,
2978
+ "count": 97
2979
  },
2980
  "MB": {
2981
+ "accuracy": 0.9803921568627451,
2982
+ "count": 51
2983
  },
2984
  "ME": {
2985
  "accuracy": 1.0,
2986
+ "count": 27
2987
  },
2988
  "UB": {
2989
+ "accuracy": 0.581081081081081,
2990
+ "count": 74
2991
  },
2992
  "UD": {
2993
+ "accuracy": 0.8415841584158416,
2994
+ "count": 101
2995
  }
2996
  }
2997
  },
2998
  "sub_M4": {
2999
+ "full_accuracy": 0.04,
3000
+ "digit_accuracy": 0.6742857142857143,
3001
+ "n_examples": 50,
3002
  "per_subtask": {
3003
  "MD": {
3004
  "accuracy": 1.0,
3005
+ "count": 100
3006
  },
3007
  "MB": {
3008
  "accuracy": 1.0,
3009
+ "count": 50
3010
  },
3011
  "UB": {
3012
+ "accuracy": 0.34,
3013
+ "count": 50
3014
  },
3015
  "UD": {
3016
  "accuracy": 0.46,
3017
+ "count": 150
3018
  }
3019
  }
3020
  },
3021
  "sub_M5": {
3022
+ "full_accuracy": 0.12,
3023
+ "digit_accuracy": 0.6314285714285715,
3024
+ "n_examples": 50,
3025
  "per_subtask": {
3026
  "MD": {
3027
  "accuracy": 1.0,
3028
+ "count": 50
3029
  },
3030
  "MB": {
3031
  "accuracy": 1.0,
3032
+ "count": 50
3033
  },
3034
  "UB": {
3035
+ "accuracy": 0.52,
3036
+ "count": 50
3037
  },
3038
  "UD": {
3039
+ "accuracy": 0.475,
3040
+ "count": 200
3041
  }
3042
  }
3043
  },
3044
  "sub_random": {
3045
+ "full_accuracy": 0.94,
3046
+ "digit_accuracy": 0.9914285714285714,
3047
  "n_examples": 200,
3048
  "per_subtask": {
3049
  "MD": {
3050
+ "accuracy": 0.9982456140350877,
3051
+ "count": 570
3052
  },
3053
  "MB": {
3054
+ "accuracy": 0.9927797833935018,
3055
+ "count": 277
3056
  },
3057
  "ME": {
3058
  "accuracy": 1.0,
3059
  "count": 53
3060
  },
3061
  "UB": {
3062
+ "accuracy": 0.9808917197452229,
3063
+ "count": 471
3064
  },
3065
  "UD": {
3066
  "accuracy": 1.0,
3067
+ "count": 29
3068
  }
3069
  }
3070
  },
3071
  "sub_B3": {
3072
+ "full_accuracy": 0.56,
3073
+ "digit_accuracy": 0.9228571428571428,
3074
+ "n_examples": 50,
3075
  "per_subtask": {
3076
  "MD": {
3077
  "accuracy": 1.0,
3078
+ "count": 150
3079
  },
3080
  "MB": {
3081
  "accuracy": 1.0,
3082
+ "count": 50
3083
  },
3084
  "UB": {
3085
+ "accuracy": 0.8118811881188119,
3086
+ "count": 101
3087
  },
3088
  "UD": {
3089
+ "accuracy": 0.8367346938775511,
3090
+ "count": 49
3091
  }
3092
  }
3093
  },
3094
  "sub_B4": {
3095
+ "full_accuracy": 0.36,
3096
+ "digit_accuracy": 0.8628571428571429,
3097
+ "n_examples": 50,
3098
  "per_subtask": {
3099
  "MD": {
3100
  "accuracy": 1.0,
3101
+ "count": 100
3102
  },
3103
  "MB": {
3104
  "accuracy": 1.0,
3105
+ "count": 50
3106
  },
3107
  "UB": {
3108
+ "accuracy": 0.7768595041322314,
3109
+ "count": 121
3110
  },
3111
  "UD": {
3112
+ "accuracy": 0.7341772151898734,
3113
+ "count": 79
3114
  }
3115
  }
3116
  },
3117
  "sub_B5": {
3118
+ "full_accuracy": 0.18,
3119
+ "digit_accuracy": 0.8314285714285714,
3120
+ "n_examples": 50,
3121
  "per_subtask": {
3122
  "MD": {
3123
  "accuracy": 1.0,
3124
+ "count": 50
3125
  },
3126
  "MB": {
3127
  "accuracy": 1.0,
3128
+ "count": 50
3129
  },
3130
  "UB": {
3131
+ "accuracy": 0.7631578947368421,
3132
+ "count": 152
3133
  },
3134
  "UD": {
3135
+ "accuracy": 0.7653061224489796,
3136
+ "count": 98
3137
  }
3138
  }
3139
  }
3140
  },
3141
  "summary": {
3142
+ "overall_accuracy": 0.678,
3143
+ "digit_accuracy": 0.9113333333333333,
3144
+ "total_examples": 1500,
3145
+ "n_splits": 24
3146
  }
3147
  }
3148
  }