decodeshare / artifacts /rebuttal /reasoning /quick_reasoning_summary.json
Zishan Shao
Add lighthouse rebuttal artifacts
1c8e365
{
"config": {
"model": "meta-llama/Llama-2-7b-chat-hf",
"device": "cuda",
"model_dtype": "fp16",
"tasks": "gsm8k,commonsenseqa,strategyqa,arc_challenge,openbookqa,qasc,logiqa",
"heldout_tasks": "gsm8k,logiqa",
"layer": 10,
"n_subspace": 64,
"n_eval": 32,
"use_forced_choice": true,
"do_sample": false,
"offline": true
},
"commands": {
"gsm8k": [
"/home/zs89/miniconda3/envs/flashsvd/bin/python",
"/home/zs89/decodeshare/reasoning/disturb_CoT_shared_loto_reasoning.py",
"--model",
"meta-llama/Llama-2-7b-chat-hf",
"--device",
"cuda",
"--model_dtype",
"fp16",
"--layer",
"10",
"--tasks",
"gsm8k,commonsenseqa,strategyqa,arc_challenge,openbookqa,qasc,logiqa",
"--mode",
"loto",
"--loto_eval_mode",
"heldout",
"--loto_only",
"gsm8k",
"--n_subspace",
"64",
"--n_eval",
"32",
"--pca_var",
"0.95",
"--tau",
"0.001",
"--m_shared",
"all",
"--calib_decode_max_new_tokens",
"48",
"--per_task_max_states",
"2048",
"--alpha_remove",
"1.0",
"--reasoning_tokens",
"64",
"--max_new_tokens",
"96",
"--temperature",
"0.7",
"--top_p",
"0.9",
"--top_k",
"0",
"--do_sample",
"0",
"--template_randomization",
"1",
"--template_seed",
"1234",
"--shuffle_choices",
"1",
"--add_answer_prefix",
"1",
"--answer_prefix",
"\nFinal answer:",
"--use_forced_choice",
"1",
"--fc_warmup_tokens",
"0",
"--fc_prefix_mode",
"auto",
"--fc_answer_prefix",
"\nFinal answer:",
"--batch_size",
"2",
"--max_prompt_len",
"512",
"--bootstrap_iters",
"500",
"--perm_iters",
"1000",
"--ci_alpha",
"0.05",
"--seed",
"42",
"--sample_seed",
"12345",
"--out_json",
"/home/zs89/decodeshare/rebuttal/reasoning/raw/gsm8k_loto.json",
"--out_md",
"/home/zs89/decodeshare/rebuttal/reasoning/raw/gsm8k_loto.md"
],
"logiqa": [
"/home/zs89/miniconda3/envs/flashsvd/bin/python",
"/home/zs89/decodeshare/reasoning/disturb_CoT_shared_loto_reasoning.py",
"--model",
"meta-llama/Llama-2-7b-chat-hf",
"--device",
"cuda",
"--model_dtype",
"fp16",
"--layer",
"10",
"--tasks",
"gsm8k,commonsenseqa,strategyqa,arc_challenge,openbookqa,qasc,logiqa",
"--mode",
"loto",
"--loto_eval_mode",
"heldout",
"--loto_only",
"logiqa",
"--n_subspace",
"64",
"--n_eval",
"32",
"--pca_var",
"0.95",
"--tau",
"0.001",
"--m_shared",
"all",
"--calib_decode_max_new_tokens",
"48",
"--per_task_max_states",
"2048",
"--alpha_remove",
"1.0",
"--reasoning_tokens",
"64",
"--max_new_tokens",
"96",
"--temperature",
"0.7",
"--top_p",
"0.9",
"--top_k",
"0",
"--do_sample",
"0",
"--template_randomization",
"1",
"--template_seed",
"1234",
"--shuffle_choices",
"1",
"--add_answer_prefix",
"1",
"--answer_prefix",
"\nFinal answer:",
"--use_forced_choice",
"1",
"--fc_warmup_tokens",
"0",
"--fc_prefix_mode",
"auto",
"--fc_answer_prefix",
"\nFinal answer:",
"--batch_size",
"2",
"--max_prompt_len",
"512",
"--bootstrap_iters",
"500",
"--perm_iters",
"1000",
"--ci_alpha",
"0.05",
"--seed",
"42",
"--sample_seed",
"12345",
"--out_json",
"/home/zs89/decodeshare/rebuttal/reasoning/raw/logiqa_loto.json",
"--out_md",
"/home/zs89/decodeshare/rebuttal/reasoning/raw/logiqa_loto.md"
]
},
"raw_files": {
"gsm8k": {
"json": "/home/zs89/decodeshare/rebuttal/reasoning/raw/gsm8k_loto.json",
"md": "/home/zs89/decodeshare/rebuttal/reasoning/raw/gsm8k_loto.md"
},
"logiqa": {
"json": "/home/zs89/decodeshare/rebuttal/reasoning/raw/logiqa_loto.json",
"md": "/home/zs89/decodeshare/rebuttal/reasoning/raw/logiqa_loto.md"
}
},
"per_task": [
{
"task": "gsm8k",
"task_label": "Open-ended numeric reasoning",
"n_eval": 32,
"protocol": "generation",
"k_eval": 77,
"chance_acc": null,
"baseline_near_floor": true,
"baseline_acc": 0.0,
"decode_shared_acc": 0.0,
"prefill_shared_acc": 0.0,
"random_acc": 0.0,
"decode_minus_prefill": 0.0,
"decode_minus_prefill_ci_low": 0.0,
"decode_minus_prefill_ci_high": 0.0,
"decode_minus_prefill_p": 1.0,
"decode_minus_baseline": 0.0,
"prefill_minus_baseline": 0.0,
"random_minus_baseline": 0.0
},
{
"task": "logiqa",
"task_label": "Logical reasoning multiple choice",
"n_eval": 32,
"protocol": "forced_choice",
"k_eval": 70,
"chance_acc": 0.25,
"baseline_near_floor": false,
"baseline_acc": 0.3125,
"decode_shared_acc": 0.15625,
"prefill_shared_acc": 0.34375,
"random_acc": 0.34375,
"decode_minus_prefill": -0.1875,
"decode_minus_prefill_ci_low": -0.3125,
"decode_minus_prefill_ci_high": -0.0625,
"decode_minus_prefill_p": 0.03596403596403597,
"decode_minus_baseline": -0.15625,
"prefill_minus_baseline": 0.03125,
"random_minus_baseline": 0.03125
}
],
"aggregate": {
"baseline_acc_mean": 0.15625,
"decode_shared_acc_mean": 0.078125,
"prefill_shared_acc_mean": 0.171875,
"random_acc_mean": 0.171875,
"decode_minus_baseline_mean": -0.078125,
"prefill_minus_baseline_mean": 0.015625,
"random_minus_baseline_mean": 0.015625,
"decode_minus_prefill_mean": -0.09375,
"informative_tasks": [
"logiqa"
],
"inconclusive_tasks": [
"gsm8k"
]
}
}