| { |
| "config": { |
| "model": "meta-llama/Llama-2-7b-chat-hf", |
| "device": "cuda", |
| "model_dtype": "fp16", |
| "tasks": "gsm8k,commonsenseqa,strategyqa,arc_challenge,openbookqa,qasc,logiqa", |
| "heldout_tasks": "gsm8k,logiqa", |
| "layer": 10, |
| "n_subspace": 64, |
| "n_eval": 32, |
| "use_forced_choice": true, |
| "do_sample": false, |
| "offline": true |
| }, |
| "commands": { |
| "gsm8k": [ |
| "/home/zs89/miniconda3/envs/flashsvd/bin/python", |
| "/home/zs89/decodeshare/reasoning/disturb_CoT_shared_loto_reasoning.py", |
| "--model", |
| "meta-llama/Llama-2-7b-chat-hf", |
| "--device", |
| "cuda", |
| "--model_dtype", |
| "fp16", |
| "--layer", |
| "10", |
| "--tasks", |
| "gsm8k,commonsenseqa,strategyqa,arc_challenge,openbookqa,qasc,logiqa", |
| "--mode", |
| "loto", |
| "--loto_eval_mode", |
| "heldout", |
| "--loto_only", |
| "gsm8k", |
| "--n_subspace", |
| "64", |
| "--n_eval", |
| "32", |
| "--pca_var", |
| "0.95", |
| "--tau", |
| "0.001", |
| "--m_shared", |
| "all", |
| "--calib_decode_max_new_tokens", |
| "48", |
| "--per_task_max_states", |
| "2048", |
| "--alpha_remove", |
| "1.0", |
| "--reasoning_tokens", |
| "64", |
| "--max_new_tokens", |
| "96", |
| "--temperature", |
| "0.7", |
| "--top_p", |
| "0.9", |
| "--top_k", |
| "0", |
| "--do_sample", |
| "0", |
| "--template_randomization", |
| "1", |
| "--template_seed", |
| "1234", |
| "--shuffle_choices", |
| "1", |
| "--add_answer_prefix", |
| "1", |
| "--answer_prefix", |
| "\nFinal answer:", |
| "--use_forced_choice", |
| "1", |
| "--fc_warmup_tokens", |
| "0", |
| "--fc_prefix_mode", |
| "auto", |
| "--fc_answer_prefix", |
| "\nFinal answer:", |
| "--batch_size", |
| "2", |
| "--max_prompt_len", |
| "512", |
| "--bootstrap_iters", |
| "500", |
| "--perm_iters", |
| "1000", |
| "--ci_alpha", |
| "0.05", |
| "--seed", |
| "42", |
| "--sample_seed", |
| "12345", |
| "--out_json", |
| "/home/zs89/decodeshare/rebuttal/reasoning/raw/gsm8k_loto.json", |
| "--out_md", |
| "/home/zs89/decodeshare/rebuttal/reasoning/raw/gsm8k_loto.md" |
| ], |
| "logiqa": [ |
| "/home/zs89/miniconda3/envs/flashsvd/bin/python", |
| "/home/zs89/decodeshare/reasoning/disturb_CoT_shared_loto_reasoning.py", |
| "--model", |
| "meta-llama/Llama-2-7b-chat-hf", |
| "--device", |
| "cuda", |
| "--model_dtype", |
| "fp16", |
| "--layer", |
| "10", |
| "--tasks", |
| "gsm8k,commonsenseqa,strategyqa,arc_challenge,openbookqa,qasc,logiqa", |
| "--mode", |
| "loto", |
| "--loto_eval_mode", |
| "heldout", |
| "--loto_only", |
| "logiqa", |
| "--n_subspace", |
| "64", |
| "--n_eval", |
| "32", |
| "--pca_var", |
| "0.95", |
| "--tau", |
| "0.001", |
| "--m_shared", |
| "all", |
| "--calib_decode_max_new_tokens", |
| "48", |
| "--per_task_max_states", |
| "2048", |
| "--alpha_remove", |
| "1.0", |
| "--reasoning_tokens", |
| "64", |
| "--max_new_tokens", |
| "96", |
| "--temperature", |
| "0.7", |
| "--top_p", |
| "0.9", |
| "--top_k", |
| "0", |
| "--do_sample", |
| "0", |
| "--template_randomization", |
| "1", |
| "--template_seed", |
| "1234", |
| "--shuffle_choices", |
| "1", |
| "--add_answer_prefix", |
| "1", |
| "--answer_prefix", |
| "\nFinal answer:", |
| "--use_forced_choice", |
| "1", |
| "--fc_warmup_tokens", |
| "0", |
| "--fc_prefix_mode", |
| "auto", |
| "--fc_answer_prefix", |
| "\nFinal answer:", |
| "--batch_size", |
| "2", |
| "--max_prompt_len", |
| "512", |
| "--bootstrap_iters", |
| "500", |
| "--perm_iters", |
| "1000", |
| "--ci_alpha", |
| "0.05", |
| "--seed", |
| "42", |
| "--sample_seed", |
| "12345", |
| "--out_json", |
| "/home/zs89/decodeshare/rebuttal/reasoning/raw/logiqa_loto.json", |
| "--out_md", |
| "/home/zs89/decodeshare/rebuttal/reasoning/raw/logiqa_loto.md" |
| ] |
| }, |
| "raw_files": { |
| "gsm8k": { |
| "json": "/home/zs89/decodeshare/rebuttal/reasoning/raw/gsm8k_loto.json", |
| "md": "/home/zs89/decodeshare/rebuttal/reasoning/raw/gsm8k_loto.md" |
| }, |
| "logiqa": { |
| "json": "/home/zs89/decodeshare/rebuttal/reasoning/raw/logiqa_loto.json", |
| "md": "/home/zs89/decodeshare/rebuttal/reasoning/raw/logiqa_loto.md" |
| } |
| }, |
| "per_task": [ |
| { |
| "task": "gsm8k", |
| "task_label": "Open-ended numeric reasoning", |
| "n_eval": 32, |
| "protocol": "generation", |
| "k_eval": 77, |
| "chance_acc": null, |
| "baseline_near_floor": true, |
| "baseline_acc": 0.0, |
| "decode_shared_acc": 0.0, |
| "prefill_shared_acc": 0.0, |
| "random_acc": 0.0, |
| "decode_minus_prefill": 0.0, |
| "decode_minus_prefill_ci_low": 0.0, |
| "decode_minus_prefill_ci_high": 0.0, |
| "decode_minus_prefill_p": 1.0, |
| "decode_minus_baseline": 0.0, |
| "prefill_minus_baseline": 0.0, |
| "random_minus_baseline": 0.0 |
| }, |
| { |
| "task": "logiqa", |
| "task_label": "Logical reasoning multiple choice", |
| "n_eval": 32, |
| "protocol": "forced_choice", |
| "k_eval": 70, |
| "chance_acc": 0.25, |
| "baseline_near_floor": false, |
| "baseline_acc": 0.3125, |
| "decode_shared_acc": 0.15625, |
| "prefill_shared_acc": 0.34375, |
| "random_acc": 0.34375, |
| "decode_minus_prefill": -0.1875, |
| "decode_minus_prefill_ci_low": -0.3125, |
| "decode_minus_prefill_ci_high": -0.0625, |
| "decode_minus_prefill_p": 0.03596403596403597, |
| "decode_minus_baseline": -0.15625, |
| "prefill_minus_baseline": 0.03125, |
| "random_minus_baseline": 0.03125 |
| } |
| ], |
| "aggregate": { |
| "baseline_acc_mean": 0.15625, |
| "decode_shared_acc_mean": 0.078125, |
| "prefill_shared_acc_mean": 0.171875, |
| "random_acc_mean": 0.171875, |
| "decode_minus_baseline_mean": -0.078125, |
| "prefill_minus_baseline_mean": 0.015625, |
| "random_minus_baseline_mean": 0.015625, |
| "decode_minus_prefill_mean": -0.09375, |
| "informative_tasks": [ |
| "logiqa" |
| ], |
| "inconclusive_tasks": [ |
| "gsm8k" |
| ] |
| } |
| } |