{ "config": { "model": "meta-llama/Llama-2-7b-chat-hf", "device": "cuda", "model_dtype": "fp16", "tasks": "gsm8k,commonsenseqa,strategyqa,arc_challenge,openbookqa,qasc,logiqa", "heldout_tasks": "gsm8k,logiqa", "layer": 10, "n_subspace": 64, "n_eval": 32, "use_forced_choice": true, "do_sample": false, "offline": true }, "commands": { "gsm8k": [ "/home/zs89/miniconda3/envs/flashsvd/bin/python", "/home/zs89/decodeshare/reasoning/disturb_CoT_shared_loto_reasoning.py", "--model", "meta-llama/Llama-2-7b-chat-hf", "--device", "cuda", "--model_dtype", "fp16", "--layer", "10", "--tasks", "gsm8k,commonsenseqa,strategyqa,arc_challenge,openbookqa,qasc,logiqa", "--mode", "loto", "--loto_eval_mode", "heldout", "--loto_only", "gsm8k", "--n_subspace", "64", "--n_eval", "32", "--pca_var", "0.95", "--tau", "0.001", "--m_shared", "all", "--calib_decode_max_new_tokens", "48", "--per_task_max_states", "2048", "--alpha_remove", "1.0", "--reasoning_tokens", "64", "--max_new_tokens", "96", "--temperature", "0.7", "--top_p", "0.9", "--top_k", "0", "--do_sample", "0", "--template_randomization", "1", "--template_seed", "1234", "--shuffle_choices", "1", "--add_answer_prefix", "1", "--answer_prefix", "\nFinal answer:", "--use_forced_choice", "1", "--fc_warmup_tokens", "0", "--fc_prefix_mode", "auto", "--fc_answer_prefix", "\nFinal answer:", "--batch_size", "2", "--max_prompt_len", "512", "--bootstrap_iters", "500", "--perm_iters", "1000", "--ci_alpha", "0.05", "--seed", "42", "--sample_seed", "12345", "--out_json", "/home/zs89/decodeshare/rebuttal/reasoning/raw/gsm8k_loto.json", "--out_md", "/home/zs89/decodeshare/rebuttal/reasoning/raw/gsm8k_loto.md" ], "logiqa": [ "/home/zs89/miniconda3/envs/flashsvd/bin/python", "/home/zs89/decodeshare/reasoning/disturb_CoT_shared_loto_reasoning.py", "--model", "meta-llama/Llama-2-7b-chat-hf", "--device", "cuda", "--model_dtype", "fp16", "--layer", "10", "--tasks", "gsm8k,commonsenseqa,strategyqa,arc_challenge,openbookqa,qasc,logiqa", "--mode", "loto", "--loto_eval_mode", "heldout", "--loto_only", "logiqa", "--n_subspace", "64", "--n_eval", "32", "--pca_var", "0.95", "--tau", "0.001", "--m_shared", "all", "--calib_decode_max_new_tokens", "48", "--per_task_max_states", "2048", "--alpha_remove", "1.0", "--reasoning_tokens", "64", "--max_new_tokens", "96", "--temperature", "0.7", "--top_p", "0.9", "--top_k", "0", "--do_sample", "0", "--template_randomization", "1", "--template_seed", "1234", "--shuffle_choices", "1", "--add_answer_prefix", "1", "--answer_prefix", "\nFinal answer:", "--use_forced_choice", "1", "--fc_warmup_tokens", "0", "--fc_prefix_mode", "auto", "--fc_answer_prefix", "\nFinal answer:", "--batch_size", "2", "--max_prompt_len", "512", "--bootstrap_iters", "500", "--perm_iters", "1000", "--ci_alpha", "0.05", "--seed", "42", "--sample_seed", "12345", "--out_json", "/home/zs89/decodeshare/rebuttal/reasoning/raw/logiqa_loto.json", "--out_md", "/home/zs89/decodeshare/rebuttal/reasoning/raw/logiqa_loto.md" ] }, "raw_files": { "gsm8k": { "json": "/home/zs89/decodeshare/rebuttal/reasoning/raw/gsm8k_loto.json", "md": "/home/zs89/decodeshare/rebuttal/reasoning/raw/gsm8k_loto.md" }, "logiqa": { "json": "/home/zs89/decodeshare/rebuttal/reasoning/raw/logiqa_loto.json", "md": "/home/zs89/decodeshare/rebuttal/reasoning/raw/logiqa_loto.md" } }, "per_task": [ { "task": "gsm8k", "task_label": "Open-ended numeric reasoning", "n_eval": 32, "protocol": "generation", "k_eval": 77, "chance_acc": null, "baseline_near_floor": true, "baseline_acc": 0.0, "decode_shared_acc": 0.0, "prefill_shared_acc": 0.0, "random_acc": 0.0, "decode_minus_prefill": 0.0, "decode_minus_prefill_ci_low": 0.0, "decode_minus_prefill_ci_high": 0.0, "decode_minus_prefill_p": 1.0, "decode_minus_baseline": 0.0, "prefill_minus_baseline": 0.0, "random_minus_baseline": 0.0 }, { "task": "logiqa", "task_label": "Logical reasoning multiple choice", "n_eval": 32, "protocol": "forced_choice", "k_eval": 70, "chance_acc": 0.25, "baseline_near_floor": false, "baseline_acc": 0.3125, "decode_shared_acc": 0.15625, "prefill_shared_acc": 0.34375, "random_acc": 0.34375, "decode_minus_prefill": -0.1875, "decode_minus_prefill_ci_low": -0.3125, "decode_minus_prefill_ci_high": -0.0625, "decode_minus_prefill_p": 0.03596403596403597, "decode_minus_baseline": -0.15625, "prefill_minus_baseline": 0.03125, "random_minus_baseline": 0.03125 } ], "aggregate": { "baseline_acc_mean": 0.15625, "decode_shared_acc_mean": 0.078125, "prefill_shared_acc_mean": 0.171875, "random_acc_mean": 0.171875, "decode_minus_baseline_mean": -0.078125, "prefill_minus_baseline_mean": 0.015625, "random_minus_baseline_mean": 0.015625, "decode_minus_prefill_mean": -0.09375, "informative_tasks": [ "logiqa" ], "inconclusive_tasks": [ "gsm8k" ] } }