diff --git "a/evaluation2.log" "b/evaluation2.log" new file mode 100644--- /dev/null +++ "b/evaluation2.log" @@ -0,0 +1,1003 @@ +INFO 03-30 20:49:54 [__init__.py:239] Automatically detected platform cuda. +2026-03-30:20:50:21 INFO [__main__:429] Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true` +2026-03-30:20:50:21 INFO [__main__:446] Selected Tasks: ['arc_challenge', 'arc_easy', 'bbh', 'hellaswag', 'humaneval', 'mmlu', 'piqa', 'social_iqa', 'winogrande'] +2026-03-30:20:50:21 INFO [evaluator:202] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234 +2026-03-30:20:50:21 WARNING [evaluator:214] generation_kwargs: {'max_length': 4096} specified through cli, these settings will update set parameters in yaml tasks. Ensure 'do_sample=True' for non-greedy decoding! +2026-03-30:20:50:21 INFO [evaluator:240] Initializing hf model, with arguments: {'pretrained': 'results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/', 'dtype': + 'bfloat16', 'trust_remote_code': True} +2026-03-30:20:50:21 WARNING [accelerate.utils.other:512] Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher. +2026-03-30:20:50:21 INFO [models.huggingface:147] Using device 'cuda:0' +2026-03-30:20:50:21 INFO [models.huggingface:535] Model type cannot be determined. Using default model type 'causal' +2026-03-30:20:50:22 INFO [models.huggingface:414] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'} +/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/datasets/load.py:1298: FutureWarning: The repository for social_i_qa contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/social_i_qa +You can avoid this message in future by passing the argument `trust_remote_code=True`. +Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`. + warnings.warn( +2026-03-30:20:56:48 INFO [evaluator:305] humaneval: Using gen_kwargs: {'until': ['\nclass', '\ndef', '\n#', '\nif', '\nprint'], 'max_length': 4096, 'do_sample': False} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_boolean_expressions: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_causal_judgement: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_date_understanding: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_disambiguation_qa: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_dyck_languages: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_formal_fallacies: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_geometric_shapes: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_hyperbaton: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_logical_deduction_five_objects: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_logical_deduction_seven_objects: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_logical_deduction_three_objects: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_movie_recommendation: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_multistep_arithmetic_two: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_navigate: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_object_counting: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_penguins_in_a_table: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_reasoning_about_colored_objects: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_ruin_names: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_salient_translation_error_detection: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_snarks: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_sports_understanding: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_temporal_sequences: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_tracking_shuffled_objects_five_objects: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_tracking_shuffled_objects_seven_objects: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_tracking_shuffled_objects_three_objects: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_web_of_lies: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [evaluator:305] bbh_cot_fewshot_word_sorting: Using gen_kwargs: {'max_new_tokens': 1024, 'until': ['', 'Q', '\n\n'], 'do_sample': False, 'temperature': 0.0, 'max_length': 4096} +2026-03-30:20:56:48 INFO [api.task:434] Building contexts for winogrande on rank 0... + 0%| | 0/1267 [00:00 + sys.exit(cli_evaluate()) + File "/home/test/test06/lm_eval/lm_eval/__main__.py", line 455, in cli_evaluate + results = evaluator.simple_evaluate( + File "/home/test/test06/lm_eval/lm_eval/utils.py", line 456, in _wrapper + return fn(*args, **kwargs) + File "/home/test/test06/lm_eval/lm_eval/evaluator.py", line 357, in simple_evaluate + results = evaluate( + File "/home/test/test06/lm_eval/lm_eval/utils.py", line 456, in _wrapper + return fn(*args, **kwargs) + File "/home/test/test06/lm_eval/lm_eval/evaluator.py", line 585, in evaluate + resps = getattr(lm, reqtype)(cloned_reqs) + File "/home/test/test06/lm_eval/lm_eval/models/huggingface.py", line 1434, in generate_until + cont = self._model_generate( + File "/home/test/test06/lm_eval/lm_eval/models/huggingface.py", line 983, in _model_generate + return self.model.generate( + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context + return func(*args, **kwargs) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/transformers/generation/utils.py", line 2629, in generate + result = self._sample( + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/transformers/generation/utils.py", line 3613, in _sample + outputs = model_forward(**model_inputs, return_dict=True) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/transformers/utils/generic.py", line 959, in wrapper + output = func(self, *args, **kwargs) + File "/home/test/test06/.cache/huggingface/modules/transformers_modules/modeling_blockffn.py", line 979, in forward + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/transformers/utils/generic.py", line 1083, in wrapper + outputs = func(self, *args, **kwargs) + File "/home/test/test06/.cache/huggingface/modules/transformers_modules/modeling_blockffn.py", line 925, in forward + + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/transformers/modeling_layers.py", line 94, in __call__ + return super().__call__(*args, **kwargs) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/home/test/test06/.cache/huggingface/modules/transformers_modules/modeling_blockffn.py", line 831, in forward + use_cache: Optional[bool] = False, + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/home/test/test06/.cache/huggingface/modules/transformers_modules/modeling_blockffn.py", line 256, in forward + def apply_expert_bias(self, router_scores: torch.Tensor) -> torch.Tensor: + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 574, in _fn + return fn(*args, **kwargs) + File "/home/test/test06/.cache/huggingface/modules/transformers_modules/modeling_blockffn.py", line 144, in forward + self.weight = init_var + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 745, in _fn + return fn(*args, **kwargs) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1184, in forward + return compiled_fn(full_args) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 426, in runtime_wrapper + for t, o in zip(ret_outs, runtime_metadata.output_info): + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py", line 52, in strict_zip + return original_zip(*iterables, **kwargs) +KeyboardInterrupt +Exception ignored in atexit callback: +Traceback (most recent call last): + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper + return func(*args, **kwargs) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/ray/_private/worker.py", line 1989, in shutdown + from ray.dag.compiled_dag_node import _shutdown_all_compiled_dags + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/ray/dag/__init__.py", line 1, in + from ray.dag.dag_node import DAGNode + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/ray/dag/dag_node.py", line 2, in + from ray.experimental.channel.auto_transport_type import AutoTransportType + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/ray/experimental/channel/__init__.py", line 1, in + from ray.experimental.channel.cached_channel import CachedChannel + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/ray/experimental/channel/cached_channel.py", line 4, in + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/ray/experimental/channel/common.py", line 21, in + from ray.experimental.channel.communicator import Communicator + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 879, in exec_module + File "", line 975, in get_code + File "", line 1073, in get_data +KeyboardInterrupt: +Exception ignored in atexit callback: > +Traceback (most recent call last): + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/weakref.py", line 667, in _exitfunc + f() + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/weakref.py", line 591, in __call__ + return info.func(*info.args, **(info.kwargs or {})) + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1176, in _close_pool_connections + conn.close() + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/site-packages/urllib3/connection.py", line 371, in close + super().close() + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/http/client.py", line 961, in close + sock.close() # close it manually... there may be other refs + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/socket.py", line 502, in close + self._real_close() + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/ssl.py", line 1366, in _real_close + super()._real_close() + File "/home/test1267/test-6/miniconda3/envs/lmeval/lib/python3.10/socket.py", line 496, in _real_close + _ss.close(self) +KeyboardInterrupt: