| import json | |
| import random | |
| def get_random_indices(array_length, num_indices): | |
| # 确保抽取的数量不超过数组长度 | |
| if num_indices > array_length: | |
| return random.sample(range(array_length), array_length) | |
| # 使用 random.sample 抽取指定数量的索引 | |
| return random.sample(range(array_length), num_indices) | |
| def find_first_non_ac(array): | |
| for element in array: | |
| if element != "AC": | |
| return element | |
| return "AC" | |
| import random | |
| def get_e_r_indices(code, type_of_test="random"): | |
| return [i for i, t in enumerate(code.get("types", [])) if t == type_of_test] | |
| test_als = ["lcb"] | |
| model_name_list = [ | |
| "claude4", | |
| "gpt-4o", | |
| ] | |
| import os | |
| for model_name in model_name_list: | |
| for test_al in test_als: | |
| edge_hacking_list = {} | |
| random_hacking_list = {} | |
| result_file = f"/home/luoxianzhen/yang/eval_wrong_code/ALLmode_results/tcb-{model_name}-{test_al}-type-{test_al}-rank5-all.json" | |
| if not os.path.exists(result_file): | |
| print(f"{model_name}-{test_al} NOT EXSIT!") | |
| continue | |
| results = json.load(open(result_file, "r", encoding="utf-8")) | |
| rank_result_random = { | |
| "rank5": {"AC":0, "CE": 0, "WA":0, "RE": 0, "TLE":0, "MLE":0,"EXE":0}, | |
| } | |
| success_k_random = { | |
| "rank5": {"total": 0, "hacked": 0}, | |
| } | |
| rank_result_edge = { | |
| "rank5": {"AC":0, "CE": 0, "WA":0, "RE": 0, "TLE":0, "MLE":0,"EXE":0}, | |
| } | |
| success_k_edge = { | |
| "rank5": {"total": 0, "hacked": 0}, | |
| } | |
| for k, v in results.items(): | |
| rank = len(v['codes']) | |
| random_hacking_list[k] = [] | |
| edge_hacking_list[k] = [] | |
| for i in range(4, 5): | |
| nums_of_tests = rank * (i + 1) | |
| array_length = max([len(code['status']) for code in v['codes']]) | |
| tests_index_random = get_e_r_indices(v['codes'][0], "random") | |
| tests_index_edge = get_e_r_indices(v['codes'][0], "edge") | |
| # tests_index_random = random.sample(tests_index_random, len(tests_index_edge)) if len(tests_index_random) > len(tests_index_edge) else tests_index_random | |
| success_k_random[f"rank{i+1}"]["total"] += rank | |
| success_k_edge[f"rank{i+1}"]["total"] += rank | |
| if array_length == 0: | |
| success_k_random[f"rank{i+1}"]['AC'] += rank | |
| success_k_edge[f"rank{i+1}"]['AC'] += rank | |
| continue | |
| for code in v['codes']: | |
| if not tests_index_random or len(tests_index_random) <= 0: | |
| rank_result_random[f"rank{i+1}"]["AC"] += 1 | |
| continue | |
| tests_status = [code['status'][i] for i in tests_index_random] if max(tests_index_random) < len(code['status']) else code['status'] | |
| rank_result_random[f"rank{i+1}"][find_first_non_ac(tests_status)] += 1 | |
| if find_first_non_ac(tests_status) != "AC": | |
| success_k_random[f"rank{i+1}"]["hacked"] += 1 | |
| random_hacking_list[k].append(code['code_id']) | |
| for code in v['codes']: | |
| if not tests_index_edge or len(tests_index_edge) <= 0: | |
| rank_result_edge[f"rank{i+1}"]["AC"] += 1 | |
| continue | |
| tests_status = [code['status'][i] for i in tests_index_edge] if max(tests_index_edge) < len(code['status']) else code['status'] | |
| rank_result_edge[f"rank{i+1}"][find_first_non_ac(tests_status)] += 1 | |
| if find_first_non_ac(tests_status) != "AC": | |
| success_k_edge[f"rank{i+1}"]["hacked"] += 1 | |
| edge_hacking_list[k].append(code['code_id']) | |
| # 创建 Markdown 表格 | |
| algorithm_model = f"{test_al}({model_name})" | |
| # 创建 Markdown 表格 | |
| markdown_table = "| Algorithm & Model | Rank | AC | CE | WA | RE | TLE | MLE | EXE | Hack Rate |\n" | |
| markdown_table += "|-------------------|------|----|----|----|----|-----|-----|-----|-----------|\n" | |
| for rank in rank_result_random: | |
| total = success_k_random[rank]["total"] | |
| hacked = success_k_random[rank]["hacked"] | |
| hack_rate = (hacked / total * 100) if total > 0 else 0 | |
| hack_rate = round(hack_rate, 2) # 保留两位小数 | |
| # 计算每个状态的百分比和数量 | |
| status_percentages = [] | |
| for key in rank_result_random[rank]: | |
| count = rank_result_random[rank][key] | |
| percentage = (count / total * 100) if total > 0 else 0 | |
| status_percentages.append(f"{percentage:.2f}% ({count})") | |
| # 将每个状态的百分比和数量组合在一起 | |
| markdown_table += f"| {algorithm_model} | {rank} | " + " | ".join(status_percentages) + f" | {hack_rate}% |\n" | |
| # # 保存到 .md 文件 | |
| # with open(f"/home/luoxianzhen/yang/eval_wrong_code/edge_random/rank_result-{model_name}-{test_al}-type.md", "w") as file: | |
| # file.write(markdown_table) | |
| # print("Markdown 文件已生成: rank_result.md") | |
| for rank in rank_result_edge: | |
| total = success_k_edge[rank]["total"] | |
| hacked = success_k_edge[rank]["hacked"] | |
| hack_rate = (hacked / total * 100) if total > 0 else 0 | |
| hack_rate = round(hack_rate, 2) # 保留两位小数 | |
| # 计算每个状态的百分比和数量 | |
| status_percentages = [] | |
| for key in rank_result_edge[rank]: | |
| count = rank_result_edge[rank][key] | |
| percentage = (count / total * 100) if total > 0 else 0 | |
| status_percentages.append(f"{percentage:.2f}% ({count})") | |
| # 将每个状态的百分比和数量组合在一起 | |
| markdown_table += f"| {algorithm_model} | {rank} | " + " | ".join(status_percentages) + f" | {hack_rate}% |\n" | |
| # 保存到 .md 文件 | |
| with open(f"/home/luoxianzhen/yang/eval_wrong_code/edge_random/rank_result-{model_name}-{test_al}-type.md", "w") as file: | |
| file.write(markdown_table) | |
| print("Markdown 文件已生成: rank_result.md") | |
| both_count = 0 | |
| random_count = 0 | |
| edge_count = 0 | |
| for k, v in random_hacking_list.items(): | |
| random_hacked = v | |
| edge_hacked = edge_hacking_list[k] | |
| code_ids = list(set(random_hacked + edge_hacked)) | |
| for code_id in code_ids: | |
| if code_id in random_hacked and code_id in edge_hacked: | |
| both_count += 1 | |
| elif code_id in random_hacked and code_id not in edge_hacked: | |
| random_count += 1 | |
| elif code_id not in random_hacked and code_id in edge_hacked: | |
| edge_count += 1 | |
| print(f"{model_name}: Random-Only Hacked: {random_count} & Edge-Only Hacked {edge_count} & Both Hacked {both_count}") |
Xet Storage Details
- Size:
- 7.13 kB
- Xet hash:
- e7939cb323fa612e41d9dd37b832a52af13610248e2e5d1d85e32cd4029fa242
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.