Spaces:
Running
Running
| from openai import OpenAI | |
| import json | |
| import os | |
| import sys | |
| import argparse | |
| from utils import read_python_files, extract_planning, content_to_json, \ | |
| num_tokens_from_messages, read_all_files, extract_json_from_string, get_now_str, print_log_cost | |
| client = OpenAI(api_key = os.environ["OPENAI_API_KEY"]) | |
| def api_call(request_json): | |
| completion = client.chat.completions.create(**request_json) | |
| return completion | |
| def main(args): | |
| paper_name = args.paper_name | |
| pdf_json_path = args.pdf_json_path | |
| output_dir = args.output_dir | |
| target_repo_dir = args.target_repo_dir | |
| eval_result_dir = args.eval_result_dir | |
| gpt_version = args.gpt_version | |
| generated_n = args.generated_n | |
| data_dir = args.data_dir | |
| eval_type = args.eval_type | |
| is_papercoder = True if args.papercoder else False | |
| gold_repo_dir = args.gold_repo_dir | |
| # paper | |
| with open(f'{pdf_json_path}') as f: | |
| paper_json = json.load(f) | |
| codes = "" | |
| if is_papercoder: | |
| # python files | |
| target_files_dict = read_python_files(target_repo_dir) | |
| # configuration | |
| with open(f'{output_dir}/planning_config.yaml') as f: | |
| config_yaml = f.read() | |
| context_lst = extract_planning(f'{output_dir}/planning_trajectories.json') | |
| if os.path.exists(f'{output_dir}/task_list.json'): | |
| with open(f'{output_dir}/task_list.json') as f: | |
| task_list = json.load(f) | |
| else: | |
| task_list = content_to_json(context_lst[2]) | |
| todo_file_lst = task_list['Task list'] | |
| for todo_file in todo_file_lst: | |
| if todo_file.endswith(".yaml"): | |
| continue | |
| codes += f"```python\n## File name: {todo_file}\n{target_files_dict[todo_file]}\n```\n\n" | |
| codes += f"```yaml\n## File name: config.yaml\n{config_yaml}\n```\n\n" | |
| else: | |
| target_files_dict = read_all_files(target_repo_dir, allowed_ext=[".py", ".yaml", ".yml", ".md", ".sh", ".bash"], is_print=False) | |
| for file_name, code in target_files_dict.items(): | |
| codes += f"```## File name: {file_name}\n{code}\n```\n\n" | |
| prompt = open(f"{data_dir}/prompts/{eval_type}.txt").read() | |
| cur_prompt = prompt.replace('{{Paper}}', f"{paper_json}").replace('{{Code}}', codes) | |
| # refernce-based | |
| if "ref_based" == eval_type and len(gold_repo_dir) > 0: | |
| all_files_dict = read_all_files(gold_repo_dir, allowed_ext=[".py", ".yaml", ".yml", ".md", ".sh", ".bash"], is_print=False) | |
| goldcodes = "" | |
| gold_cnt = 0 | |
| if len(args.selected_file_path) > 0: | |
| selected_file_lst = [] | |
| with open(args.selected_file_path) as f: | |
| selected_file_lst = f.readlines() | |
| for s_idx in range(len(selected_file_lst)): | |
| selected_file_lst[s_idx] = selected_file_lst[s_idx].strip() | |
| for all_file, all_file_code in all_files_dict.items(): | |
| if all_file not in selected_file_lst: | |
| continue | |
| goldcodes += f"```## File name: {all_file}\n{all_file_code}\n```\n\n" | |
| gold_cnt += 1 | |
| else: | |
| for all_file, all_file_code in all_files_dict.items(): | |
| goldcodes += f"```## File name: {all_file}\n{all_file_code}\n```\n\n" | |
| gold_cnt += 1 | |
| cur_prompt = cur_prompt.replace('{{GoldCode}}', f"{goldcodes}") | |
| msg = [{"role": "system", "content": cur_prompt}] | |
| try: | |
| num_tokens = num_tokens_from_messages(msg) | |
| except Exception as e: | |
| print(f"[WARNING] An exception was raised while counting tokens for the target repository of {args.paper_name}.") | |
| print(e) | |
| print("-"*40) | |
| num_tokens = 0 | |
| if num_tokens > 128000: | |
| print(f"[ERROR] {args.paper_name} more than 128k") | |
| sys.exit(0) | |
| if "o3-mini" in gpt_version: | |
| if generated_n > 8: | |
| print(f"[WARNING] o3-mini does not support n > 8. Setting generated_n to 8.") | |
| generated_n = 8 | |
| request_json = { | |
| "model": gpt_version, | |
| "messages": msg, | |
| "reasoning_effort": "high", | |
| "n": generated_n | |
| } | |
| else: | |
| request_json = { | |
| "model": gpt_version, | |
| "messages": msg, | |
| "temperature": 1, | |
| "frequency_penalty": 0, | |
| "presence_penalty": 0, | |
| "stop": None, | |
| "n": generated_n # 10 | |
| } | |
| completion = api_call(request_json) | |
| completion_json = json.loads(completion.model_dump_json()) | |
| score_key = "score" | |
| rationale_key = "critique_list" | |
| all_scores = [] | |
| rationales = [] | |
| for n in range(generated_n): | |
| choice = completion_json['choices'][n] | |
| output = choice['message']['content'].strip() | |
| try: | |
| output_json2 = json.loads(output) | |
| score = int(output_json2[score_key]) | |
| if isinstance(output_json2[rationale_key], str): | |
| rationale = output_json2[rationale_key] | |
| else: | |
| rationale = json.dumps(output_json2[rationale_key]) | |
| except Exception as e: | |
| # print(e) | |
| try: | |
| output_json2 = json.loads(extract_json_from_string(output)) | |
| score = int(output_json2[score_key]) | |
| if isinstance(output_json2[rationale_key], str): | |
| rationale = output_json2[rationale_key] | |
| else: | |
| rationale = json.dumps(output_json2[rationale_key]) | |
| except Exception as e2: # Parsing Error | |
| print(f"[WARNING] Invalid repsponse: parsing error") | |
| print(e2) | |
| print("-"*40) | |
| continue | |
| # score | |
| if score < 1 or score > 5: | |
| print(f"[WARNING] Invalid repsponse: score {score}, Score must be in the range of 1β5.") | |
| continue | |
| all_scores.append(int(score)) | |
| rationales.append(rationale) | |
| avg_score = sum(all_scores) / len(all_scores) | |
| output_json= { | |
| "paper_name": paper_name, | |
| "target_repo_dir": target_repo_dir, | |
| "eval_type": eval_type, | |
| "gold_repo_dir": gold_repo_dir, | |
| "generated_n": generated_n, | |
| "request_json": request_json, | |
| "completion_json": completion_json, | |
| "eval_result": { | |
| "score": avg_score, | |
| "valid_n": len(all_scores), | |
| "scroe_lst": all_scores, | |
| "rationale_lst": rationales, | |
| }, | |
| } | |
| now_str = get_now_str() | |
| os.makedirs(eval_result_dir, exist_ok=True) | |
| with open(f"{eval_result_dir}/{paper_name}_eval_{eval_type}_{gpt_version}_{now_str}.json", 'w', encoding='utf-8') as f: | |
| json.dump(output_json, f) | |
| # --------------- | |
| print() | |
| print("=" * 40) | |
| print("π Evaluation Summary π") | |
| print(f"π Paper name: {paper_name}") | |
| print(f"π§ͺ Evaluation type: {eval_type}") | |
| print(f"π Target repo directory: {target_repo_dir}") | |
| print(f"π Evaluation result:") | |
| print(f"\tπ Score: {avg_score:.4f}") | |
| print(f"\tβ Valid: {output_json['eval_result']['valid_n']}/{generated_n}") | |
| print("=" * 40) | |
| print_log_cost(completion_json, gpt_version, f"[Evaluation] {paper_name} - {eval_type}", output_dir, 0) | |
| # --------------- | |
| if __name__ == '__main__': | |
| argparser = argparse.ArgumentParser() | |
| argparser.add_argument('--paper_name', type=str) | |
| argparser.add_argument('--pdf_json_path', type=str) | |
| argparser.add_argument('--data_dir',type=str, default="../data") | |
| argparser.add_argument('--output_dir',type=str) | |
| argparser.add_argument('--target_repo_dir', type=str) | |
| argparser.add_argument('--gold_repo_dir', type=str, default="") | |
| argparser.add_argument('--eval_result_dir',type=str) | |
| argparser.add_argument('--eval_type', type=str, default="ref_free", choices=["ref_free", "ref_based"]) | |
| argparser.add_argument('--generated_n', type=int, default=8) | |
| argparser.add_argument('--gpt_version', type=str, default="o3-mini") | |
| argparser.add_argument('--selected_file_path', type=str, default="") | |
| argparser.add_argument('--papercoder', action="store_true") | |
| args = argparser.parse_args() | |
| main(args) | |
| # ref-free | |
| # python eval.py \ | |
| # --paper_name Transformer \ | |
| # --pdf_json_path ../examples/Transformer_cleaned.json \ | |
| # --data_dir ../data \ | |
| # --output_dir ../outputs/Transformer \ | |
| # --target_repo_dir ../outputs/Transformer_repo \ | |
| # --eval_result_dir ../results \ | |
| # --eval_type ref_free \ | |
| # --generated_n 8 \ | |
| # --papercoder | |
| # ref-based | |
| # python eval.py \ | |
| # --paper_name Transformer \ | |
| # --pdf_json_path ../examples/Transformer_cleaned.json \ | |
| # --data_dir ../data \ | |
| # --output_dir ../outputs/Transformer \ | |
| # --target_repo_dir ../outputs/Transformer_repo \ | |
| # --gold_repo_dir ../examples/Transformer_gold_repo \ | |
| # --eval_result_dir ../results \ | |
| # --eval_type ref_based \ | |
| # --generated_n 8 \ | |
| # --papercoder | |