| from dataset_all import get_ours | |
| from response import TurboResponser | |
| import json | |
| def write_json_to_file(data, filepath): | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, ensure_ascii=False, indent=4) | |
| import re | |
| def extract_content_from_tags(text): | |
| # 使用正则表达式提取 <ques> 和 </ques> 标签之间的内容 | |
| match = re.search(r'<query>(.*?)</query>', text, re.DOTALL) | |
| if match: | |
| return match.group(1) # 返回提取的内容 | |
| else: | |
| return None # 如果没有找到匹配的标签 | |
| def extract_title_from_tags(text): | |
| # 使用正则表达式提取 <ques> 和 </ques> 标签之间的内容 | |
| match = re.search(r'<title>(.*?)</title>', text, re.DOTALL) | |
| if match: | |
| return match.group(1) # 返回提取的内容 | |
| else: | |
| return None # 如果没有找到匹配的标签 | |
| def read_json(file_path): | |
| with open(file_path, "r") as f: | |
| data = json.load(f) | |
| return data | |
| import os | |
| key = "ak-63d1efgh47i8jkl26mno95pqrs34tuv7x2" | |
| api_base = "https://models-proxy.stepfun-inc.com/v1" | |
| responser = TurboResponser(api_key=key, api_base=api_base, model= "gpt-4o") | |
| system_prompt = "You are an expert in Chinese-to-English translation." | |
| user_template = """ | |
| Please translate the following question description into English. | |
| For these subtitles, please retain the markdown format and translate them into the specified terms | |
| title_dict = { | |
| '输出格式': 'Output Format', | |
| '题目描述': 'Problem Description', | |
| '评分方式': 'Scoring Method', | |
| '数据范围与提示': 'Data Range and Hints', | |
| '输入格式': 'Input Format', | |
| '数据范围': 'Data Range', | |
| '评分标准': 'Scoring Criteria', | |
| '题目背景': 'Problem Background', | |
| '大样例': 'Large Example', | |
| '样例输入输出': 'Sample Input and Output', | |
| '题面描述': 'Problem Statement', | |
| '提示': 'Hints', | |
| '测试点约束': 'Test Case Constraints', | |
| '样例': 'Examples', | |
| '后记': 'Postscript', | |
| '校验器': 'Validator', | |
| '子任务': 'Subtasks', | |
| '题目使用协议': 'Problem Usage Agreement' | |
| } | |
| Place the content between the <query> and </query> tags. | |
| Also, please specifically list the title of the problem separately again, enclosed within <title> </title> tags, ensuring the English title is consistent. | |
| ====QUESTION==== | |
| """ | |
| title_dict = { | |
| '输出格式': 'Output Format', | |
| '题目描述': 'Problem Description', | |
| '评分方式': 'Scoring Method', | |
| '数据范围与提示': 'Data Range and Hints', | |
| '输入格式': 'Input Format', | |
| '数据范围': 'Data Range', | |
| '评分标准': 'Scoring Criteria', | |
| '题目背景': 'Problem Background', | |
| '大样例': 'Large Example', | |
| '样例输入输出': 'Sample Input and Output', | |
| '题面描述': 'Problem Statement', | |
| '提示': 'Hints', | |
| '测试点约束': 'Test Case Constraints', | |
| '样例': 'Examples', | |
| '后记': 'Postscript', | |
| '校验器': 'Validator', | |
| '子任务': 'Subtasks', | |
| '题目使用协议': 'Problem Usage Agreement' | |
| } | |
| ds_v3 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/tcb_v7_add_data_check_cn.jsonl") | |
| count = 0 | |
| pos = 0 | |
| for item in ds_v3: | |
| try: | |
| id = item['tcb_id'] | |
| query = item['query'] | |
| if ('tcb_id_en' in item.keys()) and ('query_en' in item.keys()) and (item['tcb_id_en'] != "") and (item['query_en'] != ""): | |
| continue | |
| query_en = "" | |
| title = "" | |
| check_false = True | |
| MAX_TRY = 30 | |
| while check_false and MAX_TRY > 0: | |
| MAX_TRY -= 1 | |
| temp = responser.respond(system_info=system_prompt, user_prompt=user_template + query) | |
| query_en = extract_content_from_tags(temp) | |
| title = extract_title_from_tags(temp) | |
| is_none = (query_en is None or query_en == "") and (title is None or title == "") | |
| sub_title_check = True | |
| for sub_title_cn, sub_title_en in title_dict.items(): | |
| if query.count(f"\n## {sub_title_cn}\n") == 1: | |
| count = query_en.count(f"\n## {sub_title_en}\n") | |
| sub_title_check = sub_title_check and (count == 1) | |
| check_false = (is_none == True) or (sub_title_check == False) | |
| item["query_en"] = query_en | |
| item["tcb_id_en"] = title | |
| except Exception as e: | |
| continue | |
| write_json_to_file(ds_v3, '/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/tcb_v7_add_data_en.jsonl') | |
| # ds_v1 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/datasets_v11_en.json") | |
| # ds_v2 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/datasets_v11_en_v2.json") | |
| # new_ds = [] | |
| # ds_v2_dict = {} | |
| # for item in ds_v2: | |
| # ds_v2_dict[item['tcb_id']] = item | |
| # ds_v1_dict = {} | |
| # for item in ds_v2: | |
| # ds_v1_dict[item['tcb_id']] = item | |
| # for item in ds_v1: | |
| # if ('tcb_id_en' in item.keys()) and ('query_en' in item.keys()) and (item['tcb_id_en'] != "") and (item['query_en'] != ""): | |
| # new_ds.append(item) | |
| # continue | |
| # new_ds.append(ds_v2_dict[item['tcb_id']]) | |
| # write_json_to_file(new_ds, "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/datasets_v11_en_v3.json") | |
| # ds_v3 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/datasets_v11_en_v4.json") | |
| # ds_v3 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/EN_section-5.json") | |
| # aa = len(ds_v3) // 2 | |
| # write_json_to_file(ds_v3[ : aa], f"/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/EN_section-{6}.json") | |
| # write_json_to_file(ds_v3[aa: ], f"/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/EN_section-{7}.json") | |
Xet Storage Details
- Size:
- 5.65 kB
- Xet hash:
- 3a6668fc1482d15b597a80ac03c16acb6cbde79eae27a30cb54d4bb19bb53f76
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.