Tsukihjy/testcase / methods /utils /tran_to_en.py
download
raw
5.65 kB
from dataset_all import get_ours
from response import TurboResponser
import json
def write_json_to_file(data, filepath):
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
import re
def extract_content_from_tags(text):
# 使用正则表达式提取 <ques> 和 </ques> 标签之间的内容
match = re.search(r'<query>(.*?)</query>', text, re.DOTALL)
if match:
return match.group(1) # 返回提取的内容
else:
return None # 如果没有找到匹配的标签
def extract_title_from_tags(text):
# 使用正则表达式提取 <ques> 和 </ques> 标签之间的内容
match = re.search(r'<title>(.*?)</title>', text, re.DOTALL)
if match:
return match.group(1) # 返回提取的内容
else:
return None # 如果没有找到匹配的标签
def read_json(file_path):
with open(file_path, "r") as f:
data = json.load(f)
return data
import os
key = "ak-63d1efgh47i8jkl26mno95pqrs34tuv7x2"
api_base = "https://models-proxy.stepfun-inc.com/v1"
responser = TurboResponser(api_key=key, api_base=api_base, model= "gpt-4o")
system_prompt = "You are an expert in Chinese-to-English translation."
user_template = """
Please translate the following question description into English.
For these subtitles, please retain the markdown format and translate them into the specified terms
title_dict = {
'输出格式': 'Output Format',
'题目描述': 'Problem Description',
'评分方式': 'Scoring Method',
'数据范围与提示': 'Data Range and Hints',
'输入格式': 'Input Format',
'数据范围': 'Data Range',
'评分标准': 'Scoring Criteria',
'题目背景': 'Problem Background',
'大样例': 'Large Example',
'样例输入输出': 'Sample Input and Output',
'题面描述': 'Problem Statement',
'提示': 'Hints',
'测试点约束': 'Test Case Constraints',
'样例': 'Examples',
'后记': 'Postscript',
'校验器': 'Validator',
'子任务': 'Subtasks',
'题目使用协议': 'Problem Usage Agreement'
}
Place the content between the <query> and </query> tags.
Also, please specifically list the title of the problem separately again, enclosed within <title> </title> tags, ensuring the English title is consistent.
====QUESTION====
"""
title_dict = {
'输出格式': 'Output Format',
'题目描述': 'Problem Description',
'评分方式': 'Scoring Method',
'数据范围与提示': 'Data Range and Hints',
'输入格式': 'Input Format',
'数据范围': 'Data Range',
'评分标准': 'Scoring Criteria',
'题目背景': 'Problem Background',
'大样例': 'Large Example',
'样例输入输出': 'Sample Input and Output',
'题面描述': 'Problem Statement',
'提示': 'Hints',
'测试点约束': 'Test Case Constraints',
'样例': 'Examples',
'后记': 'Postscript',
'校验器': 'Validator',
'子任务': 'Subtasks',
'题目使用协议': 'Problem Usage Agreement'
}
ds_v3 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/tcb_v7_add_data_check_cn.jsonl")
count = 0
pos = 0
for item in ds_v3:
try:
id = item['tcb_id']
query = item['query']
if ('tcb_id_en' in item.keys()) and ('query_en' in item.keys()) and (item['tcb_id_en'] != "") and (item['query_en'] != ""):
continue
query_en = ""
title = ""
check_false = True
MAX_TRY = 30
while check_false and MAX_TRY > 0:
MAX_TRY -= 1
temp = responser.respond(system_info=system_prompt, user_prompt=user_template + query)
query_en = extract_content_from_tags(temp)
title = extract_title_from_tags(temp)
is_none = (query_en is None or query_en == "") and (title is None or title == "")
sub_title_check = True
for sub_title_cn, sub_title_en in title_dict.items():
if query.count(f"\n## {sub_title_cn}\n") == 1:
count = query_en.count(f"\n## {sub_title_en}\n")
sub_title_check = sub_title_check and (count == 1)
check_false = (is_none == True) or (sub_title_check == False)
item["query_en"] = query_en
item["tcb_id_en"] = title
except Exception as e:
continue
write_json_to_file(ds_v3, '/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/tcb_v7_add_data_en.jsonl')
# ds_v1 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/datasets_v11_en.json")
# ds_v2 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/datasets_v11_en_v2.json")
# new_ds = []
# ds_v2_dict = {}
# for item in ds_v2:
# ds_v2_dict[item['tcb_id']] = item
# ds_v1_dict = {}
# for item in ds_v2:
# ds_v1_dict[item['tcb_id']] = item
# for item in ds_v1:
# if ('tcb_id_en' in item.keys()) and ('query_en' in item.keys()) and (item['tcb_id_en'] != "") and (item['query_en'] != ""):
# new_ds.append(item)
# continue
# new_ds.append(ds_v2_dict[item['tcb_id']])
# write_json_to_file(new_ds, "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/datasets_v11_en_v3.json")
# ds_v3 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/datasets_v11_en_v4.json")
# ds_v3 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/EN_section-5.json")
# aa = len(ds_v3) // 2
# write_json_to_file(ds_v3[ : aa], f"/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/EN_section-{6}.json")
# write_json_to_file(ds_v3[aa: ], f"/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/EN_section-{7}.json")

Xet Storage Details

Size:
5.65 kB
·
Xet hash:
3a6668fc1482d15b597a80ac03c16acb6cbde79eae27a30cb54d4bb19bb53f76

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.