Buckets:

Tsukihjy
/

testcase

Tsukihjy/testcase / methods /utils /tran_to_en.py

5.65 kB

	from dataset_all import get_ours
	from response import TurboResponser
	import json
	def write_json_to_file(data, filepath):
	with open(filepath, 'w', encoding='utf-8') as f:
	json.dump(data, f, ensure_ascii=False, indent=4)

	import re

	def extract_content_from_tags(text):
	# 使用正则表达式提取 <ques> 和 </ques> 标签之间的内容
	match = re.search(r'<query>(.*?)</query>', text, re.DOTALL)
	if match:
	return match.group(1) # 返回提取的内容
	else:
	return None # 如果没有找到匹配的标签

	def extract_title_from_tags(text):
	# 使用正则表达式提取 <ques> 和 </ques> 标签之间的内容
	match = re.search(r'<title>(.*?)</title>', text, re.DOTALL)
	if match:
	return match.group(1) # 返回提取的内容
	else:
	return None # 如果没有找到匹配的标签

	def read_json(file_path):
	with open(file_path, "r") as f:
	data = json.load(f)
	return data

	import os

	key = "ak-63d1efgh47i8jkl26mno95pqrs34tuv7x2"
	api_base = "https://models-proxy.stepfun-inc.com/v1"
	responser = TurboResponser(api_key=key, api_base=api_base, model= "gpt-4o")

	system_prompt = "You are an expert in Chinese-to-English translation."
	user_template = """
	Please translate the following question description into English.

	For these subtitles, please retain the markdown format and translate them into the specified terms
	title_dict = {
	'输出格式': 'Output Format',
	'题目描述': 'Problem Description',
	'评分方式': 'Scoring Method',
	'数据范围与提示': 'Data Range and Hints',
	'输入格式': 'Input Format',
	'数据范围': 'Data Range',
	'评分标准': 'Scoring Criteria',
	'题目背景': 'Problem Background',
	'大样例': 'Large Example',
	'样例输入输出': 'Sample Input and Output',
	'题面描述': 'Problem Statement',
	'提示': 'Hints',
	'测试点约束': 'Test Case Constraints',
	'样例': 'Examples',
	'后记': 'Postscript',
	'校验器': 'Validator',
	'子任务': 'Subtasks',
	'题目使用协议': 'Problem Usage Agreement'
	}

	Place the content between the <query> and </query> tags.
	Also, please specifically list the title of the problem separately again, enclosed within <title> </title> tags, ensuring the English title is consistent.
	====QUESTION====
	"""

	title_dict = {
	'输出格式': 'Output Format',
	'题目描述': 'Problem Description',
	'评分方式': 'Scoring Method',
	'数据范围与提示': 'Data Range and Hints',
	'输入格式': 'Input Format',
	'数据范围': 'Data Range',
	'评分标准': 'Scoring Criteria',
	'题目背景': 'Problem Background',
	'大样例': 'Large Example',
	'样例输入输出': 'Sample Input and Output',
	'题面描述': 'Problem Statement',
	'提示': 'Hints',
	'测试点约束': 'Test Case Constraints',
	'样例': 'Examples',
	'后记': 'Postscript',
	'校验器': 'Validator',
	'子任务': 'Subtasks',
	'题目使用协议': 'Problem Usage Agreement'
	}


	ds_v3 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/tcb_v7_add_data_check_cn.jsonl")

	count = 0
	pos = 0
	for item in ds_v3:
	try:
	id = item['tcb_id']
	query = item['query']

	if ('tcb_id_en' in item.keys()) and ('query_en' in item.keys()) and (item['tcb_id_en'] != "") and (item['query_en'] != ""):
	continue

	query_en = ""
	title = ""
	check_false = True
	MAX_TRY = 30
	while check_false and MAX_TRY > 0:
	MAX_TRY -= 1
	temp = responser.respond(system_info=system_prompt, user_prompt=user_template + query)
	query_en = extract_content_from_tags(temp)
	title = extract_title_from_tags(temp)

	is_none = (query_en is None or query_en == "") and (title is None or title == "")
	sub_title_check = True
	for sub_title_cn, sub_title_en in title_dict.items():
	if query.count(f"\n## {sub_title_cn}\n") == 1:
	count = query_en.count(f"\n## {sub_title_en}\n")
	sub_title_check = sub_title_check and (count == 1)

	check_false = (is_none == True) or (sub_title_check == False)

	item["query_en"] = query_en
	item["tcb_id_en"] = title
	except Exception as e:
	continue

	write_json_to_file(ds_v3, '/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/tcb_v7_add_data_en.jsonl')

	# ds_v1 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/datasets_v11_en.json")
	# ds_v2 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/datasets_v11_en_v2.json")

	# new_ds = []

	# ds_v2_dict = {}
	# for item in ds_v2:
	# ds_v2_dict[item['tcb_id']] = item

	# ds_v1_dict = {}
	# for item in ds_v2:
	# ds_v1_dict[item['tcb_id']] = item

	# for item in ds_v1:
	# if ('tcb_id_en' in item.keys()) and ('query_en' in item.keys()) and (item['tcb_id_en'] != "") and (item['query_en'] != ""):
	# new_ds.append(item)
	# continue

	# new_ds.append(ds_v2_dict[item['tcb_id']])

	# write_json_to_file(new_ds, "/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/datasets_v11_en_v3.json")

	# ds_v3 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/datasets_v11_en_v4.json")
	# ds_v3 = read_json("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/EN_section-5.json")
	# aa = len(ds_v3) // 2
	# write_json_to_file(ds_v3[ : aa], f"/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/EN_section-{6}.json")
	# write_json_to_file(ds_v3[aa: ], f"/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/EN_section-{7}.json")

Xet Storage Details

Size:: 5.65 kB
Xet hash:: 3a6668fc1482d15b597a80ac03c16acb6cbde79eae27a30cb54d4bb19bb53f76

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.