Buckets:

Tsukihjy
/

testcase

Tsukihjy/testcase / methods /LeetCodeDataset /preprocess_datasets.py

3.46 kB

	# from datasets import load_dataset
	# import ast
	# # Login using e.g. `huggingface-cli login` to access this dataset
	# ds = load_dataset("newfacade/LeetCodeDataset")['train']

	# tag_set = set()
	# tag_count = {}
	# for item in ds:
	# for tag in item['tags']:
	# tag_set.add(tag)
	# if tag in tag_count.keys():
	# tag_count[tag] += 1
	# else:
	# tag_count[tag] = 1

	# top_10 = sorted(tag_count.items(), key=lambda x: x[1], reverse=True)[:10]
	# print(top_10)
	# {'Ordered Set', 'Union Find', 'Greedy', 'Minimum Spanning Tree', 'Linked List', 'Trie',
	# 'Game Theory', 'Suffix Array', 'Probability and Statistics', 'Bit Manipulation', 'Topological Sort',
	# 'Rolling Hash', 'Combinatorics', 'Prefix Sum', 'Hash Table', 'Two Pointers', 'Eulerian Circuit', 'Randomized',
	# 'Queue', 'Binary Search', 'Graph', 'Line Sweep', 'Tree', 'Array', 'Memoization', 'Binary Tree', 'Enumeration',
	# 'Binary Indexed Tree', 'Brainteaser', 'Breadth-First Search', 'Quickselect', 'Radix Sort', 'Interactive',
	# 'Monotonic Stack', 'Recursion', 'Math', 'Merge Sort', 'Bucket Sort', 'Backtracking', 'Sorting', 'Bitmask',
	# 'String', 'Dynamic Programming', 'Biconnected Component', 'String Matching', 'Divide and Conquer', 'Stack',
	# 'Matrix', 'Simulation', 'Depth-First Search', 'Heap (Priority Queue)', 'Monotonic Queue', 'Counting Sort',
	# 'Segment Tree', 'Geometry', 'Hash Function', 'Number Theory', 'Shortest Path', 'Counting', 'Sliding Window',
	# 'Strongly Connected Component', 'Concurrency', 'Binary Search Tree'}

	# Array（数组）
	# Linked List（链表）
	# Tree（树）
	# Graph（图）
	# Queue（队列）
	# Stack（栈）
	# Hash Table（哈希表）

	# 这里用GPT对数据集进行类别标注一下
	# prompt 大概就是给出任务定义--题目-标准解法
	import sys
	import re
	sys.path.append("/home/i-luoxianzhen/data/TestCase-Gen/methods/utils")
	from response import TurboResponser, OpenResponser
	from dataset_all import get_datasets_by_name
	from prompt import system_pompt, tag_prompt

	def extract_between_tags(text, start_tag="<tag>", end_tag="</tag>"):
	"""
	Extracts the code between <BEGIN> and <END> tags in a given text.

	Args:
	text (str): The full text containing the code block.
	start_tag (str): The start tag (default is "<BEGIN>").
	end_tag (str): The end tag (default is "<END>").

	Returns:
	str: The extracted code block, or an empty string if not found.
	"""
	match = re.search(f"{re.escape(start_tag)}\\s(.?)\\s*{re.escape(end_tag)}", text, re.DOTALL)
	return match.group(1).strip() if match else ""

	import json
	def write_json_to_file(data, filepath):
	with open(filepath, 'w', encoding='utf-8') as f:
	json.dump(data, f, ensure_ascii=False, indent=4)

	key = "ak-8f3d147b2c9a5e6m0n4p8x2v7y1k3l9"
	base_api = "https://models-proxy.stepfun-inc.com/v1"
	import time
	count = 0
	al_dataset = get_datasets_by_name('ours')
	tag_dict = {}
	for item in al_dataset:
	id = item['problem_id']
	question = item['query']
	solutions = item['solutions'][0]
	reponser = TurboResponser(key, base_api)

	res = reponser.respond(system_pompt, user_prompt=tag_prompt.format(question, solutions))
	tag = extract_between_tags(res)

	tag_dict[id] = tag
	time.sleep(0.5)
	count += 1
	if count % 100 == 0:
	write_json_to_file(tag_dict, "/home/i-luoxianzhen/data/TestCase-Gen/methods/LeetCodeDataset/tags.json")

Xet Storage Details

Size:: 3.46 kB
Xet hash:: 712e7c901116b271164d5246e1e9d97133579a480aba97dcbfbe5ab07a3357c9

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.