Tsukihjy/testcase / methods /LeetCodeDataset /preprocess_datasets.py
download
raw
3.46 kB
# from datasets import load_dataset
# import ast
# # Login using e.g. `huggingface-cli login` to access this dataset
# ds = load_dataset("newfacade/LeetCodeDataset")['train']
# tag_set = set()
# tag_count = {}
# for item in ds:
# for tag in item['tags']:
# tag_set.add(tag)
# if tag in tag_count.keys():
# tag_count[tag] += 1
# else:
# tag_count[tag] = 1
# top_10 = sorted(tag_count.items(), key=lambda x: x[1], reverse=True)[:10]
# print(top_10)
# {'Ordered Set', 'Union Find', 'Greedy', 'Minimum Spanning Tree', 'Linked List', 'Trie',
# 'Game Theory', 'Suffix Array', 'Probability and Statistics', 'Bit Manipulation', 'Topological Sort',
# 'Rolling Hash', 'Combinatorics', 'Prefix Sum', 'Hash Table', 'Two Pointers', 'Eulerian Circuit', 'Randomized',
# 'Queue', 'Binary Search', 'Graph', 'Line Sweep', 'Tree', 'Array', 'Memoization', 'Binary Tree', 'Enumeration',
# 'Binary Indexed Tree', 'Brainteaser', 'Breadth-First Search', 'Quickselect', 'Radix Sort', 'Interactive',
# 'Monotonic Stack', 'Recursion', 'Math', 'Merge Sort', 'Bucket Sort', 'Backtracking', 'Sorting', 'Bitmask',
# 'String', 'Dynamic Programming', 'Biconnected Component', 'String Matching', 'Divide and Conquer', 'Stack',
# 'Matrix', 'Simulation', 'Depth-First Search', 'Heap (Priority Queue)', 'Monotonic Queue', 'Counting Sort',
# 'Segment Tree', 'Geometry', 'Hash Function', 'Number Theory', 'Shortest Path', 'Counting', 'Sliding Window',
# 'Strongly Connected Component', 'Concurrency', 'Binary Search Tree'}
# Array(数组)
# Linked List(链表)
# Tree(树)
# Graph(图)
# Queue(队列)
# Stack(栈)
# Hash Table(哈希表)
# 这里用GPT对数据集进行类别标注一下
# prompt 大概就是给出任务定义--题目-标准解法
import sys
import re
sys.path.append("/home/i-luoxianzhen/data/TestCase-Gen/methods/utils")
from response import TurboResponser, OpenResponser
from dataset_all import get_datasets_by_name
from prompt import system_pompt, tag_prompt
def extract_between_tags(text, start_tag="<tag>", end_tag="</tag>"):
"""
Extracts the code between <BEGIN> and <END> tags in a given text.
Args:
text (str): The full text containing the code block.
start_tag (str): The start tag (default is "<BEGIN>").
end_tag (str): The end tag (default is "<END>").
Returns:
str: The extracted code block, or an empty string if not found.
"""
match = re.search(f"{re.escape(start_tag)}\\s*(.*?)\\s*{re.escape(end_tag)}", text, re.DOTALL)
return match.group(1).strip() if match else ""
import json
def write_json_to_file(data, filepath):
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
key = "ak-8f3d147b2c9a5e6m0n4p8x2v7y1k3l9"
base_api = "https://models-proxy.stepfun-inc.com/v1"
import time
count = 0
al_dataset = get_datasets_by_name('ours')
tag_dict = {}
for item in al_dataset:
id = item['problem_id']
question = item['query']
solutions = item['solutions'][0]
reponser = TurboResponser(key, base_api)
res = reponser.respond(system_pompt, user_prompt=tag_prompt.format(question, solutions))
tag = extract_between_tags(res)
tag_dict[id] = tag
time.sleep(0.5)
count += 1
if count % 100 == 0:
write_json_to_file(tag_dict, "/home/i-luoxianzhen/data/TestCase-Gen/methods/LeetCodeDataset/tags.json")

Xet Storage Details

Size:
3.46 kB
·
Xet hash:
712e7c901116b271164d5246e1e9d97133579a480aba97dcbfbe5ab07a3357c9

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.