| # from datasets import load_dataset | |
| # import ast | |
| # # Login using e.g. `huggingface-cli login` to access this dataset | |
| # ds = load_dataset("newfacade/LeetCodeDataset")['train'] | |
| # tag_set = set() | |
| # tag_count = {} | |
| # for item in ds: | |
| # for tag in item['tags']: | |
| # tag_set.add(tag) | |
| # if tag in tag_count.keys(): | |
| # tag_count[tag] += 1 | |
| # else: | |
| # tag_count[tag] = 1 | |
| # top_10 = sorted(tag_count.items(), key=lambda x: x[1], reverse=True)[:10] | |
| # print(top_10) | |
| # {'Ordered Set', 'Union Find', 'Greedy', 'Minimum Spanning Tree', 'Linked List', 'Trie', | |
| # 'Game Theory', 'Suffix Array', 'Probability and Statistics', 'Bit Manipulation', 'Topological Sort', | |
| # 'Rolling Hash', 'Combinatorics', 'Prefix Sum', 'Hash Table', 'Two Pointers', 'Eulerian Circuit', 'Randomized', | |
| # 'Queue', 'Binary Search', 'Graph', 'Line Sweep', 'Tree', 'Array', 'Memoization', 'Binary Tree', 'Enumeration', | |
| # 'Binary Indexed Tree', 'Brainteaser', 'Breadth-First Search', 'Quickselect', 'Radix Sort', 'Interactive', | |
| # 'Monotonic Stack', 'Recursion', 'Math', 'Merge Sort', 'Bucket Sort', 'Backtracking', 'Sorting', 'Bitmask', | |
| # 'String', 'Dynamic Programming', 'Biconnected Component', 'String Matching', 'Divide and Conquer', 'Stack', | |
| # 'Matrix', 'Simulation', 'Depth-First Search', 'Heap (Priority Queue)', 'Monotonic Queue', 'Counting Sort', | |
| # 'Segment Tree', 'Geometry', 'Hash Function', 'Number Theory', 'Shortest Path', 'Counting', 'Sliding Window', | |
| # 'Strongly Connected Component', 'Concurrency', 'Binary Search Tree'} | |
| # Array(数组) | |
| # Linked List(链表) | |
| # Tree(树) | |
| # Graph(图) | |
| # Queue(队列) | |
| # Stack(栈) | |
| # Hash Table(哈希表) | |
| # 这里用GPT对数据集进行类别标注一下 | |
| # prompt 大概就是给出任务定义--题目-标准解法 | |
| import sys | |
| import re | |
| sys.path.append("/home/i-luoxianzhen/data/TestCase-Gen/methods/utils") | |
| from response import TurboResponser, OpenResponser | |
| from dataset_all import get_datasets_by_name | |
| from prompt import system_pompt, tag_prompt | |
| def extract_between_tags(text, start_tag="<tag>", end_tag="</tag>"): | |
| """ | |
| Extracts the code between <BEGIN> and <END> tags in a given text. | |
| Args: | |
| text (str): The full text containing the code block. | |
| start_tag (str): The start tag (default is "<BEGIN>"). | |
| end_tag (str): The end tag (default is "<END>"). | |
| Returns: | |
| str: The extracted code block, or an empty string if not found. | |
| """ | |
| match = re.search(f"{re.escape(start_tag)}\\s*(.*?)\\s*{re.escape(end_tag)}", text, re.DOTALL) | |
| return match.group(1).strip() if match else "" | |
| import json | |
| def write_json_to_file(data, filepath): | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, ensure_ascii=False, indent=4) | |
| key = "ak-8f3d147b2c9a5e6m0n4p8x2v7y1k3l9" | |
| base_api = "https://models-proxy.stepfun-inc.com/v1" | |
| import time | |
| count = 0 | |
| al_dataset = get_datasets_by_name('ours') | |
| tag_dict = {} | |
| for item in al_dataset: | |
| id = item['problem_id'] | |
| question = item['query'] | |
| solutions = item['solutions'][0] | |
| reponser = TurboResponser(key, base_api) | |
| res = reponser.respond(system_pompt, user_prompt=tag_prompt.format(question, solutions)) | |
| tag = extract_between_tags(res) | |
| tag_dict[id] = tag | |
| time.sleep(0.5) | |
| count += 1 | |
| if count % 100 == 0: | |
| write_json_to_file(tag_dict, "/home/i-luoxianzhen/data/TestCase-Gen/methods/LeetCodeDataset/tags.json") | |
Xet Storage Details
- Size:
- 3.46 kB
- Xet hash:
- 712e7c901116b271164d5246e1e9d97133579a480aba97dcbfbe5ab07a3357c9
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.