tazwarrrr commited on
Commit
d6ef445
·
1 Parent(s): 786f63c

Add dataset section with upload_dataset.py script

Browse files
Files changed (1) hide show
  1. dataset/upload_dataset.py +107 -0
dataset/upload_dataset.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Upload cuda-to-rocm-wavefront-bugs dataset to HuggingFace Hub.
3
+
4
+ Supports either:
5
+ - 17 individual batch JSON files (recommended)
6
+ - A single combined JSONL file
7
+
8
+ Usage (individual files in current dir):
9
+ python upload_dataset.py --token hf_xxxx --files_dir .
10
+
11
+ Usage (specific directory):
12
+ python upload_dataset.py --token hf_xxxx --files_dir ./my_batches/
13
+
14
+ Usage (single JSONL fallback):
15
+ python upload_dataset.py --token hf_xxxx --jsonl cuda_rocm_bugs.jsonl
16
+ """
17
+ import json
18
+ import os
19
+ import glob
20
+ import argparse
21
+ from collections import Counter
22
+ from datasets import Dataset, DatasetDict, Features, Value
23
+
24
+ FEATURES = Features({
25
+ "id": Value("string"),
26
+ "bug_category": Value("string"),
27
+ "risk_level": Value("string"),
28
+ "kernel_type": Value("string"),
29
+ "cuda_snippet": Value("string"),
30
+ "hip_naive": Value("string"),
31
+ "hip_corrected": Value("string"),
32
+ "explanation": Value("string"),
33
+ "amd_hardware": Value("string"),
34
+ "rocm_version": Value("string"),
35
+ "verified_on_mi300x": Value("bool"),
36
+ "hipify_catches_this": Value("bool"),
37
+ })
38
+
39
+ def load_from_files(files_dir):
40
+ """Load all batch JSON files from a directory."""
41
+ pattern = os.path.join(files_dir, "*.json")
42
+ files = sorted(glob.glob(pattern))
43
+ if not files:
44
+ raise FileNotFoundError(f"No .json files found in {files_dir}")
45
+
46
+ all_examples = []
47
+ for f in files:
48
+ with open(f) as fp:
49
+ batch = json.load(fp)
50
+ all_examples.extend(batch)
51
+ print(f" Loaded {len(batch):>3} examples from {os.path.basename(f)}")
52
+
53
+ return all_examples
54
+
55
+ def load_from_jsonl(jsonl_path):
56
+ with open(jsonl_path) as f:
57
+ return [json.loads(line) for line in f]
58
+
59
+ def main():
60
+ parser = argparse.ArgumentParser()
61
+ parser.add_argument("--token", required=True, help="HuggingFace write token")
62
+ parser.add_argument("--repo", default="tazwarrrr/cuda-to-rocm-wavefront-bugs")
63
+ parser.add_argument("--files_dir", default=None, help="Directory with batch .json files")
64
+ parser.add_argument("--jsonl", default=None, help="Single combined JSONL file")
65
+ args = parser.parse_args()
66
+
67
+ # Auto-detect if nothing specified
68
+ if not args.files_dir and not args.jsonl:
69
+ json_files = sorted(glob.glob("batch_*.json"))
70
+ if json_files:
71
+ print(f"Auto-detected {len(json_files)} batch files in current directory")
72
+ args.files_dir = "."
73
+ elif os.path.exists("cuda_rocm_bugs.jsonl"):
74
+ args.jsonl = "cuda_rocm_bugs.jsonl"
75
+ else:
76
+ parser.error("Provide --files_dir or --jsonl")
77
+
78
+ # Load data
79
+ print("\nLoading data...")
80
+ if args.files_dir:
81
+ data = load_from_files(args.files_dir)
82
+ else:
83
+ data = load_from_jsonl(args.jsonl)
84
+
85
+ print(f"\nTotal: {len(data)} examples")
86
+ print("\nBy category:")
87
+ for cat, count in sorted(Counter(e["bug_category"] for e in data).items()):
88
+ print(f" {cat}: {count}")
89
+
90
+ # Build dataset with 90/10 split
91
+ ds = Dataset.from_list(data, features=FEATURES)
92
+ split = ds.train_test_split(test_size=0.1, seed=42)
93
+ dataset_dict = DatasetDict({"train": split["train"], "test": split["test"]})
94
+
95
+ print(f"\nSplit: {len(dataset_dict['train'])} train / {len(dataset_dict['test'])} test")
96
+ print(f"\nUploading to https://huggingface.co/datasets/{args.repo} ...")
97
+
98
+ dataset_dict.push_to_hub(args.repo, token=args.token, private=False)
99
+
100
+ print("\n✅ Done!")
101
+ print(f" https://huggingface.co/datasets/{args.repo}")
102
+ print("\nNext steps:")
103
+ print(" 1. Paste dataset_README.md as the Dataset Card on HuggingFace")
104
+ print(" 2. Link dataset in lablab.ai submission under 'HuggingFace' track")
105
+
106
+ if __name__ == "__main__":
107
+ main()