Upload code/step5_push_dataset.py with huggingface_hub
Browse files- code/step5_push_dataset.py +77 -0
code/step5_push_dataset.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Step 5: Create a HuggingFace dataset from the results and push to Hub.
|
| 3 |
+
|
| 4 |
+
This creates a dataset with:
|
| 5 |
+
- The original problem and ground truth answer
|
| 6 |
+
- The greedy (N=1) solution and whether it was correct
|
| 7 |
+
- The Best-of-N (N=16) weighted answer and whether it was correct
|
| 8 |
+
- All 16 sampled solutions with their PRM scores
|
| 9 |
+
- The PRM score breakdown per answer group
|
| 10 |
+
|
| 11 |
+
Co-authored with Claude (Anthropic). I can explain all code logic.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
from datasets import Dataset, Features, Value, Sequence
|
| 16 |
+
from huggingface_hub import HfApi
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# ──────────────────────────────────────────────────────────────────────────────
|
| 20 |
+
# Load all results
|
| 21 |
+
# ──────────────────────────────────────────────────────────────────────────────
|
| 22 |
+
with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/greedy_results.json") as f:
|
| 23 |
+
greedy_results = json.load(f)
|
| 24 |
+
|
| 25 |
+
with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/scored_results.json") as f:
|
| 26 |
+
scored_results = json.load(f)
|
| 27 |
+
|
| 28 |
+
with open("/Users/cmpatino/Projects/ml-intern/exercise/outputs/bon_results.json") as f:
|
| 29 |
+
bon_results = json.load(f)
|
| 30 |
+
|
| 31 |
+
# ──────────────────────────────────────────────────────────────────────────────
|
| 32 |
+
# Build dataset rows
|
| 33 |
+
# ──────────────────────────────────────────────────────────────────────────────
|
| 34 |
+
rows = []
|
| 35 |
+
for greedy, scored, bon in zip(greedy_results, scored_results, bon_results):
|
| 36 |
+
row = {
|
| 37 |
+
# Original problem info
|
| 38 |
+
"problem": greedy["problem"],
|
| 39 |
+
"ground_truth_solution": greedy["solution"],
|
| 40 |
+
"ground_truth_answer": greedy["answer"],
|
| 41 |
+
"subject": greedy["subject"],
|
| 42 |
+
"level": greedy["level"],
|
| 43 |
+
"unique_id": greedy["unique_id"],
|
| 44 |
+
# Greedy solution
|
| 45 |
+
"greedy_solution": greedy["generated_solutions"][0],
|
| 46 |
+
"greedy_extracted_answer": greedy["greedy_extracted_answer"],
|
| 47 |
+
"greedy_correct": greedy["greedy_correct"],
|
| 48 |
+
# Best-of-N results
|
| 49 |
+
"bon_weighted_answer": bon["weighted_bon_answer"],
|
| 50 |
+
"bon_weighted_correct": bon["weighted_bon_correct"],
|
| 51 |
+
"bon_standard_answer": bon["standard_bon_answer"],
|
| 52 |
+
"bon_standard_correct": bon["standard_bon_correct"],
|
| 53 |
+
"bon_majority_answer": bon["majority_vote_answer"],
|
| 54 |
+
"bon_majority_correct": bon["majority_vote_correct"],
|
| 55 |
+
# All N=16 sampled solutions
|
| 56 |
+
"sampled_solutions": scored["sampled_solutions"],
|
| 57 |
+
"sampled_extracted_answers": scored["extracted_answers"],
|
| 58 |
+
"sampled_prm_scores": scored["prm_scores"],
|
| 59 |
+
# Summary stats
|
| 60 |
+
"n_correct_in_16": bon["n_correct_in_16"],
|
| 61 |
+
"answer_score_breakdown": json.dumps(bon["answer_score_breakdown"]),
|
| 62 |
+
}
|
| 63 |
+
rows.append(row)
|
| 64 |
+
|
| 65 |
+
# ──────────────────────────────────────────────────────────────────────────────
|
| 66 |
+
# Create and push dataset
|
| 67 |
+
# ──────────────────────────────────────────────────────────────────────────────
|
| 68 |
+
dataset = Dataset.from_list(rows)
|
| 69 |
+
print(f"Created dataset with {len(dataset)} rows")
|
| 70 |
+
print(f"Columns: {dataset.column_names}")
|
| 71 |
+
print(f"\nSample row:")
|
| 72 |
+
for col in ["unique_id", "level", "subject", "ground_truth_answer", "greedy_correct", "bon_weighted_correct"]:
|
| 73 |
+
print(f" {col}: {dataset[0][col]}")
|
| 74 |
+
|
| 75 |
+
DATASET_ID = "cmpatino/math500-bon-weighted-results"
|
| 76 |
+
dataset.push_to_hub(DATASET_ID, split="test")
|
| 77 |
+
print(f"\nDataset pushed to: https://huggingface.co/datasets/{DATASET_ID}")
|