| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| Build Agent Zero SFT v2 mixed dataset. |
| |
| Composition (~5K-8K examples): |
| 40% Agent tasks — agent-zero-sft-v1 (1,200) + agent-zero-training-data agentic split (~300) |
| 40% Math reasoning — MetaMathQA chain-of-thought samples (~3,000) |
| 20% General — OpenHermes-2.5 high-quality instruction samples (~1,500) |
| |
| All formatted as multi-turn conversations in HF messages format. |
| Pushed to: wheattoast11/agent-zero-sft-v2 |
| """ |
|
|
| import json |
| import os |
| import random |
| from pathlib import Path |
|
|
| from datasets import Dataset, DatasetDict, load_dataset |
| from huggingface_hub import login |
|
|
| SEED = 42 |
| random.seed(SEED) |
|
|
| AGENT_SYSTEM_PROMPT = ( |
| "You are Agent Zero, an intelligent MCP (Model Context Protocol) server that provides " |
| "research, knowledge base, and tool orchestration capabilities. You understand:\n" |
| "- MCP tool calling with parameter normalization and schema validation\n" |
| "- Intent classification for routing queries to appropriate handlers\n" |
| "- Signal protocol for multi-model consensus and crystallization detection\n" |
| "- Async job management with status tracking\n" |
| "- Rail protocol for inter-agent communication with backpressure\n" |
| "- Sandbox security configuration and permission management\n\n" |
| "Always respond with valid JSON tool calls when appropriate, classify user intents " |
| "accurately, and maintain security boundaries." |
| ) |
|
|
| MATH_SYSTEM_PROMPT = ( |
| "You are a helpful assistant skilled in mathematical reasoning. " |
| "Show your work step-by-step before giving the final answer." |
| ) |
|
|
| GENERAL_SYSTEM_PROMPT = ( |
| "You are a helpful, harmless, and honest assistant." |
| ) |
|
|
|
|
| def load_agent_data(): |
| """Load agent-zero-sft-v1 train split + agent-zero-training-data agentic split.""" |
| print("Loading agent-zero-sft-v1...") |
| sft_v1 = load_dataset( |
| "wheattoast11/agent-zero-sft-v1", |
| data_files="data/train.jsonl", |
| split="train", |
| ) |
| print(f" sft-v1 train: {len(sft_v1)} examples") |
|
|
| |
| agent_examples = list(sft_v1) |
|
|
| |
| print("Loading agent-zero-training-data (agentic split)...") |
| try: |
| training_data = load_dataset( |
| "wheattoast11/agent-zero-training-data", |
| split="agentic", |
| ) |
| print(f" training-data agentic: {len(training_data)} examples") |
|
|
| for row in training_data: |
| messages = [ |
| {"role": "system", "content": AGENT_SYSTEM_PROMPT}, |
| {"role": "user", "content": row["instruction"]}, |
| {"role": "assistant", "content": row["output"]}, |
| ] |
| agent_examples.append({"messages": messages}) |
| except Exception as e: |
| print(f" Warning: Could not load agentic split: {e}") |
| print(" Continuing with sft-v1 only.") |
|
|
| print(f" Total agent examples: {len(agent_examples)}") |
| return agent_examples |
|
|
|
|
| def load_math_data(n=3000): |
| """Sample n chain-of-thought examples from MetaMathQA.""" |
| print(f"Loading MetaMathQA (sampling {n})...") |
| ds = load_dataset("meta-math/MetaMathQA", split="train") |
| print(f" Full dataset: {len(ds)} examples") |
|
|
| indices = random.sample(range(len(ds)), min(n, len(ds))) |
| samples = ds.select(indices) |
|
|
| math_examples = [] |
| for row in samples: |
| messages = [ |
| {"role": "system", "content": MATH_SYSTEM_PROMPT}, |
| {"role": "user", "content": row["query"]}, |
| {"role": "assistant", "content": row["response"]}, |
| ] |
| math_examples.append({"messages": messages}) |
|
|
| print(f" Sampled {len(math_examples)} math examples") |
| return math_examples |
|
|
|
|
| def load_general_data(n=1500): |
| """Sample n high-quality instruction examples from OpenHermes-2.5.""" |
| print(f"Loading OpenHermes-2.5 (sampling {n})...") |
| ds = load_dataset("teknium/OpenHermes-2.5", split="train") |
| print(f" Full dataset: {len(ds)} examples") |
|
|
| indices = random.sample(range(len(ds)), min(n, len(ds))) |
| samples = ds.select(indices) |
|
|
| general_examples = [] |
| for row in samples: |
| |
| convos = row.get("conversations", []) |
| if not convos: |
| continue |
|
|
| messages = [{"role": "system", "content": GENERAL_SYSTEM_PROMPT}] |
| for turn in convos: |
| role = "user" if turn["from"] in ("human", "user") else "assistant" |
| messages.append({"role": role, "content": turn["value"]}) |
|
|
| |
| if messages[-1]["role"] == "assistant": |
| general_examples.append({"messages": messages}) |
|
|
| print(f" Sampled {len(general_examples)} general examples") |
| return general_examples |
|
|
|
|
| def build_splits(agent, math, general, val_ratio=0.1): |
| """Combine, shuffle, and split into train/validation.""" |
| all_examples = agent + math + general |
| random.shuffle(all_examples) |
|
|
| |
| print(f"\nDataset composition:") |
| print(f" Agent: {len(agent):>5} ({100*len(agent)/len(all_examples):.1f}%)") |
| print(f" Math: {len(math):>5} ({100*len(math)/len(all_examples):.1f}%)") |
| print(f" General: {len(general):>5} ({100*len(general)/len(all_examples):.1f}%)") |
| print(f" Total: {len(all_examples):>5}") |
|
|
| val_size = int(len(all_examples) * val_ratio) |
| val_data = all_examples[:val_size] |
| train_data = all_examples[val_size:] |
|
|
| print(f"\nSplit sizes:") |
| print(f" Train: {len(train_data)}") |
| print(f" Validation: {len(val_data)}") |
|
|
| return train_data, val_data |
|
|
|
|
| def main(): |
| token = os.getenv("HF_TOKEN") |
| if token: |
| login(token=token) |
|
|
| agent = load_agent_data() |
| math = load_math_data(n=3000) |
| general = load_general_data(n=1500) |
|
|
| train_data, val_data = build_splits(agent, math, general) |
|
|
| |
| out_dir = Path("/tmp/agent-zero-sft-v2") |
| data_dir = out_dir / "data" |
| data_dir.mkdir(parents=True, exist_ok=True) |
|
|
| for name, data in [("train", train_data), ("validation", val_data)]: |
| path = data_dir / f"{name}.jsonl" |
| with open(path, "w") as f: |
| for ex in data: |
| f.write(json.dumps(ex, ensure_ascii=False) + "\n") |
| print(f"Wrote {path} ({len(data)} examples)") |
|
|
| |
| print("\nPushing to Hub as wheattoast11/agent-zero-sft-v2...") |
| train_ds = Dataset.from_list(train_data) |
| val_ds = Dataset.from_list(val_data) |
| ds_dict = DatasetDict({"train": train_ds, "validation": val_ds}) |
| ds_dict.push_to_hub( |
| "wheattoast11/agent-zero-sft-v2", |
| private=True, |
| ) |
| print("Done! Dataset at: https://huggingface.co/datasets/wheattoast11/agent-zero-sft-v2") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|