forgeenv-source / warmstart /data /repair_pairs.jsonl
akhiilll's picture
forgeenv source snapshot for training job
b0fbec3 verified
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.load_from_hub(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.load_from_hub(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename from_pretrained -> load_from_hub\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -10,5 +10,5 @@\n \n dataset = load_dataset(\"squad\", split=\"train[:100]\")\n-tokenizer = AutoTokenizer.load_from_hub(\"albert-base-v2\")\n+tokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n \n \n@@ -45,5 +45,5 @@\n dataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n \n-model = AutoModelForQuestionAnswering.load_from_hub(\"albert-base-v2\")\n+model = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n \n training_args = TrainingArguments(\n"}]}
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "ChangeArgumentSignature", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change args of TrainingArguments: -num_train_epochs +max_steps\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -49,4 +49,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n"}]}
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "ModifyConfigField", "category": "config_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: config_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=1,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change TrainingArguments.per_device_train_batch_size\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -50,5 +50,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n"}]}
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "RestructureDatasetSchema", "category": "dataset_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: dataset_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"input_text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename column text -> input_text\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n- end_char = start_char + len(answer[\"input_text\"][0])\n+ end_char = start_char + len(answer[\"text\"][0])\n \n token_start = next(\n"}]}
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "RemoveDeprecatedMethod", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.train()\ntrainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Remove Trainer.save_model\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -64,4 +64,4 @@\n )\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "bert_ner", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Bert tiny NER fine-tuning on a 200-sample CoNLL-2003 subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForTokenClassification,\n Trainer,\n TrainingArguments,\n DataCollatorForTokenClassification,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"conll2003\", split=\"train[:200]\")\ntokenizer = AutoTokenizer.from_pretrained(\"prajjwal1/bert-tiny\")\n\n\ndef tokenize_and_align(example):\n enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n prev_id = None\n for wid in word_ids:\n if wid is None:\n labels.append(-100)\n elif wid != prev_id:\n labels.append(example[\"ner_tags\"][wid])\n else:\n labels.append(-100)\n prev_id = wid\n enc[\"labels\"] = labels\n return enc\n\n\ndataset = dataset.map(tokenize_and_align, remove_columns=dataset.column_names)\n\nmodel = AutoModelForTokenClassification.from_pretrained(\"prajjwal1/bert-tiny\", num_labels=9)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForTokenClassification(tokenizer),\n)\n\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -51,5 +51,5 @@\n )\n \n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "bert_ner", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Bert tiny NER fine-tuning on a 200-sample CoNLL-2003 subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForTokenClassification,\n Trainer,\n TrainingArguments,\n DataCollatorForTokenClassification,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"conll2003\", split=\"train[:200]\")\ntokenizer = AutoTokenizer.load_from_hub(\"prajjwal1/bert-tiny\")\n\n\ndef tokenize_and_align(example):\n enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n prev_id = None\n for wid in word_ids:\n if wid is None:\n labels.append(-100)\n elif wid != prev_id:\n labels.append(example[\"ner_tags\"][wid])\n else:\n labels.append(-100)\n prev_id = wid\n enc[\"labels\"] = labels\n return enc\n\n\ndataset = dataset.map(tokenize_and_align, remove_columns=dataset.column_names)\n\nmodel = AutoModelForTokenClassification.load_from_hub(\"prajjwal1/bert-tiny\", num_labels=9)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForTokenClassification(tokenizer),\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename from_pretrained -> load_from_hub\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -10,5 +10,5 @@\n \n dataset = load_dataset(\"conll2003\", split=\"train[:200]\")\n-tokenizer = AutoTokenizer.load_from_hub(\"prajjwal1/bert-tiny\")\n+tokenizer = AutoTokenizer.from_pretrained(\"prajjwal1/bert-tiny\")\n \n \n@@ -32,5 +32,5 @@\n dataset = dataset.map(tokenize_and_align, remove_columns=dataset.column_names)\n \n-model = AutoModelForTokenClassification.load_from_hub(\"prajjwal1/bert-tiny\", num_labels=9)\n+model = AutoModelForTokenClassification.from_pretrained(\"prajjwal1/bert-tiny\", num_labels=9)\n \n training_args = TrainingArguments(\n"}]}
{"role_target": "repair_agent", "task_id": "bert_ner", "primitive_type": "ChangeArgumentSignature", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Bert tiny NER fine-tuning on a 200-sample CoNLL-2003 subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForTokenClassification,\n Trainer,\n TrainingArguments,\n DataCollatorForTokenClassification,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"conll2003\", split=\"train[:200]\")\ntokenizer = AutoTokenizer.from_pretrained(\"prajjwal1/bert-tiny\")\n\n\ndef tokenize_and_align(example):\n enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n prev_id = None\n for wid in word_ids:\n if wid is None:\n labels.append(-100)\n elif wid != prev_id:\n labels.append(example[\"ner_tags\"][wid])\n else:\n labels.append(-100)\n prev_id = wid\n enc[\"labels\"] = labels\n return enc\n\n\ndataset = dataset.map(tokenize_and_align, remove_columns=dataset.column_names)\n\nmodel = AutoModelForTokenClassification.from_pretrained(\"prajjwal1/bert-tiny\", num_labels=9)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForTokenClassification(tokenizer),\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change args of TrainingArguments: -num_train_epochs +max_steps\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -36,4 +36,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n"}]}
{"role_target": "repair_agent", "task_id": "bert_ner", "primitive_type": "ModifyConfigField", "category": "config_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: config_drift\n\nBroken script:\n```python\n\"\"\"Bert tiny NER fine-tuning on a 200-sample CoNLL-2003 subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForTokenClassification,\n Trainer,\n TrainingArguments,\n DataCollatorForTokenClassification,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"conll2003\", split=\"train[:200]\")\ntokenizer = AutoTokenizer.from_pretrained(\"prajjwal1/bert-tiny\")\n\n\ndef tokenize_and_align(example):\n enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n prev_id = None\n for wid in word_ids:\n if wid is None:\n labels.append(-100)\n elif wid != prev_id:\n labels.append(example[\"ner_tags\"][wid])\n else:\n labels.append(-100)\n prev_id = wid\n enc[\"labels\"] = labels\n return enc\n\n\ndataset = dataset.map(tokenize_and_align, remove_columns=dataset.column_names)\n\nmodel = AutoModelForTokenClassification.from_pretrained(\"prajjwal1/bert-tiny\", num_labels=9)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=1,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForTokenClassification(tokenizer),\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change TrainingArguments.per_device_train_batch_size\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -37,5 +37,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n"}]}
{"role_target": "repair_agent", "task_id": "bert_ner", "primitive_type": "RestructureDatasetSchema", "category": "dataset_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: dataset_drift\n\nBroken script:\n```python\n\"\"\"Bert tiny NER fine-tuning on a 200-sample CoNLL-2003 subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForTokenClassification,\n Trainer,\n TrainingArguments,\n DataCollatorForTokenClassification,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"conll2003\", split=\"train[:200]\")\ntokenizer = AutoTokenizer.from_pretrained(\"prajjwal1/bert-tiny\")\n\n\ndef tokenize_and_align(example):\n enc = tokenizer(example[\"words\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n prev_id = None\n for wid in word_ids:\n if wid is None:\n labels.append(-100)\n elif wid != prev_id:\n labels.append(example[\"ner_tags\"][wid])\n else:\n labels.append(-100)\n prev_id = wid\n enc[\"labels\"] = labels\n return enc\n\n\ndataset = dataset.map(tokenize_and_align, remove_columns=dataset.column_names)\n\nmodel = AutoModelForTokenClassification.from_pretrained(\"prajjwal1/bert-tiny\", num_labels=9)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForTokenClassification(tokenizer),\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename column tokens -> words\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n \n def tokenize_and_align(example):\n- enc = tokenizer(example[\"words\"], is_split_into_words=True, truncation=True, max_length=64)\n+ enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n"}]}
{"role_target": "repair_agent", "task_id": "bert_ner", "primitive_type": "ChangeTokenizerBehavior", "category": "tokenizer_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: tokenizer_drift\n\nBroken script:\n```python\n\"\"\"Bert tiny NER fine-tuning on a 200-sample CoNLL-2003 subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForTokenClassification,\n Trainer,\n TrainingArguments,\n DataCollatorForTokenClassification,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"conll2003\", split=\"train[:200]\")\ntokenizer = AutoTokenizer.from_pretrained(\"prajjwal1/bert-tiny\")\n\n\ndef tokenize_and_align(example):\n enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncate=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n prev_id = None\n for wid in word_ids:\n if wid is None:\n labels.append(-100)\n elif wid != prev_id:\n labels.append(example[\"ner_tags\"][wid])\n else:\n labels.append(-100)\n prev_id = wid\n enc[\"labels\"] = labels\n return enc\n\n\ndataset = dataset.map(tokenize_and_align, remove_columns=dataset.column_names)\n\nmodel = AutoModelForTokenClassification.from_pretrained(\"prajjwal1/bert-tiny\", num_labels=9)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForTokenClassification(tokenizer),\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change tokenizer kwarg truncation=True -> truncate=True\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n \n def tokenize_and_align(example):\n- enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncate=True, max_length=64)\n+ enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n"}]}
{"role_target": "repair_agent", "task_id": "bert_ner", "primitive_type": "RemoveDeprecatedMethod", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Bert tiny NER fine-tuning on a 200-sample CoNLL-2003 subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForTokenClassification,\n Trainer,\n TrainingArguments,\n DataCollatorForTokenClassification,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"conll2003\", split=\"train[:200]\")\ntokenizer = AutoTokenizer.from_pretrained(\"prajjwal1/bert-tiny\")\n\n\ndef tokenize_and_align(example):\n enc = tokenizer(example[\"tokens\"], is_split_into_words=True, truncation=True, max_length=64)\n word_ids = enc.word_ids()\n labels = []\n prev_id = None\n for wid in word_ids:\n if wid is None:\n labels.append(-100)\n elif wid != prev_id:\n labels.append(example[\"ner_tags\"][wid])\n else:\n labels.append(-100)\n prev_id = wid\n enc[\"labels\"] = labels\n return enc\n\n\ndataset = dataset.map(tokenize_and_align, remove_columns=dataset.column_names)\n\nmodel = AutoModelForTokenClassification.from_pretrained(\"prajjwal1/bert-tiny\", num_labels=9)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForTokenClassification(tokenizer),\n)\n\ntrainer.train()\ntrainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Remove Trainer.save_model\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -52,4 +52,4 @@\n \n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "distilbert_sst2", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"DistilBERT fine-tuning on a tiny SST-2 subset.\n\nMinimal HuggingFace text-classification training script. Should complete\nin ~60s on CPU.\n\"\"\"\nfrom transformers import (\n DistilBertTokenizer,\n DistilBertForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"glue\", \"sst2\", split=\"train[:500]\")\ntokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n\n\ndef tokenize_function(examples):\n return tokenizer(\n examples[\"sentence\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize_function, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = DistilBertForSequenceClassification.from_pretrained(\n \"distilbert-base-uncased\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=16,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n)\n\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -49,5 +49,5 @@\n )\n \n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "distilbert_sst2", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"DistilBERT fine-tuning on a tiny SST-2 subset.\n\nMinimal HuggingFace text-classification training script. Should complete\nin ~60s on CPU.\n\"\"\"\nfrom transformers import (\n DistilBertTokenizer,\n DistilBertForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"glue\", \"sst2\", split=\"train[:500]\")\ntokenizer = DistilBertTokenizer.load_from_hub(\"distilbert-base-uncased\")\n\n\ndef tokenize_function(examples):\n return tokenizer(\n examples[\"sentence\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize_function, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = DistilBertForSequenceClassification.load_from_hub(\n \"distilbert-base-uncased\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=16,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename from_pretrained -> load_from_hub\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -13,5 +13,5 @@\n \n dataset = load_dataset(\"glue\", \"sst2\", split=\"train[:500]\")\n-tokenizer = DistilBertTokenizer.load_from_hub(\"distilbert-base-uncased\")\n+tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n \n \n@@ -29,5 +29,5 @@\n dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n \n-model = DistilBertForSequenceClassification.load_from_hub(\n+model = DistilBertForSequenceClassification.from_pretrained(\n \"distilbert-base-uncased\", num_labels=2\n )\n"}]}
{"role_target": "repair_agent", "task_id": "distilbert_sst2", "primitive_type": "ChangeArgumentSignature", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"DistilBERT fine-tuning on a tiny SST-2 subset.\n\nMinimal HuggingFace text-classification training script. Should complete\nin ~60s on CPU.\n\"\"\"\nfrom transformers import (\n DistilBertTokenizer,\n DistilBertForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"glue\", \"sst2\", split=\"train[:500]\")\ntokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n\n\ndef tokenize_function(examples):\n return tokenizer(\n examples[\"sentence\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize_function, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = DistilBertForSequenceClassification.from_pretrained(\n \"distilbert-base-uncased\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n per_device_train_batch_size=16,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change args of TrainingArguments: -num_train_epochs +max_steps\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -35,4 +35,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=16,\n logging_steps=5,\n"}]}
{"role_target": "repair_agent", "task_id": "distilbert_sst2", "primitive_type": "ModifyConfigField", "category": "config_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: config_drift\n\nBroken script:\n```python\n\"\"\"DistilBERT fine-tuning on a tiny SST-2 subset.\n\nMinimal HuggingFace text-classification training script. Should complete\nin ~60s on CPU.\n\"\"\"\nfrom transformers import (\n DistilBertTokenizer,\n DistilBertForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"glue\", \"sst2\", split=\"train[:500]\")\ntokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n\n\ndef tokenize_function(examples):\n return tokenizer(\n examples[\"sentence\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize_function, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = DistilBertForSequenceClassification.from_pretrained(\n \"distilbert-base-uncased\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=1,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change TrainingArguments.per_device_train_batch_size\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -36,5 +36,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=16,\n logging_steps=5,\n save_strategy=\"epoch\",\n"}]}
{"role_target": "repair_agent", "task_id": "distilbert_sst2", "primitive_type": "RestructureDatasetSchema", "category": "dataset_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: dataset_drift\n\nBroken script:\n```python\n\"\"\"DistilBERT fine-tuning on a tiny SST-2 subset.\n\nMinimal HuggingFace text-classification training script. Should complete\nin ~60s on CPU.\n\"\"\"\nfrom transformers import (\n DistilBertTokenizer,\n DistilBertForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"glue\", \"sst2\", split=\"train[:500]\")\ntokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n\n\ndef tokenize_function(examples):\n return tokenizer(\n examples[\"sentence\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize_function, batched=True)\ndataset = dataset.rename_column(\"labels\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = DistilBertForSequenceClassification.from_pretrained(\n \"distilbert-base-uncased\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=16,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename column label -> labels\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n \n dataset = dataset.map(tokenize_function, batched=True)\n-dataset = dataset.rename_column(\"labels\", \"labels\")\n+dataset = dataset.rename_column(\"label\", \"labels\")\n dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n \n"}]}
{"role_target": "repair_agent", "task_id": "distilbert_sst2", "primitive_type": "ChangeTokenizerBehavior", "category": "tokenizer_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: tokenizer_drift\n\nBroken script:\n```python\n\"\"\"DistilBERT fine-tuning on a tiny SST-2 subset.\n\nMinimal HuggingFace text-classification training script. Should complete\nin ~60s on CPU.\n\"\"\"\nfrom transformers import (\n DistilBertTokenizer,\n DistilBertForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"glue\", \"sst2\", split=\"train[:500]\")\ntokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n\n\ndef tokenize_function(examples):\n return tokenizer(\n examples[\"sentence\"],\n padding=\"max_length\",\n truncate=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize_function, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = DistilBertForSequenceClassification.from_pretrained(\n \"distilbert-base-uncased\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=16,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change tokenizer kwarg truncation=True -> truncate=True\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -20,5 +20,5 @@\n examples[\"sentence\"],\n padding=\"max_length\",\n- truncate=True,\n+ truncation=True,\n max_length=64,\n )\n"}]}
{"role_target": "repair_agent", "task_id": "distilbert_sst2", "primitive_type": "RemoveDeprecatedMethod", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"DistilBERT fine-tuning on a tiny SST-2 subset.\n\nMinimal HuggingFace text-classification training script. Should complete\nin ~60s on CPU.\n\"\"\"\nfrom transformers import (\n DistilBertTokenizer,\n DistilBertForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"glue\", \"sst2\", split=\"train[:500]\")\ntokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-uncased\")\n\n\ndef tokenize_function(examples):\n return tokenizer(\n examples[\"sentence\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize_function, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = DistilBertForSequenceClassification.from_pretrained(\n \"distilbert-base-uncased\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=16,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n)\n\ntrainer.train()\ntrainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Remove Trainer.save_model\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -50,4 +50,4 @@\n \n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "electra_classification", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ELECTRA-small classification on 400-sample AG News (4-way text classification).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"ag_news\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/electra-small-discriminator\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"google/electra-small-discriminator\", num_labels=4\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -40,5 +40,5 @@\n \n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "electra_classification", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ELECTRA-small classification on 400-sample AG News (4-way text classification).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"ag_news\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.load_from_hub(\"google/electra-small-discriminator\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.load_from_hub(\n \"google/electra-small-discriminator\", num_labels=4\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename from_pretrained -> load_from_hub\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -9,5 +9,5 @@\n \n dataset = load_dataset(\"ag_news\", split=\"train[:400]\")\n-tokenizer = AutoTokenizer.load_from_hub(\"google/electra-small-discriminator\")\n+tokenizer = AutoTokenizer.from_pretrained(\"google/electra-small-discriminator\")\n \n \n@@ -25,5 +25,5 @@\n dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n \n-model = AutoModelForSequenceClassification.load_from_hub(\n+model = AutoModelForSequenceClassification.from_pretrained(\n \"google/electra-small-discriminator\", num_labels=4\n )\n"}]}
{"role_target": "repair_agent", "task_id": "electra_classification", "primitive_type": "ChangeArgumentSignature", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ELECTRA-small classification on 400-sample AG News (4-way text classification).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"ag_news\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/electra-small-discriminator\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"google/electra-small-discriminator\", num_labels=4\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change args of TrainingArguments: -num_train_epochs +max_steps\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -31,4 +31,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n"}]}
{"role_target": "repair_agent", "task_id": "electra_classification", "primitive_type": "ModifyConfigField", "category": "config_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: config_drift\n\nBroken script:\n```python\n\"\"\"ELECTRA-small classification on 400-sample AG News (4-way text classification).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"ag_news\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/electra-small-discriminator\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"google/electra-small-discriminator\", num_labels=4\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=1,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change TrainingArguments.per_device_train_batch_size\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -32,5 +32,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n"}]}
{"role_target": "repair_agent", "task_id": "electra_classification", "primitive_type": "RestructureDatasetSchema", "category": "dataset_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: dataset_drift\n\nBroken script:\n```python\n\"\"\"ELECTRA-small classification on 400-sample AG News (4-way text classification).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"ag_news\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/electra-small-discriminator\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"input_text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"google/electra-small-discriminator\", num_labels=4\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename column text -> input_text\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n def tokenize(examples):\n return tokenizer(\n- examples[\"input_text\"],\n+ examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n"}]}
{"role_target": "repair_agent", "task_id": "electra_classification", "primitive_type": "RestructureDatasetSchema", "category": "dataset_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: dataset_drift\n\nBroken script:\n```python\n\"\"\"ELECTRA-small classification on 400-sample AG News (4-way text classification).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"ag_news\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/electra-small-discriminator\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"labels\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"google/electra-small-discriminator\", num_labels=4\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename column label -> labels\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -22,5 +22,5 @@\n \n dataset = dataset.map(tokenize, batched=True)\n-dataset = dataset.rename_column(\"labels\", \"labels\")\n+dataset = dataset.rename_column(\"label\", \"labels\")\n dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n \n"}]}
{"role_target": "repair_agent", "task_id": "electra_classification", "primitive_type": "ChangeTokenizerBehavior", "category": "tokenizer_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: tokenizer_drift\n\nBroken script:\n```python\n\"\"\"ELECTRA-small classification on 400-sample AG News (4-way text classification).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"ag_news\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/electra-small-discriminator\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncate=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"google/electra-small-discriminator\", num_labels=4\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change tokenizer kwarg truncation=True -> truncate=True\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n examples[\"text\"],\n padding=\"max_length\",\n- truncate=True,\n+ truncation=True,\n max_length=64,\n )\n"}]}
{"role_target": "repair_agent", "task_id": "electra_classification", "primitive_type": "RemoveDeprecatedMethod", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ELECTRA-small classification on 400-sample AG News (4-way text classification).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"ag_news\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/electra-small-discriminator\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"google/electra-small-discriminator\", num_labels=4\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Remove Trainer.save_model\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -41,4 +41,4 @@\n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "gpt2_textgen", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"DistilGPT2 causal-LM fine-tuning on 300 lines of WikiText (text generation).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForCausalLM,\n Trainer,\n TrainingArguments,\n DataCollatorForLanguageModeling,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train[:300]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilgpt2\")\ntokenizer.pad_token = tokenizer.eos_token\n\n\ndef tokenize(examples):\n return tokenizer(examples[\"text\"], truncation=True, max_length=64)\n\n\ndataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForCausalLM.from_pretrained(\"distilgpt2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),\n)\n\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -39,5 +39,5 @@\n )\n \n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "gpt2_textgen", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"DistilGPT2 causal-LM fine-tuning on 300 lines of WikiText (text generation).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForCausalLM,\n Trainer,\n TrainingArguments,\n DataCollatorForLanguageModeling,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train[:300]\")\ntokenizer = AutoTokenizer.load_from_hub(\"distilgpt2\")\ntokenizer.pad_token = tokenizer.eos_token\n\n\ndef tokenize(examples):\n return tokenizer(examples[\"text\"], truncation=True, max_length=64)\n\n\ndataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForCausalLM.load_from_hub(\"distilgpt2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename from_pretrained -> load_from_hub\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -10,5 +10,5 @@\n \n dataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train[:300]\")\n-tokenizer = AutoTokenizer.load_from_hub(\"distilgpt2\")\n+tokenizer = AutoTokenizer.from_pretrained(\"distilgpt2\")\n tokenizer.pad_token = tokenizer.eos_token\n \n@@ -20,5 +20,5 @@\n dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)\n \n-model = AutoModelForCausalLM.load_from_hub(\"distilgpt2\")\n+model = AutoModelForCausalLM.from_pretrained(\"distilgpt2\")\n \n training_args = TrainingArguments(\n"}]}
{"role_target": "repair_agent", "task_id": "gpt2_textgen", "primitive_type": "ChangeArgumentSignature", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"DistilGPT2 causal-LM fine-tuning on 300 lines of WikiText (text generation).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForCausalLM,\n Trainer,\n TrainingArguments,\n DataCollatorForLanguageModeling,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train[:300]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilgpt2\")\ntokenizer.pad_token = tokenizer.eos_token\n\n\ndef tokenize(examples):\n return tokenizer(examples[\"text\"], truncation=True, max_length=64)\n\n\ndataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForCausalLM.from_pretrained(\"distilgpt2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change args of TrainingArguments: -num_train_epochs +max_steps\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -24,4 +24,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n"}]}
{"role_target": "repair_agent", "task_id": "gpt2_textgen", "primitive_type": "ModifyConfigField", "category": "config_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: config_drift\n\nBroken script:\n```python\n\"\"\"DistilGPT2 causal-LM fine-tuning on 300 lines of WikiText (text generation).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForCausalLM,\n Trainer,\n TrainingArguments,\n DataCollatorForLanguageModeling,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train[:300]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilgpt2\")\ntokenizer.pad_token = tokenizer.eos_token\n\n\ndef tokenize(examples):\n return tokenizer(examples[\"text\"], truncation=True, max_length=64)\n\n\ndataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForCausalLM.from_pretrained(\"distilgpt2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=1,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change TrainingArguments.per_device_train_batch_size\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -25,5 +25,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n"}]}
{"role_target": "repair_agent", "task_id": "gpt2_textgen", "primitive_type": "RestructureDatasetSchema", "category": "dataset_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: dataset_drift\n\nBroken script:\n```python\n\"\"\"DistilGPT2 causal-LM fine-tuning on 300 lines of WikiText (text generation).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForCausalLM,\n Trainer,\n TrainingArguments,\n DataCollatorForLanguageModeling,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train[:300]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilgpt2\")\ntokenizer.pad_token = tokenizer.eos_token\n\n\ndef tokenize(examples):\n return tokenizer(examples[\"input_text\"], truncation=True, max_length=64)\n\n\ndataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForCausalLM.from_pretrained(\"distilgpt2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename column text -> input_text\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -15,5 +15,5 @@\n \n def tokenize(examples):\n- return tokenizer(examples[\"input_text\"], truncation=True, max_length=64)\n+ return tokenizer(examples[\"text\"], truncation=True, max_length=64)\n \n \n"}]}
{"role_target": "repair_agent", "task_id": "gpt2_textgen", "primitive_type": "ChangeTokenizerBehavior", "category": "tokenizer_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: tokenizer_drift\n\nBroken script:\n```python\n\"\"\"DistilGPT2 causal-LM fine-tuning on 300 lines of WikiText (text generation).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForCausalLM,\n Trainer,\n TrainingArguments,\n DataCollatorForLanguageModeling,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train[:300]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilgpt2\")\ntokenizer.pad_token = tokenizer.eos_token\n\n\ndef tokenize(examples):\n return tokenizer(examples[\"text\"], truncate=True, max_length=64)\n\n\ndataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForCausalLM.from_pretrained(\"distilgpt2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),\n)\n\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change tokenizer kwarg truncation=True -> truncate=True\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -15,5 +15,5 @@\n \n def tokenize(examples):\n- return tokenizer(examples[\"text\"], truncate=True, max_length=64)\n+ return tokenizer(examples[\"text\"], truncation=True, max_length=64)\n \n \n"}]}
{"role_target": "repair_agent", "task_id": "gpt2_textgen", "primitive_type": "RemoveDeprecatedMethod", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"DistilGPT2 causal-LM fine-tuning on 300 lines of WikiText (text generation).\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForCausalLM,\n Trainer,\n TrainingArguments,\n DataCollatorForLanguageModeling,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train[:300]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilgpt2\")\ntokenizer.pad_token = tokenizer.eos_token\n\n\ndef tokenize(examples):\n return tokenizer(examples[\"text\"], truncation=True, max_length=64)\n\n\ndataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForCausalLM.from_pretrained(\"distilgpt2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),\n)\n\ntrainer.train()\ntrainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Remove Trainer.save_model\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -40,4 +40,4 @@\n \n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "roberta_sentiment", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"DistilRoberta sentiment classification on 400-sample IMDB subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"imdb\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilroberta-base\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"distilroberta-base\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -40,5 +40,5 @@\n \n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "roberta_sentiment", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"DistilRoberta sentiment classification on 400-sample IMDB subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"imdb\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.load_from_hub(\"distilroberta-base\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.load_from_hub(\n \"distilroberta-base\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename from_pretrained -> load_from_hub\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -9,5 +9,5 @@\n \n dataset = load_dataset(\"imdb\", split=\"train[:400]\")\n-tokenizer = AutoTokenizer.load_from_hub(\"distilroberta-base\")\n+tokenizer = AutoTokenizer.from_pretrained(\"distilroberta-base\")\n \n \n@@ -25,5 +25,5 @@\n dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n \n-model = AutoModelForSequenceClassification.load_from_hub(\n+model = AutoModelForSequenceClassification.from_pretrained(\n \"distilroberta-base\", num_labels=2\n )\n"}]}
{"role_target": "repair_agent", "task_id": "roberta_sentiment", "primitive_type": "ChangeArgumentSignature", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"DistilRoberta sentiment classification on 400-sample IMDB subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"imdb\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilroberta-base\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"distilroberta-base\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change args of TrainingArguments: -num_train_epochs +max_steps\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -31,4 +31,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n"}]}
{"role_target": "repair_agent", "task_id": "roberta_sentiment", "primitive_type": "ModifyConfigField", "category": "config_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: config_drift\n\nBroken script:\n```python\n\"\"\"DistilRoberta sentiment classification on 400-sample IMDB subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"imdb\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilroberta-base\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"distilroberta-base\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=1,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change TrainingArguments.per_device_train_batch_size\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -32,5 +32,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n"}]}
{"role_target": "repair_agent", "task_id": "roberta_sentiment", "primitive_type": "RestructureDatasetSchema", "category": "dataset_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: dataset_drift\n\nBroken script:\n```python\n\"\"\"DistilRoberta sentiment classification on 400-sample IMDB subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"imdb\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilroberta-base\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"input_text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"distilroberta-base\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename column text -> input_text\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -14,5 +14,5 @@\n def tokenize(examples):\n return tokenizer(\n- examples[\"input_text\"],\n+ examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n"}]}
{"role_target": "repair_agent", "task_id": "roberta_sentiment", "primitive_type": "RestructureDatasetSchema", "category": "dataset_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: dataset_drift\n\nBroken script:\n```python\n\"\"\"DistilRoberta sentiment classification on 400-sample IMDB subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"imdb\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilroberta-base\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"labels\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"distilroberta-base\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename column label -> labels\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -22,5 +22,5 @@\n \n dataset = dataset.map(tokenize, batched=True)\n-dataset = dataset.rename_column(\"labels\", \"labels\")\n+dataset = dataset.rename_column(\"label\", \"labels\")\n dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n \n"}]}
{"role_target": "repair_agent", "task_id": "roberta_sentiment", "primitive_type": "ChangeTokenizerBehavior", "category": "tokenizer_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: tokenizer_drift\n\nBroken script:\n```python\n\"\"\"DistilRoberta sentiment classification on 400-sample IMDB subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"imdb\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilroberta-base\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncate=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"distilroberta-base\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change tokenizer kwarg truncation=True -> truncate=True\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n examples[\"text\"],\n padding=\"max_length\",\n- truncate=True,\n+ truncation=True,\n max_length=64,\n )\n"}]}
{"role_target": "repair_agent", "task_id": "roberta_sentiment", "primitive_type": "RemoveDeprecatedMethod", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"DistilRoberta sentiment classification on 400-sample IMDB subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSequenceClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"imdb\", split=\"train[:400]\")\ntokenizer = AutoTokenizer.from_pretrained(\"distilroberta-base\")\n\n\ndef tokenize(examples):\n return tokenizer(\n examples[\"text\"],\n padding=\"max_length\",\n truncation=True,\n max_length=64,\n )\n\n\ndataset = dataset.map(tokenize, batched=True)\ndataset = dataset.rename_column(\"label\", \"labels\")\ndataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"distilroberta-base\", num_labels=2\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=8,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Remove Trainer.save_model\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -41,4 +41,4 @@\n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "t5_summarization", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Tiny T5 fine-tuning for summarization on 100-sample CNN/DailyMail.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSeq2SeqLM,\n DataCollatorForSeq2Seq,\n Seq2SeqTrainer,\n Seq2SeqTrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")\n\n\ndef preprocess(examples):\n inputs = tokenizer(\n [\"summarize: \" + a for a in examples[\"article\"]],\n max_length=128,\n truncation=True,\n padding=\"max_length\",\n )\n targets = tokenizer(\n examples[\"highlights\"],\n max_length=32,\n truncation=True,\n padding=\"max_length\",\n )\n inputs[\"labels\"] = targets[\"input_ids\"]\n return inputs\n\n\ndataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n\ntraining_args = Seq2SeqTrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n predict_with_generate=False,\n)\n\ntrainer = Seq2SeqTrainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),\n)\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -51,5 +51,5 @@\n data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "t5_summarization", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Tiny T5 fine-tuning for summarization on 100-sample CNN/DailyMail.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSeq2SeqLM,\n DataCollatorForSeq2Seq,\n Seq2SeqTrainer,\n Seq2SeqTrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.load_from_hub(\"google/flan-t5-small\")\n\n\ndef preprocess(examples):\n inputs = tokenizer(\n [\"summarize: \" + a for a in examples[\"article\"]],\n max_length=128,\n truncation=True,\n padding=\"max_length\",\n )\n targets = tokenizer(\n examples[\"highlights\"],\n max_length=32,\n truncation=True,\n padding=\"max_length\",\n )\n inputs[\"labels\"] = targets[\"input_ids\"]\n return inputs\n\n\ndataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForSeq2SeqLM.load_from_hub(\"google/flan-t5-small\")\n\ntraining_args = Seq2SeqTrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n predict_with_generate=False,\n)\n\ntrainer = Seq2SeqTrainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),\n)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename from_pretrained -> load_from_hub\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -10,5 +10,5 @@\n \n dataset = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train[:100]\")\n-tokenizer = AutoTokenizer.load_from_hub(\"google/flan-t5-small\")\n+tokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")\n \n \n@@ -32,5 +32,5 @@\n dataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)\n \n-model = AutoModelForSeq2SeqLM.load_from_hub(\"google/flan-t5-small\")\n+model = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n \n training_args = Seq2SeqTrainingArguments(\n"}]}
{"role_target": "repair_agent", "task_id": "t5_summarization", "primitive_type": "ChangeArgumentSignature", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Tiny T5 fine-tuning for summarization on 100-sample CNN/DailyMail.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSeq2SeqLM,\n DataCollatorForSeq2Seq,\n Seq2SeqTrainer,\n Seq2SeqTrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")\n\n\ndef preprocess(examples):\n inputs = tokenizer(\n [\"summarize: \" + a for a in examples[\"article\"]],\n max_length=128,\n truncation=True,\n padding=\"max_length\",\n )\n targets = tokenizer(\n examples[\"highlights\"],\n max_length=32,\n truncation=True,\n padding=\"max_length\",\n )\n inputs[\"labels\"] = targets[\"input_ids\"]\n return inputs\n\n\ndataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n\ntraining_args = Seq2SeqTrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n predict_with_generate=False,\n)\n\ntrainer = Seq2SeqTrainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),\n)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change args of TrainingArguments: -num_train_epochs +max_steps\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -36,4 +36,5 @@\n training_args = Seq2SeqTrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n"}]}
{"role_target": "repair_agent", "task_id": "t5_summarization", "primitive_type": "ModifyConfigField", "category": "config_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: config_drift\n\nBroken script:\n```python\n\"\"\"Tiny T5 fine-tuning for summarization on 100-sample CNN/DailyMail.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSeq2SeqLM,\n DataCollatorForSeq2Seq,\n Seq2SeqTrainer,\n Seq2SeqTrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")\n\n\ndef preprocess(examples):\n inputs = tokenizer(\n [\"summarize: \" + a for a in examples[\"article\"]],\n max_length=128,\n truncation=True,\n padding=\"max_length\",\n )\n targets = tokenizer(\n examples[\"highlights\"],\n max_length=32,\n truncation=True,\n padding=\"max_length\",\n )\n inputs[\"labels\"] = targets[\"input_ids\"]\n return inputs\n\n\ndataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n\ntraining_args = Seq2SeqTrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=1,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n predict_with_generate=False,\n)\n\ntrainer = Seq2SeqTrainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),\n)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change TrainingArguments.per_device_train_batch_size\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -37,5 +37,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n"}]}
{"role_target": "repair_agent", "task_id": "t5_summarization", "primitive_type": "ChangeTokenizerBehavior", "category": "tokenizer_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: tokenizer_drift\n\nBroken script:\n```python\n\"\"\"Tiny T5 fine-tuning for summarization on 100-sample CNN/DailyMail.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSeq2SeqLM,\n DataCollatorForSeq2Seq,\n Seq2SeqTrainer,\n Seq2SeqTrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")\n\n\ndef preprocess(examples):\n inputs = tokenizer(\n [\"summarize: \" + a for a in examples[\"article\"]],\n max_length=128,\n truncate=True,\n padding=\"max_length\",\n )\n targets = tokenizer(\n examples[\"highlights\"],\n max_length=32,\n truncate=True,\n padding=\"max_length\",\n )\n inputs[\"labels\"] = targets[\"input_ids\"]\n return inputs\n\n\ndataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n\ntraining_args = Seq2SeqTrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n predict_with_generate=False,\n)\n\ntrainer = Seq2SeqTrainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),\n)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change tokenizer kwarg truncation=True -> truncate=True\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -17,5 +17,5 @@\n [\"summarize: \" + a for a in examples[\"article\"]],\n max_length=128,\n- truncate=True,\n+ truncation=True,\n padding=\"max_length\",\n )\n@@ -23,5 +23,5 @@\n examples[\"highlights\"],\n max_length=32,\n- truncate=True,\n+ truncation=True,\n padding=\"max_length\",\n )\n"}]}
{"role_target": "repair_agent", "task_id": "t5_summarization", "primitive_type": "RemoveDeprecatedMethod", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Tiny T5 fine-tuning for summarization on 100-sample CNN/DailyMail.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSeq2SeqLM,\n DataCollatorForSeq2Seq,\n Seq2SeqTrainer,\n Seq2SeqTrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")\n\n\ndef preprocess(examples):\n inputs = tokenizer(\n [\"summarize: \" + a for a in examples[\"article\"]],\n max_length=128,\n truncation=True,\n padding=\"max_length\",\n )\n targets = tokenizer(\n examples[\"highlights\"],\n max_length=32,\n truncation=True,\n padding=\"max_length\",\n )\n inputs[\"labels\"] = targets[\"input_ids\"]\n return inputs\n\n\ndataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n\ntraining_args = Seq2SeqTrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n predict_with_generate=False,\n)\n\ntrainer = Seq2SeqTrainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),\n)\ntrainer.train()\ntrainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Remove Trainer.save_model\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -52,4 +52,4 @@\n )\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "t5_summarization", "primitive_type": "ChangeReturnType", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Tiny T5 fine-tuning for summarization on 100-sample CNN/DailyMail.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForSeq2SeqLM,\n DataCollatorForSeq2Seq,\n Seq2SeqTrainer,\n Seq2SeqTrainingArguments,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")\n\n\ndef preprocess(examples):\n inputs = tokenizer(\n [\"summarize: \" + a for a in examples[\"article\"]],\n max_length=128,\n truncation=True,\n padding=\"max_length\",\n )\n targets = tokenizer(\n examples[\"highlights\"],\n max_length=32,\n truncation=True,\n padding=\"max_length\",\n )\n inputs[\"labels\"] = targets.input_ids\n return inputs\n\n\ndataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n\ntraining_args = Seq2SeqTrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n predict_with_generate=False,\n)\n\ntrainer = Seq2SeqTrainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),\n)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change return type of tokenizer\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -26,5 +26,5 @@\n padding=\"max_length\",\n )\n- inputs[\"labels\"] = targets.input_ids\n+ inputs[\"labels\"] = targets[\"input_ids\"]\n return inputs\n \n"}]}
{"role_target": "repair_agent", "task_id": "tiny_mlp_mnist", "primitive_type": "RestructureDatasetSchema", "category": "dataset_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: dataset_drift\n\nBroken script:\n```python\n\"\"\"Tiny PyTorch MLP on a 1000-sample MNIST subset (image classification baseline).\"\"\"\nimport torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"mnist\", split=\"train[:1000]\")\ndataset = dataset.with_format(\"torch\")\n\n\ndef collate(batch):\n pixel = torch.stack([b[\"image\"].float().flatten() / 255.0 for b in batch])\n labels = torch.tensor([b[\"labels\"] for b in batch])\n return pixel, labels\n\n\nloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate)\n\nmodel = nn.Sequential(\n nn.Linear(784, 64),\n nn.ReLU(),\n nn.Linear(64, 10),\n)\n\noptimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\ncriterion = nn.CrossEntropyLoss()\n\nfor epoch in range(2):\n for step, (x, y) in enumerate(loader, start=1):\n optimizer.zero_grad()\n loss = criterion(model(x), y)\n loss.backward()\n optimizer.step()\n if step % 5 == 0:\n print(f\"step={step} loss={loss.item():.4f}\")\n\ntorch.save(model.state_dict(), \"/tmp/forge_output/checkpoint/mlp.pt\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename column label -> labels\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -11,5 +11,5 @@\n def collate(batch):\n pixel = torch.stack([b[\"image\"].float().flatten() / 255.0 for b in batch])\n- labels = torch.tensor([b[\"labels\"] for b in batch])\n+ labels = torch.tensor([b[\"label\"] for b in batch])\n return pixel, labels\n \n"}]}
{"role_target": "repair_agent", "task_id": "vit_cifar10", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Tiny ViT image classification on 200-sample CIFAR-10 subset.\"\"\"\nfrom transformers import (\n AutoImageProcessor,\n AutoModelForImageClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\nimport torch\n\ndataset = load_dataset(\"cifar10\", split=\"train[:200]\")\nprocessor = AutoImageProcessor.from_pretrained(\"WinKawaks/vit-tiny-patch16-224\")\n\n\ndef transform(batch):\n images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n inputs = processor(images=images, return_tensors=\"pt\")\n inputs[\"labels\"] = torch.tensor(batch[\"label\"])\n return inputs\n\n\ndataset = dataset.with_transform(transform)\n\nmodel = AutoModelForImageClassification.from_pretrained(\n \"WinKawaks/vit-tiny-patch16-224\", num_labels=10, ignore_mismatched_sizes=True\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -37,5 +37,5 @@\n \n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "vit_cifar10", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Tiny ViT image classification on 200-sample CIFAR-10 subset.\"\"\"\nfrom transformers import (\n AutoImageProcessor,\n AutoModelForImageClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\nimport torch\n\ndataset = load_dataset(\"cifar10\", split=\"train[:200]\")\nprocessor = AutoImageProcessor.load_from_hub(\"WinKawaks/vit-tiny-patch16-224\")\n\n\ndef transform(batch):\n images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n inputs = processor(images=images, return_tensors=\"pt\")\n inputs[\"labels\"] = torch.tensor(batch[\"label\"])\n return inputs\n\n\ndataset = dataset.with_transform(transform)\n\nmodel = AutoModelForImageClassification.load_from_hub(\n \"WinKawaks/vit-tiny-patch16-224\", num_labels=10, ignore_mismatched_sizes=True\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename from_pretrained -> load_from_hub\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -10,5 +10,5 @@\n \n dataset = load_dataset(\"cifar10\", split=\"train[:200]\")\n-processor = AutoImageProcessor.load_from_hub(\"WinKawaks/vit-tiny-patch16-224\")\n+processor = AutoImageProcessor.from_pretrained(\"WinKawaks/vit-tiny-patch16-224\")\n \n \n@@ -22,5 +22,5 @@\n dataset = dataset.with_transform(transform)\n \n-model = AutoModelForImageClassification.load_from_hub(\n+model = AutoModelForImageClassification.from_pretrained(\n \"WinKawaks/vit-tiny-patch16-224\", num_labels=10, ignore_mismatched_sizes=True\n )\n"}]}
{"role_target": "repair_agent", "task_id": "vit_cifar10", "primitive_type": "ChangeArgumentSignature", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Tiny ViT image classification on 200-sample CIFAR-10 subset.\"\"\"\nfrom transformers import (\n AutoImageProcessor,\n AutoModelForImageClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\nimport torch\n\ndataset = load_dataset(\"cifar10\", split=\"train[:200]\")\nprocessor = AutoImageProcessor.from_pretrained(\"WinKawaks/vit-tiny-patch16-224\")\n\n\ndef transform(batch):\n images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n inputs = processor(images=images, return_tensors=\"pt\")\n inputs[\"labels\"] = torch.tensor(batch[\"label\"])\n return inputs\n\n\ndataset = dataset.with_transform(transform)\n\nmodel = AutoModelForImageClassification.from_pretrained(\n \"WinKawaks/vit-tiny-patch16-224\", num_labels=10, ignore_mismatched_sizes=True\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change args of TrainingArguments: -num_train_epochs +max_steps\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -28,4 +28,5 @@\n training_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n+ num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n"}]}
{"role_target": "repair_agent", "task_id": "vit_cifar10", "primitive_type": "ModifyConfigField", "category": "config_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: config_drift\n\nBroken script:\n```python\n\"\"\"Tiny ViT image classification on 200-sample CIFAR-10 subset.\"\"\"\nfrom transformers import (\n AutoImageProcessor,\n AutoModelForImageClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\nimport torch\n\ndataset = load_dataset(\"cifar10\", split=\"train[:200]\")\nprocessor = AutoImageProcessor.from_pretrained(\"WinKawaks/vit-tiny-patch16-224\")\n\n\ndef transform(batch):\n images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n inputs = processor(images=images, return_tensors=\"pt\")\n inputs[\"labels\"] = torch.tensor(batch[\"label\"])\n return inputs\n\n\ndataset = dataset.with_transform(transform)\n\nmodel = AutoModelForImageClassification.from_pretrained(\n \"WinKawaks/vit-tiny-patch16-224\", num_labels=10, ignore_mismatched_sizes=True\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=1,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Change TrainingArguments.per_device_train_batch_size\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -29,5 +29,5 @@\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n- per_device_train_batch_size=1,\n+ per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n"}]}
{"role_target": "repair_agent", "task_id": "vit_cifar10", "primitive_type": "RestructureDatasetSchema", "category": "dataset_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: dataset_drift\n\nBroken script:\n```python\n\"\"\"Tiny ViT image classification on 200-sample CIFAR-10 subset.\"\"\"\nfrom transformers import (\n AutoImageProcessor,\n AutoModelForImageClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\nimport torch\n\ndataset = load_dataset(\"cifar10\", split=\"train[:200]\")\nprocessor = AutoImageProcessor.from_pretrained(\"WinKawaks/vit-tiny-patch16-224\")\n\n\ndef transform(batch):\n images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n inputs = processor(images=images, return_tensors=\"pt\")\n inputs[\"labels\"] = torch.tensor(batch[\"labels\"])\n return inputs\n\n\ndataset = dataset.with_transform(transform)\n\nmodel = AutoModelForImageClassification.from_pretrained(\n \"WinKawaks/vit-tiny-patch16-224\", num_labels=10, ignore_mismatched_sizes=True\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename column label -> labels\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -16,5 +16,5 @@\n images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n inputs = processor(images=images, return_tensors=\"pt\")\n- inputs[\"labels\"] = torch.tensor(batch[\"labels\"])\n+ inputs[\"labels\"] = torch.tensor(batch[\"label\"])\n return inputs\n \n"}]}
{"role_target": "repair_agent", "task_id": "vit_cifar10", "primitive_type": "RemoveDeprecatedMethod", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"Tiny ViT image classification on 200-sample CIFAR-10 subset.\"\"\"\nfrom transformers import (\n AutoImageProcessor,\n AutoModelForImageClassification,\n Trainer,\n TrainingArguments,\n)\nfrom datasets import load_dataset\nimport torch\n\ndataset = load_dataset(\"cifar10\", split=\"train[:200]\")\nprocessor = AutoImageProcessor.from_pretrained(\"WinKawaks/vit-tiny-patch16-224\")\n\n\ndef transform(batch):\n images = [img.convert(\"RGB\") for img in batch[\"img\"]]\n inputs = processor(images=images, return_tensors=\"pt\")\n inputs[\"labels\"] = torch.tensor(batch[\"label\"])\n return inputs\n\n\ndataset = dataset.with_transform(transform)\n\nmodel = AutoModelForImageClassification.from_pretrained(\n \"WinKawaks/vit-tiny-patch16-224\", num_labels=10, ignore_mismatched_sizes=True\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(model=model, args=training_args, train_dataset=dataset)\ntrainer.train()\ntrainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Remove Trainer.save_model\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -38,4 +38,4 @@\n trainer = Trainer(model=model, args=training_args, train_dataset=dataset)\n trainer.train()\n-trainer.save_model_DEPRECATED(\"/tmp/forge_output/checkpoint\")\n+trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}
{"role_target": "repair_agent", "task_id": "albert_qa", "primitive_type": "RenameApiCall", "category": "api_drift", "messages": [{"role": "system", "content": "You are the Repair Agent.\nYou see a broken HuggingFace training script, an error trace, and the current\nlibrary version snapshot. Output ONLY a unified diff that fixes the script.\n\nRules:\n1. Use canonical unified-diff format with `--- a/train.py` / `+++ b/train.py`\n headers and `@@ ... @@` hunk markers.\n2. Make the MINIMAL change that resolves the error AND preserves the original\n training intent. Do NOT add bare-except blocks, monkey-patches, or sys.exit\n calls.\n3. Do NOT add any prose, markdown fences, or thinking output \u2014 diff only.\n4. If the error is unfixable, output an empty diff.\n"}, {"role": "user", "content": "Library versions: transformers=4.40.0, datasets=2.18.0, trl=0.10.0\nTarget category hint: api_drift\n\nBroken script:\n```python\n\"\"\"ALBERT-tiny extractive QA on 100-sample SQuAD subset.\"\"\"\nfrom transformers import (\n AutoTokenizer,\n AutoModelForQuestionAnswering,\n Trainer,\n TrainingArguments,\n DefaultDataCollator,\n)\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"squad\", split=\"train[:100]\")\ntokenizer = AutoTokenizer.from_pretrained(\"albert-base-v2\")\n\n\ndef prepare(examples):\n enc = tokenizer(\n examples[\"question\"],\n examples[\"context\"],\n max_length=128,\n truncation=\"only_second\",\n padding=\"max_length\",\n return_offsets_mapping=True,\n )\n start_positions, end_positions = [], []\n for i, offsets in enumerate(enc[\"offset_mapping\"]):\n answer = examples[\"answers\"][i]\n start_char = answer[\"answer_start\"][0]\n end_char = start_char + len(answer[\"text\"][0])\n\n token_start = next(\n (idx for idx, (a, b) in enumerate(offsets) if a <= start_char < b), 0\n )\n token_end = next(\n (idx for idx, (a, b) in enumerate(offsets) if a < end_char <= b), token_start\n )\n start_positions.append(token_start)\n end_positions.append(token_end)\n\n enc[\"start_positions\"] = start_positions\n enc[\"end_positions\"] = end_positions\n enc.pop(\"offset_mapping\")\n return enc\n\n\ndataset = dataset.map(prepare, batched=True, remove_columns=dataset.column_names)\n\nmodel = AutoModelForQuestionAnswering.from_pretrained(\"albert-base-v2\")\n\ntraining_args = TrainingArguments(\n output_dir=\"/tmp/forge_output/checkpoint\",\n num_train_epochs=1,\n per_device_train_batch_size=4,\n logging_steps=5,\n save_strategy=\"epoch\",\n no_cuda=True,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n data_collator=DefaultDataCollator(),\n)\ntrainer.start_training()\ntrainer.save_model(\"/tmp/forge_output/checkpoint\")\nprint(\"TRAINING_COMPLETE\")\n\n```\n\nError trace:\n[simulated] Rename trainer.train -> trainer.start_training\n\nOutput unified diff (no prose, no fences):"}, {"role": "assistant", "content": "--- a/train.py\n+++ b/train.py\n@@ -63,5 +63,5 @@\n data_collator=DefaultDataCollator(),\n )\n-trainer.start_training()\n+trainer.train()\n trainer.save_model(\"/tmp/forge_output/checkpoint\")\n print(\"TRAINING_COMPLETE\")\n"}]}