forgeenv-source / forgeenv /primitives /drift_taxonomy.yaml
akhiilll's picture
forgeenv source snapshot for training job
a15535e verified
# Drift taxonomy: real HuggingFace/PyTorch breakages observed across version bumps.
# Used to seed the Drift Generator's initial proposal distribution and to anchor
# warm-start pair generation in things that actually happened in the wild.
- version_range: "transformers 4.36 -> 4.45"
affected_api: "Trainer.evaluate"
description: "Trainer.evaluate() return type changed shape; metrics now nested under .metrics"
breakage_primitive: "ChangeReturnType"
params:
function_name: "evaluate"
old_access: "trainer.evaluate()"
new_access: "trainer.evaluate().metrics"
repair_primitive: "RestoreReturnAccess"
category: "api_drift"
- version_range: "transformers 4.30 -> 4.40"
affected_api: "TrainingArguments.evaluation_strategy"
description: "Renamed evaluation_strategy -> eval_strategy"
breakage_primitive: "RenameApiCall"
params:
old_name: "evaluation_strategy"
new_name: "eval_strategy"
repair_primitive: "RestoreApiCall"
category: "api_drift"
- version_range: "datasets 2.14 -> 3.0"
affected_api: "load_dataset"
description: "Default split column was renamed in some GLUE configs"
breakage_primitive: "RestructureDatasetSchema"
params:
old_column: "label"
new_column: "labels"
repair_primitive: "RestoreColumn"
category: "dataset_drift"
- version_range: "transformers 4.40 -> 4.50"
affected_api: "Trainer.predict"
description: "Method removed; users should use evaluate() with prediction_loss_only=False"
breakage_primitive: "RemoveDeprecatedMethod"
params:
class_name: "Trainer"
method_name: "predict"
replacement: "evaluate"
repair_primitive: "RestoreMethod"
category: "api_drift"
- version_range: "transformers 4.36 -> 4.40"
affected_api: "TrainingArguments"
description: "num_train_epochs default behavior changed; max_steps now preferred"
breakage_primitive: "ModifyConfigField"
params:
config_class: "TrainingArguments"
field_name: "num_train_epochs"
new_value: "0"
repair_primitive: "RestoreConfigField"
category: "config_drift"
- version_range: "transformers 4.34 -> 4.42"
affected_api: "Tokenizer.__call__"
description: "padding=True semantics changed; users should pass padding='max_length'"
breakage_primitive: "ChangeTokenizerBehavior"
params:
old_kwarg: "padding"
old_value: "True"
new_kwarg: "padding"
new_value: '"max_length"'
repair_primitive: "RestoreTokenizerKwarg"
category: "tokenizer_drift"
- version_range: "transformers 4.20 -> 4.30"
affected_api: "imports"
description: "transformers.training_args moved to transformers.training_args_pt"
breakage_primitive: "DeprecateImport"
params:
old_module: "from transformers.training_args"
new_module: "from transformers.training_args_pt"
repair_primitive: "RestoreImport"
category: "import_drift"
- version_range: "transformers 4.45 -> 4.50"
affected_api: "save_pretrained"
description: "save_pretrained() now requires safe_serialization to default True"
breakage_primitive: "ChangeArgumentSignature"
params:
function_name: "save_pretrained"
removed_arg: "safe_serialization"
added_arg: "safe_serialization"
added_value: "True"
repair_primitive: "RestoreArgument"
category: "api_drift"
- version_range: "datasets 2.18 -> 3.0"
affected_api: "Dataset.set_format"
description: "set_format(type='torch') signature stricter, columns required"
breakage_primitive: "ChangeArgumentSignature"
params:
function_name: "set_format"
removed_arg: "columns"
added_arg: "columns"
added_value: '["input_ids", "attention_mask", "labels"]'
repair_primitive: "RestoreArgument"
category: "api_drift"
- version_range: "transformers 4.36 -> 4.45"
affected_api: "Tokenizer.__call__"
description: "max_length default reduced from 512 -> 256 for some tokenizers"
breakage_primitive: "ModifyConfigField"
params:
config_class: "tokenizer"
field_name: "max_length"
new_value: "256"
repair_primitive: "RestoreConfigField"
category: "tokenizer_drift"
- version_range: "transformers 4.40 -> 4.45"
affected_api: "DataCollatorWithPadding"
description: "Renamed `tokenizer` -> `processing_class` in DataCollator constructors"
breakage_primitive: "RenameApiCall"
params:
old_name: "tokenizer"
new_name: "processing_class"
repair_primitive: "RestoreApiCall"
category: "api_drift"
- version_range: "datasets 2.14 -> 2.18"
affected_api: "load_dataset"
description: "Some splits renamed train[:500] semantics changed"
breakage_primitive: "RestructureDatasetSchema"
params:
old_column: "sentence"
new_column: "text"
repair_primitive: "RestoreColumn"
category: "dataset_drift"
- version_range: "transformers 4.45 -> 4.50"
affected_api: "Trainer"
description: "evaluation_strategy was deprecated and removed"
breakage_primitive: "RemoveDeprecatedMethod"
params:
class_name: "Trainer"
method_name: "evaluate"
replacement: "evaluate_legacy"
repair_primitive: "RestoreMethod"
category: "api_drift"
- version_range: "transformers 4.30 -> 4.40"
affected_api: "PreTrainedModel.from_pretrained"
description: "torch_dtype now required for some quantized model paths"
breakage_primitive: "ChangeArgumentSignature"
params:
function_name: "from_pretrained"
removed_arg: "torch_dtype"
added_arg: "torch_dtype"
added_value: '"auto"'
repair_primitive: "RestoreArgument"
category: "api_drift"
- version_range: "datasets 3.0 -> 3.2"
affected_api: "Dataset.rename_column"
description: "rename_column raises if target name exists"
breakage_primitive: "RestructureDatasetSchema"
params:
old_column: "labels"
new_column: "label"
repair_primitive: "RestoreColumn"
category: "dataset_drift"
- version_range: "transformers 4.36 -> 4.42"
affected_api: "TrainingArguments.report_to"
description: "Default report_to changed from 'all' to 'none'"
breakage_primitive: "ModifyConfigField"
params:
config_class: "TrainingArguments"
field_name: "report_to"
new_value: '"all"'
repair_primitive: "RestoreConfigField"
category: "config_drift"
- version_range: "transformers 4.40 -> 4.50"
affected_api: "imports"
description: "transformers.deepspeed moved to accelerate.utils.deepspeed"
breakage_primitive: "DeprecateImport"
params:
old_module: "from transformers.deepspeed"
new_module: "from accelerate.utils.deepspeed"
repair_primitive: "RestoreImport"
category: "import_drift"
- version_range: "transformers 4.45 -> 4.50"
affected_api: "Tokenizer return"
description: "Tokenizer call output now returns a BatchEncoding with .encodings attribute"
breakage_primitive: "ChangeReturnType"
params:
function_name: "tokenizer"
old_access: "tokenizer(text)"
new_access: "tokenizer(text).encodings"
repair_primitive: "RestoreReturnAccess"
category: "api_drift"
- version_range: "transformers 4.30 -> 4.40"
affected_api: "save_pretrained"
description: "save_pretrained -> save_pretrained_directory rename in some classes"
breakage_primitive: "RenameApiCall"
params:
old_name: "save_pretrained"
new_name: "save_pretrained_directory"
repair_primitive: "RestoreApiCall"
category: "api_drift"
- version_range: "transformers 4.45 -> 4.50"
affected_api: "TrainingArguments.no_cuda"
description: "no_cuda renamed to use_cpu (logic inverted)"
breakage_primitive: "RenameApiCall"
params:
old_name: "no_cuda"
new_name: "use_cpu"
repair_primitive: "RestoreApiCall"
category: "config_drift"