# Drift taxonomy: real HuggingFace/PyTorch breakages observed across version bumps. # Used to seed the Drift Generator's initial proposal distribution and to anchor # warm-start pair generation in things that actually happened in the wild. - version_range: "transformers 4.36 -> 4.45" affected_api: "Trainer.evaluate" description: "Trainer.evaluate() return type changed shape; metrics now nested under .metrics" breakage_primitive: "ChangeReturnType" params: function_name: "evaluate" old_access: "trainer.evaluate()" new_access: "trainer.evaluate().metrics" repair_primitive: "RestoreReturnAccess" category: "api_drift" - version_range: "transformers 4.30 -> 4.40" affected_api: "TrainingArguments.evaluation_strategy" description: "Renamed evaluation_strategy -> eval_strategy" breakage_primitive: "RenameApiCall" params: old_name: "evaluation_strategy" new_name: "eval_strategy" repair_primitive: "RestoreApiCall" category: "api_drift" - version_range: "datasets 2.14 -> 3.0" affected_api: "load_dataset" description: "Default split column was renamed in some GLUE configs" breakage_primitive: "RestructureDatasetSchema" params: old_column: "label" new_column: "labels" repair_primitive: "RestoreColumn" category: "dataset_drift" - version_range: "transformers 4.40 -> 4.50" affected_api: "Trainer.predict" description: "Method removed; users should use evaluate() with prediction_loss_only=False" breakage_primitive: "RemoveDeprecatedMethod" params: class_name: "Trainer" method_name: "predict" replacement: "evaluate" repair_primitive: "RestoreMethod" category: "api_drift" - version_range: "transformers 4.36 -> 4.40" affected_api: "TrainingArguments" description: "num_train_epochs default behavior changed; max_steps now preferred" breakage_primitive: "ModifyConfigField" params: config_class: "TrainingArguments" field_name: "num_train_epochs" new_value: "0" repair_primitive: "RestoreConfigField" category: "config_drift" - version_range: "transformers 4.34 -> 4.42" affected_api: "Tokenizer.__call__" description: "padding=True semantics changed; users should pass padding='max_length'" breakage_primitive: "ChangeTokenizerBehavior" params: old_kwarg: "padding" old_value: "True" new_kwarg: "padding" new_value: '"max_length"' repair_primitive: "RestoreTokenizerKwarg" category: "tokenizer_drift" - version_range: "transformers 4.20 -> 4.30" affected_api: "imports" description: "transformers.training_args moved to transformers.training_args_pt" breakage_primitive: "DeprecateImport" params: old_module: "from transformers.training_args" new_module: "from transformers.training_args_pt" repair_primitive: "RestoreImport" category: "import_drift" - version_range: "transformers 4.45 -> 4.50" affected_api: "save_pretrained" description: "save_pretrained() now requires safe_serialization to default True" breakage_primitive: "ChangeArgumentSignature" params: function_name: "save_pretrained" removed_arg: "safe_serialization" added_arg: "safe_serialization" added_value: "True" repair_primitive: "RestoreArgument" category: "api_drift" - version_range: "datasets 2.18 -> 3.0" affected_api: "Dataset.set_format" description: "set_format(type='torch') signature stricter, columns required" breakage_primitive: "ChangeArgumentSignature" params: function_name: "set_format" removed_arg: "columns" added_arg: "columns" added_value: '["input_ids", "attention_mask", "labels"]' repair_primitive: "RestoreArgument" category: "api_drift" - version_range: "transformers 4.36 -> 4.45" affected_api: "Tokenizer.__call__" description: "max_length default reduced from 512 -> 256 for some tokenizers" breakage_primitive: "ModifyConfigField" params: config_class: "tokenizer" field_name: "max_length" new_value: "256" repair_primitive: "RestoreConfigField" category: "tokenizer_drift" - version_range: "transformers 4.40 -> 4.45" affected_api: "DataCollatorWithPadding" description: "Renamed `tokenizer` -> `processing_class` in DataCollator constructors" breakage_primitive: "RenameApiCall" params: old_name: "tokenizer" new_name: "processing_class" repair_primitive: "RestoreApiCall" category: "api_drift" - version_range: "datasets 2.14 -> 2.18" affected_api: "load_dataset" description: "Some splits renamed train[:500] semantics changed" breakage_primitive: "RestructureDatasetSchema" params: old_column: "sentence" new_column: "text" repair_primitive: "RestoreColumn" category: "dataset_drift" - version_range: "transformers 4.45 -> 4.50" affected_api: "Trainer" description: "evaluation_strategy was deprecated and removed" breakage_primitive: "RemoveDeprecatedMethod" params: class_name: "Trainer" method_name: "evaluate" replacement: "evaluate_legacy" repair_primitive: "RestoreMethod" category: "api_drift" - version_range: "transformers 4.30 -> 4.40" affected_api: "PreTrainedModel.from_pretrained" description: "torch_dtype now required for some quantized model paths" breakage_primitive: "ChangeArgumentSignature" params: function_name: "from_pretrained" removed_arg: "torch_dtype" added_arg: "torch_dtype" added_value: '"auto"' repair_primitive: "RestoreArgument" category: "api_drift" - version_range: "datasets 3.0 -> 3.2" affected_api: "Dataset.rename_column" description: "rename_column raises if target name exists" breakage_primitive: "RestructureDatasetSchema" params: old_column: "labels" new_column: "label" repair_primitive: "RestoreColumn" category: "dataset_drift" - version_range: "transformers 4.36 -> 4.42" affected_api: "TrainingArguments.report_to" description: "Default report_to changed from 'all' to 'none'" breakage_primitive: "ModifyConfigField" params: config_class: "TrainingArguments" field_name: "report_to" new_value: '"all"' repair_primitive: "RestoreConfigField" category: "config_drift" - version_range: "transformers 4.40 -> 4.50" affected_api: "imports" description: "transformers.deepspeed moved to accelerate.utils.deepspeed" breakage_primitive: "DeprecateImport" params: old_module: "from transformers.deepspeed" new_module: "from accelerate.utils.deepspeed" repair_primitive: "RestoreImport" category: "import_drift" - version_range: "transformers 4.45 -> 4.50" affected_api: "Tokenizer return" description: "Tokenizer call output now returns a BatchEncoding with .encodings attribute" breakage_primitive: "ChangeReturnType" params: function_name: "tokenizer" old_access: "tokenizer(text)" new_access: "tokenizer(text).encodings" repair_primitive: "RestoreReturnAccess" category: "api_drift" - version_range: "transformers 4.30 -> 4.40" affected_api: "save_pretrained" description: "save_pretrained -> save_pretrained_directory rename in some classes" breakage_primitive: "RenameApiCall" params: old_name: "save_pretrained" new_name: "save_pretrained_directory" repair_primitive: "RestoreApiCall" category: "api_drift" - version_range: "transformers 4.45 -> 4.50" affected_api: "TrainingArguments.no_cuda" description: "no_cuda renamed to use_cpu (logic inverted)" breakage_primitive: "RenameApiCall" params: old_name: "no_cuda" new_name: "use_cpu" repair_primitive: "RestoreApiCall" category: "config_drift"