""" LLaMA Factory training entry-point wrapper for HYV3. This script: 1. Registers the hy_v3 chat template 2. Applies all HYV3 monkey-patches (checkpoint key rename, dtype fix, etc.) 3. Injects HYV3PatchCallback into the training loop 4. Calls run_exp() to start LLaMA Factory training How it works: - train_lf.sh launches this script via torchrun directly: torchrun ... train_hy_v3.py hy_v3_full_sft.yaml - Each torchrun worker executes this script, so all patches are applied in every worker process before training begins. - We call run_exp() directly (not the CLI launcher) to avoid the launcher re-spawning workers and losing our patches. Usage: # Via launch script (recommended): bash train_lf.sh # Direct single-node (8 GPUs): torchrun --nproc_per_node 8 train_hy_v3.py hy_v3_full_sft.yaml """ import sys import os # Add current directory to path so patches can be imported sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) # Step 1: Register HYV3 template (must be before training starts) import hy_v3_template # noqa: F401 # Step 2: Apply checkpoint key rename patch (must be before model loading) import hy_v3_patches # noqa: F401 # Step 3: Inject HYV3PatchCallback into LLaMA Factory's training flow from llamafactory.train.sft.workflow import run_sft as _orig_run_sft def _patched_run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks=None): """Wrap run_sft to inject HYV3PatchCallback.""" if callbacks is None: callbacks = [] # Determine tokenizer directory for the save callback tokenizer_dir = getattr(model_args, "model_name_or_path", None) callbacks.append(hy_v3_patches.HYV3PatchCallback(tokenizer_dir=tokenizer_dir)) return _orig_run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks=callbacks) # Monkey-patch the SFT workflow import llamafactory.train.sft.workflow as _sft_wf _sft_wf.run_sft = _patched_run_sft def main(): """Entry point: called by torchrun in each worker process. Since train_lf.sh launches us via torchrun directly, all patches (template registration, checkpoint key rename, SFT callback injection) are already applied in this process. We just call run_exp() to start training — no need to go through the CLI launcher. """ from llamafactory.train.tuner import run_exp run_exp() if __name__ == "__main__": main()