Code

by YsK-dev - opened Feb 22
Discussion
YsK-dev
Feb 22
U share model weigths and data but what about pipeline that you finetuned will u share also this nice job btw
armand0e
TeichAI org Feb 22
Let me get the files in order
armand0e
TeichAI org Feb 22
Ok so my file had a bunch of sanity checks for the dataset and a lot of bloat checking to make sure the dataset looked good and all. I had claude remake the file removing all these extra stuff (just to preface you on the ai-like format of the file). But more or less this is the exact file I used for training this model on both TeichAI/Gemini-3-Flash-Preview-VIBE and TeichAI/MiniMax-M2.1-Code-SFT:
import os
import re
import json
import hashlib
import multiprocessing as mp
from collections import Counter

# Environment setup (a couple of these are windows specific)
os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
os.environ["HF_DATASETS_DISABLE_MULTIPROCESSING"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:true"

from datasets import load_dataset, concatenate_datasets
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from transformers import AutoTokenizer
from trl import SFTTrainer, SFTConfig
import torch


# ============================================================================
# CONFIGURATION
# ============================================================================

# Model configuration
INPUT_MODEL = "unsloth/Qwen3-4B-Thinking-2507"
CHAT_TEMPLATE = "qwen3-thinking"
MAX_SEQ_LENGTH = 32768

# Dataset configuration (can be a single string or list of dataset names)
HF_DATASETS = ["TeichAI/MiniMax-M2.1-Code-SFT"]
DATASET_FILES = []  # Optional: local JSONL files

# Training configuration
MAX_STEPS = 2000
BATCH_SIZE = 2
GRAD_ACCUMULATION = 4
LEARNING_RATE = 2e-4
WARMUP_STEPS = 5
SAVE_STEPS = 200
SAVE_TOTAL_LIMIT = 20

# LoRA configuration
LORA_RANK = 32
LORA_ALPHA = 32
LORA_DROPOUT = 0

# Output configuration
HF_ACCOUNT = "TeichAI"
OUTPUT_MODEL_REPO = "Qwen3-4B-Thinking-Tools-SFT"
HF_TOKEN = None  # Set your token here or via environment variable
PRIVATE_UPLOAD = True
RESUME_FROM_CHECKPOINT = False


# ============================================================================
# DATASET LOADING & PREPROCESSING
# ============================================================================

def load_raw_dataset():
    """Load and concatenate datasets from HuggingFace and/or local files."""
    dsets = []

    # Load HuggingFace datasets
    hf_list = HF_DATASETS if isinstance(HF_DATASETS, list) else [HF_DATASETS]
    for hf_name in hf_list:
        if hf_name and hf_name.strip():
            print(f"Loading HuggingFace dataset: {hf_name}")
            dsets.append(load_dataset(hf_name.strip(), split="train"))

    # Load local JSONL files
    file_list = DATASET_FILES if isinstance(DATASET_FILES, list) else [DATASET_FILES]
    valid_files = [f.strip() for f in file_list if f and f.strip()]
    if valid_files:
        print(f"Loading local files: {valid_files}")
        dsets.append(load_dataset("json", data_files=valid_files, split="train"))

    if not dsets:
        raise ValueError("No datasets provided! Set HF_DATASETS or DATASET_FILES.")

    # Concatenate all datasets
    combined = dsets[0] if len(dsets) == 1 else concatenate_datasets(dsets)
    print(f"Total rows loaded: {len(combined)}")

    return combined


def deduplicate_dataset(dataset):
    """Remove duplicate prompts, keeping the first occurrence."""
    print(f"Rows before deduplication: {len(dataset)}")

    seen_hashes = set()

    def is_unique(example):
        messages = example.get("messages", [])
        if not isinstance(messages, list) or not messages:
            return False

        # Hash the prompt (everything except the last assistant message)
        prompt_msgs = messages
        if messages and isinstance(messages[-1], dict) and messages[-1].get("role") == "assistant":
            prompt_msgs = messages[:-1]

        content_str = json.dumps(prompt_msgs, sort_keys=True)
        content_hash = hashlib.md5(content_str.encode("utf-8")).hexdigest()

        if content_hash in seen_hashes:
            return False

        seen_hashes.add(content_hash)
        return True

    deduplicated = dataset.filter(is_unique, num_proc=1, load_from_cache_file=False)
    print(f"Rows after deduplication: {len(deduplicated)}")

    return deduplicated


def validate_messages(messages):
    """Validate message structure and return error reason if invalid."""
    if not isinstance(messages, list) or len(messages) == 0:
        return "messages_not_list_or_empty"

    last_non_system_role = None

    for msg in messages:
        if not isinstance(msg, dict):
            return "message_not_dict"

        role = msg.get("role")
        if role != "system":
            last_non_system_role = role

    if last_non_system_role != "assistant":
        return "does_not_end_with_assistant"

    return ""


def filter_invalid_messages(dataset):
    """Filter out invalid message sequences."""
    dataset = dataset.map(
        lambda ex: {"bad_reason": validate_messages(ex.get("messages"))},
        num_proc=1,
    )

    before = len(dataset)
    dataset = dataset.filter(lambda ex: ex["bad_reason"] == "", num_proc=1)
    dataset = dataset.remove_columns(["bad_reason"])

    filtered_count = before - len(dataset)
    if filtered_count > 0:
        print(f"Filtered out {filtered_count} invalid rows")

    return dataset


def prepare_dataset():
    """Load, clean, and prepare the training dataset."""
    print("=" * 80)
    print("LOADING DATASET")
    print("=" * 80)

    dataset = load_raw_dataset()
    dataset = deduplicate_dataset(dataset)
    dataset = dataset.shuffle(seed=42)
    dataset = filter_invalid_messages(dataset)

    print(f"\nFinal dataset size: {len(dataset)} rows")
    return dataset


# ============================================================================
# MODEL LOADING & TRAINING
# ============================================================================

def load_model_and_tokenizer():
    """Load the base model and tokenizer with LoRA configuration."""
    print("\n" + "=" * 80)
    print("LOADING MODEL")
    print("=" * 80)

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=INPUT_MODEL,
        max_seq_length=MAX_SEQ_LENGTH,
        load_in_4bit=True,
        load_in_8bit=False,
        full_finetuning=False,
        token=HF_TOKEN,
        attn_implementation="eager",
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r=LORA_RANK,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )

    tokenizer = get_chat_template(tokenizer, chat_template=CHAT_TEMPLATE)

    print(f"Model loaded: {INPUT_MODEL}")
    print(f"LoRA rank: {LORA_RANK}, alpha: {LORA_ALPHA}")
    print(f"Max sequence length: {MAX_SEQ_LENGTH}")

    return model, tokenizer


def formatting_prompts_func(examples, tokenizer):
    """Format messages into training text using chat template."""
    convos = examples["messages"]
    tools_list = examples.get("tools", [None] * len(convos))

    texts = []
    for convo, tools in zip(convos, tools_list):
        # Apply chat template with tools if available
        if tools:
            text = tokenizer.apply_chat_template(
                convo,
                tools=tools,
                tokenize=False,
                add_generation_prompt=False,
            )
        else:
            text = tokenizer.apply_chat_template(
                convo,
                tokenize=False,
                add_generation_prompt=False,
            )
        texts.append(text)

    return {"text": texts}


def train_model(model, tokenizer, dataset):
    """Train the model using SFTTrainer."""
    print("\n" + "=" * 80)
    print("PREPARING TRAINING")
    print("=" * 80)

    # Format dataset
    train_dataset = dataset.map(
        lambda ex: formatting_prompts_func(ex, tokenizer),
        batched=True,
    )

    # Create trainer
    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=None,
        dataset_num_proc=1,
        args=SFTConfig(
            dataset_text_field="text",
            max_length=MAX_SEQ_LENGTH,
            per_device_train_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRAD_ACCUMULATION,
            warmup_steps=WARMUP_STEPS,
            max_steps=MAX_STEPS,
            learning_rate=LEARNING_RATE,
            logging_steps=1,
            optim="paged_adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3447,
            report_to="none",
            dataloader_num_workers=0,
            output_dir="outputs",
            save_strategy="steps",
            save_steps=SAVE_STEPS,
            save_total_limit=SAVE_TOTAL_LIMIT,
        ),
    )

    # Print GPU stats
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU: {gpu_stats.name}")
    print(f"Max memory: {max_memory} GB")
    print(f"Reserved memory: {start_gpu_memory} GB")

    # Configure token IDs
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.eos_token_id = tokenizer.eos_token_id
    model.config.bos_token_id = tokenizer.bos_token_id
    model.generation_config.eos_token_id = tokenizer.eos_token_id
    model.generation_config.pad_token_id = tokenizer.pad_token_id
    model.generation_config.bos_token_id = tokenizer.bos_token_id

    print("\n" + "=" * 80)
    print("STARTING TRAINING")
    print("=" * 80)

    # Train
    trainer_stats = trainer.train(resume_from_checkpoint=RESUME_FROM_CHECKPOINT)

    # Print training stats
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

    print("\n" + "=" * 80)
    print("TRAINING COMPLETE")
    print("=" * 80)
    print(f"Training time: {trainer_stats.metrics['train_runtime']:.2f} seconds")
    print(f"Training time: {trainer_stats.metrics['train_runtime']/60:.2f} minutes")
    print(f"Peak reserved memory: {used_memory} GB")
    print(f"Peak reserved memory for training: {used_memory_for_lora} GB")
    print(f"Peak reserved memory % of max: {used_percentage}%")
    print(f"Peak reserved memory for training % of max: {lora_percentage}%")

    return model, tokenizer


def save_model(model, tokenizer):
    """Save the trained model to HuggingFace Hub."""
    print("\n" + "=" * 80)
    print("SAVING MODEL")
    print("=" * 80)

    # Save merged model
    print(f"Pushing merged model to {HF_ACCOUNT}/{OUTPUT_MODEL_REPO}...")
    model.push_to_hub_merged(
        f"{HF_ACCOUNT}/{OUTPUT_MODEL_REPO}",
        tokenizer,
        save_method="merged_16bit",
        token=HF_TOKEN,
        private=PRIVATE_UPLOAD,
    )
    print("✓ Merged model saved")

    # Save GGUF quantized versions
    print(f"Pushing GGUF models to {HF_ACCOUNT}/{OUTPUT_MODEL_REPO}-GGUF...")
    model.push_to_hub_gguf(
        f"{HF_ACCOUNT}/{OUTPUT_MODEL_REPO}-GGUF",
        tokenizer,
        quantization_method=["bf16", "f16", "q8_0"],
        token=HF_TOKEN,
        private=PRIVATE_UPLOAD,
    )
    print("✓ GGUF models saved")

    print("\n" + "=" * 80)
    print("ALL DONE!")
    print("=" * 80)


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    """Main training pipeline."""
    dataset = prepare_dataset()
    model, tokenizer = load_model_and_tokenizer()
    model, tokenizer = train_model(model, tokenizer, dataset)
    save_model(model, tokenizer)


if __name__ == "__main__":
    mp.freeze_support()
    main()
Bob-the-Koala
Feb 23
Maybe you could upload the code you use for each finetune on docs.teichai.com
Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.
Tap or paste here to upload images
· Sign up or log in to comment