Huihui-Qwen3.5-27B-abliterated-Athanorlite-ORPO-v2

Fixed version of nbeerbower/Huihui-Qwen3.5-27B-abliterated-Athanorlite-ORPO with corrected state dict key naming and restored multimodal/MTP weights.

What was fixed

The original model had three issues caused by a PEFT merge bug with VLM architectures:

  1. Triple-nested LM keysmodel.language_model.language_model.language_model.X renamed to model.language_model.X
  2. Misplaced visual keysmodel.language_model.visual.X moved to model.visual.X, visual weights restored from base model
  3. Missing MTP weights — 15 multi-token prediction keys (~425M params) grafted back from huihui-ai/Huihui-Qwen3.5-27B-abliterated

Final model: 1199 keys (850 LM + 333 visual + 15 MTP + 1 lm_head), matching the base model's key structure exactly.

Training Configuration

Parameter Value
Training Mode ORPO
Base Model huihui-ai/Huihui-Qwen3.5-27B-abliterated
Learning Rate 9e-05
Epochs 1
Batch Size 1
Gradient Accumulation 32
Effective Batch Size 32
Max Sequence Length 2048
Optimizer paged_adamw_8bit
LR Scheduler cosine
Warmup Ratio 0.05
Weight Decay 0.01
Max Grad Norm 0.25
Seed 42
Beta 0.1
Max Prompt Length 1024
LoRA Rank (r) 128
LoRA Alpha 64
LoRA Dropout 0.05
Target Modules up_proj, down_proj, gate_proj, k_proj, q_proj, v_proj, o_proj
Quantization 4-bit (NF4)
GPU NVIDIA A100-SXM4-80GB

Graft Script

Click to expand the Python script used to fix this model
#!/usr/bin/env python3
"""
Graft multimodal (visual + MTP) weights from the base Qwen3.5-27B model
back onto a fine-tuned variant that has broken key naming and missing components.

Problems in nbeerbower/Huihui-Qwen3.5-27B-abliterated-Athanorlite-ORPO:
1. LM keys are triple-nested: model.language_model.language_model.language_model.X
   -> should be model.language_model.X
2. Visual keys are under: model.language_model.visual.X
   -> should be model.visual.X
3. MTP (multi-token prediction) weights are completely missing (15 keys)

Fix: rename LM keys, replace visual weights from base model, graft MTP weights from base.
"""

import json
import os
import shutil
from pathlib import Path
from collections import OrderedDict

from huggingface_hub import snapshot_download, hf_hub_download
from safetensors.torch import load_file, save_file

FINETUNED_REPO = "nbeerbower/Huihui-Qwen3.5-27B-abliterated-Athanorlite-ORPO"
BASE_REPO = "huihui-ai/Huihui-Qwen3.5-27B-abliterated"
WORK_DIR = Path(".")
FINETUNED_DIR = WORK_DIR / "finetuned"
OUTPUT_DIR = WORK_DIR / "output"
BASE_CACHE_DIR = WORK_DIR / "base_shard"
MAX_SHARD_SIZE = 5 * 1024 * 1024 * 1024  # 5 GB


def download_models():
    """Download the fine-tuned model and base model files."""
    print("Step 1: Downloading models")

    snapshot_download(FINETUNED_REPO, local_dir=str(FINETUNED_DIR))

    BASE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
    # Download base model index to find which shards have visual/MTP keys
    hf_hub_download(BASE_REPO, filename="model.safetensors.index.json", local_dir=str(BASE_CACHE_DIR))
    hf_hub_download(BASE_REPO, filename="config.json", local_dir=str(BASE_CACHE_DIR))

    # Parse the index to find all shards containing visual/MTP keys
    with open(BASE_CACHE_DIR / "model.safetensors.index.json") as f:
        base_index = json.load(f)

    needed_shards = set()
    for key, shard in base_index["weight_map"].items():
        if key.startswith("model.visual.") or key.startswith("mtp."):
            needed_shards.add(shard)

    for shard_name in sorted(needed_shards):
        print(f"  Downloading base shard: {shard_name}")
        hf_hub_download(BASE_REPO, filename=shard_name, local_dir=str(BASE_CACHE_DIR))

    for fname in ["preprocessor_config.json", "video_preprocessor_config.json"]:
        try:
            hf_hub_download(BASE_REPO, filename=fname, local_dir=str(BASE_CACHE_DIR))
        except Exception:
            pass


def fix_key(key: str) -> str:
    """Fix a single weight key name."""
    if key.startswith("model.language_model.language_model.language_model."):
        return key.replace(
            "model.language_model.language_model.language_model.",
            "model.language_model.", 1,
        )
    if key.startswith("model.language_model.visual."):
        return key.replace("model.language_model.visual.", "model.visual.", 1)
    return key


def process_weights():
    """Rename keys in fine-tuned model, graft visual + MTP from base."""
    print("Step 2: Processing weights")
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    # Collect visual + MTP tensors from all needed base shards
    base_visual = {}
    base_mtp = {}
    for shard_path in sorted(BASE_CACHE_DIR.glob("model.safetensors-*.safetensors")):
        tensors = load_file(str(shard_path))
        for k, v in tensors.items():
            if k.startswith("model.visual."):
                base_visual[k] = v
            elif k.startswith("mtp."):
                base_mtp[k] = v
        del tensors
    print(f"  Base visual keys: {len(base_visual)}, MTP keys: {len(base_mtp)}")

    # Load fine-tuned shards and fix keys
    all_tensors = OrderedDict()
    for shard_path in sorted(FINETUNED_DIR.glob("model-*.safetensors")):
        shard = load_file(str(shard_path))
        for key, tensor in shard.items():
            fixed_key = fix_key(key)
            if fixed_key.startswith("model.visual."):
                continue  # Replace with base model's visual weights
            all_tensors[fixed_key] = tensor
        del shard

    # Graft visual + MTP from base
    all_tensors.update(base_visual)
    all_tensors.update(base_mtp)
    print(f"  Total keys: {len(all_tensors)}")

    # Save as sharded safetensors
    weight_map = {}
    shard_idx = 1
    current_shard = OrderedDict()
    current_size = 0
    shard_files = []

    for key in sorted(all_tensors.keys()):
        tensor = all_tensors[key]
        tensor_size = tensor.nelement() * tensor.element_size()

        if current_size + tensor_size > MAX_SHARD_SIZE and current_shard:
            shard_name = f"model-{shard_idx:05d}-of-PLACEHOLDER.safetensors"
            save_file(current_shard, str(OUTPUT_DIR / shard_name))
            shard_files.append(shard_name)
            for k in current_shard:
                weight_map[k] = shard_name
            current_shard = OrderedDict()
            current_size = 0
            shard_idx += 1

        current_shard[key] = tensor
        current_size += tensor_size

    if current_shard:
        shard_name = f"model-{shard_idx:05d}-of-PLACEHOLDER.safetensors"
        save_file(current_shard, str(OUTPUT_DIR / shard_name))
        shard_files.append(shard_name)
        for k in current_shard:
            weight_map[k] = shard_name

    total_shards = shard_idx
    final_weight_map = {}
    for i, old_name in enumerate(shard_files, 1):
        new_name = f"model-{i:05d}-of-{total_shards:05d}.safetensors"
        (OUTPUT_DIR / old_name).rename(OUTPUT_DIR / new_name)
        for k, v in weight_map.items():
            if v == old_name:
                final_weight_map[k] = new_name

    total_size = sum(t.nelement() * t.element_size() for t in all_tensors.values())
    index = {"metadata": {"total_size": total_size}, "weight_map": final_weight_map}
    with open(OUTPUT_DIR / "model.safetensors.index.json", "w") as f:
        json.dump(index, f, indent=2, sort_keys=True)

    # Copy config files
    for fname in ["config.json", "generation_config.json", "tokenizer.json",
                   "tokenizer_config.json", "chat_template.jinja"]:
        src = FINETUNED_DIR / fname
        if src.exists():
            shutil.copy2(src, OUTPUT_DIR / fname)
    for fname in ["preprocessor_config.json", "video_preprocessor_config.json"]:
        src = BASE_CACHE_DIR / fname
        if src.exists():
            shutil.copy2(src, OUTPUT_DIR / fname)


if __name__ == "__main__":
    download_models()
    process_weights()
    print("Done! Output in:", OUTPUT_DIR)

Trained with Merlina

Merlina on GitHub

Downloads last month
225
Safetensors
Model size
28B params
Tensor type
BF16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for nbeerbower/Huihui-Qwen3.5-27B-abliterated-Athanorlite-ORPO-v2

Base model

Qwen/Qwen3.5-27B
Finetuned
(3)
this model
Quantizations
3 models

Dataset used to train nbeerbower/Huihui-Qwen3.5-27B-abliterated-Athanorlite-ORPO-v2