Huihui-Qwen3.5-27B-abliterated-Athanorlite-ORPO-v2
Fixed version of nbeerbower/Huihui-Qwen3.5-27B-abliterated-Athanorlite-ORPO with corrected state dict key naming and restored multimodal/MTP weights.
What was fixed
The original model had three issues caused by a PEFT merge bug with VLM architectures:
- Triple-nested LM keys —
model.language_model.language_model.language_model.Xrenamed tomodel.language_model.X - Misplaced visual keys —
model.language_model.visual.Xmoved tomodel.visual.X, visual weights restored from base model - Missing MTP weights — 15 multi-token prediction keys (~425M params) grafted back from huihui-ai/Huihui-Qwen3.5-27B-abliterated
Final model: 1199 keys (850 LM + 333 visual + 15 MTP + 1 lm_head), matching the base model's key structure exactly.
Training Configuration
| Parameter | Value |
|---|---|
| Training Mode | ORPO |
| Base Model | huihui-ai/Huihui-Qwen3.5-27B-abliterated |
| Learning Rate | 9e-05 |
| Epochs | 1 |
| Batch Size | 1 |
| Gradient Accumulation | 32 |
| Effective Batch Size | 32 |
| Max Sequence Length | 2048 |
| Optimizer | paged_adamw_8bit |
| LR Scheduler | cosine |
| Warmup Ratio | 0.05 |
| Weight Decay | 0.01 |
| Max Grad Norm | 0.25 |
| Seed | 42 |
| Beta | 0.1 |
| Max Prompt Length | 1024 |
| LoRA Rank (r) | 128 |
| LoRA Alpha | 64 |
| LoRA Dropout | 0.05 |
| Target Modules | up_proj, down_proj, gate_proj, k_proj, q_proj, v_proj, o_proj |
| Quantization | 4-bit (NF4) |
| GPU | NVIDIA A100-SXM4-80GB |
Graft Script
Click to expand the Python script used to fix this model
#!/usr/bin/env python3
"""
Graft multimodal (visual + MTP) weights from the base Qwen3.5-27B model
back onto a fine-tuned variant that has broken key naming and missing components.
Problems in nbeerbower/Huihui-Qwen3.5-27B-abliterated-Athanorlite-ORPO:
1. LM keys are triple-nested: model.language_model.language_model.language_model.X
-> should be model.language_model.X
2. Visual keys are under: model.language_model.visual.X
-> should be model.visual.X
3. MTP (multi-token prediction) weights are completely missing (15 keys)
Fix: rename LM keys, replace visual weights from base model, graft MTP weights from base.
"""
import json
import os
import shutil
from pathlib import Path
from collections import OrderedDict
from huggingface_hub import snapshot_download, hf_hub_download
from safetensors.torch import load_file, save_file
FINETUNED_REPO = "nbeerbower/Huihui-Qwen3.5-27B-abliterated-Athanorlite-ORPO"
BASE_REPO = "huihui-ai/Huihui-Qwen3.5-27B-abliterated"
WORK_DIR = Path(".")
FINETUNED_DIR = WORK_DIR / "finetuned"
OUTPUT_DIR = WORK_DIR / "output"
BASE_CACHE_DIR = WORK_DIR / "base_shard"
MAX_SHARD_SIZE = 5 * 1024 * 1024 * 1024 # 5 GB
def download_models():
"""Download the fine-tuned model and base model files."""
print("Step 1: Downloading models")
snapshot_download(FINETUNED_REPO, local_dir=str(FINETUNED_DIR))
BASE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
# Download base model index to find which shards have visual/MTP keys
hf_hub_download(BASE_REPO, filename="model.safetensors.index.json", local_dir=str(BASE_CACHE_DIR))
hf_hub_download(BASE_REPO, filename="config.json", local_dir=str(BASE_CACHE_DIR))
# Parse the index to find all shards containing visual/MTP keys
with open(BASE_CACHE_DIR / "model.safetensors.index.json") as f:
base_index = json.load(f)
needed_shards = set()
for key, shard in base_index["weight_map"].items():
if key.startswith("model.visual.") or key.startswith("mtp."):
needed_shards.add(shard)
for shard_name in sorted(needed_shards):
print(f" Downloading base shard: {shard_name}")
hf_hub_download(BASE_REPO, filename=shard_name, local_dir=str(BASE_CACHE_DIR))
for fname in ["preprocessor_config.json", "video_preprocessor_config.json"]:
try:
hf_hub_download(BASE_REPO, filename=fname, local_dir=str(BASE_CACHE_DIR))
except Exception:
pass
def fix_key(key: str) -> str:
"""Fix a single weight key name."""
if key.startswith("model.language_model.language_model.language_model."):
return key.replace(
"model.language_model.language_model.language_model.",
"model.language_model.", 1,
)
if key.startswith("model.language_model.visual."):
return key.replace("model.language_model.visual.", "model.visual.", 1)
return key
def process_weights():
"""Rename keys in fine-tuned model, graft visual + MTP from base."""
print("Step 2: Processing weights")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Collect visual + MTP tensors from all needed base shards
base_visual = {}
base_mtp = {}
for shard_path in sorted(BASE_CACHE_DIR.glob("model.safetensors-*.safetensors")):
tensors = load_file(str(shard_path))
for k, v in tensors.items():
if k.startswith("model.visual."):
base_visual[k] = v
elif k.startswith("mtp."):
base_mtp[k] = v
del tensors
print(f" Base visual keys: {len(base_visual)}, MTP keys: {len(base_mtp)}")
# Load fine-tuned shards and fix keys
all_tensors = OrderedDict()
for shard_path in sorted(FINETUNED_DIR.glob("model-*.safetensors")):
shard = load_file(str(shard_path))
for key, tensor in shard.items():
fixed_key = fix_key(key)
if fixed_key.startswith("model.visual."):
continue # Replace with base model's visual weights
all_tensors[fixed_key] = tensor
del shard
# Graft visual + MTP from base
all_tensors.update(base_visual)
all_tensors.update(base_mtp)
print(f" Total keys: {len(all_tensors)}")
# Save as sharded safetensors
weight_map = {}
shard_idx = 1
current_shard = OrderedDict()
current_size = 0
shard_files = []
for key in sorted(all_tensors.keys()):
tensor = all_tensors[key]
tensor_size = tensor.nelement() * tensor.element_size()
if current_size + tensor_size > MAX_SHARD_SIZE and current_shard:
shard_name = f"model-{shard_idx:05d}-of-PLACEHOLDER.safetensors"
save_file(current_shard, str(OUTPUT_DIR / shard_name))
shard_files.append(shard_name)
for k in current_shard:
weight_map[k] = shard_name
current_shard = OrderedDict()
current_size = 0
shard_idx += 1
current_shard[key] = tensor
current_size += tensor_size
if current_shard:
shard_name = f"model-{shard_idx:05d}-of-PLACEHOLDER.safetensors"
save_file(current_shard, str(OUTPUT_DIR / shard_name))
shard_files.append(shard_name)
for k in current_shard:
weight_map[k] = shard_name
total_shards = shard_idx
final_weight_map = {}
for i, old_name in enumerate(shard_files, 1):
new_name = f"model-{i:05d}-of-{total_shards:05d}.safetensors"
(OUTPUT_DIR / old_name).rename(OUTPUT_DIR / new_name)
for k, v in weight_map.items():
if v == old_name:
final_weight_map[k] = new_name
total_size = sum(t.nelement() * t.element_size() for t in all_tensors.values())
index = {"metadata": {"total_size": total_size}, "weight_map": final_weight_map}
with open(OUTPUT_DIR / "model.safetensors.index.json", "w") as f:
json.dump(index, f, indent=2, sort_keys=True)
# Copy config files
for fname in ["config.json", "generation_config.json", "tokenizer.json",
"tokenizer_config.json", "chat_template.jinja"]:
src = FINETUNED_DIR / fname
if src.exists():
shutil.copy2(src, OUTPUT_DIR / fname)
for fname in ["preprocessor_config.json", "video_preprocessor_config.json"]:
src = BASE_CACHE_DIR / fname
if src.exists():
shutil.copy2(src, OUTPUT_DIR / fname)
if __name__ == "__main__":
download_models()
process_weights()
print("Done! Output in:", OUTPUT_DIR)
- Downloads last month
- 225
