Spaces:
Sleeping
Sleeping
File size: 6,541 Bytes
745f62a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | """
Export fine-tuned LoRA adapter to GGUF for Ollama deployment.
Step 1: Unsloth merges LoRA into base model -> saves 16-bit safetensors
Step 2: llama.cpp converts safetensors -> GGUF (with quantization)
Step 3: Register with Ollama
Usage:
python scripts/export_gguf.py
python scripts/export_gguf.py --quant q8_0
"""
import argparse
import os
import subprocess
import sys
os.environ["TORCH_COMPILE_DISABLE"] = "1"
os.environ["TORCHDYNAMO_DISABLE"] = "1"
OLLAMA_EXE = os.path.expanduser("~/AppData/Local/Programs/Ollama/ollama.exe")
ADAPTER_PATH = "./models/checkpoints/final"
MERGED_DIR = "./models/merged_16bit"
EXPORT_DIR = "./models/exported"
MODELFILE_PATH = "./configs/Modelfile"
def step1_merge(adapter_path):
"""Load base + LoRA via Unsloth, merge, save 16-bit safetensors."""
print("=" * 60)
print("Step 1: Merging LoRA into base model (Unsloth)...")
print("=" * 60)
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=adapter_path,
max_seq_length=4096,
load_in_4bit=True,
)
os.makedirs(MERGED_DIR, exist_ok=True)
model.save_pretrained_merged(
MERGED_DIR,
tokenizer,
save_method="merged_16bit",
)
print(f"Merged model saved to {MERGED_DIR}")
# Free GPU memory before llama.cpp conversion
del model, tokenizer
import torch, gc
gc.collect()
torch.cuda.empty_cache()
def step2_convert_gguf(quant):
"""Convert merged safetensors to GGUF via llama.cpp."""
print("=" * 60)
print(f"Step 2: Converting to GGUF ({quant}) via llama.cpp...")
print("=" * 60)
convert_script = "./llama.cpp/convert_hf_to_gguf.py"
if not os.path.exists(convert_script):
print("Cloning llama.cpp...")
subprocess.run(
["git", "clone", "--depth", "1", "https://github.com/ggml-org/llama.cpp", "./llama.cpp"],
check=True,
)
# Only install gguf package — full requirements.txt has torch version conflicts
subprocess.run(
[sys.executable, "-m", "pip", "install", "gguf"],
check=True,
)
os.makedirs(EXPORT_DIR, exist_ok=True)
if quant in ("f16", "f32"):
# Direct conversion, no quantization needed
gguf_output = os.path.join(EXPORT_DIR, f"sakhi-e4b-{quant}.gguf")
subprocess.run(
[sys.executable, convert_script, MERGED_DIR, "--outfile", gguf_output, "--outtype", quant],
check=True,
)
else:
# Convert to f16 first, then quantize
gguf_f16 = os.path.join(EXPORT_DIR, "sakhi-e4b-f16.gguf")
gguf_output = os.path.join(EXPORT_DIR, f"sakhi-e4b-{quant}.gguf")
print("Converting HF -> GGUF (f16)...")
subprocess.run(
[sys.executable, convert_script, MERGED_DIR, "--outfile", gguf_f16, "--outtype", "f16"],
check=True,
)
# Find llama-quantize binary
quantize_bin = None
for candidate in [
"./llama.cpp/build/bin/llama-quantize",
"./llama.cpp/build/bin/Release/llama-quantize",
"./llama.cpp/build/bin/Release/llama-quantize.exe",
"./llama.cpp/build/bin/llama-quantize.exe",
]:
if os.path.exists(candidate):
quantize_bin = candidate
break
if quantize_bin is None:
print("Building llama.cpp (needs cmake)...")
subprocess.run(
["cmake", "-B", "build", "-DCMAKE_BUILD_TYPE=Release"],
cwd="./llama.cpp", check=True,
)
subprocess.run(
["cmake", "--build", "build", "--config", "Release", "-j"],
cwd="./llama.cpp", check=True,
)
# Re-check after build
for candidate in [
"./llama.cpp/build/bin/Release/llama-quantize.exe",
"./llama.cpp/build/bin/llama-quantize.exe",
"./llama.cpp/build/bin/llama-quantize",
]:
if os.path.exists(candidate):
quantize_bin = candidate
break
if quantize_bin is None:
print("ERROR: llama-quantize not found after build!")
print(f"F16 GGUF is at: {gguf_f16}")
print("You can quantize manually later.")
gguf_output = gguf_f16
else:
quant_type = quant.upper()
print(f"Quantizing f16 -> {quant_type}...")
subprocess.run([quantize_bin, gguf_f16, gguf_output, quant_type], check=True)
os.remove(gguf_f16)
print(f"GGUF: {gguf_output}")
return gguf_output
def step3_ollama(gguf_path, model_name):
"""Create Ollama model from GGUF."""
print("=" * 60)
print(f"Step 3: Creating Ollama model '{model_name}'...")
print("=" * 60)
import re
abs_gguf = os.path.abspath(gguf_path)
with open(MODELFILE_PATH, "r") as f:
modelfile_content = f.read()
modelfile_content = re.sub(
r'^FROM\s+.*$',
f'FROM {abs_gguf}',
modelfile_content,
flags=re.MULTILINE,
)
updated_modelfile = os.path.join(EXPORT_DIR, "Modelfile")
with open(updated_modelfile, "w") as f:
f.write(modelfile_content)
result = subprocess.run(
[OLLAMA_EXE, "create", model_name, "-f", updated_modelfile],
capture_output=True, text=True,
)
print(result.stdout)
if result.returncode != 0:
print(f"ERROR: {result.stderr}")
sys.exit(1)
print(f"Done! Test with: ollama run {model_name} \"नमस्ते\"")
def main():
parser = argparse.ArgumentParser(description="Export LoRA to GGUF for Ollama")
parser.add_argument("--quant", default="q4_k_m", help="Quantization (default: q4_k_m)")
parser.add_argument("--adapter", default=ADAPTER_PATH)
parser.add_argument("--model-name", default="sakhi")
parser.add_argument("--skip-merge", action="store_true", help="Skip step 1 (reuse existing merged)")
parser.add_argument("--skip-ollama", action="store_true", help="Skip step 3")
args = parser.parse_args()
if not args.skip_merge:
step1_merge(args.adapter)
gguf_path = step2_convert_gguf(args.quant)
if not args.skip_ollama:
step3_ollama(gguf_path, args.model_name)
else:
print(f"\nGGUF ready at: {gguf_path}")
print(f"Run: ollama create {args.model_name} -f models/exported/Modelfile")
if __name__ == "__main__":
main()
|