Spaces:
Sleeping
Sleeping
| """ | |
| Export fine-tuned LoRA adapter to GGUF for Ollama deployment. | |
| Step 1: Unsloth merges LoRA into base model -> saves 16-bit safetensors | |
| Step 2: llama.cpp converts safetensors -> GGUF (with quantization) | |
| Step 3: Register with Ollama | |
| Usage: | |
| python scripts/export_gguf.py | |
| python scripts/export_gguf.py --quant q8_0 | |
| """ | |
| import argparse | |
| import os | |
| import subprocess | |
| import sys | |
| os.environ["TORCH_COMPILE_DISABLE"] = "1" | |
| os.environ["TORCHDYNAMO_DISABLE"] = "1" | |
| OLLAMA_EXE = os.path.expanduser("~/AppData/Local/Programs/Ollama/ollama.exe") | |
| ADAPTER_PATH = "./models/checkpoints/final" | |
| MERGED_DIR = "./models/merged_16bit" | |
| EXPORT_DIR = "./models/exported" | |
| MODELFILE_PATH = "./configs/Modelfile" | |
| def step1_merge(adapter_path): | |
| """Load base + LoRA via Unsloth, merge, save 16-bit safetensors.""" | |
| print("=" * 60) | |
| print("Step 1: Merging LoRA into base model (Unsloth)...") | |
| print("=" * 60) | |
| from unsloth import FastLanguageModel | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=adapter_path, | |
| max_seq_length=4096, | |
| load_in_4bit=True, | |
| ) | |
| os.makedirs(MERGED_DIR, exist_ok=True) | |
| model.save_pretrained_merged( | |
| MERGED_DIR, | |
| tokenizer, | |
| save_method="merged_16bit", | |
| ) | |
| print(f"Merged model saved to {MERGED_DIR}") | |
| # Free GPU memory before llama.cpp conversion | |
| del model, tokenizer | |
| import torch, gc | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| def step2_convert_gguf(quant): | |
| """Convert merged safetensors to GGUF via llama.cpp.""" | |
| print("=" * 60) | |
| print(f"Step 2: Converting to GGUF ({quant}) via llama.cpp...") | |
| print("=" * 60) | |
| convert_script = "./llama.cpp/convert_hf_to_gguf.py" | |
| if not os.path.exists(convert_script): | |
| print("Cloning llama.cpp...") | |
| subprocess.run( | |
| ["git", "clone", "--depth", "1", "https://github.com/ggml-org/llama.cpp", "./llama.cpp"], | |
| check=True, | |
| ) | |
| # Only install gguf package — full requirements.txt has torch version conflicts | |
| subprocess.run( | |
| [sys.executable, "-m", "pip", "install", "gguf"], | |
| check=True, | |
| ) | |
| os.makedirs(EXPORT_DIR, exist_ok=True) | |
| if quant in ("f16", "f32"): | |
| # Direct conversion, no quantization needed | |
| gguf_output = os.path.join(EXPORT_DIR, f"sakhi-e4b-{quant}.gguf") | |
| subprocess.run( | |
| [sys.executable, convert_script, MERGED_DIR, "--outfile", gguf_output, "--outtype", quant], | |
| check=True, | |
| ) | |
| else: | |
| # Convert to f16 first, then quantize | |
| gguf_f16 = os.path.join(EXPORT_DIR, "sakhi-e4b-f16.gguf") | |
| gguf_output = os.path.join(EXPORT_DIR, f"sakhi-e4b-{quant}.gguf") | |
| print("Converting HF -> GGUF (f16)...") | |
| subprocess.run( | |
| [sys.executable, convert_script, MERGED_DIR, "--outfile", gguf_f16, "--outtype", "f16"], | |
| check=True, | |
| ) | |
| # Find llama-quantize binary | |
| quantize_bin = None | |
| for candidate in [ | |
| "./llama.cpp/build/bin/llama-quantize", | |
| "./llama.cpp/build/bin/Release/llama-quantize", | |
| "./llama.cpp/build/bin/Release/llama-quantize.exe", | |
| "./llama.cpp/build/bin/llama-quantize.exe", | |
| ]: | |
| if os.path.exists(candidate): | |
| quantize_bin = candidate | |
| break | |
| if quantize_bin is None: | |
| print("Building llama.cpp (needs cmake)...") | |
| subprocess.run( | |
| ["cmake", "-B", "build", "-DCMAKE_BUILD_TYPE=Release"], | |
| cwd="./llama.cpp", check=True, | |
| ) | |
| subprocess.run( | |
| ["cmake", "--build", "build", "--config", "Release", "-j"], | |
| cwd="./llama.cpp", check=True, | |
| ) | |
| # Re-check after build | |
| for candidate in [ | |
| "./llama.cpp/build/bin/Release/llama-quantize.exe", | |
| "./llama.cpp/build/bin/llama-quantize.exe", | |
| "./llama.cpp/build/bin/llama-quantize", | |
| ]: | |
| if os.path.exists(candidate): | |
| quantize_bin = candidate | |
| break | |
| if quantize_bin is None: | |
| print("ERROR: llama-quantize not found after build!") | |
| print(f"F16 GGUF is at: {gguf_f16}") | |
| print("You can quantize manually later.") | |
| gguf_output = gguf_f16 | |
| else: | |
| quant_type = quant.upper() | |
| print(f"Quantizing f16 -> {quant_type}...") | |
| subprocess.run([quantize_bin, gguf_f16, gguf_output, quant_type], check=True) | |
| os.remove(gguf_f16) | |
| print(f"GGUF: {gguf_output}") | |
| return gguf_output | |
| def step3_ollama(gguf_path, model_name): | |
| """Create Ollama model from GGUF.""" | |
| print("=" * 60) | |
| print(f"Step 3: Creating Ollama model '{model_name}'...") | |
| print("=" * 60) | |
| import re | |
| abs_gguf = os.path.abspath(gguf_path) | |
| with open(MODELFILE_PATH, "r") as f: | |
| modelfile_content = f.read() | |
| modelfile_content = re.sub( | |
| r'^FROM\s+.*$', | |
| f'FROM {abs_gguf}', | |
| modelfile_content, | |
| flags=re.MULTILINE, | |
| ) | |
| updated_modelfile = os.path.join(EXPORT_DIR, "Modelfile") | |
| with open(updated_modelfile, "w") as f: | |
| f.write(modelfile_content) | |
| result = subprocess.run( | |
| [OLLAMA_EXE, "create", model_name, "-f", updated_modelfile], | |
| capture_output=True, text=True, | |
| ) | |
| print(result.stdout) | |
| if result.returncode != 0: | |
| print(f"ERROR: {result.stderr}") | |
| sys.exit(1) | |
| print(f"Done! Test with: ollama run {model_name} \"नमस्ते\"") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Export LoRA to GGUF for Ollama") | |
| parser.add_argument("--quant", default="q4_k_m", help="Quantization (default: q4_k_m)") | |
| parser.add_argument("--adapter", default=ADAPTER_PATH) | |
| parser.add_argument("--model-name", default="sakhi") | |
| parser.add_argument("--skip-merge", action="store_true", help="Skip step 1 (reuse existing merged)") | |
| parser.add_argument("--skip-ollama", action="store_true", help="Skip step 3") | |
| args = parser.parse_args() | |
| if not args.skip_merge: | |
| step1_merge(args.adapter) | |
| gguf_path = step2_convert_gguf(args.quant) | |
| if not args.skip_ollama: | |
| step3_ollama(gguf_path, args.model_name) | |
| else: | |
| print(f"\nGGUF ready at: {gguf_path}") | |
| print(f"Run: ollama create {args.model_name} -f models/exported/Modelfile") | |
| if __name__ == "__main__": | |
| main() | |