Spaces:

Tushar9802
/

sakhi

Sleeping

File size: 6,541 Bytes

745f62a

"""
Export fine-tuned LoRA adapter to GGUF for Ollama deployment.

Step 1: Unsloth merges LoRA into base model -> saves 16-bit safetensors
Step 2: llama.cpp converts safetensors -> GGUF (with quantization)
Step 3: Register with Ollama

Usage:
  python scripts/export_gguf.py
  python scripts/export_gguf.py --quant q8_0
"""
import argparse
import os
import subprocess
import sys

os.environ["TORCH_COMPILE_DISABLE"] = "1"
os.environ["TORCHDYNAMO_DISABLE"] = "1"

OLLAMA_EXE = os.path.expanduser("~/AppData/Local/Programs/Ollama/ollama.exe")
ADAPTER_PATH = "./models/checkpoints/final"
MERGED_DIR = "./models/merged_16bit"
EXPORT_DIR = "./models/exported"
MODELFILE_PATH = "./configs/Modelfile"


def step1_merge(adapter_path):
    """Load base + LoRA via Unsloth, merge, save 16-bit safetensors."""
    print("=" * 60)
    print("Step 1: Merging LoRA into base model (Unsloth)...")
    print("=" * 60)

    from unsloth import FastLanguageModel

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=adapter_path,
        max_seq_length=4096,
        load_in_4bit=True,
    )

    os.makedirs(MERGED_DIR, exist_ok=True)
    model.save_pretrained_merged(
        MERGED_DIR,
        tokenizer,
        save_method="merged_16bit",
    )
    print(f"Merged model saved to {MERGED_DIR}")

    # Free GPU memory before llama.cpp conversion
    del model, tokenizer
    import torch, gc
    gc.collect()
    torch.cuda.empty_cache()


def step2_convert_gguf(quant):
    """Convert merged safetensors to GGUF via llama.cpp."""
    print("=" * 60)
    print(f"Step 2: Converting to GGUF ({quant}) via llama.cpp...")
    print("=" * 60)

    convert_script = "./llama.cpp/convert_hf_to_gguf.py"
    if not os.path.exists(convert_script):
        print("Cloning llama.cpp...")
        subprocess.run(
            ["git", "clone", "--depth", "1", "https://github.com/ggml-org/llama.cpp", "./llama.cpp"],
            check=True,
        )
        # Only install gguf package — full requirements.txt has torch version conflicts
        subprocess.run(
            [sys.executable, "-m", "pip", "install", "gguf"],
            check=True,
        )

    os.makedirs(EXPORT_DIR, exist_ok=True)

    if quant in ("f16", "f32"):
        # Direct conversion, no quantization needed
        gguf_output = os.path.join(EXPORT_DIR, f"sakhi-e4b-{quant}.gguf")
        subprocess.run(
            [sys.executable, convert_script, MERGED_DIR, "--outfile", gguf_output, "--outtype", quant],
            check=True,
        )
    else:
        # Convert to f16 first, then quantize
        gguf_f16 = os.path.join(EXPORT_DIR, "sakhi-e4b-f16.gguf")
        gguf_output = os.path.join(EXPORT_DIR, f"sakhi-e4b-{quant}.gguf")

        print("Converting HF -> GGUF (f16)...")
        subprocess.run(
            [sys.executable, convert_script, MERGED_DIR, "--outfile", gguf_f16, "--outtype", "f16"],
            check=True,
        )

        # Find llama-quantize binary
        quantize_bin = None
        for candidate in [
            "./llama.cpp/build/bin/llama-quantize",
            "./llama.cpp/build/bin/Release/llama-quantize",
            "./llama.cpp/build/bin/Release/llama-quantize.exe",
            "./llama.cpp/build/bin/llama-quantize.exe",
        ]:
            if os.path.exists(candidate):
                quantize_bin = candidate
                break

        if quantize_bin is None:
            print("Building llama.cpp (needs cmake)...")
            subprocess.run(
                ["cmake", "-B", "build", "-DCMAKE_BUILD_TYPE=Release"],
                cwd="./llama.cpp", check=True,
            )
            subprocess.run(
                ["cmake", "--build", "build", "--config", "Release", "-j"],
                cwd="./llama.cpp", check=True,
            )
            # Re-check after build
            for candidate in [
                "./llama.cpp/build/bin/Release/llama-quantize.exe",
                "./llama.cpp/build/bin/llama-quantize.exe",
                "./llama.cpp/build/bin/llama-quantize",
            ]:
                if os.path.exists(candidate):
                    quantize_bin = candidate
                    break

        if quantize_bin is None:
            print("ERROR: llama-quantize not found after build!")
            print(f"F16 GGUF is at: {gguf_f16}")
            print("You can quantize manually later.")
            gguf_output = gguf_f16
        else:
            quant_type = quant.upper()
            print(f"Quantizing f16 -> {quant_type}...")
            subprocess.run([quantize_bin, gguf_f16, gguf_output, quant_type], check=True)
            os.remove(gguf_f16)

    print(f"GGUF: {gguf_output}")
    return gguf_output


def step3_ollama(gguf_path, model_name):
    """Create Ollama model from GGUF."""
    print("=" * 60)
    print(f"Step 3: Creating Ollama model '{model_name}'...")
    print("=" * 60)

    import re
    abs_gguf = os.path.abspath(gguf_path)

    with open(MODELFILE_PATH, "r") as f:
        modelfile_content = f.read()

    modelfile_content = re.sub(
        r'^FROM\s+.*$',
        f'FROM {abs_gguf}',
        modelfile_content,
        flags=re.MULTILINE,
    )

    updated_modelfile = os.path.join(EXPORT_DIR, "Modelfile")
    with open(updated_modelfile, "w") as f:
        f.write(modelfile_content)

    result = subprocess.run(
        [OLLAMA_EXE, "create", model_name, "-f", updated_modelfile],
        capture_output=True, text=True,
    )
    print(result.stdout)
    if result.returncode != 0:
        print(f"ERROR: {result.stderr}")
        sys.exit(1)

    print(f"Done! Test with: ollama run {model_name} \"नमस्ते\"")


def main():
    parser = argparse.ArgumentParser(description="Export LoRA to GGUF for Ollama")
    parser.add_argument("--quant", default="q4_k_m", help="Quantization (default: q4_k_m)")
    parser.add_argument("--adapter", default=ADAPTER_PATH)
    parser.add_argument("--model-name", default="sakhi")
    parser.add_argument("--skip-merge", action="store_true", help="Skip step 1 (reuse existing merged)")
    parser.add_argument("--skip-ollama", action="store_true", help="Skip step 3")
    args = parser.parse_args()

    if not args.skip_merge:
        step1_merge(args.adapter)

    gguf_path = step2_convert_gguf(args.quant)

    if not args.skip_ollama:
        step3_ollama(gguf_path, args.model_name)
    else:
        print(f"\nGGUF ready at: {gguf_path}")
        print(f"Run: ollama create {args.model_name} -f models/exported/Modelfile")


if __name__ == "__main__":
    main()