File size: 6,541 Bytes
745f62a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
"""
Export fine-tuned LoRA adapter to GGUF for Ollama deployment.

Step 1: Unsloth merges LoRA into base model -> saves 16-bit safetensors
Step 2: llama.cpp converts safetensors -> GGUF (with quantization)
Step 3: Register with Ollama

Usage:
  python scripts/export_gguf.py
  python scripts/export_gguf.py --quant q8_0
"""
import argparse
import os
import subprocess
import sys

os.environ["TORCH_COMPILE_DISABLE"] = "1"
os.environ["TORCHDYNAMO_DISABLE"] = "1"

OLLAMA_EXE = os.path.expanduser("~/AppData/Local/Programs/Ollama/ollama.exe")
ADAPTER_PATH = "./models/checkpoints/final"
MERGED_DIR = "./models/merged_16bit"
EXPORT_DIR = "./models/exported"
MODELFILE_PATH = "./configs/Modelfile"


def step1_merge(adapter_path):
    """Load base + LoRA via Unsloth, merge, save 16-bit safetensors."""
    print("=" * 60)
    print("Step 1: Merging LoRA into base model (Unsloth)...")
    print("=" * 60)

    from unsloth import FastLanguageModel

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=adapter_path,
        max_seq_length=4096,
        load_in_4bit=True,
    )

    os.makedirs(MERGED_DIR, exist_ok=True)
    model.save_pretrained_merged(
        MERGED_DIR,
        tokenizer,
        save_method="merged_16bit",
    )
    print(f"Merged model saved to {MERGED_DIR}")

    # Free GPU memory before llama.cpp conversion
    del model, tokenizer
    import torch, gc
    gc.collect()
    torch.cuda.empty_cache()


def step2_convert_gguf(quant):
    """Convert merged safetensors to GGUF via llama.cpp."""
    print("=" * 60)
    print(f"Step 2: Converting to GGUF ({quant}) via llama.cpp...")
    print("=" * 60)

    convert_script = "./llama.cpp/convert_hf_to_gguf.py"
    if not os.path.exists(convert_script):
        print("Cloning llama.cpp...")
        subprocess.run(
            ["git", "clone", "--depth", "1", "https://github.com/ggml-org/llama.cpp", "./llama.cpp"],
            check=True,
        )
        # Only install gguf package — full requirements.txt has torch version conflicts
        subprocess.run(
            [sys.executable, "-m", "pip", "install", "gguf"],
            check=True,
        )

    os.makedirs(EXPORT_DIR, exist_ok=True)

    if quant in ("f16", "f32"):
        # Direct conversion, no quantization needed
        gguf_output = os.path.join(EXPORT_DIR, f"sakhi-e4b-{quant}.gguf")
        subprocess.run(
            [sys.executable, convert_script, MERGED_DIR, "--outfile", gguf_output, "--outtype", quant],
            check=True,
        )
    else:
        # Convert to f16 first, then quantize
        gguf_f16 = os.path.join(EXPORT_DIR, "sakhi-e4b-f16.gguf")
        gguf_output = os.path.join(EXPORT_DIR, f"sakhi-e4b-{quant}.gguf")

        print("Converting HF -> GGUF (f16)...")
        subprocess.run(
            [sys.executable, convert_script, MERGED_DIR, "--outfile", gguf_f16, "--outtype", "f16"],
            check=True,
        )

        # Find llama-quantize binary
        quantize_bin = None
        for candidate in [
            "./llama.cpp/build/bin/llama-quantize",
            "./llama.cpp/build/bin/Release/llama-quantize",
            "./llama.cpp/build/bin/Release/llama-quantize.exe",
            "./llama.cpp/build/bin/llama-quantize.exe",
        ]:
            if os.path.exists(candidate):
                quantize_bin = candidate
                break

        if quantize_bin is None:
            print("Building llama.cpp (needs cmake)...")
            subprocess.run(
                ["cmake", "-B", "build", "-DCMAKE_BUILD_TYPE=Release"],
                cwd="./llama.cpp", check=True,
            )
            subprocess.run(
                ["cmake", "--build", "build", "--config", "Release", "-j"],
                cwd="./llama.cpp", check=True,
            )
            # Re-check after build
            for candidate in [
                "./llama.cpp/build/bin/Release/llama-quantize.exe",
                "./llama.cpp/build/bin/llama-quantize.exe",
                "./llama.cpp/build/bin/llama-quantize",
            ]:
                if os.path.exists(candidate):
                    quantize_bin = candidate
                    break

        if quantize_bin is None:
            print("ERROR: llama-quantize not found after build!")
            print(f"F16 GGUF is at: {gguf_f16}")
            print("You can quantize manually later.")
            gguf_output = gguf_f16
        else:
            quant_type = quant.upper()
            print(f"Quantizing f16 -> {quant_type}...")
            subprocess.run([quantize_bin, gguf_f16, gguf_output, quant_type], check=True)
            os.remove(gguf_f16)

    print(f"GGUF: {gguf_output}")
    return gguf_output


def step3_ollama(gguf_path, model_name):
    """Create Ollama model from GGUF."""
    print("=" * 60)
    print(f"Step 3: Creating Ollama model '{model_name}'...")
    print("=" * 60)

    import re
    abs_gguf = os.path.abspath(gguf_path)

    with open(MODELFILE_PATH, "r") as f:
        modelfile_content = f.read()

    modelfile_content = re.sub(
        r'^FROM\s+.*$',
        f'FROM {abs_gguf}',
        modelfile_content,
        flags=re.MULTILINE,
    )

    updated_modelfile = os.path.join(EXPORT_DIR, "Modelfile")
    with open(updated_modelfile, "w") as f:
        f.write(modelfile_content)

    result = subprocess.run(
        [OLLAMA_EXE, "create", model_name, "-f", updated_modelfile],
        capture_output=True, text=True,
    )
    print(result.stdout)
    if result.returncode != 0:
        print(f"ERROR: {result.stderr}")
        sys.exit(1)

    print(f"Done! Test with: ollama run {model_name} \"नमस्ते\"")


def main():
    parser = argparse.ArgumentParser(description="Export LoRA to GGUF for Ollama")
    parser.add_argument("--quant", default="q4_k_m", help="Quantization (default: q4_k_m)")
    parser.add_argument("--adapter", default=ADAPTER_PATH)
    parser.add_argument("--model-name", default="sakhi")
    parser.add_argument("--skip-merge", action="store_true", help="Skip step 1 (reuse existing merged)")
    parser.add_argument("--skip-ollama", action="store_true", help="Skip step 3")
    args = parser.parse_args()

    if not args.skip_merge:
        step1_merge(args.adapter)

    gguf_path = step2_convert_gguf(args.quant)

    if not args.skip_ollama:
        step3_ollama(gguf_path, args.model_name)
    else:
        print(f"\nGGUF ready at: {gguf_path}")
        print(f"Run: ollama create {args.model_name} -f models/exported/Modelfile")


if __name__ == "__main__":
    main()