| |
| |
| |
|
|
| """ |
| Convert fine-tuned LoRA model to GGUF format with Q4_K_M quantization. |
| """ |
|
|
| import os |
| import subprocess |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from peft import PeftModel |
| import torch |
|
|
| |
| ADAPTER_MODEL = "nathens/qwen-codeforces-sft" |
| BASE_MODEL = "Qwen/Qwen2.5-0.5B" |
| OUTPUT_REPO = "nathens/my-model-gguf" |
| QUANTIZATION = "Q4_K_M" |
|
|
| print(f"π§ Converting model to GGUF") |
| print(f" Base model: {BASE_MODEL}") |
| print(f" Adapter: {ADAPTER_MODEL}") |
| print(f" Output: {OUTPUT_REPO}") |
| print(f" Quantization: {QUANTIZATION}") |
|
|
| |
| print("\nπ¦ Loading base model and tokenizer...") |
| base_model = AutoModelForCausalLM.from_pretrained( |
| BASE_MODEL, |
| dtype=torch.float16, |
| device_map="auto", |
| trust_remote_code=True |
| ) |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) |
|
|
| |
| print(f"π Loading and merging LoRA adapter from {ADAPTER_MODEL}...") |
| model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL) |
| print("βοΈ Merging adapter weights into base model...") |
| merged_model = model.merge_and_unload() |
|
|
| |
| print("πΎ Saving merged model...") |
| merged_dir = "./merged_model" |
| merged_model.save_pretrained(merged_dir) |
| tokenizer.save_pretrained(merged_dir) |
| print(f"β
Merged model saved to {merged_dir}") |
|
|
| |
| print("\nπ₯ Installing llama.cpp for GGUF conversion...") |
| subprocess.run(["apt-get", "update", "-qq"], check=True) |
| subprocess.run(["apt-get", "install", "-y", "-qq", "git", "build-essential", "cmake"], check=True) |
| subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True) |
|
|
| |
| nproc_result = subprocess.run(["nproc"], capture_output=True, text=True, check=True) |
| nproc = nproc_result.stdout.strip() |
| print(f"Building llama.cpp with {nproc} cores using CMake...") |
|
|
| os.makedirs("llama.cpp/build", exist_ok=True) |
| subprocess.run(["cmake", "-B", "llama.cpp/build", "-S", "llama.cpp"], check=True) |
| subprocess.run(["cmake", "--build", "llama.cpp/build", "--config", "Release", "-j", nproc], check=True) |
|
|
| |
| print("\nπ Converting to GGUF format...") |
| subprocess.run([ |
| "python3", "llama.cpp/convert_hf_to_gguf.py", |
| merged_dir, |
| "--outfile", "./model-f16.gguf", |
| "--outtype", "f16" |
| ], check=True) |
| print("β
Converted to FP16 GGUF") |
|
|
| |
| print(f"\nβ‘ Quantizing to {QUANTIZATION}...") |
| subprocess.run([ |
| "./llama.cpp/build/bin/llama-quantize", |
| "./model-f16.gguf", |
| f"./model-{QUANTIZATION}.gguf", |
| QUANTIZATION |
| ], check=True) |
| print(f"β
Quantized to {QUANTIZATION}") |
|
|
| |
| print(f"\nπ€ Uploading to {OUTPUT_REPO}...") |
| from huggingface_hub import HfApi |
| api = HfApi() |
|
|
| |
| try: |
| api.create_repo(OUTPUT_REPO, repo_type="model", exist_ok=True) |
| except Exception as e: |
| print(f"Note: {e}") |
|
|
| |
| api.upload_file( |
| path_or_fileobj=f"./model-{QUANTIZATION}.gguf", |
| path_in_repo=f"model-{QUANTIZATION}.gguf", |
| repo_id=OUTPUT_REPO, |
| repo_type="model" |
| ) |
|
|
| api.upload_file( |
| path_or_fileobj="./model-f16.gguf", |
| path_in_repo="model-f16.gguf", |
| repo_id=OUTPUT_REPO, |
| repo_type="model" |
| ) |
|
|
| |
| for file in ["tokenizer.json", "tokenizer_config.json"]: |
| try: |
| api.upload_file( |
| path_or_fileobj=f"{merged_dir}/{file}", |
| path_in_repo=file, |
| repo_id=OUTPUT_REPO, |
| repo_type="model" |
| ) |
| except Exception: |
| pass |
|
|
| print(f"\nβ
Conversion complete!") |
| print(f"π GGUF model available at: https://huggingface.co/{OUTPUT_REPO}") |
| print(f"\nπ‘ To use with Ollama:") |
| print(f" huggingface-cli download {OUTPUT_REPO} model-{QUANTIZATION}.gguf") |
|
|