#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ CodePilot Model Export — 把訓練好的模型匯出給 Ollama / LM Studio ================================================================ Usage: # Step 1+2+3 一鍵完成 python export_model.py --adapter ~/.codepilot/adapter_20260423 --output ./my-model # 只合併(產生完整模型) python export_model.py --adapter ~/.codepilot/adapter_20260423 --output ./merged --merge-only # 只轉 GGUF(已有合併模型) python export_model.py --merged-model ./merged --output ./my-model --quantize q4_k_m # 自動註冊到 Ollama python export_model.py --adapter ~/.codepilot/adapter_20260423 --output ./my-model --ollama # 上傳到 HuggingFace Hub python export_model.py --adapter ~/.codepilot/adapter_20260423 --output ./my-model --push-to-hub USERNAME/my-model """ import argparse, os, sys, subprocess, shutil from pathlib import Path DEFAULT_BASE_MODEL = "Qwen/Qwen2.5-Coder-3B-Instruct" def step1_merge_adapter(base_model, adapter_path, output_dir): """Step 1: 合併 LoRA adapter 到基礎模型""" print(f"\n{'='*60}") print(f" Step 1: 合併 LoRA Adapter") print(f" Base: {base_model}") print(f" Adapter: {adapter_path}") print(f" Output: {output_dir}") print(f"{'='*60}\n") import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel print("📥 載入基礎模型...") tokenizer = AutoTokenizer.from_pretrained(base_model) model = AutoModelForCausalLM.from_pretrained( base_model, torch_dtype=torch.float16, device_map="cpu", # 合併用 CPU,省 GPU 記憶體 trust_remote_code=True, ) print("📥 載入 LoRA adapter...") model = PeftModel.from_pretrained(model, adapter_path) print("🔄 合併權重...") model = model.merge_and_unload() print(f"💾 保存到 {output_dir}...") os.makedirs(output_dir, exist_ok=True) model.save_pretrained(output_dir, safe_serialization=True) tokenizer.save_pretrained(output_dir) print(f"✅ 合併完成: {output_dir}") # 顯示大小 total_size = sum(f.stat().st_size for f in Path(output_dir).rglob("*") if f.is_file()) print(f" 大小: {total_size / 1024**3:.1f} GB") return output_dir def step2_convert_gguf(merged_dir, output_dir, quantize="q4_k_m"): """Step 2: 轉換成 GGUF 格式""" print(f"\n{'='*60}") print(f" Step 2: 轉換 GGUF") print(f" Input: {merged_dir}") print(f" Output: {output_dir}") print(f" Quantize: {quantize}") print(f"{'='*60}\n") # 檢查 llama.cpp 是否已安裝 convert_script = shutil.which("convert_hf_to_gguf.py") quantize_bin = shutil.which("llama-quantize") if not convert_script: # 嘗試找 llama.cpp 目錄 llama_cpp_paths = [ os.path.expanduser("~/llama.cpp"), "/opt/llama.cpp", os.path.expanduser("~/.local/share/llama.cpp"), ] for p in llama_cpp_paths: if os.path.exists(os.path.join(p, "convert_hf_to_gguf.py")): convert_script = os.path.join(p, "convert_hf_to_gguf.py") quantize_bin = os.path.join(p, "build", "bin", "llama-quantize") break if not convert_script: print("⚠️ llama.cpp 未安裝。安裝方式:") print() print(" # 方式 1: pip(最簡單)") print(" pip install llama-cpp-python") print() print(" # 方式 2: 從原始碼編譯(完整功能)") print(" git clone https://github.com/ggml-org/llama.cpp") print(" cd llama.cpp && make -j") print() print(" # 方式 3: 用 Hugging Face 的轉換工具") print(" pip install transformers[gguf]") print() # 嘗試用 transformers 的 GGUF 導出 print("🔄 嘗試用 transformers 導出 GGUF...") try: from transformers import AutoModelForCausalLM, AutoTokenizer os.makedirs(output_dir, exist_ok=True) gguf_path = os.path.join(output_dir, "model.gguf") tokenizer = AutoTokenizer.from_pretrained(merged_dir) model = AutoModelForCausalLM.from_pretrained(merged_dir, torch_dtype="auto") model.save_pretrained(output_dir, safe_serialization=False) # 用 convert script from transformers convert_cmd = [ sys.executable, "-c", f"from transformers.convert_slow_tokenizer import convert_gguf; " f"convert_gguf('{merged_dir}', '{gguf_path}')" ] result = subprocess.run(convert_cmd, capture_output=True, text=True) if result.returncode == 0: print(f"✅ GGUF 轉換完成: {gguf_path}") return gguf_path except Exception as e: pass print() print("❌ 自動轉換失敗。請手動安裝 llama.cpp 後重試。") print(f" 或者直接用合併後的模型: {merged_dir}") return None # 用 llama.cpp 轉換 os.makedirs(output_dir, exist_ok=True) fp16_gguf = os.path.join(output_dir, "model-fp16.gguf") quant_gguf = os.path.join(output_dir, f"model-{quantize}.gguf") # Step 2a: HF → GGUF (fp16) print("🔄 轉換 HF → GGUF (fp16)...") result = subprocess.run( [sys.executable, convert_script, merged_dir, "--outfile", fp16_gguf, "--outtype", "f16"], capture_output=True, text=True) if result.returncode != 0: print(f"❌ 轉換失敗:\n{result.stderr[:500]}") return None # Step 2b: 量化 if quantize and quantize != "f16" and quantize_bin and os.path.exists(quantize_bin): print(f"🔄 量化 → {quantize}...") result = subprocess.run( [quantize_bin, fp16_gguf, quant_gguf, quantize.upper()], capture_output=True, text=True) if result.returncode == 0: # 刪除 fp16 版本節省空間 os.remove(fp16_gguf) gguf_path = quant_gguf else: print(f"⚠️ 量化失敗,使用 fp16 版本") gguf_path = fp16_gguf else: gguf_path = fp16_gguf size = os.path.getsize(gguf_path) / 1024**3 print(f"✅ GGUF 完成: {gguf_path} ({size:.1f} GB)") return gguf_path def step3_register_ollama(gguf_path, model_name="codepilot"): """Step 3: 註冊到 Ollama""" print(f"\n{'='*60}") print(f" Step 3: 註冊到 Ollama") print(f"{'='*60}\n") if not shutil.which("ollama"): print("❌ Ollama 未安裝") print(" 安裝: curl -fsSL https://ollama.ai/install.sh | sh") return # 建立 Modelfile modelfile_path = os.path.join(os.path.dirname(gguf_path), "Modelfile") modelfile_content = f"""FROM {os.path.abspath(gguf_path)} TEMPLATE \"\"\"{{{{- if .System }}}}<|im_start|>system {{{{ .System }}}}<|im_end|> {{{{- end }}}} <|im_start|>user {{{{ .Prompt }}}}<|im_end|> <|im_start|>assistant \"\"\" PARAMETER stop "<|im_end|>" PARAMETER stop "<|endoftext|>" PARAMETER temperature 0.7 PARAMETER top_p 0.9 PARAMETER num_ctx 4096 SYSTEM \"\"\"You are CodePilot, an expert AI programming assistant. Write clean, efficient, well-documented code.\"\"\" """ Path(modelfile_path).write_text(modelfile_content) print(f"📝 Modelfile 已建立: {modelfile_path}") # 註冊到 Ollama print(f"🔄 ollama create {model_name}...") result = subprocess.run( ["ollama", "create", model_name, "-f", modelfile_path], capture_output=True, text=True) if result.returncode == 0: print(f"\n✅ 已註冊到 Ollama!") print(f"\n 使用方式:") print(f" ollama run {model_name}") print(f" ollama run {model_name} '寫一個快速排序'") print(f"\n 在 CodePilot 中使用:") print(f" python codepilot_v4.py --provider ollama --cloud-model {model_name}") else: print(f"❌ 註冊失敗:\n{result.stderr[:300]}") print(f"\n 手動註冊:") print(f" ollama create {model_name} -f {modelfile_path}") def step4_push_to_hub(merged_dir, repo_id): """(可選)上傳到 HuggingFace Hub""" print(f"\n{'='*60}") print(f" Step 4: 上傳到 HuggingFace Hub") print(f" Repo: {repo_id}") print(f"{'='*60}\n") from huggingface_hub import HfApi api = HfApi() print("📤 上傳中...") api.upload_folder( folder_path=merged_dir, repo_id=repo_id, repo_type="model", commit_message="CodePilot fine-tuned model", ) print(f"✅ 已上傳: https://huggingface.co/{repo_id}") def step_lmstudio(gguf_path): """顯示 LM Studio 使用說明""" print(f"\n{'='*60}") print(f" LM Studio 使用方式") print(f"{'='*60}\n") print(f" 1. 打開 LM Studio") print(f" 2. 左側選「My Models」") print(f" 3. 點「Import Model」") print(f" 4. 選擇: {os.path.abspath(gguf_path)}") print(f" 5. 載入後就可以在 LM Studio 中使用") print(f"\n 或者把 GGUF 文件複製到 LM Studio 的模型目錄:") # LM Studio 預設路徑 import platform if platform.system() == "Windows": lm_dir = os.path.expanduser("~/.cache/lm-studio/models") elif platform.system() == "Darwin": lm_dir = os.path.expanduser("~/.cache/lm-studio/models") else: lm_dir = os.path.expanduser("~/.cache/lm-studio/models") dest = os.path.join(lm_dir, "codepilot") print(f" mkdir -p {dest}") print(f" cp {os.path.abspath(gguf_path)} {dest}/") def main(): parser = argparse.ArgumentParser(description="匯出模型給 Ollama / LM Studio") parser.add_argument("--base-model", default=DEFAULT_BASE_MODEL, help="基礎模型") parser.add_argument("--adapter", help="LoRA adapter 路徑") parser.add_argument("--merged-model", help="已合併的模型路徑(跳過 Step 1)") parser.add_argument("--output", default="./exported_model", help="輸出目錄") parser.add_argument("--quantize", default="q4_k_m", choices=["f16", "q8_0", "q6_k", "q5_k_m", "q4_k_m", "q4_0", "q3_k_m", "q2_k"], help="量化等級 (預設: q4_k_m)") parser.add_argument("--ollama", action="store_true", help="自動註冊到 Ollama") parser.add_argument("--ollama-name", default="codepilot", help="Ollama 模型名稱") parser.add_argument("--merge-only", action="store_true", help="只合併,不轉 GGUF") parser.add_argument("--push-to-hub", help="上傳到 HF Hub (格式: username/model-name)") args = parser.parse_args() print(""" ╔════════════════════════════════════════════════════════════╗ ║ CodePilot Model Export ║ ║ LoRA → 合併 → GGUF → Ollama / LM Studio ║ ╚════════════════════════════════════════════════════════════╝ """) merged_dir = args.merged_model gguf_path = None # Step 1: 合併 if not merged_dir: if not args.adapter: print("❌ 請指定 --adapter 或 --merged-model") sys.exit(1) merged_dir = os.path.join(args.output, "merged") step1_merge_adapter(args.base_model, args.adapter, merged_dir) if args.merge_only: print(f"\n✅ 合併完成: {merged_dir}") return # Step 2: GGUF gguf_dir = os.path.join(args.output, "gguf") gguf_path = step2_convert_gguf(merged_dir, gguf_dir, args.quantize) # Step 3: Ollama if args.ollama and gguf_path: step3_register_ollama(gguf_path, args.ollama_name) # LM Studio 說明 if gguf_path: step_lmstudio(gguf_path) # 上傳 if args.push_to_hub: step4_push_to_hub(merged_dir, args.push_to_hub) print(f"\n{'='*60}") print(f" 🎉 匯出完成!") print(f"{'='*60}") print(f" 合併模型: {merged_dir}") if gguf_path: print(f" GGUF: {gguf_path}") print() print(f" 量化選項說明:") print(f" f16 — 最高品質,最大 (~6GB)") print(f" q8_0 — 幾乎無損 (~3.5GB)") print(f" q6_k — 高品質 (~2.8GB)") print(f" q5_k_m — 好的平衡 (~2.4GB)") print(f" q4_k_m — 推薦預設 (~2.0GB) ← 品質/大小最佳平衡") print(f" q4_0 — 較小 (~1.8GB)") print(f" q3_k_m — 很小 (~1.5GB)") print(f" q2_k — 最小,品質有損 (~1.2GB)") if __name__ == "__main__": main()