| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| GGUF Conversion for QMD Query Expansion 4B Model |
| |
| Loads base model, applies SFT adapter, then GRPO adapter, merges all, |
| and converts to GGUF format for use with Ollama/llama.cpp/LM Studio. |
| """ |
|
|
| import os |
| import sys |
| import subprocess |
|
|
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from peft import PeftModel |
| from huggingface_hub import HfApi, login |
|
|
| |
| BASE_MODEL = "Qwen/Qwen3-4B" |
| SFT_MODEL = "tobil/qmd-query-expansion-4B-sft" |
| GRPO_MODEL = "tobil/qmd-query-expansion-4B-grpo" |
| OUTPUT_REPO = "tobil/qmd-query-expansion-4B-gguf" |
|
|
| def run_command(cmd, description): |
| """Run a command with error handling.""" |
| print(f" {description}...") |
| try: |
| result = subprocess.run(cmd, check=True, capture_output=True, text=True) |
| return True |
| except subprocess.CalledProcessError as e: |
| print(f" β Command failed: {' '.join(cmd)}") |
| if e.stderr: |
| print(f" STDERR: {e.stderr[:500]}") |
| return False |
| except FileNotFoundError: |
| print(f" β Command not found: {cmd[0]}") |
| return False |
|
|
|
|
| print("π QMD Query Expansion 4B GGUF Conversion") |
| print("=" * 60) |
|
|
| |
| print("\nπ¦ Installing build dependencies...") |
| subprocess.run(["apt-get", "update", "-qq"], capture_output=True) |
| subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake", "git"], capture_output=True) |
| print(" β
Build tools ready") |
|
|
| |
| hf_token = os.environ.get("HF_TOKEN") |
| if hf_token: |
| print("\nπ Logging in to HuggingFace...") |
| login(token=hf_token) |
| print(" β
Logged in") |
|
|
| |
| print(f"\nπ§ Step 1: Loading base model {BASE_MODEL}...") |
| base_model = AutoModelForCausalLM.from_pretrained( |
| BASE_MODEL, |
| torch_dtype=torch.bfloat16, |
| device_map="auto", |
| trust_remote_code=True, |
| ) |
| print(" β
Base model loaded") |
|
|
| |
| print(f"\nπ§ Step 2: Loading SFT adapter {SFT_MODEL}...") |
| model = PeftModel.from_pretrained(base_model, SFT_MODEL) |
| print(" Merging SFT adapter...") |
| model = model.merge_and_unload() |
| print(" β
SFT merged") |
|
|
| |
| print(f"\nπ§ Step 3: Loading GRPO adapter {GRPO_MODEL}...") |
| model = PeftModel.from_pretrained(model, GRPO_MODEL) |
| print(" Merging GRPO adapter...") |
| merged_model = model.merge_and_unload() |
| print(" β
GRPO merged - final model ready") |
|
|
| |
| print("\nπ Loading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) |
| print(" β
Tokenizer loaded") |
|
|
| |
| print("\nπΎ Step 4: Saving merged model to disk...") |
| merged_dir = "/tmp/merged_model" |
| merged_model.save_pretrained(merged_dir, safe_serialization=True) |
| tokenizer.save_pretrained(merged_dir) |
| print(f" β
Saved to {merged_dir}") |
|
|
| |
| print("\nπ₯ Step 5: Setting up llama.cpp...") |
| if not os.path.exists("/tmp/llama.cpp"): |
| run_command( |
| ["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"], |
| "Cloning llama.cpp" |
| ) |
|
|
| |
| subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"], capture_output=True) |
| subprocess.run([sys.executable, "-m", "pip", "install", "-q", "sentencepiece", "protobuf"], capture_output=True) |
| print(" β
llama.cpp ready") |
|
|
| |
| print("\nπ Step 6: Converting to GGUF format (FP16)...") |
| gguf_output_dir = "/tmp/gguf_output" |
| os.makedirs(gguf_output_dir, exist_ok=True) |
|
|
| model_name = "qmd-query-expansion-4B" |
| gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf" |
|
|
| convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py" |
| if not run_command( |
| [sys.executable, convert_script, merged_dir, "--outfile", gguf_file, "--outtype", "f16"], |
| "Converting to FP16 GGUF" |
| ): |
| print(" β Conversion failed!") |
| sys.exit(1) |
|
|
| size_mb = os.path.getsize(gguf_file) / (1024 * 1024) |
| print(f" β
FP16 GGUF created: {size_mb:.1f} MB") |
|
|
| |
| print("\nβοΈ Step 7: Building quantize tool...") |
| os.makedirs("/tmp/llama.cpp/build", exist_ok=True) |
|
|
| run_command( |
| ["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"], |
| "Configuring with CMake" |
| ) |
| run_command( |
| ["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"], |
| "Building llama-quantize" |
| ) |
|
|
| quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize" |
| print(" β
Quantize tool built") |
|
|
| |
| print("\nβοΈ Step 8: Creating quantized versions...") |
| quant_formats = [ |
| ("Q4_K_M", "4-bit medium (recommended)"), |
| ("Q5_K_M", "5-bit medium"), |
| ("Q8_0", "8-bit"), |
| ] |
|
|
| quantized_files = [] |
| for quant_type, description in quant_formats: |
| print(f" Creating {quant_type} ({description})...") |
| quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf" |
|
|
| if run_command([quantize_bin, gguf_file, quant_file, quant_type], f"Quantizing to {quant_type}"): |
| size_mb = os.path.getsize(quant_file) / (1024 * 1024) |
| print(f" β
{quant_type}: {size_mb:.1f} MB") |
| quantized_files.append((quant_file, quant_type)) |
| else: |
| print(f" β οΈ Skipping {quant_type}") |
|
|
| |
| print("\nβοΈ Step 9: Uploading to Hugging Face Hub...") |
| api = HfApi() |
|
|
| print(f" Creating repository: {OUTPUT_REPO}") |
| api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True) |
|
|
| |
| print(" Uploading FP16...") |
| api.upload_file( |
| path_or_fileobj=gguf_file, |
| path_in_repo=f"{model_name}-f16.gguf", |
| repo_id=OUTPUT_REPO, |
| ) |
| print(" β
FP16 uploaded") |
|
|
| |
| for quant_file, quant_type in quantized_files: |
| print(f" Uploading {quant_type}...") |
| api.upload_file( |
| path_or_fileobj=quant_file, |
| path_in_repo=f"{model_name}-{quant_type.lower()}.gguf", |
| repo_id=OUTPUT_REPO, |
| ) |
| print(f" β
{quant_type} uploaded") |
|
|
| |
| print("\nπ Creating README...") |
| readme_content = f"""--- |
| base_model: {BASE_MODEL} |
| tags: |
| - gguf |
| - llama.cpp |
| - quantized |
| - query-expansion |
| - qmd |
| --- |
| |
| # QMD Query Expansion 4B (GGUF) |
| |
| GGUF conversion of the QMD Query Expansion model for use with Ollama, llama.cpp, and LM Studio. |
| |
| ## Model Details |
| |
| - **Base Model:** {BASE_MODEL} |
| - **SFT Adapter:** {SFT_MODEL} |
| - **GRPO Adapter:** {GRPO_MODEL} |
| - **Task:** Query expansion for hybrid search (lex/vec/hyde format) |
| |
| ## Available Quantizations |
| |
| | File | Quant | Description | |
| |------|-------|-------------| |
| | {model_name}-f16.gguf | F16 | Full precision | |
| | {model_name}-q8_0.gguf | Q8_0 | 8-bit | |
| | {model_name}-q5_k_m.gguf | Q5_K_M | 5-bit medium | |
| | {model_name}-q4_k_m.gguf | Q4_K_M | 4-bit medium (recommended) | |
| |
| ## Usage |
| |
| ### With Ollama |
| |
| ```bash |
| # Download |
| huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf --local-dir . |
| |
| # Create Modelfile |
| echo 'FROM ./{model_name}-q4_k_m.gguf' > Modelfile |
| |
| # Create and run |
| ollama create qmd-expand-4b -f Modelfile |
| ollama run qmd-expand-4b |
| ``` |
| |
| ### Prompt Format |
| |
| Use Qwen3 chat format with `/no_think`: |
| |
| ``` |
| <|im_start|>user |
| /no_think Expand this search query: your query here<|im_end|> |
| <|im_start|>assistant |
| ``` |
| |
| ### Expected Output |
| |
| ``` |
| lex: keyword variation 1 |
| lex: keyword variation 2 |
| vec: natural language reformulation |
| hyde: Hypothetical document passage answering the query. |
| ``` |
| |
| ## License |
| |
| Apache 2.0 (inherited from Qwen3) |
| """ |
|
|
| api.upload_file( |
| path_or_fileobj=readme_content.encode(), |
| path_in_repo="README.md", |
| repo_id=OUTPUT_REPO, |
| ) |
| print(" β
README uploaded") |
|
|
| print("\n" + "=" * 60) |
| print("β
GGUF Conversion Complete!") |
| print(f"π¦ Repository: https://huggingface.co/{OUTPUT_REPO}") |
| print("=" * 60) |
|
|