import json
import os
import argparse
from colorama import init, Fore, Style

init()

def load_json(path):
    if not os.path.exists(path):
        return {}
    try:
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception:
        return {}

def save_json(path, data):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)

def convert_model(model_path):
    model_name = os.path.basename(model_path)
    print(f"Scanning: {model_name}...")
    
    gen_path = os.path.join(model_path, "generation_config.json")
    tok_conf_path = os.path.join(model_path, "tokenizer_config.json")
    
    # 1. Check if it is actually a ChatML/32000 model
    gen_data = load_json(gen_path)
    current_eos = gen_data.get("eos_token_id")
    
    # Handle list format (e.g. [32000, 2])
    if isinstance(current_eos, list):
        current_eos = current_eos[0]

    if str(current_eos) != "32000":
        # If it's already 2, we skip it (unless forced, but let's be safe)
        if str(current_eos) == "2":
            return # Already Mistral format
        print(f"  Skipping: EOS ID is {current_eos} (Not 32000)")
        return

    print(f"{Fore.YELLOW}  -> Detected ChatML (EOS: 32000). Converting to Mistral format...{Style.RESET_ALL}")

    # 2. Patch Generation Config
    gen_data["eos_token_id"] = 2
    gen_data["pad_token_id"] = 2 # Usually good practice to align pad/eos for base mistral
    save_json(gen_path, gen_data)
    print(f"     Fixed generation_config.json (ID: 2)")

    # 3. Patch Tokenizer Config
    if os.path.exists(tok_conf_path):
        tok_data = load_json(tok_conf_path)
        
        # Change string to </s>
        tok_data["eos_token"] = "</s>"
        
        # Remove chat_template if it exists (prevents auto-detection issues later)
        if "chat_template" in tok_data:
            del tok_data["chat_template"]
            
        save_json(tok_conf_path, tok_data)
        print(f"     Fixed tokenizer_config.json (Str: </s>)")

    # 4. Patch Special Tokens Map
    spec_path = os.path.join(model_path, "special_tokens_map.json")
    if os.path.exists(spec_path):
        spec_data = load_json(spec_path)
        spec_data["eos_token"] = "</s>"
        save_json(spec_path, spec_data)
        print(f"     Fixed special_tokens_map.json")

    print(f"{Fore.GREEN}  -> Successfully converted {model_name}{Style.RESET_ALL}")

def main():
    parser = argparse.ArgumentParser(description="Convert ChatML models (EOS 32000) to Mistral format (EOS 2)")
    parser.add_argument("base_dir", help="Directory containing the model folders (e.g. B:\\7B)")
    args = parser.parse_args()

    print(f"{Fore.CYAN}--- CHATML TO MISTRAL CONVERTER ---{Style.RESET_ALL}")
    print("This script changes metadata only. It allows ChatML models to be merged")
    print("using 'tokenizer: source: base' without errors.\n")

    if not os.path.exists(args.base_dir):
        print(f"Error: Directory {args.base_dir} does not exist.")
        return

    # Walk through all subdirectories
    count = 0
    for root, dirs, files in os.walk(args.base_dir):
        for name in dirs:
            # We look at every folder, check if it's a model inside convert_model
            full_path = os.path.join(root, name)
            # Simple check if it looks like a model folder
            if os.path.exists(os.path.join(full_path, "config.json")):
                convert_model(full_path)
                count += 1
    
    print("-" * 60)
    print("Scan complete.")

if __name__ == "__main__":
    main()