Spaces:

Naphula
/

model_tools

Running

App Files Files Community

model_tools / chatml_to_mistral.py

Naphula

Upload 10 files

f43fd2b verified about 1 month ago

raw

history blame contribute delete

3.71 kB

	import json
	import os
	import argparse
	from colorama import init, Fore, Style

	init()

	def load_json(path):
	if not os.path.exists(path):
	return {}
	try:
	with open(path, 'r', encoding='utf-8') as f:
	return json.load(f)
	except Exception:
	return {}

	def save_json(path, data):
	with open(path, 'w', encoding='utf-8') as f:
	json.dump(data, f, indent=2)

	def convert_model(model_path):
	model_name = os.path.basename(model_path)
	print(f"Scanning: {model_name}...")

	gen_path = os.path.join(model_path, "generation_config.json")
	tok_conf_path = os.path.join(model_path, "tokenizer_config.json")

	# 1. Check if it is actually a ChatML/32000 model
	gen_data = load_json(gen_path)
	current_eos = gen_data.get("eos_token_id")

	# Handle list format (e.g. [32000, 2])
	if isinstance(current_eos, list):
	current_eos = current_eos[0]

	if str(current_eos) != "32000":
	# If it's already 2, we skip it (unless forced, but let's be safe)
	if str(current_eos) == "2":
	return # Already Mistral format
	print(f" Skipping: EOS ID is {current_eos} (Not 32000)")
	return

	print(f"{Fore.YELLOW} -> Detected ChatML (EOS: 32000). Converting to Mistral format...{Style.RESET_ALL}")

	# 2. Patch Generation Config
	gen_data["eos_token_id"] = 2
	gen_data["pad_token_id"] = 2 # Usually good practice to align pad/eos for base mistral
	save_json(gen_path, gen_data)
	print(f" Fixed generation_config.json (ID: 2)")

	# 3. Patch Tokenizer Config
	if os.path.exists(tok_conf_path):
	tok_data = load_json(tok_conf_path)

	# Change string to </s>
	tok_data["eos_token"] = "</s>"

	# Remove chat_template if it exists (prevents auto-detection issues later)
	if "chat_template" in tok_data:
	del tok_data["chat_template"]

	save_json(tok_conf_path, tok_data)
	print(f" Fixed tokenizer_config.json (Str: </s>)")

	# 4. Patch Special Tokens Map
	spec_path = os.path.join(model_path, "special_tokens_map.json")
	if os.path.exists(spec_path):
	spec_data = load_json(spec_path)
	spec_data["eos_token"] = "</s>"
	save_json(spec_path, spec_data)
	print(f" Fixed special_tokens_map.json")

	print(f"{Fore.GREEN} -> Successfully converted {model_name}{Style.RESET_ALL}")

	def main():
	parser = argparse.ArgumentParser(description="Convert ChatML models (EOS 32000) to Mistral format (EOS 2)")
	parser.add_argument("base_dir", help="Directory containing the model folders (e.g. B:\\7B)")
	args = parser.parse_args()

	print(f"{Fore.CYAN}--- CHATML TO MISTRAL CONVERTER ---{Style.RESET_ALL}")
	print("This script changes metadata only. It allows ChatML models to be merged")
	print("using 'tokenizer: source: base' without errors.\n")

	if not os.path.exists(args.base_dir):
	print(f"Error: Directory {args.base_dir} does not exist.")
	return

	# Walk through all subdirectories
	count = 0
	for root, dirs, files in os.walk(args.base_dir):
	for name in dirs:
	# We look at every folder, check if it's a model inside convert_model
	full_path = os.path.join(root, name)
	# Simple check if it looks like a model folder
	if os.path.exists(os.path.join(full_path, "config.json")):
	convert_model(full_path)
	count += 1

	print("-" * 60)
	print("Scan complete.")

	if __name__ == "__main__":
	main()