LLM4HEP / compare_model_configs.py

initial commit

cfcbbc8 5 months ago

6.38 kB

	#!/usr/bin/env python3
	"""
	Compare two model variants to see if they have different configurations.
	Usage:
	export CBORG_API_KEY=...
	python compare_model_configs.py openai/o:latest openai/o3
	"""
	import os
	import sys
	from openai import OpenAI
	import json

	def test_model_detailed(client, model_id):
	"""Test a model and return detailed response information."""
	try:
	response = client.chat.completions.create(
	model=model_id,
	messages=[{"role": "user", "content": "What is 2+2?"}],
	max_tokens=10,
	temperature=1.0, # Explicitly set
	top_p=1.0, # Explicitly set
	)

	# Extract all available information
	info = {
	'model': response.model,
	'id': response.id,
	'created': response.created,
	'object': response.object,
	'system_fingerprint': getattr(response, 'system_fingerprint', None),
	'usage': {
	'prompt_tokens': response.usage.prompt_tokens,
	'completion_tokens': response.usage.completion_tokens,
	'total_tokens': response.usage.total_tokens,
	},
	'response_content': response.choices[0].message.content,
	'finish_reason': response.choices[0].finish_reason,
	}

	# Try to get any additional metadata
	try:
	info['raw_response'] = str(response)
	except:
	pass

	return info, None
	except Exception as e:
	return None, str(e)

	def main():
	if len(sys.argv) < 3:
	print("Usage: python compare_model_configs.py <model1> <model2>")
	print("Example: python compare_model_configs.py openai/o:latest openai/o3")
	sys.exit(1)

	model1 = sys.argv[1]
	model2 = sys.argv[2]

	api_key = os.environ.get('CBORG_API_KEY')
	if not api_key:
	print("Error: CBORG_API_KEY environment variable not set.")
	sys.exit(1)

	client = OpenAI(
	api_key=api_key,
	base_url="https://api.cborg.lbl.gov"
	)

	print("=" * 100)
	print(f"COMPARING: {model1} vs {model2}")
	print("=" * 100)
	print()

	# Test model 1
	print(f"Testing {model1}...")
	info1, error1 = test_model_detailed(client, model1)

	if error1:
	print(f"❌ Error: {error1}")
	sys.exit(1)

	# Test model 2
	print(f"Testing {model2}...")
	info2, error2 = test_model_detailed(client, model2)

	if error2:
	print(f"❌ Error: {error2}")
	sys.exit(1)

	print()
	print("=" * 100)
	print("COMPARISON RESULTS")
	print("=" * 100)
	print()

	# Compare underlying models
	print("1. UNDERLYING MODEL:")
	print(f" {model1:<30} → {info1['model']}")
	print(f" {model2:<30} → {info2['model']}")
	if info1['model'] == info2['model']:
	print(" ✓ SAME underlying model")
	else:
	print(" ⚠️ DIFFERENT underlying models!")
	print()

	# Compare system fingerprints (if available)
	print("2. SYSTEM FINGERPRINT:")
	print(f" {model1:<30} → {info1['system_fingerprint']}")
	print(f" {model2:<30} → {info2['system_fingerprint']}")
	if info1['system_fingerprint'] == info2['system_fingerprint']:
	print(" ✓ SAME system fingerprint")
	elif info1['system_fingerprint'] is None or info2['system_fingerprint'] is None:
	print(" ⚠️ System fingerprint not available")
	else:
	print(" ⚠️ DIFFERENT system fingerprints!")
	print()

	# Compare token usage patterns
	print("3. TOKEN USAGE (for same prompt):")
	print(f" {model1:<30} prompt={info1['usage']['prompt_tokens']}, completion={info1['usage']['completion_tokens']}")
	print(f" {model2:<30} prompt={info2['usage']['prompt_tokens']}, completion={info2['usage']['completion_tokens']}")
	if info1['usage'] == info2['usage']:
	print(" ✓ IDENTICAL token usage")
	else:
	print(" ⚠️ Different token usage (could indicate different behavior)")
	print()

	# Compare responses
	print("4. RESPONSE CONTENT:")
	print(f" {model1}: \"{info1['response_content']}\"")
	print(f" {model2}: \"{info2['response_content']}\"")
	if info1['response_content'] == info2['response_content']:
	print(" ✓ IDENTICAL responses")
	else:
	print(" ⚠️ Different responses")
	print()

	# Show raw response if available
	if 'raw_response' in info1:
	print("5. RAW RESPONSE MODEL 1:")
	print(f" {info1['raw_response'][:500]}")
	print()
	print("6. RAW RESPONSE MODEL 2:")
	print(f" {info2['raw_response'][:500]}")
	print()

	# Final verdict
	print("=" * 100)
	print("VERDICT:")
	print("=" * 100)

	same_count = 0
	total_count = 4

	if info1['model'] == info2['model']:
	same_count += 1
	if info1['system_fingerprint'] == info2['system_fingerprint'] or \
	(info1['system_fingerprint'] is None and info2['system_fingerprint'] is None):
	same_count += 1
	if info1['usage'] == info2['usage']:
	same_count += 1
	if info1['response_content'] == info2['response_content']:
	same_count += 1

	print(f"Similarity: {same_count}/{total_count} metrics match")
	print()

	if same_count == total_count:
	print("✓ Models appear to be IDENTICAL")
	print(" → Same underlying model, same configuration")
	print(" → Likely just different aliases for the same deployment")
	elif info1['model'] == info2['model'] and same_count >= 2:
	print("⚠️ Models use the SAME base model but show some differences")
	print(" → Could be due to:")
	print(" - Different deployment instances")
	print(" - Randomness in generation")
	print(" - Different routing/load balancing")
	else:
	print("⚠️ Models appear to be DIFFERENT")
	print(" → Different configurations or versions")

	print()
	print("NOTE: In your dataset, these models have different performance because")
	print(" they represent different experimental runs, not necessarily different")
	print(" model configurations.")
	print("=" * 100)

	if __name__ == '__main__':
	main()