| |
| """ |
| Compare two model variants to see if they have different configurations. |
| Usage: |
| export CBORG_API_KEY=... |
| python compare_model_configs.py openai/o:latest openai/o3 |
| """ |
| import os |
| import sys |
| from openai import OpenAI |
| import json |
|
|
| def test_model_detailed(client, model_id): |
| """Test a model and return detailed response information.""" |
| try: |
| response = client.chat.completions.create( |
| model=model_id, |
| messages=[{"role": "user", "content": "What is 2+2?"}], |
| max_tokens=10, |
| temperature=1.0, |
| top_p=1.0, |
| ) |
| |
| |
| info = { |
| 'model': response.model, |
| 'id': response.id, |
| 'created': response.created, |
| 'object': response.object, |
| 'system_fingerprint': getattr(response, 'system_fingerprint', None), |
| 'usage': { |
| 'prompt_tokens': response.usage.prompt_tokens, |
| 'completion_tokens': response.usage.completion_tokens, |
| 'total_tokens': response.usage.total_tokens, |
| }, |
| 'response_content': response.choices[0].message.content, |
| 'finish_reason': response.choices[0].finish_reason, |
| } |
| |
| |
| try: |
| info['raw_response'] = str(response) |
| except: |
| pass |
| |
| return info, None |
| except Exception as e: |
| return None, str(e) |
|
|
| def main(): |
| if len(sys.argv) < 3: |
| print("Usage: python compare_model_configs.py <model1> <model2>") |
| print("Example: python compare_model_configs.py openai/o:latest openai/o3") |
| sys.exit(1) |
| |
| model1 = sys.argv[1] |
| model2 = sys.argv[2] |
| |
| api_key = os.environ.get('CBORG_API_KEY') |
| if not api_key: |
| print("Error: CBORG_API_KEY environment variable not set.") |
| sys.exit(1) |
|
|
| client = OpenAI( |
| api_key=api_key, |
| base_url="https://api.cborg.lbl.gov" |
| ) |
| |
| print("=" * 100) |
| print(f"COMPARING: {model1} vs {model2}") |
| print("=" * 100) |
| print() |
| |
| |
| print(f"Testing {model1}...") |
| info1, error1 = test_model_detailed(client, model1) |
| |
| if error1: |
| print(f"❌ Error: {error1}") |
| sys.exit(1) |
| |
| |
| print(f"Testing {model2}...") |
| info2, error2 = test_model_detailed(client, model2) |
| |
| if error2: |
| print(f"❌ Error: {error2}") |
| sys.exit(1) |
| |
| print() |
| print("=" * 100) |
| print("COMPARISON RESULTS") |
| print("=" * 100) |
| print() |
| |
| |
| print("1. UNDERLYING MODEL:") |
| print(f" {model1:<30} → {info1['model']}") |
| print(f" {model2:<30} → {info2['model']}") |
| if info1['model'] == info2['model']: |
| print(" ✓ SAME underlying model") |
| else: |
| print(" ⚠️ DIFFERENT underlying models!") |
| print() |
| |
| |
| print("2. SYSTEM FINGERPRINT:") |
| print(f" {model1:<30} → {info1['system_fingerprint']}") |
| print(f" {model2:<30} → {info2['system_fingerprint']}") |
| if info1['system_fingerprint'] == info2['system_fingerprint']: |
| print(" ✓ SAME system fingerprint") |
| elif info1['system_fingerprint'] is None or info2['system_fingerprint'] is None: |
| print(" ⚠️ System fingerprint not available") |
| else: |
| print(" ⚠️ DIFFERENT system fingerprints!") |
| print() |
| |
| |
| print("3. TOKEN USAGE (for same prompt):") |
| print(f" {model1:<30} prompt={info1['usage']['prompt_tokens']}, completion={info1['usage']['completion_tokens']}") |
| print(f" {model2:<30} prompt={info2['usage']['prompt_tokens']}, completion={info2['usage']['completion_tokens']}") |
| if info1['usage'] == info2['usage']: |
| print(" ✓ IDENTICAL token usage") |
| else: |
| print(" ⚠️ Different token usage (could indicate different behavior)") |
| print() |
| |
| |
| print("4. RESPONSE CONTENT:") |
| print(f" {model1}: \"{info1['response_content']}\"") |
| print(f" {model2}: \"{info2['response_content']}\"") |
| if info1['response_content'] == info2['response_content']: |
| print(" ✓ IDENTICAL responses") |
| else: |
| print(" ⚠️ Different responses") |
| print() |
| |
| |
| if 'raw_response' in info1: |
| print("5. RAW RESPONSE MODEL 1:") |
| print(f" {info1['raw_response'][:500]}") |
| print() |
| print("6. RAW RESPONSE MODEL 2:") |
| print(f" {info2['raw_response'][:500]}") |
| print() |
| |
| |
| print("=" * 100) |
| print("VERDICT:") |
| print("=" * 100) |
| |
| same_count = 0 |
| total_count = 4 |
| |
| if info1['model'] == info2['model']: |
| same_count += 1 |
| if info1['system_fingerprint'] == info2['system_fingerprint'] or \ |
| (info1['system_fingerprint'] is None and info2['system_fingerprint'] is None): |
| same_count += 1 |
| if info1['usage'] == info2['usage']: |
| same_count += 1 |
| if info1['response_content'] == info2['response_content']: |
| same_count += 1 |
| |
| print(f"Similarity: {same_count}/{total_count} metrics match") |
| print() |
| |
| if same_count == total_count: |
| print("✓ Models appear to be IDENTICAL") |
| print(" → Same underlying model, same configuration") |
| print(" → Likely just different aliases for the same deployment") |
| elif info1['model'] == info2['model'] and same_count >= 2: |
| print("⚠️ Models use the SAME base model but show some differences") |
| print(" → Could be due to:") |
| print(" - Different deployment instances") |
| print(" - Randomness in generation") |
| print(" - Different routing/load balancing") |
| else: |
| print("⚠️ Models appear to be DIFFERENT") |
| print(" → Different configurations or versions") |
| |
| print() |
| print("NOTE: In your dataset, these models have different performance because") |
| print(" they represent different experimental runs, not necessarily different") |
| print(" model configurations.") |
| print("=" * 100) |
|
|
| if __name__ == '__main__': |
| main() |
|
|