""" Comprehensive benchmark script for Q-TensorFormer v3. Runs multi-model comparison against all baselines and produces a full evaluation report with Pareto frontier analysis. Usage: python scripts/benchmark.py --preset small --epochs 5 --output results/ """ import sys import os import argparse import json from pathlib import Path # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent)) from src.config import ExperimentConfig, ModelConfig, TrainingConfig, PRESETS from src.models import create_model from src.baselines import StandardTransformer, DistilledTransformer, PrunedTransformer from src.data import load_wikitext2, load_synthetic_data from src.training import Trainer from src.metrics import ( evaluate_model, compare_models, compute_pareto_frontier, compute_efficiency_score, print_comparison_table, rank_trajectory_analysis, ) def parse_args(): parser = argparse.ArgumentParser(description="Q-TensorFormer Benchmark") parser.add_argument("--preset", type=str, default="small", choices=["tiny", "small", "medium"], help="Configuration preset") parser.add_argument("--epochs", type=int, default=5, help="Training epochs") parser.add_argument("--batch-size", type=int, default=16) parser.add_argument("--seq-len", type=int, default=128) parser.add_argument("--output", type=str, default="./outputs/benchmark/", help="Output directory") parser.add_argument("--device", type=str, default="cpu", help="Device (cpu, cuda)") parser.add_argument("--synthetic", action="store_true", help="Use synthetic data (faster)") parser.add_argument("--seed", type=int, default=42) return parser.parse_args() def main(): args = parse_args() torch.manual_seed(args.seed) # Load config config = PRESETS[args.preset]() config.training.max_epochs = args.epochs config.training.batch_size = args.batch_size config.model.max_seq_len = args.seq_len print(f"Config: {config.experiment_name}") print(f"Model: d_model={config.model.d_model}, " f"n_layers={config.model.n_layers}, " f"tt_rank={config.model.tt_rank}") # Load data print("\nLoading data...") if args.synthetic: train_loader = load_synthetic_data( vocab_size=config.model.vocab_size, seq_len=args.seq_len, n_samples=2000, batch_size=args.batch_size, ) val_loader = None test_loader = train_loader # Same for synthetic tokenizer = None else: train_loader, val_loader, test_loader, tokenizer = load_wikitext2( seq_len=args.seq_len, batch_size=args.batch_size, ) config.model.vocab_size = tokenizer.vocab_size # Create models print("\nCreating models...") models = {} # Q-TensorFormer (hybrid) models["QTensorFormer"] = create_model(config, "qtensor") print(f" QTensorFormer: {models['QTensorFormer'].total_params:,} params") # TT-Only (no quantum) models["TensorOnly"] = create_model(config, "tensor_only") print(f" TensorOnly: {models['TensorOnly'].total_params:,} params") # Standard transformer (dense) models["StandardTransformer"] = StandardTransformer( vocab_size=config.model.vocab_size, d_model=config.model.d_model, n_heads=config.model.n_heads, n_layers=config.model.n_layers, max_seq_len=config.model.max_seq_len, ) print(f" StandardTransformer: {models['StandardTransformer'].total_params:,} params") # Distilled (smaller dense) models["Distilled"] = DistilledTransformer( vocab_size=config.model.vocab_size, d_model=max(64, config.model.d_model // 2), n_heads=config.model.n_heads, n_layers=config.model.n_layers, max_seq_len=config.model.max_seq_len, ) print(f" Distilled: {models['Distilled'].total_params:,} params") # Train all models print(f"\n{'='*60}") print("Training models...") print(f"{'='*60}") trained_models = {} for name, model in models.items(): print(f"\n--- Training {name} ---") trainer = Trainer( model, config, train_loader=train_loader, val_loader=val_loader, test_loader=test_loader, device=args.device, output_dir=f"{args.output}/{name}", ) trainer.train() trained_models[name] = model # Evaluate print(f"\n{'='*60}") print("Evaluating models...") print(f"{'='*60}") results = {} for name, model in trained_models.items(): results[name] = evaluate_model(model, test_loader, args.device) # Print comparison print_comparison_table(results) # Pareto frontier pareto = compute_pareto_frontier(results) print(f"\nPareto-optimal models: {pareto}") # Efficiency ranking efficiency = {name: compute_efficiency_score(r) for name, r in results.items()} best = max(efficiency, key=efficiency.get) print(f"Most efficient: {best} (score={efficiency[best]:.1f})") # Save results os.makedirs(args.output, exist_ok=True) with open(f"{args.output}/results.json", "w") as f: # Convert float32 to native float clean = {} for name, r in results.items(): clean[name] = {k: (float(v) if hasattr(v, 'item') else v) for k, v in r.items()} json.dump({ "config": config.experiment_name, "results": clean, "pareto": pareto, "efficiency": {k: float(v) for k, v in efficiency.items()}, "best": best, }, f, indent=2) print(f"\nResults saved to {args.output}/results.json") # Summary print(f"\n{'='*60}") print("SUMMARY") print(f"{'='*60}") for name in results: ppl = results[name]["test_ppl"] params = results[name]["total_params"] lat = results[name].get("latency_ms_mean", 0) print(f" {name:<25} PPL={ppl:.2f} Params={params:,} Lat={lat:.1f}ms") if __name__ == "__main__": main()