| """ |
| Comprehensive benchmark script for Q-TensorFormer v3. |
| |
| Runs multi-model comparison against all baselines and produces |
| a full evaluation report with Pareto frontier analysis. |
| |
| Usage: |
| python scripts/benchmark.py --preset small --epochs 5 --output results/ |
| """ |
|
|
| import sys |
| import os |
| import argparse |
| import json |
| from pathlib import Path |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
| from src.config import ExperimentConfig, ModelConfig, TrainingConfig, PRESETS |
| from src.models import create_model |
| from src.baselines import StandardTransformer, DistilledTransformer, PrunedTransformer |
| from src.data import load_wikitext2, load_synthetic_data |
| from src.training import Trainer |
| from src.metrics import ( |
| evaluate_model, compare_models, compute_pareto_frontier, |
| compute_efficiency_score, print_comparison_table, |
| rank_trajectory_analysis, |
| ) |
|
|
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser(description="Q-TensorFormer Benchmark") |
| parser.add_argument("--preset", type=str, default="small", |
| choices=["tiny", "small", "medium"], |
| help="Configuration preset") |
| parser.add_argument("--epochs", type=int, default=5, |
| help="Training epochs") |
| parser.add_argument("--batch-size", type=int, default=16) |
| parser.add_argument("--seq-len", type=int, default=128) |
| parser.add_argument("--output", type=str, default="./outputs/benchmark/", |
| help="Output directory") |
| parser.add_argument("--device", type=str, default="cpu", |
| help="Device (cpu, cuda)") |
| parser.add_argument("--synthetic", action="store_true", |
| help="Use synthetic data (faster)") |
| parser.add_argument("--seed", type=int, default=42) |
| return parser.parse_args() |
|
|
|
|
| def main(): |
| args = parse_args() |
| torch.manual_seed(args.seed) |
|
|
| |
| config = PRESETS[args.preset]() |
| config.training.max_epochs = args.epochs |
| config.training.batch_size = args.batch_size |
| config.model.max_seq_len = args.seq_len |
|
|
| print(f"Config: {config.experiment_name}") |
| print(f"Model: d_model={config.model.d_model}, " |
| f"n_layers={config.model.n_layers}, " |
| f"tt_rank={config.model.tt_rank}") |
|
|
| |
| print("\nLoading data...") |
| if args.synthetic: |
| train_loader = load_synthetic_data( |
| vocab_size=config.model.vocab_size, |
| seq_len=args.seq_len, |
| n_samples=2000, |
| batch_size=args.batch_size, |
| ) |
| val_loader = None |
| test_loader = train_loader |
| tokenizer = None |
| else: |
| train_loader, val_loader, test_loader, tokenizer = load_wikitext2( |
| seq_len=args.seq_len, |
| batch_size=args.batch_size, |
| ) |
| config.model.vocab_size = tokenizer.vocab_size |
|
|
| |
| print("\nCreating models...") |
| models = {} |
|
|
| |
| models["QTensorFormer"] = create_model(config, "qtensor") |
| print(f" QTensorFormer: {models['QTensorFormer'].total_params:,} params") |
|
|
| |
| models["TensorOnly"] = create_model(config, "tensor_only") |
| print(f" TensorOnly: {models['TensorOnly'].total_params:,} params") |
|
|
| |
| models["StandardTransformer"] = StandardTransformer( |
| vocab_size=config.model.vocab_size, |
| d_model=config.model.d_model, |
| n_heads=config.model.n_heads, |
| n_layers=config.model.n_layers, |
| max_seq_len=config.model.max_seq_len, |
| ) |
| print(f" StandardTransformer: {models['StandardTransformer'].total_params:,} params") |
|
|
| |
| models["Distilled"] = DistilledTransformer( |
| vocab_size=config.model.vocab_size, |
| d_model=max(64, config.model.d_model // 2), |
| n_heads=config.model.n_heads, |
| n_layers=config.model.n_layers, |
| max_seq_len=config.model.max_seq_len, |
| ) |
| print(f" Distilled: {models['Distilled'].total_params:,} params") |
|
|
| |
| print(f"\n{'='*60}") |
| print("Training models...") |
| print(f"{'='*60}") |
|
|
| trained_models = {} |
| for name, model in models.items(): |
| print(f"\n--- Training {name} ---") |
| trainer = Trainer( |
| model, config, |
| train_loader=train_loader, |
| val_loader=val_loader, |
| test_loader=test_loader, |
| device=args.device, |
| output_dir=f"{args.output}/{name}", |
| ) |
| trainer.train() |
| trained_models[name] = model |
|
|
| |
| print(f"\n{'='*60}") |
| print("Evaluating models...") |
| print(f"{'='*60}") |
|
|
| results = {} |
| for name, model in trained_models.items(): |
| results[name] = evaluate_model(model, test_loader, args.device) |
|
|
| |
| print_comparison_table(results) |
|
|
| |
| pareto = compute_pareto_frontier(results) |
| print(f"\nPareto-optimal models: {pareto}") |
|
|
| |
| efficiency = {name: compute_efficiency_score(r) for name, r in results.items()} |
| best = max(efficiency, key=efficiency.get) |
| print(f"Most efficient: {best} (score={efficiency[best]:.1f})") |
|
|
| |
| os.makedirs(args.output, exist_ok=True) |
| with open(f"{args.output}/results.json", "w") as f: |
| |
| clean = {} |
| for name, r in results.items(): |
| clean[name] = {k: (float(v) if hasattr(v, 'item') else v) for k, v in r.items()} |
| json.dump({ |
| "config": config.experiment_name, |
| "results": clean, |
| "pareto": pareto, |
| "efficiency": {k: float(v) for k, v in efficiency.items()}, |
| "best": best, |
| }, f, indent=2) |
|
|
| print(f"\nResults saved to {args.output}/results.json") |
|
|
| |
| print(f"\n{'='*60}") |
| print("SUMMARY") |
| print(f"{'='*60}") |
| for name in results: |
| ppl = results[name]["test_ppl"] |
| params = results[name]["total_params"] |
| lat = results[name].get("latency_ms_mean", 0) |
| print(f" {name:<25} PPL={ppl:.2f} Params={params:,} Lat={lat:.1f}ms") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|