| { |
| "ngpus": 32, |
| "tokens": 50257, |
| "gpt_dir": "assets/gpt2-large", |
| "outdir": "../output", |
| "training": { |
| "batch_size": 512, |
| "accum": 1, |
| "n_iters": 1000001, |
| "snapshot_freq": 50000, |
| "log_freq": 50, |
| "eval_freq": 100, |
| "snapshot_freq_for_preemption": 10000, |
| "weight": "standard", |
| "snapshot_sampling": true, |
| "ema": 0.9999, |
| "loss_type": "lambda_DCE" |
| }, |
| "data": { |
| "train": "openwebtext", |
| "valid": "wikitext103", |
| "cache_dir": "data" |
| }, |
| "noise": { |
| "type": "loglinear", |
| "sigma_min": 0.0001, |
| "sigma_max": 20 |
| }, |
| "sampling": { |
| "predictor": "euler", |
| "steps": 128 |
| }, |
| "eval": { |
| "batch_size": 512, |
| "perplexity": true, |
| "perplexity_batch_size": 16 |
| }, |
| "optim": { |
| "weight_decay": 0.03, |
| "optimizer": "AdamW", |
| "lr": 0.0003, |
| "beta1": 0.9, |
| "beta2": 0.999, |
| "eps": 1e-08, |
| "warmup": 2500, |
| "grad_clip": 1.0 |
| }, |
| "model": { |
| "name": "medium_wotsm", |
| "type": "ddit_wot", |
| "hidden_size": 1024, |
| "cond_dim": 128, |
| "length": 1024, |
| "n_blocks": 24, |
| "n_heads": 16, |
| "dropout": 0.02, |
| "use_checkpoint": false, |
| "dtype": "bfloat16" |
| } |
| } |