IvanHU commited on
Commit
b2c3288
·
verified ·
1 Parent(s): ec3be61

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +8 -0
  2. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/8k-100.sh +65 -0
  3. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/config.json +33 -0
  4. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_1B.json +29 -0
  5. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_340M.json +26 -0
  6. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_1B.json +22 -0
  7. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_340M.json +22 -0
  8. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gdn_6_1_340M.json +50 -0
  9. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_340M.json +24 -0
  10. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_7B.json +25 -0
  11. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gsa_340M.json +29 -0
  12. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/hgrn2_340M.json +20 -0
  13. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_1B.json +32 -0
  14. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_340M.json +32 -0
  15. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_1B.json +30 -0
  16. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_340M.json +30 -0
  17. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/samba_1B.json +52 -0
  18. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/sba_340m.json +18 -0
  19. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_1B.json +22 -0
  20. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_340M.json +18 -0
  21. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_7B.json +21 -0
  22. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/generation_config.json +6 -0
  23. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/0/error.json +1 -0
  24. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/0/stderr.log +175 -0
  25. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/0/stdout.log +0 -0
  26. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/1/error.json +1 -0
  27. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/1/stderr.log +174 -0
  28. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/1/stdout.log +0 -0
  29. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/2/error.json +1 -0
  30. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/2/stderr.log +174 -0
  31. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/2/stdout.log +0 -0
  32. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/3/error.json +1 -0
  33. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/3/stderr.log +174 -0
  34. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/3/stdout.log +0 -0
  35. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/4/error.json +1 -0
  36. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/4/stderr.log +174 -0
  37. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/4/stdout.log +0 -0
  38. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/5/error.json +1 -0
  39. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/5/stderr.log +174 -0
  40. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/5/stdout.log +0 -0
  41. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/6/error.json +1 -0
  42. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/6/stderr.log +174 -0
  43. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/6/stdout.log +0 -0
  44. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/7/error.json +1 -0
  45. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/7/stderr.log +174 -0
  46. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/7/stdout.log +0 -0
  47. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/0/stderr.log +25 -0
  48. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/0/stdout.log +0 -0
  49. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/1/stderr.log +0 -0
  50. transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/1/stdout.log +0 -0
.gitattributes CHANGED
@@ -65,3 +65,11 @@ mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8
65
  mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
66
  mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
67
  mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
65
  mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
66
  mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
67
  mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
68
+ transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/0/stderr.log filter=lfs diff=lfs merge=lfs -text
69
+ transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/1/stderr.log filter=lfs diff=lfs merge=lfs -text
70
+ transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/2/stderr.log filter=lfs diff=lfs merge=lfs -text
71
+ transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/3/stderr.log filter=lfs diff=lfs merge=lfs -text
72
+ transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/4/stderr.log filter=lfs diff=lfs merge=lfs -text
73
+ transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
74
+ transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
75
+ transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/8k-100.sh ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FLAME_PATH=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame
2
+ DATASET_ROOT=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset
3
+ TOKENIZER=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer
4
+
5
+ cd $FLAME_PATH
6
+ source .venv/bin/activate
7
+
8
+ # =========== train config ===========
9
+ CONFIG=${1:-transformer_340M.json}
10
+ SEQ_LEN=8192
11
+ WARMUP_STEPS=100
12
+ STEPS=95366
13
+ LR=3e-4
14
+ BATCH_SIZE=16
15
+ DECAY_TYPE=linear
16
+ DECAY_RATIO=1
17
+
18
+ NNODE=1
19
+ NGPU=8
20
+ LOG_RANK=0
21
+ # ====================================
22
+
23
+ # if jq command is not found, install it
24
+ if ! command -v jq &> /dev/null; then
25
+ echo "jq could not be found, installing it..."
26
+ sudo yum install -y jq
27
+ fi
28
+
29
+ EXP_NAME=$(basename $CONFIG | sed 's/\.config//')-ctx${SEQ_LEN}-steps${STEPS}-lr${LR}-decay_type${DECAY_TYPE}-decay_ratio${DECAY_RATIO}-bs${BATCH_SIZE}-nn${NNODE}
30
+
31
+ bash train.sh \
32
+ --job.config_file flame/models/fla.toml \
33
+ --job.dump_folder $FLAME_PATH/exp/$EXP_NAME \
34
+ --model.config $FLAME_PATH/configs/$CONFIG \
35
+ --model.tokenizer_path $TOKENIZER \
36
+ --optimizer.name AdamW \
37
+ --optimizer.eps 1e-8 \
38
+ --optimizer.lr $LR \
39
+ --lr_scheduler.warmup_steps $WARMUP_STEPS \
40
+ --lr_scheduler.lr_min 0.01 \
41
+ --lr_scheduler.decay_type $DECAY_TYPE \
42
+ --lr_scheduler.decay_ratio $DECAY_RATIO \
43
+ --training.batch_size $BATCH_SIZE \
44
+ --training.seq_len $SEQ_LEN \
45
+ --training.context_len $SEQ_LEN \
46
+ --training.gradient_accumulation_steps 1 \
47
+ --training.steps $STEPS \
48
+ --training.max_norm 1.0 \
49
+ --training.skip_nan_inf \
50
+ --training.dataset $DATASET_ROOT/fineweb-edu-sample,$DATASET_ROOT/small_repos_20B_sample_merged,$DATASET_ROOT/megamath-web-pro \
51
+ --training.data_probs 0.55,0.3,0.15 \
52
+ --training.dataset_split train,train,train \
53
+ --training.dataset_name default,default,default \
54
+ --training.streaming \
55
+ --training.num_workers 32 \
56
+ --training.prefetch_factor 2 \
57
+ --training.seed 42 \
58
+ --training.compile \
59
+ --checkpoint.interval 8192 \
60
+ --checkpoint.load_step -1 \
61
+ --checkpoint.keep_latest_k 100 \
62
+ --metrics.log_freq 1 \
63
+ --metrics.enable_tensorboard \
64
+ --training.streaming
65
+
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "TransformerForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "bos_token_id": 1,
7
+ "elementwise_affine": true,
8
+ "eos_token_id": 2,
9
+ "fuse_cross_entropy": true,
10
+ "fuse_norm": true,
11
+ "fuse_swiglu": true,
12
+ "hidden_act": "swish",
13
+ "hidden_ratio": 4,
14
+ "hidden_size": 1024,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": null,
17
+ "max_position_embeddings": 8192,
18
+ "model_type": "transformer",
19
+ "norm_eps": 1e-06,
20
+ "num_heads": 16,
21
+ "num_hidden_layers": 24,
22
+ "num_kv_heads": null,
23
+ "qk_norm": false,
24
+ "qkv_bias": false,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.53.3",
29
+ "use_cache": true,
30
+ "use_l2warp": false,
31
+ "vocab_size": 32000,
32
+ "window_size": null
33
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_1B.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": null,
3
+ "attn_mode": "chunk",
4
+ "bos_token_id": 1,
5
+ "conv_size": 4,
6
+ "eos_token_id": 2,
7
+ "expand_k": 1,
8
+ "expand_v": 1,
9
+ "fuse_cross_entropy": true,
10
+ "fuse_norm": true,
11
+ "hidden_act": "swish",
12
+ "hidden_ratio": 4,
13
+ "hidden_size": 2048,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": null,
16
+ "model_type": "delta_net",
17
+ "norm_eps": 1e-06,
18
+ "num_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "pad_token_id": 2,
21
+ "qk_activation": "silu",
22
+ "qk_norm": "l2",
23
+ "tie_word_embeddings": false,
24
+ "use_beta": true,
25
+ "use_cache": true,
26
+ "use_gate": false,
27
+ "use_output_norm": true,
28
+ "use_short_conv": true
29
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_340M.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_k": 1,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "delta_net",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 8,
17
+ "num_hidden_layers": 24,
18
+ "qk_activation": "silu",
19
+ "qk_norm": "l2",
20
+ "tie_word_embeddings": false,
21
+ "use_beta": true,
22
+ "use_cache": true,
23
+ "use_gate": false,
24
+ "use_output_norm": true,
25
+ "use_short_conv": true
26
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_1B.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_v": 2,
7
+ "fuse_cross_entropy": true,
8
+ "head_dim": 256,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "gated_deltanet",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 6,
17
+ "num_hidden_layers": 21,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "use_gate": true,
21
+ "use_short_conv": true
22
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_340M.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "conv_size": 4,
5
+ "eos_token_id": 2,
6
+ "expand_v": 2,
7
+ "fuse_cross_entropy": true,
8
+ "head_dim": 256,
9
+ "hidden_act": "swish",
10
+ "hidden_ratio": 4,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": null,
14
+ "model_type": "gated_deltanet",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 6,
17
+ "num_hidden_layers": 21,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "use_gate": true,
21
+ "use_short_conv": true
22
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gdn_6_1_340M.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GatedDeltaNetForCausalLM"
4
+ ],
5
+ "attn": {
6
+ "layers": [
7
+ 5,
8
+ 11,
9
+ 17,
10
+ 23
11
+ ],
12
+ "num_heads": 16,
13
+ "num_kv_heads": 8,
14
+ "qkv_bias": false,
15
+ "rope_theta": 160000.0,
16
+ "window_size": null
17
+ },
18
+ "attn_mode": "chunk",
19
+ "bos_token_id": 1,
20
+ "conv_size": 4,
21
+ "eos_token_id": 2,
22
+ "expand_k": 1,
23
+ "expand_v": 1,
24
+ "fuse_cross_entropy": true,
25
+ "fuse_norm": true,
26
+ "fuse_swiglu": true,
27
+ "head_dim": 256,
28
+ "hidden_act": "swish",
29
+ "hidden_ratio": 4,
30
+ "hidden_size": 1024,
31
+ "initializer_range": 0.02,
32
+ "intermediate_size": null,
33
+ "max_position_embeddings": 8192,
34
+ "model_type": "gated_deltanet",
35
+ "norm_eps": 1e-06,
36
+ "norm_first": false,
37
+ "num_heads": 4,
38
+ "num_hidden_layers": 24,
39
+ "qk_activation": "silu",
40
+ "qk_norm": "l2",
41
+ "tie_word_embeddings": false,
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.51.3",
44
+ "use_beta": true,
45
+ "use_cache": true,
46
+ "use_gate": true,
47
+ "use_output_norm": true,
48
+ "use_short_conv": true,
49
+ "vocab_size": 32000
50
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_340M.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "clamp_min": null,
5
+ "eos_token_id": 2,
6
+ "expand_k": 0.5,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "fuse_norm": true,
10
+ "hidden_act": "swish",
11
+ "hidden_ratio": 4,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": null,
15
+ "model_type": "gla",
16
+ "num_heads": 4,
17
+ "num_hidden_layers": 24,
18
+ "norm_eps": 1e-06,
19
+ "tie_word_embeddings": false,
20
+ "use_cache": true,
21
+ "use_gk": true,
22
+ "use_gv": false,
23
+ "vocab_size": 32000
24
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_7B.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": null,
3
+ "attn_mode": "chunk",
4
+ "bos_token_id": 1,
5
+ "eos_token_id": 2,
6
+ "expand_k": 0.5,
7
+ "expand_v": 1,
8
+ "fuse_cross_entropy": true,
9
+ "fuse_norm": true,
10
+ "hidden_act": "swish",
11
+ "hidden_ratio": 4,
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "model_type": "gla",
16
+ "norm_eps": 1e-06,
17
+ "num_heads": 16,
18
+ "num_hidden_layers": 32,
19
+ "tie_word_embeddings": false,
20
+ "use_cache": true,
21
+ "use_gk": true,
22
+ "use_gv": false,
23
+ "use_output_gate": true,
24
+ "use_short_conv": false
25
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gsa_340M.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_size": 4,
4
+ "eos_token_id": 2,
5
+ "expand_k": 1,
6
+ "expand_v": 1,
7
+ "elementwise_affine": false,
8
+ "feature_map": "swish",
9
+ "fuse_cross_entropy": true,
10
+ "fuse_norm": true,
11
+ "gate_logit_normalizer": 4,
12
+ "hidden_act": "swish",
13
+ "hidden_ratio": 4,
14
+ "hidden_size": 1024,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": null,
17
+ "model_type": "gsa",
18
+ "num_heads": 4,
19
+ "num_hidden_layers": 24,
20
+ "num_slots": 64,
21
+ "norm_eps": 1e-06,
22
+ "share_conv_kernel": true,
23
+ "tie_word_embeddings": false,
24
+ "use_cache": true,
25
+ "use_norm": true,
26
+ "use_output_gate": true,
27
+ "use_rope": false,
28
+ "use_short_conv": false
29
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/hgrn2_340M.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_mode": "chunk",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "expand_ratio": 128,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "swish",
9
+ "hidden_ratio": 4,
10
+ "hidden_size": 1024,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "model_type": "hgrn2",
14
+ "num_heads": 8,
15
+ "num_hidden_layers": 24,
16
+ "norm_eps": 1e-06,
17
+ "tie_word_embeddings": false,
18
+ "use_cache": true,
19
+ "vocab_size": 32000
20
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_1B.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "chunk_size": 256,
4
+ "conv_kernel": 4,
5
+ "eos_token_id": 2,
6
+ "expand": 2,
7
+ "fuse_cross_entropy": true,
8
+ "fuse_norm": true,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "norm_eps": 1e-05,
14
+ "model_type": "mamba2",
15
+ "n_groups": 1,
16
+ "num_hidden_layers": 48,
17
+ "pad_token_id": 0,
18
+ "rescale_prenorm_residual": true,
19
+ "residual_in_fp32": true,
20
+ "rms_norm": true,
21
+ "state_size": 128,
22
+ "tie_word_embeddings": false,
23
+ "time_step_floor": 0.0001,
24
+ "time_step_max": 0.1,
25
+ "time_step_min": 0.001,
26
+ "time_step_rank": 128,
27
+ "transformers_version": "4.50.1",
28
+ "use_bias": false,
29
+ "use_cache": true,
30
+ "use_conv_bias": true,
31
+ "vocab_size": 32000
32
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_340M.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "chunk_size": 256,
4
+ "conv_kernel": 4,
5
+ "eos_token_id": 2,
6
+ "expand": 2,
7
+ "fuse_cross_entropy": true,
8
+ "fuse_norm": true,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "norm_eps": 1e-05,
14
+ "model_type": "mamba2",
15
+ "n_groups": 1,
16
+ "num_hidden_layers": 48,
17
+ "pad_token_id": 0,
18
+ "rescale_prenorm_residual": true,
19
+ "residual_in_fp32": true,
20
+ "rms_norm": true,
21
+ "state_size": 128,
22
+ "tie_word_embeddings": false,
23
+ "time_step_floor": 0.0001,
24
+ "time_step_max": 0.1,
25
+ "time_step_min": 0.001,
26
+ "time_step_rank": 128,
27
+ "transformers_version": "4.50.1",
28
+ "use_bias": false,
29
+ "use_cache": true,
30
+ "use_conv_bias": true,
31
+ "vocab_size": 32000
32
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_1B.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_kernel": 4,
4
+ "eos_token_id": 2,
5
+ "expand": 2,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "model_type": "mamba",
12
+ "norm_eps": 1e-05,
13
+ "num_hidden_layers": 48,
14
+ "pad_token_id": 0,
15
+ "rescale_prenorm_residual": false,
16
+ "residual_in_fp32": false,
17
+ "state_size": 16,
18
+ "tie_word_embeddings": false,
19
+ "time_step_floor": 0.0001,
20
+ "time_step_init_scheme": "random",
21
+ "time_step_max": 0.1,
22
+ "time_step_min": 0.001,
23
+ "time_step_rank": 128,
24
+ "time_step_scale": 1.0,
25
+ "transformers_version": "4.50.1",
26
+ "use_bias": false,
27
+ "use_cache": true,
28
+ "use_conv_bias": true,
29
+ "vocab_size": 32000
30
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_340M.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "conv_kernel": 4,
4
+ "eos_token_id": 2,
5
+ "expand": 2,
6
+ "fuse_cross_entropy": true,
7
+ "fuse_norm": true,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1024,
10
+ "initializer_range": 0.02,
11
+ "model_type": "mamba",
12
+ "norm_eps": 1e-05,
13
+ "num_hidden_layers": 48,
14
+ "pad_token_id": 0,
15
+ "rescale_prenorm_residual": false,
16
+ "residual_in_fp32": false,
17
+ "state_size": 16,
18
+ "tie_word_embeddings": false,
19
+ "time_step_floor": 0.0001,
20
+ "time_step_init_scheme": "random",
21
+ "time_step_max": 0.1,
22
+ "time_step_min": 0.001,
23
+ "time_step_rank": 128,
24
+ "time_step_scale": 1.0,
25
+ "transformers_version": "4.50.1",
26
+ "use_bias": false,
27
+ "use_cache": true,
28
+ "use_conv_bias": true,
29
+ "vocab_size": 32000
30
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/samba_1B.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn": {
3
+ "layers": [
4
+ 1,
5
+ 3,
6
+ 5,
7
+ 7,
8
+ 9,
9
+ 11,
10
+ 13,
11
+ 15,
12
+ 17
13
+ ],
14
+ "num_heads": 18,
15
+ "num_kv_heads": 18,
16
+ "qkv_bias": false,
17
+ "rope_theta": 10000.0,
18
+ "window_size": 2048
19
+ },
20
+ "bos_token_id": 1,
21
+ "conv_kernel": 4,
22
+ "eos_token_id": 2,
23
+ "expand": 2,
24
+ "fuse_cross_entropy": true,
25
+ "fuse_norm": true,
26
+ "fuse_swiglu": true,
27
+ "hidden_act": "swish",
28
+ "hidden_ratio": 4,
29
+ "hidden_size": 2304,
30
+ "initializer_range": 0.02,
31
+ "intermediate_size": 4608,
32
+ "max_position_embeddings": 2048,
33
+ "model_type": "samba",
34
+ "norm_eps": 1e-05,
35
+ "num_hidden_layers": 18,
36
+ "pad_token_id": 0,
37
+ "rescale_prenorm_residual": false,
38
+ "residual_in_fp32": false,
39
+ "state_size": 16,
40
+ "tie_word_embeddings": false,
41
+ "time_step_floor": 0.0001,
42
+ "time_step_init_scheme": "random",
43
+ "time_step_max": 0.1,
44
+ "time_step_min": 0.001,
45
+ "time_step_rank": 144,
46
+ "time_step_scale": 1.0,
47
+ "transformers_version": "4.50.1",
48
+ "use_bias": false,
49
+ "use_cache": true,
50
+ "use_conv_bias": true,
51
+ "vocab_size": 32000
52
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/sba_340m.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_size": 1024,
9
+ "initializer_range": 0.006,
10
+ "max_position_embeddings": 8192,
11
+ "model_type": "sba",
12
+ "num_heads": 16,
13
+ "num_hidden_layers": 24,
14
+ "norm_eps": 1e-06,
15
+ "tie_word_embeddings": false,
16
+ "use_cache": true,
17
+ "vocab_size": 32000
18
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_1B.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "elementwise_affine": true,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "fuse_swiglu": true,
8
+ "hidden_act": "swish",
9
+ "hidden_ratio": 4,
10
+ "hidden_size": 2048,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": null,
13
+ "max_position_embeddings": 8192,
14
+ "model_type": "transformer",
15
+ "norm_eps": 1e-06,
16
+ "num_heads": 32,
17
+ "num_hidden_layers": 24,
18
+ "num_kv_heads": null,
19
+ "pad_token_id": 2,
20
+ "rope_theta": 10000.0,
21
+ "tie_word_embeddings": false
22
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_340M.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_size": 1024,
9
+ "initializer_range": 0.02,
10
+ "max_position_embeddings": 8192,
11
+ "model_type": "transformer",
12
+ "num_heads": 16,
13
+ "num_hidden_layers": 24,
14
+ "norm_eps": 1e-06,
15
+ "tie_word_embeddings": false,
16
+ "use_cache": true,
17
+ "vocab_size": 32000
18
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_7B.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "fuse_cross_entropy": true,
6
+ "fuse_norm": true,
7
+ "hidden_act": "swish",
8
+ "hidden_ratio": 4,
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 14336,
12
+ "model_type": "transformer",
13
+ "norm_eps": 1e-06,
14
+ "num_heads": 32,
15
+ "num_hidden_layers": 32,
16
+ "num_kv_heads": 8,
17
+ "rope_theta": 10000.0,
18
+ "tie_word_embeddings": false,
19
+ "use_cache": true,
20
+ "window_size": null
21
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.53.3"
6
+ }
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/0/error.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/0/stderr.log ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:38:30,495 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:38:30,496 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:38:30,496 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:38:30,496 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:38:30,514 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:38:30,602 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:38:30,602 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:38:30,602 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:38:31,582 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:38:32,959 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:38:32,959 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:38:32,959 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [rank0]: Traceback (most recent call last):
153
+ [rank0]: File "<frozen runpy>", line 198, in _run_module_as_main
154
+ [rank0]: File "<frozen runpy>", line 88, in _run_code
155
+ [rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
156
+ [rank0]: main(config)
157
+ [rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
158
+ [rank0]: return f(*args, **kwargs)
159
+ [rank0]: ^^^^^^^^^^^^^^^^^^
160
+ [rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
161
+ [rank0]: dataset = build_dataset(
162
+ [rank0]: ^^^^^^^^^^^^^^
163
+ [rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
164
+ [rank0]: subset = load_dataset(
165
+ [rank0]: ^^^^^^^^^^^^^
166
+ [rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
167
+ [rank0]: builder_instance = load_dataset_builder(
168
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
169
+ [rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
170
+ [rank0]: dataset_module = dataset_module_factory(
171
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^
172
+ [rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
173
+ [rank0]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
174
+ [rank0]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
175
+ [rank0]:[W722 22:38:33.059900780 ProcessGroupNCCL.cpp:1479] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/0/stdout.log ADDED
File without changes
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/1/error.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/1/stderr.log ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:38:30,567 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:38:30,568 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:38:30,568 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:38:31,570 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:38:31,572 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:38:31,659 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:38:31,659 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:38:31,659 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:38:31,687 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:38:32,959 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:38:32,959 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:38:32,959 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [rank1]: Traceback (most recent call last):
153
+ [rank1]: File "<frozen runpy>", line 198, in _run_module_as_main
154
+ [rank1]: File "<frozen runpy>", line 88, in _run_code
155
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
156
+ [rank1]: main(config)
157
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
158
+ [rank1]: return f(*args, **kwargs)
159
+ [rank1]: ^^^^^^^^^^^^^^^^^^
160
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
161
+ [rank1]: dataset = build_dataset(
162
+ [rank1]: ^^^^^^^^^^^^^^
163
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
164
+ [rank1]: subset = load_dataset(
165
+ [rank1]: ^^^^^^^^^^^^^
166
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
167
+ [rank1]: builder_instance = load_dataset_builder(
168
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^
169
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
170
+ [rank1]: dataset_module = dataset_module_factory(
171
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^
172
+ [rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
173
+ [rank1]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
174
+ [rank1]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/1/stdout.log ADDED
File without changes
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/2/error.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/2/stderr.log ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:38:30,455 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:38:30,456 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:38:30,457 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:38:31,554 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:38:31,557 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:38:31,609 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:38:31,610 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:38:31,610 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:38:31,634 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:38:32,962 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:38:32,963 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:38:32,963 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [rank2]: Traceback (most recent call last):
153
+ [rank2]: File "<frozen runpy>", line 198, in _run_module_as_main
154
+ [rank2]: File "<frozen runpy>", line 88, in _run_code
155
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
156
+ [rank2]: main(config)
157
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
158
+ [rank2]: return f(*args, **kwargs)
159
+ [rank2]: ^^^^^^^^^^^^^^^^^^
160
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
161
+ [rank2]: dataset = build_dataset(
162
+ [rank2]: ^^^^^^^^^^^^^^
163
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
164
+ [rank2]: subset = load_dataset(
165
+ [rank2]: ^^^^^^^^^^^^^
166
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
167
+ [rank2]: builder_instance = load_dataset_builder(
168
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^
169
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
170
+ [rank2]: dataset_module = dataset_module_factory(
171
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^
172
+ [rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
173
+ [rank2]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
174
+ [rank2]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/2/stdout.log ADDED
File without changes
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/3/error.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/3/stderr.log ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:38:30,383 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:38:30,384 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:38:30,385 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:38:31,417 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:38:31,419 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:38:31,456 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:38:31,456 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:38:31,457 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:38:31,526 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:38:32,964 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:38:32,964 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:38:32,964 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [rank3]: Traceback (most recent call last):
153
+ [rank3]: File "<frozen runpy>", line 198, in _run_module_as_main
154
+ [rank3]: File "<frozen runpy>", line 88, in _run_code
155
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
156
+ [rank3]: main(config)
157
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
158
+ [rank3]: return f(*args, **kwargs)
159
+ [rank3]: ^^^^^^^^^^^^^^^^^^
160
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
161
+ [rank3]: dataset = build_dataset(
162
+ [rank3]: ^^^^^^^^^^^^^^
163
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
164
+ [rank3]: subset = load_dataset(
165
+ [rank3]: ^^^^^^^^^^^^^
166
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
167
+ [rank3]: builder_instance = load_dataset_builder(
168
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^
169
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
170
+ [rank3]: dataset_module = dataset_module_factory(
171
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^
172
+ [rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
173
+ [rank3]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
174
+ [rank3]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/3/stdout.log ADDED
File without changes
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/4/error.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/4/stderr.log ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:38:30,292 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:38:30,292 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:38:30,293 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:38:31,334 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:38:31,337 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:38:31,401 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:38:31,402 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:38:31,402 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:38:31,526 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:38:32,961 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:38:32,962 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:38:32,962 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [rank4]: Traceback (most recent call last):
153
+ [rank4]: File "<frozen runpy>", line 198, in _run_module_as_main
154
+ [rank4]: File "<frozen runpy>", line 88, in _run_code
155
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
156
+ [rank4]: main(config)
157
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
158
+ [rank4]: return f(*args, **kwargs)
159
+ [rank4]: ^^^^^^^^^^^^^^^^^^
160
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
161
+ [rank4]: dataset = build_dataset(
162
+ [rank4]: ^^^^^^^^^^^^^^
163
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
164
+ [rank4]: subset = load_dataset(
165
+ [rank4]: ^^^^^^^^^^^^^
166
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
167
+ [rank4]: builder_instance = load_dataset_builder(
168
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^
169
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
170
+ [rank4]: dataset_module = dataset_module_factory(
171
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^
172
+ [rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
173
+ [rank4]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
174
+ [rank4]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/4/stdout.log ADDED
File without changes
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/5/error.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/5/stderr.log ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:38:30,490 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:38:30,491 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:38:30,492 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:38:31,576 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:38:31,579 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:38:31,668 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:38:31,668 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:38:31,669 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:38:31,691 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:38:32,957 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:38:32,958 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:38:32,958 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [rank5]: Traceback (most recent call last):
153
+ [rank5]: File "<frozen runpy>", line 198, in _run_module_as_main
154
+ [rank5]: File "<frozen runpy>", line 88, in _run_code
155
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
156
+ [rank5]: main(config)
157
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
158
+ [rank5]: return f(*args, **kwargs)
159
+ [rank5]: ^^^^^^^^^^^^^^^^^^
160
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
161
+ [rank5]: dataset = build_dataset(
162
+ [rank5]: ^^^^^^^^^^^^^^
163
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
164
+ [rank5]: subset = load_dataset(
165
+ [rank5]: ^^^^^^^^^^^^^
166
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
167
+ [rank5]: builder_instance = load_dataset_builder(
168
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^
169
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
170
+ [rank5]: dataset_module = dataset_module_factory(
171
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^
172
+ [rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
173
+ [rank5]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
174
+ [rank5]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/5/stdout.log ADDED
File without changes
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/6/error.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/6/stderr.log ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:38:30,545 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:38:30,545 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:38:30,546 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:38:31,578 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:38:31,580 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:38:31,669 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:38:31,670 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:38:31,670 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:38:31,690 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:38:32,960 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:38:32,961 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:38:32,961 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [rank6]: Traceback (most recent call last):
153
+ [rank6]: File "<frozen runpy>", line 198, in _run_module_as_main
154
+ [rank6]: File "<frozen runpy>", line 88, in _run_code
155
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
156
+ [rank6]: main(config)
157
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
158
+ [rank6]: return f(*args, **kwargs)
159
+ [rank6]: ^^^^^^^^^^^^^^^^^^
160
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
161
+ [rank6]: dataset = build_dataset(
162
+ [rank6]: ^^^^^^^^^^^^^^
163
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
164
+ [rank6]: subset = load_dataset(
165
+ [rank6]: ^^^^^^^^^^^^^
166
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
167
+ [rank6]: builder_instance = load_dataset_builder(
168
+ [rank6]: ^^^^^^^^^^^^^^^^^^^^^
169
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
170
+ [rank6]: dataset_module = dataset_module_factory(
171
+ [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^
172
+ [rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
173
+ [rank6]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
174
+ [rank6]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/6/stdout.log ADDED
File without changes
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/7/error.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/7/stderr.log ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2025-07-22 22:38:30,347 - root - INFO - Starting job: default job
2
+ [titan] 2025-07-22 22:38:30,347 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "interval": 8192,
18
+ "interval_type": "steps",
19
+ "keep_latest_k": 100,
20
+ "load_step": -1,
21
+ "model_weights_only": false
22
+ },
23
+ "comm": {
24
+ "init_timeout_seconds": 300,
25
+ "trace_buf_size": 20000,
26
+ "train_timeout_seconds": 100
27
+ },
28
+ "experimental": {
29
+ "context_parallel_degree": 1,
30
+ "context_parallel_rotate_method": "allgather",
31
+ "custom_model_path": "",
32
+ "enable_async_tensor_parallel": false,
33
+ "enable_compiled_autograd": false,
34
+ "pipeline_parallel_degree": 1,
35
+ "pipeline_parallel_microbatches": null,
36
+ "pipeline_parallel_schedule": "1F1B",
37
+ "pipeline_parallel_schedule_csv": "",
38
+ "pipeline_parallel_split_points": []
39
+ },
40
+ "fault_tolerance": {
41
+ "enable": false,
42
+ "group_size": 0,
43
+ "min_replica_size": 1,
44
+ "replica_id": 0
45
+ },
46
+ "float8": {
47
+ "enable_fsdp_float8_all_gather": false,
48
+ "force_recompute_fp8_weight_in_bwd": false,
49
+ "precompute_float8_dynamic_scale_for_fsdp": false,
50
+ "recipe_name": null
51
+ },
52
+ "job": {
53
+ "config_file": "flame/models/fla.toml",
54
+ "description": "default job",
55
+ "dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
56
+ "print_args": true,
57
+ "use_for_integration_test": false
58
+ },
59
+ "lr_scheduler": {
60
+ "decay_ratio": 1.0,
61
+ "decay_type": "linear",
62
+ "lr_min": 0.01,
63
+ "warmup_steps": 100
64
+ },
65
+ "memory_estimation": {
66
+ "disable_fake_mode": false,
67
+ "enabled": false
68
+ },
69
+ "metrics": {
70
+ "disable_color_printing": false,
71
+ "enable_tensorboard": true,
72
+ "enable_wandb": true,
73
+ "log_freq": 1,
74
+ "save_for_all_ranks": false,
75
+ "save_tb_folder": "tb"
76
+ },
77
+ "model": {
78
+ "config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
79
+ "converters": [],
80
+ "name": "fla",
81
+ "print_after_conversion": false,
82
+ "tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
83
+ },
84
+ "optimizer": {
85
+ "early_step_in_backward": false,
86
+ "eps": 1e-08,
87
+ "implementation": "fused",
88
+ "lr": 0.0003,
89
+ "name": "AdamW"
90
+ },
91
+ "profiling": {
92
+ "enable_memory_snapshot": false,
93
+ "enable_profiling": true,
94
+ "profile_freq": 512,
95
+ "save_memory_snapshot_folder": "memory_snapshot",
96
+ "save_traces_folder": "profile_trace"
97
+ },
98
+ "training": {
99
+ "batch_size": 16,
100
+ "compile": true,
101
+ "context_len": 8192,
102
+ "data_dir": null,
103
+ "data_files": null,
104
+ "data_parallel_replicate_degree": 1,
105
+ "data_parallel_shard_degree": -1,
106
+ "data_probs": "0.55,0.3,0.15",
107
+ "dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
108
+ "dataset_name": "default,default,default",
109
+ "dataset_split": "train,train,train",
110
+ "deterministic": false,
111
+ "disable_loss_parallel": false,
112
+ "enable_cpu_offload": false,
113
+ "fsdp_reshard_after_forward": "default",
114
+ "gc_freq": 50,
115
+ "gradient_accumulation_steps": 1,
116
+ "max_norm": 1.0,
117
+ "mixed_precision_param": "bfloat16",
118
+ "mixed_precision_reduce": "float32",
119
+ "num_workers": 32,
120
+ "persistent_workers": false,
121
+ "pin_memory": false,
122
+ "prefetch_factor": 2,
123
+ "seed": 42,
124
+ "seq_len": 8192,
125
+ "skip_nan_inf": true,
126
+ "steps": 95366,
127
+ "streaming": true,
128
+ "tensor_parallel_degree": 1,
129
+ "varlen": false
130
+ }
131
+ }
132
+ [titan] 2025-07-22 22:38:30,348 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
133
+ [titan] 2025-07-22 22:38:31,334 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
134
+ [titan] 2025-07-22 22:38:31,337 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
135
+ [titan] 2025-07-22 22:38:31,402 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
136
+ [titan] 2025-07-22 22:38:31,402 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
137
+ [titan] 2025-07-22 22:38:31,403 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
138
+ [titan] 2025-07-22 22:38:31,526 - root - INFO - Loading tokenizer...
139
+ [titan] 2025-07-22 22:38:32,962 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
140
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
141
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
142
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
143
+ }
144
+ )
145
+ [titan] 2025-07-22 22:38:32,963 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
146
+ `trust_remote_code` is not supported anymore.
147
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
148
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
149
+ [titan] 2025-07-22 22:38:32,963 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
150
+ Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
151
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
152
+ [rank7]: Traceback (most recent call last):
153
+ [rank7]: File "<frozen runpy>", line 198, in _run_module_as_main
154
+ [rank7]: File "<frozen runpy>", line 88, in _run_code
155
+ [rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
156
+ [rank7]: main(config)
157
+ [rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
158
+ [rank7]: return f(*args, **kwargs)
159
+ [rank7]: ^^^^^^^^^^^^^^^^^^
160
+ [rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
161
+ [rank7]: dataset = build_dataset(
162
+ [rank7]: ^^^^^^^^^^^^^^
163
+ [rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
164
+ [rank7]: subset = load_dataset(
165
+ [rank7]: ^^^^^^^^^^^^^
166
+ [rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
167
+ [rank7]: builder_instance = load_dataset_builder(
168
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^
169
+ [rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
170
+ [rank7]: dataset_module = dataset_module_factory(
171
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^
172
+ [rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
173
+ [rank7]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
174
+ [rank7]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/7/stdout.log ADDED
File without changes
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/0/stderr.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "<frozen runpy>", line 198, in _run_module_as_main
3
+ File "<frozen runpy>", line 88, in _run_code
4
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 12, in <module>
5
+ import fla # noqa
6
+ ^^^^^^^^^^
7
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/__init__.py", line 28, in <module>
8
+ from fla.models import (
9
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/__init__.py", line 3, in <module>
10
+ from fla.models.abc import ABCConfig, ABCForCausalLM, ABCModel
11
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/abc/__init__.py", line 6, in <module>
12
+ from fla.models.abc.modeling_abc import ABCForCausalLM, ABCModel
13
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/abc/modeling_abc.py", line 14, in <module>
14
+ from transformers.modeling_utils import PreTrainedModel
15
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/modeling_utils.py", line 59, in <module>
16
+ from .integrations.flash_attention import flash_attention_forward
17
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/integrations/flash_attention.py", line 5, in <module>
18
+ from ..modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
19
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/modeling_flash_attention_utils.py", line 100, in <module>
20
+ from flash_attn import flash_attn_func as flash_attn_2_func
21
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/flash_attn/__init__.py", line 3, in <module>
22
+ from flash_attn.flash_attn_interface import (
23
+ File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/flash_attn/flash_attn_interface.py", line 15, in <module>
24
+ import flash_attn_2_cuda as flash_attn_gpu
25
+ ImportError: /usr/lib64/libc.so.6: version `GLIBC_2.32' not found (required by /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so)
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/0/stdout.log ADDED
File without changes
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/1/stderr.log ADDED
File without changes
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/1/stdout.log ADDED
File without changes