Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +8 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/8k-100.sh +65 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/config.json +33 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_1B.json +29 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_340M.json +26 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_1B.json +22 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_340M.json +22 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gdn_6_1_340M.json +50 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_340M.json +24 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_7B.json +25 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gsa_340M.json +29 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/hgrn2_340M.json +20 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_1B.json +32 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_340M.json +32 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_1B.json +30 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_340M.json +30 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/samba_1B.json +52 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/sba_340m.json +18 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_1B.json +22 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_340M.json +18 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_7B.json +21 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/generation_config.json +6 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/0/error.json +1 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/0/stderr.log +175 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/0/stdout.log +0 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/1/error.json +1 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/1/stderr.log +174 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/1/stdout.log +0 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/2/error.json +1 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/2/stderr.log +174 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/2/stdout.log +0 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/3/error.json +1 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/3/stderr.log +174 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/3/stdout.log +0 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/4/error.json +1 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/4/stderr.log +174 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/4/stdout.log +0 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/5/error.json +1 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/5/stderr.log +174 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/5/stdout.log +0 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/6/error.json +1 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/6/stderr.log +174 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/6/stdout.log +0 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/7/error.json +1 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/7/stderr.log +174 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/7/stdout.log +0 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/0/stderr.log +25 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/0/stdout.log +0 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/1/stderr.log +0 -0
- transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/1/stdout.log +0 -0
.gitattributes
CHANGED
|
@@ -65,3 +65,11 @@ mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8
|
|
| 65 |
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 66 |
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 67 |
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 66 |
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 67 |
mamba2_6_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_v3h3fbcf/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/0/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/1/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/2/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/3/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/4/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_jqc1xcka/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/8k-100.sh
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FLAME_PATH=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame
|
| 2 |
+
DATASET_ROOT=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset
|
| 3 |
+
TOKENIZER=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer
|
| 4 |
+
|
| 5 |
+
cd $FLAME_PATH
|
| 6 |
+
source .venv/bin/activate
|
| 7 |
+
|
| 8 |
+
# =========== train config ===========
|
| 9 |
+
CONFIG=${1:-transformer_340M.json}
|
| 10 |
+
SEQ_LEN=8192
|
| 11 |
+
WARMUP_STEPS=100
|
| 12 |
+
STEPS=95366
|
| 13 |
+
LR=3e-4
|
| 14 |
+
BATCH_SIZE=16
|
| 15 |
+
DECAY_TYPE=linear
|
| 16 |
+
DECAY_RATIO=1
|
| 17 |
+
|
| 18 |
+
NNODE=1
|
| 19 |
+
NGPU=8
|
| 20 |
+
LOG_RANK=0
|
| 21 |
+
# ====================================
|
| 22 |
+
|
| 23 |
+
# if jq command is not found, install it
|
| 24 |
+
if ! command -v jq &> /dev/null; then
|
| 25 |
+
echo "jq could not be found, installing it..."
|
| 26 |
+
sudo yum install -y jq
|
| 27 |
+
fi
|
| 28 |
+
|
| 29 |
+
EXP_NAME=$(basename $CONFIG | sed 's/\.config//')-ctx${SEQ_LEN}-steps${STEPS}-lr${LR}-decay_type${DECAY_TYPE}-decay_ratio${DECAY_RATIO}-bs${BATCH_SIZE}-nn${NNODE}
|
| 30 |
+
|
| 31 |
+
bash train.sh \
|
| 32 |
+
--job.config_file flame/models/fla.toml \
|
| 33 |
+
--job.dump_folder $FLAME_PATH/exp/$EXP_NAME \
|
| 34 |
+
--model.config $FLAME_PATH/configs/$CONFIG \
|
| 35 |
+
--model.tokenizer_path $TOKENIZER \
|
| 36 |
+
--optimizer.name AdamW \
|
| 37 |
+
--optimizer.eps 1e-8 \
|
| 38 |
+
--optimizer.lr $LR \
|
| 39 |
+
--lr_scheduler.warmup_steps $WARMUP_STEPS \
|
| 40 |
+
--lr_scheduler.lr_min 0.01 \
|
| 41 |
+
--lr_scheduler.decay_type $DECAY_TYPE \
|
| 42 |
+
--lr_scheduler.decay_ratio $DECAY_RATIO \
|
| 43 |
+
--training.batch_size $BATCH_SIZE \
|
| 44 |
+
--training.seq_len $SEQ_LEN \
|
| 45 |
+
--training.context_len $SEQ_LEN \
|
| 46 |
+
--training.gradient_accumulation_steps 1 \
|
| 47 |
+
--training.steps $STEPS \
|
| 48 |
+
--training.max_norm 1.0 \
|
| 49 |
+
--training.skip_nan_inf \
|
| 50 |
+
--training.dataset $DATASET_ROOT/fineweb-edu-sample,$DATASET_ROOT/small_repos_20B_sample_merged,$DATASET_ROOT/megamath-web-pro \
|
| 51 |
+
--training.data_probs 0.55,0.3,0.15 \
|
| 52 |
+
--training.dataset_split train,train,train \
|
| 53 |
+
--training.dataset_name default,default,default \
|
| 54 |
+
--training.streaming \
|
| 55 |
+
--training.num_workers 32 \
|
| 56 |
+
--training.prefetch_factor 2 \
|
| 57 |
+
--training.seed 42 \
|
| 58 |
+
--training.compile \
|
| 59 |
+
--checkpoint.interval 8192 \
|
| 60 |
+
--checkpoint.load_step -1 \
|
| 61 |
+
--checkpoint.keep_latest_k 100 \
|
| 62 |
+
--metrics.log_freq 1 \
|
| 63 |
+
--metrics.enable_tensorboard \
|
| 64 |
+
--training.streaming
|
| 65 |
+
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/config.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"TransformerForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"bos_token_id": 1,
|
| 7 |
+
"elementwise_affine": true,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"fuse_cross_entropy": true,
|
| 10 |
+
"fuse_norm": true,
|
| 11 |
+
"fuse_swiglu": true,
|
| 12 |
+
"hidden_act": "swish",
|
| 13 |
+
"hidden_ratio": 4,
|
| 14 |
+
"hidden_size": 1024,
|
| 15 |
+
"initializer_range": 0.02,
|
| 16 |
+
"intermediate_size": null,
|
| 17 |
+
"max_position_embeddings": 8192,
|
| 18 |
+
"model_type": "transformer",
|
| 19 |
+
"norm_eps": 1e-06,
|
| 20 |
+
"num_heads": 16,
|
| 21 |
+
"num_hidden_layers": 24,
|
| 22 |
+
"num_kv_heads": null,
|
| 23 |
+
"qk_norm": false,
|
| 24 |
+
"qkv_bias": false,
|
| 25 |
+
"rope_theta": 10000.0,
|
| 26 |
+
"tie_word_embeddings": false,
|
| 27 |
+
"torch_dtype": "float32",
|
| 28 |
+
"transformers_version": "4.53.3",
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"use_l2warp": false,
|
| 31 |
+
"vocab_size": 32000,
|
| 32 |
+
"window_size": null
|
| 33 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_1B.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": null,
|
| 3 |
+
"attn_mode": "chunk",
|
| 4 |
+
"bos_token_id": 1,
|
| 5 |
+
"conv_size": 4,
|
| 6 |
+
"eos_token_id": 2,
|
| 7 |
+
"expand_k": 1,
|
| 8 |
+
"expand_v": 1,
|
| 9 |
+
"fuse_cross_entropy": true,
|
| 10 |
+
"fuse_norm": true,
|
| 11 |
+
"hidden_act": "swish",
|
| 12 |
+
"hidden_ratio": 4,
|
| 13 |
+
"hidden_size": 2048,
|
| 14 |
+
"initializer_range": 0.02,
|
| 15 |
+
"intermediate_size": null,
|
| 16 |
+
"model_type": "delta_net",
|
| 17 |
+
"norm_eps": 1e-06,
|
| 18 |
+
"num_heads": 16,
|
| 19 |
+
"num_hidden_layers": 24,
|
| 20 |
+
"pad_token_id": 2,
|
| 21 |
+
"qk_activation": "silu",
|
| 22 |
+
"qk_norm": "l2",
|
| 23 |
+
"tie_word_embeddings": false,
|
| 24 |
+
"use_beta": true,
|
| 25 |
+
"use_cache": true,
|
| 26 |
+
"use_gate": false,
|
| 27 |
+
"use_output_norm": true,
|
| 28 |
+
"use_short_conv": true
|
| 29 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/delta_net_340M.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 1,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "delta_net",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 8,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"qk_activation": "silu",
|
| 19 |
+
"qk_norm": "l2",
|
| 20 |
+
"tie_word_embeddings": false,
|
| 21 |
+
"use_beta": true,
|
| 22 |
+
"use_cache": true,
|
| 23 |
+
"use_gate": false,
|
| 24 |
+
"use_output_norm": true,
|
| 25 |
+
"use_short_conv": true
|
| 26 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_1B.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_v": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"head_dim": 256,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 2048,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "gated_deltanet",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 6,
|
| 17 |
+
"num_hidden_layers": 21,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"use_gate": true,
|
| 21 |
+
"use_short_conv": true
|
| 22 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gated_deltanet_340M.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_v": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"head_dim": 256,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "gated_deltanet",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 6,
|
| 17 |
+
"num_hidden_layers": 21,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"use_gate": true,
|
| 21 |
+
"use_short_conv": true
|
| 22 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gdn_6_1_340M.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"GatedDeltaNetForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn": {
|
| 6 |
+
"layers": [
|
| 7 |
+
5,
|
| 8 |
+
11,
|
| 9 |
+
17,
|
| 10 |
+
23
|
| 11 |
+
],
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_kv_heads": 8,
|
| 14 |
+
"qkv_bias": false,
|
| 15 |
+
"rope_theta": 160000.0,
|
| 16 |
+
"window_size": null
|
| 17 |
+
},
|
| 18 |
+
"attn_mode": "chunk",
|
| 19 |
+
"bos_token_id": 1,
|
| 20 |
+
"conv_size": 4,
|
| 21 |
+
"eos_token_id": 2,
|
| 22 |
+
"expand_k": 1,
|
| 23 |
+
"expand_v": 1,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"head_dim": 256,
|
| 28 |
+
"hidden_act": "swish",
|
| 29 |
+
"hidden_ratio": 4,
|
| 30 |
+
"hidden_size": 1024,
|
| 31 |
+
"initializer_range": 0.02,
|
| 32 |
+
"intermediate_size": null,
|
| 33 |
+
"max_position_embeddings": 8192,
|
| 34 |
+
"model_type": "gated_deltanet",
|
| 35 |
+
"norm_eps": 1e-06,
|
| 36 |
+
"norm_first": false,
|
| 37 |
+
"num_heads": 4,
|
| 38 |
+
"num_hidden_layers": 24,
|
| 39 |
+
"qk_activation": "silu",
|
| 40 |
+
"qk_norm": "l2",
|
| 41 |
+
"tie_word_embeddings": false,
|
| 42 |
+
"torch_dtype": "float32",
|
| 43 |
+
"transformers_version": "4.51.3",
|
| 44 |
+
"use_beta": true,
|
| 45 |
+
"use_cache": true,
|
| 46 |
+
"use_gate": true,
|
| 47 |
+
"use_output_norm": true,
|
| 48 |
+
"use_short_conv": true,
|
| 49 |
+
"vocab_size": 32000
|
| 50 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_340M.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"clamp_min": null,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 0.5,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"fuse_norm": true,
|
| 10 |
+
"hidden_act": "swish",
|
| 11 |
+
"hidden_ratio": 4,
|
| 12 |
+
"hidden_size": 1024,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": null,
|
| 15 |
+
"model_type": "gla",
|
| 16 |
+
"num_heads": 4,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"norm_eps": 1e-06,
|
| 19 |
+
"tie_word_embeddings": false,
|
| 20 |
+
"use_cache": true,
|
| 21 |
+
"use_gk": true,
|
| 22 |
+
"use_gv": false,
|
| 23 |
+
"vocab_size": 32000
|
| 24 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gla_7B.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": null,
|
| 3 |
+
"attn_mode": "chunk",
|
| 4 |
+
"bos_token_id": 1,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 0.5,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"fuse_norm": true,
|
| 10 |
+
"hidden_act": "swish",
|
| 11 |
+
"hidden_ratio": 4,
|
| 12 |
+
"hidden_size": 4096,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 11008,
|
| 15 |
+
"model_type": "gla",
|
| 16 |
+
"norm_eps": 1e-06,
|
| 17 |
+
"num_heads": 16,
|
| 18 |
+
"num_hidden_layers": 32,
|
| 19 |
+
"tie_word_embeddings": false,
|
| 20 |
+
"use_cache": true,
|
| 21 |
+
"use_gk": true,
|
| 22 |
+
"use_gv": false,
|
| 23 |
+
"use_output_gate": true,
|
| 24 |
+
"use_short_conv": false
|
| 25 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/gsa_340M.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_size": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand_k": 1,
|
| 6 |
+
"expand_v": 1,
|
| 7 |
+
"elementwise_affine": false,
|
| 8 |
+
"feature_map": "swish",
|
| 9 |
+
"fuse_cross_entropy": true,
|
| 10 |
+
"fuse_norm": true,
|
| 11 |
+
"gate_logit_normalizer": 4,
|
| 12 |
+
"hidden_act": "swish",
|
| 13 |
+
"hidden_ratio": 4,
|
| 14 |
+
"hidden_size": 1024,
|
| 15 |
+
"initializer_range": 0.02,
|
| 16 |
+
"intermediate_size": null,
|
| 17 |
+
"model_type": "gsa",
|
| 18 |
+
"num_heads": 4,
|
| 19 |
+
"num_hidden_layers": 24,
|
| 20 |
+
"num_slots": 64,
|
| 21 |
+
"norm_eps": 1e-06,
|
| 22 |
+
"share_conv_kernel": true,
|
| 23 |
+
"tie_word_embeddings": false,
|
| 24 |
+
"use_cache": true,
|
| 25 |
+
"use_norm": true,
|
| 26 |
+
"use_output_gate": true,
|
| 27 |
+
"use_rope": false,
|
| 28 |
+
"use_short_conv": false
|
| 29 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/hgrn2_340M.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand_ratio": 128,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "swish",
|
| 9 |
+
"hidden_ratio": 4,
|
| 10 |
+
"hidden_size": 1024,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": null,
|
| 13 |
+
"model_type": "hgrn2",
|
| 14 |
+
"num_heads": 8,
|
| 15 |
+
"num_hidden_layers": 24,
|
| 16 |
+
"norm_eps": 1e-06,
|
| 17 |
+
"tie_word_embeddings": false,
|
| 18 |
+
"use_cache": true,
|
| 19 |
+
"vocab_size": 32000
|
| 20 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_1B.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"chunk_size": 256,
|
| 4 |
+
"conv_kernel": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"fuse_norm": true,
|
| 9 |
+
"head_dim": 64,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 2048,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"norm_eps": 1e-05,
|
| 14 |
+
"model_type": "mamba2",
|
| 15 |
+
"n_groups": 1,
|
| 16 |
+
"num_hidden_layers": 48,
|
| 17 |
+
"pad_token_id": 0,
|
| 18 |
+
"rescale_prenorm_residual": true,
|
| 19 |
+
"residual_in_fp32": true,
|
| 20 |
+
"rms_norm": true,
|
| 21 |
+
"state_size": 128,
|
| 22 |
+
"tie_word_embeddings": false,
|
| 23 |
+
"time_step_floor": 0.0001,
|
| 24 |
+
"time_step_max": 0.1,
|
| 25 |
+
"time_step_min": 0.001,
|
| 26 |
+
"time_step_rank": 128,
|
| 27 |
+
"transformers_version": "4.50.1",
|
| 28 |
+
"use_bias": false,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"use_conv_bias": true,
|
| 31 |
+
"vocab_size": 32000
|
| 32 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba2_340M.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"chunk_size": 256,
|
| 4 |
+
"conv_kernel": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"fuse_norm": true,
|
| 9 |
+
"head_dim": 64,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"norm_eps": 1e-05,
|
| 14 |
+
"model_type": "mamba2",
|
| 15 |
+
"n_groups": 1,
|
| 16 |
+
"num_hidden_layers": 48,
|
| 17 |
+
"pad_token_id": 0,
|
| 18 |
+
"rescale_prenorm_residual": true,
|
| 19 |
+
"residual_in_fp32": true,
|
| 20 |
+
"rms_norm": true,
|
| 21 |
+
"state_size": 128,
|
| 22 |
+
"tie_word_embeddings": false,
|
| 23 |
+
"time_step_floor": 0.0001,
|
| 24 |
+
"time_step_max": 0.1,
|
| 25 |
+
"time_step_min": 0.001,
|
| 26 |
+
"time_step_rank": 128,
|
| 27 |
+
"transformers_version": "4.50.1",
|
| 28 |
+
"use_bias": false,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"use_conv_bias": true,
|
| 31 |
+
"vocab_size": 32000
|
| 32 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_1B.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_kernel": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand": 2,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 2048,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"model_type": "mamba",
|
| 12 |
+
"norm_eps": 1e-05,
|
| 13 |
+
"num_hidden_layers": 48,
|
| 14 |
+
"pad_token_id": 0,
|
| 15 |
+
"rescale_prenorm_residual": false,
|
| 16 |
+
"residual_in_fp32": false,
|
| 17 |
+
"state_size": 16,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"time_step_floor": 0.0001,
|
| 20 |
+
"time_step_init_scheme": "random",
|
| 21 |
+
"time_step_max": 0.1,
|
| 22 |
+
"time_step_min": 0.001,
|
| 23 |
+
"time_step_rank": 128,
|
| 24 |
+
"time_step_scale": 1.0,
|
| 25 |
+
"transformers_version": "4.50.1",
|
| 26 |
+
"use_bias": false,
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"use_conv_bias": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/mamba_340M.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_kernel": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand": 2,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 1024,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"model_type": "mamba",
|
| 12 |
+
"norm_eps": 1e-05,
|
| 13 |
+
"num_hidden_layers": 48,
|
| 14 |
+
"pad_token_id": 0,
|
| 15 |
+
"rescale_prenorm_residual": false,
|
| 16 |
+
"residual_in_fp32": false,
|
| 17 |
+
"state_size": 16,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"time_step_floor": 0.0001,
|
| 20 |
+
"time_step_init_scheme": "random",
|
| 21 |
+
"time_step_max": 0.1,
|
| 22 |
+
"time_step_min": 0.001,
|
| 23 |
+
"time_step_rank": 128,
|
| 24 |
+
"time_step_scale": 1.0,
|
| 25 |
+
"transformers_version": "4.50.1",
|
| 26 |
+
"use_bias": false,
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"use_conv_bias": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/samba_1B.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": {
|
| 3 |
+
"layers": [
|
| 4 |
+
1,
|
| 5 |
+
3,
|
| 6 |
+
5,
|
| 7 |
+
7,
|
| 8 |
+
9,
|
| 9 |
+
11,
|
| 10 |
+
13,
|
| 11 |
+
15,
|
| 12 |
+
17
|
| 13 |
+
],
|
| 14 |
+
"num_heads": 18,
|
| 15 |
+
"num_kv_heads": 18,
|
| 16 |
+
"qkv_bias": false,
|
| 17 |
+
"rope_theta": 10000.0,
|
| 18 |
+
"window_size": 2048
|
| 19 |
+
},
|
| 20 |
+
"bos_token_id": 1,
|
| 21 |
+
"conv_kernel": 4,
|
| 22 |
+
"eos_token_id": 2,
|
| 23 |
+
"expand": 2,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"hidden_act": "swish",
|
| 28 |
+
"hidden_ratio": 4,
|
| 29 |
+
"hidden_size": 2304,
|
| 30 |
+
"initializer_range": 0.02,
|
| 31 |
+
"intermediate_size": 4608,
|
| 32 |
+
"max_position_embeddings": 2048,
|
| 33 |
+
"model_type": "samba",
|
| 34 |
+
"norm_eps": 1e-05,
|
| 35 |
+
"num_hidden_layers": 18,
|
| 36 |
+
"pad_token_id": 0,
|
| 37 |
+
"rescale_prenorm_residual": false,
|
| 38 |
+
"residual_in_fp32": false,
|
| 39 |
+
"state_size": 16,
|
| 40 |
+
"tie_word_embeddings": false,
|
| 41 |
+
"time_step_floor": 0.0001,
|
| 42 |
+
"time_step_init_scheme": "random",
|
| 43 |
+
"time_step_max": 0.1,
|
| 44 |
+
"time_step_min": 0.001,
|
| 45 |
+
"time_step_rank": 144,
|
| 46 |
+
"time_step_scale": 1.0,
|
| 47 |
+
"transformers_version": "4.50.1",
|
| 48 |
+
"use_bias": false,
|
| 49 |
+
"use_cache": true,
|
| 50 |
+
"use_conv_bias": true,
|
| 51 |
+
"vocab_size": 32000
|
| 52 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/sba_340m.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_size": 1024,
|
| 9 |
+
"initializer_range": 0.006,
|
| 10 |
+
"max_position_embeddings": 8192,
|
| 11 |
+
"model_type": "sba",
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_hidden_layers": 24,
|
| 14 |
+
"norm_eps": 1e-06,
|
| 15 |
+
"tie_word_embeddings": false,
|
| 16 |
+
"use_cache": true,
|
| 17 |
+
"vocab_size": 32000
|
| 18 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_1B.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"elementwise_affine": true,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"fuse_swiglu": true,
|
| 8 |
+
"hidden_act": "swish",
|
| 9 |
+
"hidden_ratio": 4,
|
| 10 |
+
"hidden_size": 2048,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": null,
|
| 13 |
+
"max_position_embeddings": 8192,
|
| 14 |
+
"model_type": "transformer",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 32,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"num_kv_heads": null,
|
| 19 |
+
"pad_token_id": 2,
|
| 20 |
+
"rope_theta": 10000.0,
|
| 21 |
+
"tie_word_embeddings": false
|
| 22 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_340M.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_size": 1024,
|
| 9 |
+
"initializer_range": 0.02,
|
| 10 |
+
"max_position_embeddings": 8192,
|
| 11 |
+
"model_type": "transformer",
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_hidden_layers": 24,
|
| 14 |
+
"norm_eps": 1e-06,
|
| 15 |
+
"tie_word_embeddings": false,
|
| 16 |
+
"use_cache": true,
|
| 17 |
+
"vocab_size": 32000
|
| 18 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/configs/transformer_7B.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_ratio": 4,
|
| 9 |
+
"hidden_size": 4096,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 14336,
|
| 12 |
+
"model_type": "transformer",
|
| 13 |
+
"norm_eps": 1e-06,
|
| 14 |
+
"num_heads": 32,
|
| 15 |
+
"num_hidden_layers": 32,
|
| 16 |
+
"num_kv_heads": 8,
|
| 17 |
+
"rope_theta": 10000.0,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"window_size": null
|
| 21 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"transformers_version": "4.53.3"
|
| 6 |
+
}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/0/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/0/stderr.log
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:38:30,495 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:38:30,496 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:38:30,496 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:38:30,496 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:38:30,514 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:38:30,602 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:38:30,602 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:38:30,602 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:38:31,582 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:38:32,959 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:38:32,959 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:38:32,959 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[rank0]: Traceback (most recent call last):
|
| 153 |
+
[rank0]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 154 |
+
[rank0]: File "<frozen runpy>", line 88, in _run_code
|
| 155 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 156 |
+
[rank0]: main(config)
|
| 157 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 158 |
+
[rank0]: return f(*args, **kwargs)
|
| 159 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^
|
| 160 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 161 |
+
[rank0]: dataset = build_dataset(
|
| 162 |
+
[rank0]: ^^^^^^^^^^^^^^
|
| 163 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
|
| 164 |
+
[rank0]: subset = load_dataset(
|
| 165 |
+
[rank0]: ^^^^^^^^^^^^^
|
| 166 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
|
| 167 |
+
[rank0]: builder_instance = load_dataset_builder(
|
| 168 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
|
| 169 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
|
| 170 |
+
[rank0]: dataset_module = dataset_module_factory(
|
| 171 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^
|
| 172 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
|
| 173 |
+
[rank0]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
|
| 174 |
+
[rank0]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
|
| 175 |
+
[rank0]:[W722 22:38:33.059900780 ProcessGroupNCCL.cpp:1479] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/0/stdout.log
ADDED
|
File without changes
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/1/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/1/stderr.log
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:38:30,567 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:38:30,568 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:38:30,568 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:38:31,570 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:38:31,572 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:38:31,659 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:38:31,659 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:38:31,659 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:38:31,687 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:38:32,959 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:38:32,959 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:38:32,959 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[rank1]: Traceback (most recent call last):
|
| 153 |
+
[rank1]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 154 |
+
[rank1]: File "<frozen runpy>", line 88, in _run_code
|
| 155 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 156 |
+
[rank1]: main(config)
|
| 157 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 158 |
+
[rank1]: return f(*args, **kwargs)
|
| 159 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^
|
| 160 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 161 |
+
[rank1]: dataset = build_dataset(
|
| 162 |
+
[rank1]: ^^^^^^^^^^^^^^
|
| 163 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
|
| 164 |
+
[rank1]: subset = load_dataset(
|
| 165 |
+
[rank1]: ^^^^^^^^^^^^^
|
| 166 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
|
| 167 |
+
[rank1]: builder_instance = load_dataset_builder(
|
| 168 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^
|
| 169 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
|
| 170 |
+
[rank1]: dataset_module = dataset_module_factory(
|
| 171 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^
|
| 172 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
|
| 173 |
+
[rank1]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
|
| 174 |
+
[rank1]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/1/stdout.log
ADDED
|
File without changes
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/2/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/2/stderr.log
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:38:30,455 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:38:30,456 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:38:30,457 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:38:31,554 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:38:31,557 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:38:31,609 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:38:31,610 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:38:31,610 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:38:31,634 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:38:32,962 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:38:32,963 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:38:32,963 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[rank2]: Traceback (most recent call last):
|
| 153 |
+
[rank2]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 154 |
+
[rank2]: File "<frozen runpy>", line 88, in _run_code
|
| 155 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 156 |
+
[rank2]: main(config)
|
| 157 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 158 |
+
[rank2]: return f(*args, **kwargs)
|
| 159 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^
|
| 160 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 161 |
+
[rank2]: dataset = build_dataset(
|
| 162 |
+
[rank2]: ^^^^^^^^^^^^^^
|
| 163 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
|
| 164 |
+
[rank2]: subset = load_dataset(
|
| 165 |
+
[rank2]: ^^^^^^^^^^^^^
|
| 166 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
|
| 167 |
+
[rank2]: builder_instance = load_dataset_builder(
|
| 168 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^
|
| 169 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
|
| 170 |
+
[rank2]: dataset_module = dataset_module_factory(
|
| 171 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^
|
| 172 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
|
| 173 |
+
[rank2]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
|
| 174 |
+
[rank2]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/2/stdout.log
ADDED
|
File without changes
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/3/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/3/stderr.log
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:38:30,383 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:38:30,384 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:38:30,385 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:38:31,417 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:38:31,419 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:38:31,456 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:38:31,456 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:38:31,457 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:38:31,526 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:38:32,964 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:38:32,964 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:38:32,964 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[rank3]: Traceback (most recent call last):
|
| 153 |
+
[rank3]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 154 |
+
[rank3]: File "<frozen runpy>", line 88, in _run_code
|
| 155 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 156 |
+
[rank3]: main(config)
|
| 157 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 158 |
+
[rank3]: return f(*args, **kwargs)
|
| 159 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^
|
| 160 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 161 |
+
[rank3]: dataset = build_dataset(
|
| 162 |
+
[rank3]: ^^^^^^^^^^^^^^
|
| 163 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
|
| 164 |
+
[rank3]: subset = load_dataset(
|
| 165 |
+
[rank3]: ^^^^^^^^^^^^^
|
| 166 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
|
| 167 |
+
[rank3]: builder_instance = load_dataset_builder(
|
| 168 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^
|
| 169 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
|
| 170 |
+
[rank3]: dataset_module = dataset_module_factory(
|
| 171 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^
|
| 172 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
|
| 173 |
+
[rank3]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
|
| 174 |
+
[rank3]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/3/stdout.log
ADDED
|
File without changes
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/4/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/4/stderr.log
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:38:30,292 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:38:30,292 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:38:30,293 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:38:31,334 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:38:31,337 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:38:31,401 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:38:31,402 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:38:31,402 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:38:31,526 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:38:32,961 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:38:32,962 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:38:32,962 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[rank4]: Traceback (most recent call last):
|
| 153 |
+
[rank4]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 154 |
+
[rank4]: File "<frozen runpy>", line 88, in _run_code
|
| 155 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 156 |
+
[rank4]: main(config)
|
| 157 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 158 |
+
[rank4]: return f(*args, **kwargs)
|
| 159 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^
|
| 160 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 161 |
+
[rank4]: dataset = build_dataset(
|
| 162 |
+
[rank4]: ^^^^^^^^^^^^^^
|
| 163 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
|
| 164 |
+
[rank4]: subset = load_dataset(
|
| 165 |
+
[rank4]: ^^^^^^^^^^^^^
|
| 166 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
|
| 167 |
+
[rank4]: builder_instance = load_dataset_builder(
|
| 168 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^
|
| 169 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
|
| 170 |
+
[rank4]: dataset_module = dataset_module_factory(
|
| 171 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^
|
| 172 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
|
| 173 |
+
[rank4]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
|
| 174 |
+
[rank4]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/4/stdout.log
ADDED
|
File without changes
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/5/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/5/stderr.log
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:38:30,490 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:38:30,491 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:38:30,492 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:38:31,576 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:38:31,579 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:38:31,668 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:38:31,668 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:38:31,669 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:38:31,691 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:38:32,957 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:38:32,958 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:38:32,958 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[rank5]: Traceback (most recent call last):
|
| 153 |
+
[rank5]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 154 |
+
[rank5]: File "<frozen runpy>", line 88, in _run_code
|
| 155 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 156 |
+
[rank5]: main(config)
|
| 157 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 158 |
+
[rank5]: return f(*args, **kwargs)
|
| 159 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^
|
| 160 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 161 |
+
[rank5]: dataset = build_dataset(
|
| 162 |
+
[rank5]: ^^^^^^^^^^^^^^
|
| 163 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
|
| 164 |
+
[rank5]: subset = load_dataset(
|
| 165 |
+
[rank5]: ^^^^^^^^^^^^^
|
| 166 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
|
| 167 |
+
[rank5]: builder_instance = load_dataset_builder(
|
| 168 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^
|
| 169 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
|
| 170 |
+
[rank5]: dataset_module = dataset_module_factory(
|
| 171 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^
|
| 172 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
|
| 173 |
+
[rank5]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
|
| 174 |
+
[rank5]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/5/stdout.log
ADDED
|
File without changes
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/6/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/6/stderr.log
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:38:30,545 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:38:30,545 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:38:30,546 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:38:31,578 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:38:31,580 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:38:31,669 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:38:31,670 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:38:31,670 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:38:31,690 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:38:32,960 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:38:32,961 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:38:32,961 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[rank6]: Traceback (most recent call last):
|
| 153 |
+
[rank6]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 154 |
+
[rank6]: File "<frozen runpy>", line 88, in _run_code
|
| 155 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 156 |
+
[rank6]: main(config)
|
| 157 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 158 |
+
[rank6]: return f(*args, **kwargs)
|
| 159 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^
|
| 160 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 161 |
+
[rank6]: dataset = build_dataset(
|
| 162 |
+
[rank6]: ^^^^^^^^^^^^^^
|
| 163 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
|
| 164 |
+
[rank6]: subset = load_dataset(
|
| 165 |
+
[rank6]: ^^^^^^^^^^^^^
|
| 166 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
|
| 167 |
+
[rank6]: builder_instance = load_dataset_builder(
|
| 168 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^
|
| 169 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
|
| 170 |
+
[rank6]: dataset_module = dataset_module_factory(
|
| 171 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^
|
| 172 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
|
| 173 |
+
[rank6]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
|
| 174 |
+
[rank6]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/6/stdout.log
ADDED
|
File without changes
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/7/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 155, in main\n dataset = build_dataset(\n ^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py\", line 649, in build_dataset\n subset = load_dataset(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1392, in load_dataset\n builder_instance = load_dataset_builder(\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1132, in load_dataset_builder\n dataset_module = dataset_module_factory(\n ^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py\", line 1033, in dataset_module_factory\n raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\nFileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.\n", "timestamp": "1753195112"}}}
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/7/stderr.log
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-22 22:38:30,347 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-22 22:38:30,347 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/transformer_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 16,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 1,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-22 22:38:30,348 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-22 22:38:31,334 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-22 22:38:31,337 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-22 22:38:31,402 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-22 22:38:31,402 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-22 22:38:31,403 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-22 22:38:31,526 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-22 22:38:32,962 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-22 22:38:32,963 - root - INFO - Loading dataset /fineweb-edu-sample,/small_repos_20B_sample_merged,/MegaMath/megamath-web-pro:default,default,default
|
| 146 |
+
`trust_remote_code` is not supported anymore.
|
| 147 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 148 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 149 |
+
[titan] 2025-07-22 22:38:32,963 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 150 |
+
Please check that the Hugging Face dataset '/fineweb-edu-sample' isn't based on a loading script and remove `trust_remote_code`.
|
| 151 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 152 |
+
[rank7]: Traceback (most recent call last):
|
| 153 |
+
[rank7]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 154 |
+
[rank7]: File "<frozen runpy>", line 88, in _run_code
|
| 155 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 615, in <module>
|
| 156 |
+
[rank7]: main(config)
|
| 157 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 158 |
+
[rank7]: return f(*args, **kwargs)
|
| 159 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^
|
| 160 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 155, in main
|
| 161 |
+
[rank7]: dataset = build_dataset(
|
| 162 |
+
[rank7]: ^^^^^^^^^^^^^^
|
| 163 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/data.py", line 649, in build_dataset
|
| 164 |
+
[rank7]: subset = load_dataset(
|
| 165 |
+
[rank7]: ^^^^^^^^^^^^^
|
| 166 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1392, in load_dataset
|
| 167 |
+
[rank7]: builder_instance = load_dataset_builder(
|
| 168 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^
|
| 169 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1132, in load_dataset_builder
|
| 170 |
+
[rank7]: dataset_module = dataset_module_factory(
|
| 171 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^
|
| 172 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/datasets/load.py", line 1033, in dataset_module_factory
|
| 173 |
+
[rank7]: raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
|
| 174 |
+
[rank7]: FileNotFoundError: Couldn't find any data file at /fineweb-edu-sample.
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_1334zpzr/attempt_0/7/stdout.log
ADDED
|
File without changes
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/0/stderr.log
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Traceback (most recent call last):
|
| 2 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 3 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 4 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 12, in <module>
|
| 5 |
+
import fla # noqa
|
| 6 |
+
^^^^^^^^^^
|
| 7 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/__init__.py", line 28, in <module>
|
| 8 |
+
from fla.models import (
|
| 9 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/__init__.py", line 3, in <module>
|
| 10 |
+
from fla.models.abc import ABCConfig, ABCForCausalLM, ABCModel
|
| 11 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/abc/__init__.py", line 6, in <module>
|
| 12 |
+
from fla.models.abc.modeling_abc import ABCForCausalLM, ABCModel
|
| 13 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/abc/modeling_abc.py", line 14, in <module>
|
| 14 |
+
from transformers.modeling_utils import PreTrainedModel
|
| 15 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/modeling_utils.py", line 59, in <module>
|
| 16 |
+
from .integrations.flash_attention import flash_attention_forward
|
| 17 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/integrations/flash_attention.py", line 5, in <module>
|
| 18 |
+
from ..modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
|
| 19 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/modeling_flash_attention_utils.py", line 100, in <module>
|
| 20 |
+
from flash_attn import flash_attn_func as flash_attn_2_func
|
| 21 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/flash_attn/__init__.py", line 3, in <module>
|
| 22 |
+
from flash_attn.flash_attn_interface import (
|
| 23 |
+
File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/flash_attn/flash_attn_interface.py", line 15, in <module>
|
| 24 |
+
import flash_attn_2_cuda as flash_attn_gpu
|
| 25 |
+
ImportError: /usr/lib64/libc.so.6: version `GLIBC_2.32' not found (required by /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so)
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/0/stdout.log
ADDED
|
File without changes
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/1/stderr.log
ADDED
|
File without changes
|
transformer_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs16-nn1/logs/none_45k4njo9/attempt_0/1/stdout.log
ADDED
|
File without changes
|