craffel HF Staff commited on
Commit
8bca8c8
·
verified ·
1 Parent(s): 512dcdb

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -123,3 +123,12 @@ fineweb2_hq_superset_lang_tokenizers/0000100000/__5_0.distcp filter=lfs diff=lfs
123
  fineweb2_hq_superset_lang_tokenizers/0000100000/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
124
  fineweb2_hq_superset_lang_tokenizers/0000100000/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
125
  fineweb2_hq_superset_oracle/metrics.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
123
  fineweb2_hq_superset_lang_tokenizers/0000100000/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
124
  fineweb2_hq_superset_lang_tokenizers/0000100000/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
125
  fineweb2_hq_superset_oracle/metrics.jsonl filter=lfs diff=lfs merge=lfs -text
126
+ fineweb2_hq_superset_oracle/0000100000/.metadata filter=lfs diff=lfs merge=lfs -text
127
+ fineweb2_hq_superset_oracle/0000100000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
128
+ fineweb2_hq_superset_oracle/0000100000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
129
+ fineweb2_hq_superset_oracle/0000100000/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
130
+ fineweb2_hq_superset_oracle/0000100000/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
131
+ fineweb2_hq_superset_oracle/0000100000/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
132
+ fineweb2_hq_superset_oracle/0000100000/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
133
+ fineweb2_hq_superset_oracle/0000100000/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
134
+ fineweb2_hq_superset_oracle/0000100000/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
fineweb2_hq_superset_oracle/0000100000/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75640aa0d9c45e512bab869611fd76f793f31399447ca9b4ab19751b0e3d1ce8
3
+ size 1148565
fineweb2_hq_superset_oracle/0000100000/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52dd0910af534ea6c3c981622cd6a064178f084cacf36a797be1d3f72d1a9a2a
3
+ size 2626712784
fineweb2_hq_superset_oracle/0000100000/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ece0ca64f2bda9393bce19390d9c5e84e268718a4aa67540eabc3a832a7f242e
3
+ size 2626751724
fineweb2_hq_superset_oracle/0000100000/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c30a2b78ebacb38cc52f071406ffaff75bfcf3c36857e0776900c45c4d5a22a5
3
+ size 2626751724
fineweb2_hq_superset_oracle/0000100000/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db3d7a5d005953008e579f8bf0341282918df6dbf0507c426c94ef8f2d2f0034
3
+ size 2626751724
fineweb2_hq_superset_oracle/0000100000/__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3729e16d9a6e21ee62a2bfe01fe47b665aba2325982f9d80eb7f30d0e32b9401
3
+ size 2626751724
fineweb2_hq_superset_oracle/0000100000/__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fac6daed48331ec2fc05c63df73c5fd1af51e608f9b9b3df03b36e77c00d3f08
3
+ size 2626754000
fineweb2_hq_superset_oracle/0000100000/__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66a02218da3ac7c48d903b75e296185722df9bbfe8d741cc686b21bdc8ad2191
3
+ size 2626754000
fineweb2_hq_superset_oracle/0000100000/__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:603d84f565f50d8bbb078a028576d2d8c9d084270371c787638d777045744644
3
+ size 2626565648
fineweb2_hq_superset_oracle/0000100000/params.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"name": "flexitok_superset_fineweb2_hq_oracle", "dump_dir": "/fsx/craffel/lingua_logs/fineweb2_hq_superset_oracle", "seed": 777, "grad_acc_steps": 8, "gc_collect_freq": 1000, "probe_freq": null, "steps": 100000, "data": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "batch_size": 4, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "load_supermapping": false, "dropout": 0.0, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764}, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 1.0}}, "optim": {"lr": 0.001, "weight_decay": 0.1, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 2000, "lr_min_ratio": 1e-06, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 2048, "n_layers": 25, "head_dim": null, "n_heads": 16, "n_kv_heads": null, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "max_seqlen": 4096, "seed": 42, "vocab_size": 113764, "weight_tying": false, "sliding_window": null, "use_factorized_embeddings": false, "factorized_embedding_dim": 0}, "distributed": {"dp_shard": 1, "dp_replicate": 8, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 10000, "keep": -1}, "eval": {"every": 10000, "keep": -1}, "path": "/fsx/craffel/lingua_logs/fineweb2_hq_superset_oracle/checkpoints", "init_ckpt_path": null, "load_init_optimizer_state": false, "save_init_ckpt": false}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 0, "mem_steps": 4, "profile_warmup": 100, "profile_steps": 4}, "logging": {"freq": 1, "acc_freq": null, "wandb": null}, "async_eval_gpus": 8, "eval": {"harness": {"tasks": ["hellaswag", "piqa", "arc_easy", "arc_challenge", "include_base_44_arabic", "include_base_44_chinese", "include_base_44_german", "include_base_44_greek", "include_base_44_persian", "include_base_44_french", "include_base_44_hungarian", "include_base_44_indonesian", "include_base_44_italian", "include_base_44_japanese", "include_base_44_dutch", "include_base_44_polish", "include_base_44_portuguese", "include_base_44_russian", "include_base_44_spanish", "include_base_44_turkish", "include_base_44_vietnamese", "belebele_arb_Arab", "belebele_ces_Latn", "belebele_zho_Hans", "belebele_dan_Latn", "belebele_deu_Latn", "belebele_ell_Grek", "belebele_pes_Arab", "belebele_fra_Latn", "belebele_hun_Latn", "belebele_ind_Latn", "belebele_ita_Latn", "belebele_jpn_Jpan", "belebele_nld_Latn", "belebele_pol_Latn", "belebele_por_Latn", "belebele_rus_Cyrl", "belebele_spa_Latn", "belebele_swe_Latn", "belebele_tur_Latn", "belebele_vie_Latn", "belebele_eng_Latn", "xnli_ar", "xnli_zh", "xnli_de", "xnli_el", "xnli_en", "xnli_es", "xnli_fr", "xnli_hi", "xnli_ru", "xnli_tr", "xnli_vi"]}, "generator": {"max_tokens": 16384, "dtype": "bf16", "add_bos": false}}}
fineweb2_hq_superset_oracle/0000100000/train_state_00000.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 306, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.00.jsonl", "position": 25211902871, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.00.jsonl", "position": 1157285505, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.00.jsonl", "position": 1082549827, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.00.jsonl", "position": 1159563797, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.00.jsonl", "position": 1547040565, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.00.jsonl", "position": 1299850161, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.00.jsonl", "position": 983443586, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.00.jsonl", "position": 1513283497, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.00.jsonl", "position": 1695618305, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.00.jsonl", "position": 2108605159, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.00.jsonl", "position": 1500052023, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.00.jsonl", "position": 1195052061, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.00.jsonl", "position": 1255593692, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.00.jsonl", "position": 1303927750, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.00.jsonl", "position": 1456295200, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.00.jsonl", "position": 1524541429, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.00.jsonl", "position": 2023611657, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.00.jsonl", "position": 1818279643, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.00.jsonl", "position": 2206479315, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.00.jsonl", "position": 2337811221, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.00.jsonl", "position": 5503290245, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 217564847851477665956251761912029708799, "inc": 252101603063402394885084957393789173453}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 125157940657924324488197213018949239865, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 0}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 1.0}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 300569569296678341640414112158566886944, "inc": 257317082376085721142933171929815648017}, "has_uint32": 1, "uinteger": 630457105}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle/0000100000/train_state_00001.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 779, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.01.jsonl", "position": 25212451290, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.01.jsonl", "position": 1149795785, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.01.jsonl", "position": 1100645218, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.01.jsonl", "position": 1159274118, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.01.jsonl", "position": 1534304523, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.01.jsonl", "position": 1299946312, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.01.jsonl", "position": 977019953, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.01.jsonl", "position": 1523023896, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.01.jsonl", "position": 1705532703, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.01.jsonl", "position": 2121773439, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.01.jsonl", "position": 1493311734, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.01.jsonl", "position": 1192060455, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.01.jsonl", "position": 1260663363, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.01.jsonl", "position": 1311836945, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.01.jsonl", "position": 1455649127, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.01.jsonl", "position": 1522559161, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.01.jsonl", "position": 2028114358, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.01.jsonl", "position": 1838681960, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.01.jsonl", "position": 2204928666, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.01.jsonl", "position": 2341077515, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.01.jsonl", "position": 5458105242, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 170792644739050368700427021677186784648, "inc": 246509925186285949978196491240064802315}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 316148842036080281590551602282634283494, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 0}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 1.0}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 298609680158430271867266436931522339460, "inc": 173555323965545256606922338259303677603}, "has_uint32": 1, "uinteger": 1182378492}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle/0000100000/train_state_00002.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1772, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.02.jsonl", "position": 25226736981, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.02.jsonl", "position": 1159402883, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.02.jsonl", "position": 1081465255, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.02.jsonl", "position": 1158420487, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.02.jsonl", "position": 1537115868, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.02.jsonl", "position": 1295792848, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.02.jsonl", "position": 985443351, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.02.jsonl", "position": 1501066721, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.02.jsonl", "position": 1712545062, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.02.jsonl", "position": 2103755335, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.02.jsonl", "position": 1504338970, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.02.jsonl", "position": 1201293257, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.02.jsonl", "position": 1253809024, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.02.jsonl", "position": 1312479021, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.02.jsonl", "position": 1462587292, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.02.jsonl", "position": 1532935641, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.02.jsonl", "position": 2022643799, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.02.jsonl", "position": 1811046814, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.02.jsonl", "position": 2204602971, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.02.jsonl", "position": 2339778245, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.02.jsonl", "position": 5462380423, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 78406532954139751752527080872347323957, "inc": 234358335530849485425064040311006256713}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 219426431185928772972372231830118304479, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 0}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 1.0}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 172206642884315098528897268843500314535, "inc": 319170006889470250209362588441616495209}, "has_uint32": 1, "uinteger": 2183176397}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle/0000100000/train_state_00003.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 245, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.03.jsonl", "position": 25213305359, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.03.jsonl", "position": 1155598630, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.03.jsonl", "position": 1081427832, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.03.jsonl", "position": 1166605369, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.03.jsonl", "position": 1549288983, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.03.jsonl", "position": 1291398724, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.03.jsonl", "position": 991420281, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.03.jsonl", "position": 1509358421, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.03.jsonl", "position": 1699742167, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.03.jsonl", "position": 2124178896, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.03.jsonl", "position": 1497366852, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.03.jsonl", "position": 1205368042, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.03.jsonl", "position": 1257041789, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.03.jsonl", "position": 1302345898, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.03.jsonl", "position": 1463442503, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.03.jsonl", "position": 1510723079, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.03.jsonl", "position": 2016174064, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.03.jsonl", "position": 1829906688, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.03.jsonl", "position": 2210604373, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.03.jsonl", "position": 2335056374, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.03.jsonl", "position": 5494391656, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 65304134056203034814964073875323843608, "inc": 148211758571781046255077612135386035203}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 37109397118048674082937746491550082722, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 0}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 1.0}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 54691439170624420224489263842165716902, "inc": 115810872492597857501795428972873905393}, "has_uint32": 1, "uinteger": 1237320779}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle/0000100000/train_state_00004.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1727, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.04.jsonl", "position": 25204288713, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.04.jsonl", "position": 1161370401, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.04.jsonl", "position": 1081257581, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.04.jsonl", "position": 1161554367, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.04.jsonl", "position": 1544980246, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.04.jsonl", "position": 1285839860, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.04.jsonl", "position": 985434926, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.04.jsonl", "position": 1518709313, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.04.jsonl", "position": 1697884125, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.04.jsonl", "position": 2137165524, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.04.jsonl", "position": 1490968997, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.04.jsonl", "position": 1198129712, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.04.jsonl", "position": 1254404693, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.04.jsonl", "position": 1312986834, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.04.jsonl", "position": 1447992191, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.04.jsonl", "position": 1534295431, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.04.jsonl", "position": 2030657097, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.04.jsonl", "position": 1813189536, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.04.jsonl", "position": 2200076956, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.04.jsonl", "position": 2338014490, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.04.jsonl", "position": 5493716351, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 201106476958641232530589734220828791282, "inc": 186633262021180533256729114674950595327}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 86768207423770862006994754676507252918, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 0}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 1.0}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 330736315247521707224292038935126153445, "inc": 303111205818808944921858206842105131807}, "has_uint32": 1, "uinteger": 211256137}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle/0000100000/train_state_00005.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.05.jsonl", "position": 25235000940, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.05.jsonl", "position": 1151391398, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.05.jsonl", "position": 1082338472, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.05.jsonl", "position": 1167836660, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.05.jsonl", "position": 1533936888, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.05.jsonl", "position": 1290699266, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.05.jsonl", "position": 981931221, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.05.jsonl", "position": 1519092011, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.05.jsonl", "position": 1702958322, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.05.jsonl", "position": 2122352253, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.05.jsonl", "position": 1504513627, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.05.jsonl", "position": 1184191468, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.05.jsonl", "position": 1266204484, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.05.jsonl", "position": 1306913887, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.05.jsonl", "position": 1454401326, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.05.jsonl", "position": 1525761175, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.05.jsonl", "position": 2033994816, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.05.jsonl", "position": 1824271361, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.05.jsonl", "position": 2204627465, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.05.jsonl", "position": 2330506803, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.05.jsonl", "position": 5446616877, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 240371827841764391988259859712252951183, "inc": 329233669073478483697346584247981015037}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 148940459418961144841451594188526566633, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 0}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 1.0}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 152527936818944850177382785176910810558, "inc": 47382953940698287647753879262736142901}, "has_uint32": 1, "uinteger": 3675784275}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle/0000100000/train_state_00006.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 3650, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.06.jsonl", "position": 25222209844, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.06.jsonl", "position": 1162441478, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.06.jsonl", "position": 1084297958, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.06.jsonl", "position": 1155002991, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.06.jsonl", "position": 1538966412, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.06.jsonl", "position": 1283984050, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.06.jsonl", "position": 991540044, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.06.jsonl", "position": 1521258413, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.06.jsonl", "position": 1704141605, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.06.jsonl", "position": 2110950484, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.06.jsonl", "position": 1508438475, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.06.jsonl", "position": 1195664149, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.06.jsonl", "position": 1263016506, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.06.jsonl", "position": 1302741609, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.06.jsonl", "position": 1450154481, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.06.jsonl", "position": 1519057678, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.06.jsonl", "position": 2025094505, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.06.jsonl", "position": 1818706586, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.06.jsonl", "position": 2207380726, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.06.jsonl", "position": 2341202689, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.06.jsonl", "position": 5460435575, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 306669038162928386424589020554077522248, "inc": 95963489890761403814531195999220475639}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 82502698088204949750787845795365333961, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 0}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 1.0}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 131915213571014219328634582687476804074, "inc": 72545526324180839152750112646078969085}, "has_uint32": 0, "uinteger": 2783644522}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle/0000100000/train_state_00007.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 449, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.07.jsonl", "position": 25216531821, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.07.jsonl", "position": 1168620136, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.07.jsonl", "position": 1088051299, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.07.jsonl", "position": 1161902610, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.07.jsonl", "position": 1539891099, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.07.jsonl", "position": 1296226727, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.07.jsonl", "position": 992790130, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.07.jsonl", "position": 1507200229, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.07.jsonl", "position": 1701080402, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.07.jsonl", "position": 2123817051, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.07.jsonl", "position": 1494283017, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.07.jsonl", "position": 1199168910, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.07.jsonl", "position": 1259619459, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.07.jsonl", "position": 1300246684, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.07.jsonl", "position": 1453806646, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.07.jsonl", "position": 1532143375, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.07.jsonl", "position": 2030635375, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.07.jsonl", "position": 1826111443, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.07.jsonl", "position": 2190248128, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.07.jsonl", "position": 2336185354, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.07.jsonl", "position": 5446849977, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 239927437835081522836668089702901613902, "inc": 53245743019587277358203950863334653629}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 63738353674706646305299819800461466782, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 0}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 1.0}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 239803483826676955776584746976189400951, "inc": 19761753544780285878460645500694854795}, "has_uint32": 1, "uinteger": 513301027}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}