craffel HF Staff commited on
Commit
1b60662
·
verified ·
1 Parent(s): 11c727a

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -133,3 +133,12 @@ fineweb2_hq_superset_oracle/0000100000/__5_0.distcp filter=lfs diff=lfs merge=lf
133
  fineweb2_hq_superset_oracle/0000100000/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
134
  fineweb2_hq_superset_oracle/0000100000/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
135
  fineweb2_hq_superset_oracle_09/metrics.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
133
  fineweb2_hq_superset_oracle/0000100000/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
134
  fineweb2_hq_superset_oracle/0000100000/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
135
  fineweb2_hq_superset_oracle_09/metrics.jsonl filter=lfs diff=lfs merge=lfs -text
136
+ fineweb2_hq_superset_oracle_09/0000100000/.metadata filter=lfs diff=lfs merge=lfs -text
137
+ fineweb2_hq_superset_oracle_09/0000100000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
138
+ fineweb2_hq_superset_oracle_09/0000100000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
139
+ fineweb2_hq_superset_oracle_09/0000100000/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
140
+ fineweb2_hq_superset_oracle_09/0000100000/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
141
+ fineweb2_hq_superset_oracle_09/0000100000/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
142
+ fineweb2_hq_superset_oracle_09/0000100000/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
143
+ fineweb2_hq_superset_oracle_09/0000100000/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
144
+ fineweb2_hq_superset_oracle_09/0000100000/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
fineweb2_hq_superset_oracle_09/0000100000/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:346f350eb162b32ba21ba8efc726ac56c7ae5bd257ca04be591b479e9436cb10
3
+ size 1148568
fineweb2_hq_superset_oracle_09/0000100000/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45378fb5d4c2bf001d2b97662c9ebfad7b7ded7060e242224b7f4619be07669f
3
+ size 2626712784
fineweb2_hq_superset_oracle_09/0000100000/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72990d3e30eb15d564c8e2a8f44982d102068b6d74c6f7384bcaffe9ac3f3d25
3
+ size 2626751724
fineweb2_hq_superset_oracle_09/0000100000/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8374f04f7032a08fd0ac862d9df9c8af160f56b3c720ad9cbb32589a0fa3976a
3
+ size 2626751724
fineweb2_hq_superset_oracle_09/0000100000/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78aa341ac2b409869238bcd1492d92d3350e3324ccd5386ca1f41d21335e6fc8
3
+ size 2626751724
fineweb2_hq_superset_oracle_09/0000100000/__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:805a4b608c14de18b966d0ccf8819e6420c7e2a2095ce73ecd0440dd773ed6c3
3
+ size 2626751724
fineweb2_hq_superset_oracle_09/0000100000/__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aac65811ec22c3e3a899a62c94ca193faca15f82f40dd8461f8f8af3bba0c1e8
3
+ size 2626754000
fineweb2_hq_superset_oracle_09/0000100000/__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07a9e1c4a78f494ed85c0837487c8ea944c88263da053a1a34a984e9649f432f
3
+ size 2626754000
fineweb2_hq_superset_oracle_09/0000100000/__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c79635982338821b2a2ee9c689614fd58904084e095213d1a46de35fb5dcd09a
3
+ size 2626565648
fineweb2_hq_superset_oracle_09/0000100000/params.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"name": "flexitok_superset_fineweb2_hq_oracle_09", "dump_dir": "/fsx/craffel/lingua_logs/fineweb2_hq_superset_oracle_09", "seed": 777, "grad_acc_steps": 8, "gc_collect_freq": 1000, "probe_freq": null, "steps": 100000, "data": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "batch_size": 4, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "load_supermapping": false, "dropout": 0.0, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764}, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 0.9}}, "optim": {"lr": 0.001, "weight_decay": 0.1, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 2000, "lr_min_ratio": 1e-06, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 2048, "n_layers": 25, "head_dim": null, "n_heads": 16, "n_kv_heads": null, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "max_seqlen": 4096, "seed": 42, "vocab_size": 113764, "weight_tying": false, "sliding_window": null, "use_factorized_embeddings": false, "factorized_embedding_dim": 0}, "distributed": {"dp_shard": 1, "dp_replicate": 8, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 10000, "keep": -1}, "eval": {"every": 10000, "keep": -1}, "path": "/fsx/craffel/lingua_logs/fineweb2_hq_superset_oracle_09/checkpoints", "init_ckpt_path": null, "load_init_optimizer_state": false, "save_init_ckpt": false}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 0, "mem_steps": 4, "profile_warmup": 100, "profile_steps": 4}, "logging": {"freq": 1, "acc_freq": null, "wandb": null}, "async_eval_gpus": 8, "eval": {"harness": {"tasks": ["hellaswag", "piqa", "arc_easy", "arc_challenge", "include_base_44_arabic", "include_base_44_chinese", "include_base_44_german", "include_base_44_greek", "include_base_44_persian", "include_base_44_french", "include_base_44_hungarian", "include_base_44_indonesian", "include_base_44_italian", "include_base_44_japanese", "include_base_44_dutch", "include_base_44_polish", "include_base_44_portuguese", "include_base_44_russian", "include_base_44_spanish", "include_base_44_turkish", "include_base_44_vietnamese", "belebele_arb_Arab", "belebele_ces_Latn", "belebele_zho_Hans", "belebele_dan_Latn", "belebele_deu_Latn", "belebele_ell_Grek", "belebele_pes_Arab", "belebele_fra_Latn", "belebele_hun_Latn", "belebele_ind_Latn", "belebele_ita_Latn", "belebele_jpn_Jpan", "belebele_nld_Latn", "belebele_pol_Latn", "belebele_por_Latn", "belebele_rus_Cyrl", "belebele_spa_Latn", "belebele_swe_Latn", "belebele_tur_Latn", "belebele_vie_Latn", "belebele_eng_Latn", "xnli_ar", "xnli_zh", "xnli_de", "xnli_el", "xnli_en", "xnli_es", "xnli_fr", "xnli_hi", "xnli_ru", "xnli_tr", "xnli_vi"]}, "generator": {"max_tokens": 16384, "dtype": "bf16", "add_bos": false}}}
fineweb2_hq_superset_oracle_09/0000100000/train_state_00000.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 48, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.00.jsonl", "position": 22483258854, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.00.jsonl", "position": 1032589472, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.00.jsonl", "position": 968439755, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.00.jsonl", "position": 1035189529, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.00.jsonl", "position": 1375887178, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.00.jsonl", "position": 1161498055, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.00.jsonl", "position": 879594349, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.00.jsonl", "position": 1347981366, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.00.jsonl", "position": 1510509397, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.00.jsonl", "position": 1879969852, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.00.jsonl", "position": 1339606685, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.00.jsonl", "position": 1066667428, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.00.jsonl", "position": 1121800644, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.00.jsonl", "position": 1163739268, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.00.jsonl", "position": 1301134107, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.00.jsonl", "position": 1362158405, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.00.jsonl", "position": 1806670991, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.00.jsonl", "position": 1621734095, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.00.jsonl", "position": 1970498110, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.00.jsonl", "position": 2087037429, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.00.jsonl", "position": 4894410491, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 270383003534006626838332511736703048882, "inc": 252101603063402394885084957393789173453}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 251936371675889377286400910147764665395, "inc": 332724090758049132448979897138935081983}, "has_uint32": 1, "uinteger": 752264651}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 0.9}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 300569569296678341640414112158566886944, "inc": 257317082376085721142933171929815648017}, "has_uint32": 1, "uinteger": 630457105}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle_09/0000100000/train_state_00001.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 286, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.01.jsonl", "position": 22463260616, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.01.jsonl", "position": 1024809607, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.01.jsonl", "position": 980973103, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.01.jsonl", "position": 1034937404, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.01.jsonl", "position": 1367664765, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.01.jsonl", "position": 1157972458, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.01.jsonl", "position": 871518216, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.01.jsonl", "position": 1356552048, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.01.jsonl", "position": 1520499033, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.01.jsonl", "position": 1893590202, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.01.jsonl", "position": 1329906118, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.01.jsonl", "position": 1061451000, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.01.jsonl", "position": 1127045544, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.01.jsonl", "position": 1169202157, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.01.jsonl", "position": 1296716654, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.01.jsonl", "position": 1356556112, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.01.jsonl", "position": 1810934505, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.01.jsonl", "position": 1634534349, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.01.jsonl", "position": 1964164880, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.01.jsonl", "position": 2085222323, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.01.jsonl", "position": 4868909585, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 48231011483084912610579028376827026167, "inc": 246509925186285949978196491240064802315}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 190356997901590126227442893214424013847, "inc": 332724090758049132448979897138935081983}, "has_uint32": 1, "uinteger": 1297863037}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 0.9}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 298609680158430271867266436931522339460, "inc": 173555323965545256606922338259303677603}, "has_uint32": 1, "uinteger": 1182378492}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle_09/0000100000/train_state_00002.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 3056, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.02.jsonl", "position": 22489758329, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.02.jsonl", "position": 1032438740, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.02.jsonl", "position": 962793831, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.02.jsonl", "position": 1033892558, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.02.jsonl", "position": 1368629710, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.02.jsonl", "position": 1155408207, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.02.jsonl", "position": 877228943, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.02.jsonl", "position": 1338089650, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.02.jsonl", "position": 1525184351, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.02.jsonl", "position": 1877545332, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.02.jsonl", "position": 1342294407, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.02.jsonl", "position": 1068540363, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.02.jsonl", "position": 1114692888, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.02.jsonl", "position": 1169528599, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.02.jsonl", "position": 1305225985, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.02.jsonl", "position": 1367880315, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.02.jsonl", "position": 1802098920, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.02.jsonl", "position": 1617045033, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.02.jsonl", "position": 1970304290, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.02.jsonl", "position": 2082333374, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.02.jsonl", "position": 4859750632, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 223681783025195812999968259691627346910, "inc": 234358335530849485425064040311006256713}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 125782433428769626417896507380568044658, "inc": 332724090758049132448979897138935081983}, "has_uint32": 1, "uinteger": 1885206414}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 0.9}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 172206642884315098528897268843500314535, "inc": 319170006889470250209362588441616495209}, "has_uint32": 1, "uinteger": 2183176397}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle_09/0000100000/train_state_00003.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 134, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.03.jsonl", "position": 22460239421, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.03.jsonl", "position": 1028785515, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.03.jsonl", "position": 965495272, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.03.jsonl", "position": 1039664947, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.03.jsonl", "position": 1380151232, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.03.jsonl", "position": 1149927115, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.03.jsonl", "position": 884412084, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.03.jsonl", "position": 1344152599, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.03.jsonl", "position": 1513293707, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.03.jsonl", "position": 1882285422, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.03.jsonl", "position": 1334018090, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.03.jsonl", "position": 1074240883, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.03.jsonl", "position": 1122299490, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.03.jsonl", "position": 1159789499, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.03.jsonl", "position": 1303150206, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.03.jsonl", "position": 1346770628, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.03.jsonl", "position": 1800318223, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.03.jsonl", "position": 1629346866, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.03.jsonl", "position": 1971381630, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.03.jsonl", "position": 2077045731, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.03.jsonl", "position": 4907840361, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 135603433442169554720520037598354742005, "inc": 148211758571781046255077612135386035203}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 130014772671157124969548468809526624105, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 3077772535}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 0.9}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 54691439170624420224489263842165716902, "inc": 115810872492597857501795428972873905393}, "has_uint32": 1, "uinteger": 1237320779}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle_09/0000100000/train_state_00004.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 4030, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.04.jsonl", "position": 22454130097, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.04.jsonl", "position": 1037680662, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.04.jsonl", "position": 962282857, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.04.jsonl", "position": 1034866261, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.04.jsonl", "position": 1378006373, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.04.jsonl", "position": 1146824160, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.04.jsonl", "position": 876319976, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.04.jsonl", "position": 1356451538, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.04.jsonl", "position": 1509891976, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.04.jsonl", "position": 1903537509, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.04.jsonl", "position": 1328836066, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.04.jsonl", "position": 1070067886, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.04.jsonl", "position": 1121096086, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.04.jsonl", "position": 1170428896, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.04.jsonl", "position": 1290679721, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.04.jsonl", "position": 1369129876, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.04.jsonl", "position": 1811956980, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.04.jsonl", "position": 1615708118, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.04.jsonl", "position": 1954048067, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.04.jsonl", "position": 2082005666, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.04.jsonl", "position": 4890660613, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 38273289384812320998922801536110352119, "inc": 186633262021180533256729114674950595327}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 306058781670887813596132903282708372391, "inc": 332724090758049132448979897138935081983}, "has_uint32": 1, "uinteger": 133836247}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 0.9}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 330736315247521707224292038935126153445, "inc": 303111205818808944921858206842105131807}, "has_uint32": 1, "uinteger": 211256137}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle_09/0000100000/train_state_00005.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 64724, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.05.jsonl", "position": 22502227336, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.05.jsonl", "position": 1026179119, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.05.jsonl", "position": 966051120, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.05.jsonl", "position": 1038899295, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.05.jsonl", "position": 1366788463, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.05.jsonl", "position": 1149953495, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.05.jsonl", "position": 876851028, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.05.jsonl", "position": 1353427404, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.05.jsonl", "position": 1520716847, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.05.jsonl", "position": 1896660790, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.05.jsonl", "position": 1339462124, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.05.jsonl", "position": 1054789955, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.05.jsonl", "position": 1128866039, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.05.jsonl", "position": 1164143159, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.05.jsonl", "position": 1296438541, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.05.jsonl", "position": 1360068469, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.05.jsonl", "position": 1811322940, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.05.jsonl", "position": 1628673552, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.05.jsonl", "position": 1965163877, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.05.jsonl", "position": 2080186368, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.05.jsonl", "position": 4853310943, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 232872261194737025455778400523083265256, "inc": 329233669073478483697346584247981015037}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 303010857546518459934908507501533175092, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 960666977}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 0.9}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 152527936818944850177382785176910810558, "inc": 47382953940698287647753879262736142901}, "has_uint32": 1, "uinteger": 3675784275}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle_09/0000100000/train_state_00006.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 284, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.06.jsonl", "position": 22472519887, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.06.jsonl", "position": 1037864320, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.06.jsonl", "position": 968007387, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.06.jsonl", "position": 1030499197, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.06.jsonl", "position": 1371171271, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.06.jsonl", "position": 1145733952, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.06.jsonl", "position": 886646497, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.06.jsonl", "position": 1353439706, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.06.jsonl", "position": 1519100634, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.06.jsonl", "position": 1876412740, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.06.jsonl", "position": 1342380358, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.06.jsonl", "position": 1063301845, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.06.jsonl", "position": 1122850018, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.06.jsonl", "position": 1163074482, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.06.jsonl", "position": 1291137561, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.06.jsonl", "position": 1353223694, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.06.jsonl", "position": 1803609874, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.06.jsonl", "position": 1619869819, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.06.jsonl", "position": 1967831068, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.06.jsonl", "position": 2084508946, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.06.jsonl", "position": 4852330889, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 182159864847985040360867520349984811671, "inc": 95963489890761403814531195999220475639}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 218289611282521592203553418248435858916, "inc": 332724090758049132448979897138935081983}, "has_uint32": 1, "uinteger": 446954685}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 0.9}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 131915213571014219328634582687476804074, "inc": 72545526324180839152750112646078969085}, "has_uint32": 0, "uinteger": 2783644522}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_oracle_09/0000100000/train_state_00007.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 643, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.07.jsonl", "position": 22468184014, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.07.jsonl", "position": 1038767004, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.07.jsonl", "position": 970838564, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.07.jsonl", "position": 1036554054, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.07.jsonl", "position": 1370159513, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.07.jsonl", "position": 1159863083, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.07.jsonl", "position": 887028896, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.07.jsonl", "position": 1345383343, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.07.jsonl", "position": 1521522386, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.07.jsonl", "position": 1891502545, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.07.jsonl", "position": 1333060603, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.07.jsonl", "position": 1067900524, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.07.jsonl", "position": 1125319846, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.07.jsonl", "position": 1157314599, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.07.jsonl", "position": 1295791776, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.07.jsonl", "position": 1368083289, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.07.jsonl", "position": 1811993632, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.07.jsonl", "position": 1631466906, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.07.jsonl", "position": 1953770138, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.07.jsonl", "position": 2084656019, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.07.jsonl", "position": 4848663064, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 189811832799242350825626138056339974820, "inc": 53245743019587277358203950863334653629}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 331378542064310518769400339080276923579, "inc": 332724090758049132448979897138935081983}, "has_uint32": 1, "uinteger": 2793454212}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764, "routing": {"source_to_tokenizer": {"arb_Arab": "flexitok/bpe_arb_Arab_8000", "ces_Latn": "flexitok/bpe_ces_Latn_8000", "cmn_Hani": "flexitok/bpe_cmn_Hani_8000", "dan_Latn": "flexitok/bpe_dan_Latn_8000", "deu_Latn": "flexitok/bpe_deu_Latn_8000", "ell_Grek": "flexitok/bpe_ell_Grek_8000", "fas_Arab": "flexitok/bpe_fas_Arab_8000", "fra_Latn": "flexitok/bpe_fra_Latn_8000", "fw_edu": "flexitok/bpe_fw_edu_8000", "hun_Latn": "flexitok/bpe_hun_Latn_8000", "ind_Latn": "flexitok/bpe_ind_Latn_8000", "ita_Latn": "flexitok/bpe_ita_Latn_8000", "jpn_Jpan": "flexitok/bpe_jpn_Jpan_8000", "nld_Latn": "flexitok/bpe_nld_Latn_8000", "pol_Latn": "flexitok/bpe_pol_Latn_8000", "por_Latn": "flexitok/bpe_por_Latn_8000", "rus_Cyrl": "flexitok/bpe_rus_Cyrl_8000", "spa_Latn": "flexitok/bpe_spa_Latn_8000", "swe_Latn": "flexitok/bpe_swe_Latn_8000", "tur_Latn": "flexitok/bpe_tur_Latn_8000", "vie_Latn": "flexitok/bpe_vie_Latn_8000"}, "suitable_tokenizer_probability": 0.9}}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 239803483826676955776584746976189400951, "inc": 19761753544780285878460645500694854795}, "has_uint32": 1, "uinteger": 513301027}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}