File size: 165,866 Bytes
f734e14 | 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.12.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"%%writefile install.sh\npip install -q transformers datasets tokenizers evaluate accelerate lm-eval","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:37:26.466055Z","iopub.execute_input":"2026-04-26T08:37:26.466325Z","iopub.status.idle":"2026-04-26T08:37:26.477305Z","shell.execute_reply.started":"2026-04-26T08:37:26.466291Z","shell.execute_reply":"2026-04-26T08:37:26.476356Z"}},"outputs":[{"name":"stdout","text":"Writing install.sh\n","output_type":"stream"}],"execution_count":1},{"cell_type":"code","source":"!bash install.sh","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:38:08.522844Z","iopub.execute_input":"2026-04-26T08:38:08.523643Z","iopub.status.idle":"2026-04-26T08:38:24.439891Z","shell.execute_reply.started":"2026-04-26T08:38:08.523613Z","shell.execute_reply":"2026-04-26T08:38:24.439193Z"}},"outputs":[{"name":"stdout","text":"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m56.4/56.4 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m8.7/8.7 MB\u001b[0m \u001b[31m99.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m100.8/100.8 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m91.1/91.1 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25h Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n Building wheel for word2number (setup.py) ... \u001b[?25l\u001b[?25hdone\n","output_type":"stream"}],"execution_count":2},{"cell_type":"code","source":"%%writefile train_tokenizer.py\nfrom tokenizers import ByteLevelBPETokenizer\nfrom datasets import load_dataset\nimport os\nfrom transformers import PreTrainedTokenizerFast\n\nprint(\"Loading dataset for tokenizer training...\")\ndataset = load_dataset(\"HuggingFaceFW/fineweb-edu\", name=\"sample-10BT\", split='train', streaming=True)\n\ndef get_training_corpus():\n for i, example in enumerate(dataset):\n yield example[\"text\"]\n if i >= 50_000:\n break\n\nprint(\"Training tokenizer (4096 vocab)...\")\ntokenizer = ByteLevelBPETokenizer()\ntokenizer.train_from_iterator(\n get_training_corpus(),\n vocab_size=4096,\n min_frequency=2,\n special_tokens=[\"<|endoftext|>\"]\n)\n\nfast_tokenizer = PreTrainedTokenizerFast(\n tokenizer_object=tokenizer,\n bos_token=\"<|endoftext|>\",\n eos_token=\"<|endoftext|>\",\n unk_token=\"<|endoftext|>\",\n pad_token=\"<|endoftext|>\"\n)\n\nos.makedirs(\"spark_tokenizer\", exist_ok=True)\nfast_tokenizer.save_pretrained(\"spark_tokenizer\")\nprint(\"Tokenizer saved in 'spark_tokenizer'!\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:38:35.756835Z","iopub.execute_input":"2026-04-26T08:38:35.757137Z","iopub.status.idle":"2026-04-26T08:38:35.763094Z","shell.execute_reply.started":"2026-04-26T08:38:35.757110Z","shell.execute_reply":"2026-04-26T08:38:35.762397Z"}},"outputs":[{"name":"stdout","text":"Writing train_tokenizer.py\n","output_type":"stream"}],"execution_count":3},{"cell_type":"code","source":"!python3 train_tokenizer.py","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:38:37.912612Z","iopub.execute_input":"2026-04-26T08:38:37.913465Z","iopub.status.idle":"2026-04-26T08:39:53.269430Z","shell.execute_reply.started":"2026-04-26T08:38:37.913416Z","shell.execute_reply":"2026-04-26T08:39:53.268694Z"}},"outputs":[{"name":"stdout","text":"Lade Datensatz fΓΌr Tokenizer-Training...\nREADME.md: 26.4kB [00:00, 51.7MB/s]\nWarning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\nResolving data files: 100%|ββββββββββββββ| 2410/2410 [00:00<00:00, 30662.35it/s]\nTrainiere Tokenizer (4096 Vocab)...\n\u001b[2K[00:00:00] Tokenize words ββββββββββββββββββ 585540 / 585540[00:00:00] Tokenize words ββββββββββββββββββ 0 / 0\n\u001b[2K[00:00:00] Count pairs ββββββββββββββββββ 585540 / 585540\n\u001b[2K[00:00:03] Compute merges ββββββββββββββββββ 3839 / 3839\nTokenizer in 'spark_tokenizer' gespeichert!\n","output_type":"stream"}],"execution_count":4},{"cell_type":"code","source":"%%writefile prep_data.py\nfrom datasets import load_dataset\nfrom transformers import PreTrainedTokenizerFast\nimport os\n\ntokenizer = PreTrainedTokenizerFast.from_pretrained(\"spark_tokenizer\")\nblock_size = 512\n\nprint(\"Loading FineWeb-Edu (500000 Samples)...\")\ndataset = load_dataset(\"HuggingFaceFW/fineweb-edu\", name=\"sample-10BT\", split='train[:500000]')\n\ndef tokenize_function(examples):\n return tokenizer(examples[\"text\"])\n\nprint(\"Tokenizing dataset...\")\ncols_to_remove = dataset.column_names\ntokenized_datasets = dataset.map(\n tokenize_function, \n batched=True, \n num_proc=4, \n remove_columns=cols_to_remove\n)\n\ndef group_texts(examples):\n concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n total_length = len(concatenated_examples[list(examples.keys())[0]])\n total_length = (total_length // block_size) * block_size\n result = {\n k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n for k, t in concatenated_examples.items()\n }\n result[\"labels\"] = result[\"input_ids\"].copy()\n return result\n\nprint(\"Grouping into blocks (512)...\")\nlm_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=4)\n\ntotal_tokens = len(lm_datasets) * block_size\ntokens_in_mrd = total_tokens / 1_000_000_000\n\nprint(\"-\" * 40)\nprint(f\"β
Token Count: {tokens_in_mrd:.6f} Mrd. ({total_tokens:,} Tokens)\")\nprint(\"-\" * 40)\n\nlm_datasets.save_to_disk(\"spark_v4_data\")\nprint(\"Training data saved in 'spark_v4_data' successfully!\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:40:06.565748Z","iopub.execute_input":"2026-04-26T08:40:06.566389Z","iopub.status.idle":"2026-04-26T08:40:06.572644Z","shell.execute_reply.started":"2026-04-26T08:40:06.566354Z","shell.execute_reply":"2026-04-26T08:40:06.571875Z"}},"outputs":[{"name":"stdout","text":"Writing prep_data.py\n","output_type":"stream"}],"execution_count":5},{"cell_type":"code","source":"!python3 prep_data.py","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:40:09.124594Z","iopub.execute_input":"2026-04-26T08:40:09.125422Z","iopub.status.idle":"2026-04-26T09:27:31.711360Z","shell.execute_reply.started":"2026-04-26T08:40:09.125383Z","shell.execute_reply":"2026-04-26T09:27:31.710512Z"}},"outputs":[{"name":"stdout","text":"Lade FineWeb-Edu (500.000 Beispiele fΓΌr den Data-Scaling-Effekt!)...\nWarning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\nResolving data files: 100%|ββββββββββββββ| 2410/2410 [00:00<00:00, 30659.56it/s]\nsample/10BT/000_00000.parquet: 22%|ββ | 469M/2.15G [00:08<00:31, 53.4MB/s]\nsample/10BT/001_00000.parquet: 37%|βββ | 805M/2.15G [00:08<00:14, 94.3MB/s]\nsample/10BT/002_00000.parquet: 37%|βββ | 805M/2.15G [00:08<00:13, 96.5MB/s]\nsample/10BT/003_00000.parquet: 41%|ββββ | 872M/2.15G [00:08<00:13, 97.2MB/s]\nsample/10BT/004_00000.parquet: 31%|βββ | 670M/2.15G [00:08<00:19, 75.4MB/s]\nsample/10BT/005_00000.parquet: 22%|ββ | 469M/2.15G [00:08<00:31, 53.3MB/s]\nsample/10BT/006_00000.parquet: 78%|βββββββ | 1.68G/2.15G [00:14<00:04, 116MB/s]\nsample/10BT/007_00000.parquet: 36%|βββ | 768M/2.15G [00:09<00:16, 84.9MB/s]\nsample/10BT/008_00000.parquet: 48%|ββββ | 1.02G/2.15G [00:08<00:09, 114MB/s]\nsample/10BT/009_00000.parquet: 28%|βββ | 604M/2.15G [00:09<00:23, 65.0MB/s]\nsample/10BT/010_00000.parquet: 36%|βββ | 768M/2.15G [00:09<00:16, 84.2MB/s]\nsample/10BT/011_00000.parquet: 41%|ββββ | 872M/2.15G [00:08<00:13, 97.1MB/s]\nsample/10BT/012_00000.parquet: 44%|ββββ | 939M/2.15G [00:09<00:11, 103MB/s]\nsample/10BT/013_00000.parquet: 50%|βββββ | 268M/541M [00:03<00:03, 85.1MB/s]\nGenerating train split: 100%|β| 9672101/9672101 [04:37<00:00, 34909.16 examples/\nTokenisiere Datensatz (Multiprozessing aktiviert)...\nMap (num_proc=4): 100%|βββββββββ| 500000/500000 [10:19<00:00, 806.64 examples/s]\nGruppiere in BlΓΆcke (512)...\nMap (num_proc=4): 100%|βββββββββ| 500000/500000 [29:47<00:00, 279.74 examples/s]\n----------------------------------------\nβ
Token Count: 0.693131 Mrd. (693,130,752 Tokens)\n----------------------------------------\nSaving the dataset (19/19 shards): 100%|β| 1353771/1353771 [00:18<00:00, 71702.0\nTrainingsdaten in 'spark_v4_data' gespeichert!\n","output_type":"stream"}],"execution_count":6},{"cell_type":"code","source":"%%writefile train_model.py\nfrom transformers import LlamaConfig, LlamaForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling\nfrom transformers import PreTrainedTokenizerFast\nfrom datasets import load_from_disk\nimport torch\n\ntokenizer = PreTrainedTokenizerFast.from_pretrained(\"spark_tokenizer\")\ndataset = load_from_disk(\"spark_v4_data\")\n\nsplit_dataset = dataset.train_test_split(test_size=0.05, seed=42)\ntrain_dataset = split_dataset[\"train\"]\neval_dataset = split_dataset[\"test\"]\n\nconfig = LlamaConfig(\n vocab_size=len(tokenizer),\n hidden_size=256,\n intermediate_size=512,\n num_hidden_layers=6,\n num_attention_heads=8,\n max_position_embeddings=512,\n tie_word_embeddings=True, \n bos_token_id=tokenizer.bos_token_id,\n eos_token_id=tokenizer.eos_token_id,\n)\n\nmodel = LlamaForCausalLM(config)\nprint(f\"Model parameters: {model.num_parameters() / 1e6:.2f}M\")\n\ntraining_args = TrainingArguments(\n output_dir=\"./spark_v4_out\",\n eval_strategy=\"steps\",\n eval_steps=1500,\n logging_steps=100,\n save_steps=3000,\n learning_rate=1e-3,\n weight_decay=0.1,\n per_device_train_batch_size=128,\n per_device_eval_batch_size=128,\n max_steps=15000,\n lr_scheduler_type=\"cosine\",\n warmup_steps=1000,\n fp16=True, \n report_to=\"none\",\n optim=\"adamw_torch_fused\",\n dataloader_num_workers=4,\n dataloader_pin_memory=True,\n)\n\ndata_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=train_dataset,\n eval_dataset=eval_dataset,\n data_collator=data_collator,\n)\n\nprint(\"Starting v4 training...\")\ntrainer.train()\n\nmodel.save_pretrained(\"./spark_v4_final\")\ntokenizer.save_pretrained(\"./spark_v4_final\")\nprint(\"Training finished and v4 model saved successfully!\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T09:30:37.346192Z","iopub.execute_input":"2026-04-26T09:30:37.346497Z","iopub.status.idle":"2026-04-26T09:30:37.353238Z","shell.execute_reply.started":"2026-04-26T09:30:37.346471Z","shell.execute_reply":"2026-04-26T09:30:37.352371Z"}},"outputs":[{"name":"stdout","text":"Overwriting train_model.py\n","output_type":"stream"}],"execution_count":9},{"cell_type":"code","source":"!python3 train_model.py","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T09:30:39.118288Z","iopub.execute_input":"2026-04-26T09:30:39.119024Z"}},"outputs":[{"name":"stdout","text":"Loading dataset from disk: 100%|ββββββββββββββββ| 19/19 [00:01<00:00, 10.23it/s]\nModel parameters: 4.98M\nStarting v4 training...\n{'loss': '7.792', 'grad_norm': '0.7288', 'learning_rate': '9.9e-05', 'epoch': '0.0199'}\n{'loss': '6.79', 'grad_norm': '0.6882', 'learning_rate': '0.000199', 'epoch': '0.03981'}\n{'loss': '5.921', 'grad_norm': '1.194', 'learning_rate': '0.000299', 'epoch': '0.05971'}\n{'loss': '5.244', 'grad_norm': '1.192', 'learning_rate': '0.000399', 'epoch': '0.07962'}\n{'loss': '4.753', 'grad_norm': '1.035', 'learning_rate': '0.000499', 'epoch': '0.09952'}\n{'loss': '4.447', 'grad_norm': '0.9472', 'learning_rate': '0.000599', 'epoch': '0.1194'}\n{'loss': '4.254', 'grad_norm': '0.7994', 'learning_rate': '0.000699', 'epoch': '0.1393'}\n{'loss': '4.118', 'grad_norm': '1.023', 'learning_rate': '0.000799', 'epoch': '0.1592'}\n{'loss': '4.027', 'grad_norm': '0.6585', 'learning_rate': '0.000899', 'epoch': '0.1791'}\n{'loss': '3.937', 'grad_norm': '0.5435', 'learning_rate': '0.000999', 'epoch': '0.199'}\n{'loss': '3.845', 'grad_norm': '0.7014', 'learning_rate': '0.0009999', 'epoch': '0.2189'}\n{'loss': '3.757', 'grad_norm': '0.6224', 'learning_rate': '0.0009995', 'epoch': '0.2389'}\n{'loss': '3.698', 'grad_norm': '0.608', 'learning_rate': '0.0009989', 'epoch': '0.2588'}\n{'loss': '3.656', 'grad_norm': '0.5573', 'learning_rate': '0.000998', 'epoch': '0.2787'}\n{'loss': '3.621', 'grad_norm': '0.5656', 'learning_rate': '0.0009969', 'epoch': '0.2986'}\n 10%|ββββ | 1500/15000 [31:27<4:43:51, 1.26s/it]\n 0%| | 0/265 [00:00<?, ?it/s]\u001b[A\n 1%|β | 2/265 [00:00<00:58, 4.46it/s]\u001b[A\n 1%|β | 3/265 [00:00<01:23, 3.12it/s]\u001b[A\n 2%|β | 4/265 [00:01<01:39, 2.63it/s]\u001b[A\n 2%|β | 5/265 [00:01<01:45, 2.47it/s]\u001b[A\n 2%|β | 6/265 [00:02<01:49, 2.37it/s]\u001b[A\n 3%|ββ | 7/265 [00:02<01:52, 2.29it/s]\u001b[A\n 3%|ββ | 8/265 [00:03<01:53, 2.25it/s]\u001b[A\n 3%|ββ | 9/265 [00:03<01:54, 2.23it/s]\u001b[A\n 4%|ββ | 10/265 [00:04<01:55, 2.21it/s]\u001b[A\n 4%|ββ | 11/265 [00:04<01:55, 2.20it/s]\u001b[A\n 5%|ββ | 12/265 [00:05<01:56, 2.18it/s]\u001b[A\n 5%|ββ | 13/265 [00:05<01:55, 2.18it/s]\u001b[A\n 5%|βββ | 14/265 [00:05<01:56, 2.16it/s]\u001b[A\n 6%|βββ | 15/265 [00:06<01:55, 2.16it/s]\u001b[A\n 6%|βββ | 16/265 [00:06<01:55, 2.16it/s]\u001b[A\n 6%|βββ | 17/265 [00:07<01:54, 2.16it/s]\u001b[A\n 7%|βββ | 18/265 [00:07<01:54, 2.15it/s]\u001b[A\n 7%|βββ | 19/265 [00:08<01:53, 2.16it/s]\u001b[A\n 8%|ββββ | 20/265 [00:08<01:54, 2.15it/s]\u001b[A\n 8%|ββββ | 21/265 [00:09<01:53, 2.16it/s]\u001b[A\n 8%|ββββ | 22/265 [00:09<01:52, 2.16it/s]\u001b[A\n 9%|ββββ | 23/265 [00:10<01:51, 2.17it/s]\u001b[A\n 9%|ββββ | 24/265 [00:10<01:51, 2.17it/s]\u001b[A\n 9%|ββββ | 25/265 [00:11<01:51, 2.16it/s]\u001b[A\n 10%|ββββ | 26/265 [00:11<01:50, 2.16it/s]\u001b[A\n 10%|βββββ | 27/265 [00:12<01:50, 2.16it/s]\u001b[A\n 11%|βββββ | 28/265 [00:12<01:48, 2.17it/s]\u001b[A\n 11%|βββββ | 29/265 [00:12<01:48, 2.17it/s]\u001b[A\n 11%|βββββ | 30/265 [00:13<01:48, 2.16it/s]\u001b[A\n 12%|βββββ | 31/265 [00:13<01:48, 2.15it/s]\u001b[A\n 12%|βββββ | 32/265 [00:14<01:47, 2.16it/s]\u001b[A\n 12%|ββββββ | 33/265 [00:14<01:47, 2.16it/s]\u001b[A\n 13%|ββββββ | 34/265 [00:15<01:46, 2.16it/s]\u001b[A\n 13%|ββββββ | 35/265 [00:15<01:46, 2.17it/s]\u001b[A\n 14%|ββββββ | 36/265 [00:16<01:45, 2.17it/s]\u001b[A\n 14%|ββββββ | 37/265 [00:16<01:45, 2.17it/s]\u001b[A\n 14%|ββββββ | 38/265 [00:17<01:44, 2.17it/s]\u001b[A\n 15%|βββββββ | 39/265 [00:17<01:43, 2.18it/s]\u001b[A\n 15%|βββββββ | 40/265 [00:18<01:43, 2.17it/s]\u001b[A\n 15%|βββββββ | 41/265 [00:18<01:43, 2.16it/s]\u001b[A\n 16%|βββββββ | 42/265 [00:18<01:43, 2.15it/s]\u001b[A\n 16%|βββββββ | 43/265 [00:19<01:43, 2.15it/s]\u001b[A\n 17%|βββββββ | 44/265 [00:19<01:42, 2.15it/s]\u001b[A\n 17%|ββββββββ | 45/265 [00:20<01:41, 2.16it/s]\u001b[A\n 17%|ββββββββ | 46/265 [00:20<01:41, 2.16it/s]\u001b[A\n 18%|ββββββββ | 47/265 [00:21<01:40, 2.17it/s]\u001b[A\n 18%|ββββββββ | 48/265 [00:21<01:39, 2.17it/s]\u001b[A\n 18%|ββββββββ | 49/265 [00:22<01:39, 2.16it/s]\u001b[A\n 19%|ββββββββ | 50/265 [00:22<01:39, 2.15it/s]\u001b[A\n 19%|ββββββββ | 51/265 [00:23<01:39, 2.16it/s]\u001b[A\n 20%|βββββββββ | 52/265 [00:23<01:38, 2.16it/s]\u001b[A\n 20%|βββββββββ | 53/265 [00:24<01:38, 2.16it/s]\u001b[A\n 20%|βββββββββ | 54/265 [00:24<01:38, 2.15it/s]\u001b[A\n 21%|βββββββββ | 55/265 [00:24<01:37, 2.16it/s]\u001b[A\n 21%|βββββββββ | 56/265 [00:25<01:36, 2.17it/s]\u001b[A\n 22%|βββββββββ | 57/265 [00:25<01:36, 2.16it/s]\u001b[A\n 22%|ββββββββββ | 58/265 [00:26<01:35, 2.16it/s]\u001b[A\n 22%|ββββββββββ | 59/265 [00:26<01:35, 2.16it/s]\u001b[A\n 23%|ββββββββββ | 60/265 [00:27<01:34, 2.16it/s]\u001b[A\n 23%|ββββββββββ | 61/265 [00:27<01:33, 2.17it/s]\u001b[A\n 23%|ββββββββββ | 62/265 [00:28<01:33, 2.16it/s]\u001b[A\n 24%|ββββββββββ | 63/265 [00:28<01:33, 2.16it/s]\u001b[A\n 24%|βββββββββββ | 64/265 [00:29<01:32, 2.16it/s]\u001b[A\n 25%|βββββββββββ | 65/265 [00:29<01:32, 2.16it/s]\u001b[A\n 25%|βββββββββββ | 66/265 [00:30<01:32, 2.15it/s]\u001b[A\n 25%|βββββββββββ | 67/265 [00:30<01:31, 2.16it/s]\u001b[A\n 26%|βββββββββββ | 68/265 [00:30<01:31, 2.16it/s]\u001b[A\n 26%|βββββββββββ | 69/265 [00:31<01:30, 2.17it/s]\u001b[A\n 26%|βββββββββββ | 70/265 [00:31<01:30, 2.16it/s]\u001b[A\n 27%|ββββββββββββ | 71/265 [00:32<01:30, 2.16it/s]\u001b[A\n 27%|ββββββββββββ | 72/265 [00:32<01:29, 2.16it/s]\u001b[A\n 28%|ββββββββββββ | 73/265 [00:33<01:28, 2.16it/s]\u001b[A\n 28%|ββββββββββββ | 74/265 [00:33<01:28, 2.15it/s]\u001b[A\n 28%|ββββββββββββ | 75/265 [00:34<01:28, 2.14it/s]\u001b[A\n 29%|ββββββββββββ | 76/265 [00:34<01:27, 2.16it/s]\u001b[A\n 29%|βββββββββββββ | 77/265 [00:35<01:27, 2.16it/s]\u001b[A\n 29%|βββββββββββββ | 78/265 [00:35<01:26, 2.16it/s]\u001b[A\n 30%|βββββββββββββ | 79/265 [00:36<01:26, 2.15it/s]\u001b[A\n 30%|βββββββββββββ | 80/265 [00:36<01:25, 2.16it/s]\u001b[A\n 31%|βββββββββββββ | 81/265 [00:37<01:25, 2.15it/s]\u001b[A\n 31%|βββββββββββββ | 82/265 [00:37<01:24, 2.17it/s]\u001b[A\n 31%|ββββββββββββββ | 83/265 [00:37<01:24, 2.15it/s]\u001b[A\n 32%|ββββββββββββββ | 84/265 [00:38<01:23, 2.16it/s]\u001b[A\n 32%|ββββββββββββββ | 85/265 [00:38<01:23, 2.16it/s]\u001b[A\n 32%|ββββββββββββββ | 86/265 [00:39<01:22, 2.16it/s]\u001b[A\n 33%|ββββββββββββββ | 87/265 [00:39<01:22, 2.15it/s]\u001b[A\n 33%|ββββββββββββββ | 88/265 [00:40<01:22, 2.16it/s]\u001b[A\n 34%|ββββββββββββββ | 89/265 [00:40<01:21, 2.16it/s]\u001b[A\n 34%|βββββββββββββββ | 90/265 [00:41<01:20, 2.16it/s]\u001b[A\n 34%|βββββββββββββββ | 91/265 [00:41<01:20, 2.16it/s]\u001b[A\n 35%|βββββββββββββββ | 92/265 [00:42<01:19, 2.17it/s]\u001b[A\n 35%|βββββββββββββββ | 93/265 [00:42<01:19, 2.16it/s]\u001b[A\n 35%|βββββββββββββββ | 94/265 [00:43<01:19, 2.15it/s]\u001b[A\n 36%|βββββββββββββββ | 95/265 [00:43<01:18, 2.16it/s]\u001b[A\n 36%|ββββββββββββββββ | 96/265 [00:43<01:18, 2.16it/s]\u001b[A\n 37%|ββββββββββββββββ | 97/265 [00:44<01:17, 2.15it/s]\u001b[A\n 37%|ββββββββββββββββ | 98/265 [00:44<01:17, 2.14it/s]\u001b[A\n 37%|ββββββββββββββββ | 99/265 [00:45<01:17, 2.15it/s]\u001b[A\n 38%|ββββββββββββββββ | 100/265 [00:45<01:17, 2.14it/s]\u001b[A\n 38%|ββββββββββββββββ | 101/265 [00:46<01:16, 2.15it/s]\u001b[A\n 38%|ββββββββββββββββ | 102/265 [00:46<01:15, 2.16it/s]\u001b[A\n 39%|ββββββββββββββββ | 103/265 [00:47<01:14, 2.16it/s]\u001b[A\n 39%|ββββββββββββββββ | 104/265 [00:47<01:14, 2.16it/s]\u001b[A\n 40%|βββββββββββββββββ | 105/265 [00:48<01:14, 2.16it/s]\u001b[A\n 40%|βββββββββββββββββ | 106/265 [00:48<01:13, 2.16it/s]\u001b[A\n 40%|βββββββββββββββββ | 107/265 [00:49<01:13, 2.16it/s]\u001b[A\n 41%|βββββββββββββββββ | 108/265 [00:49<01:12, 2.15it/s]\u001b[A\n 41%|βββββββββββββββββ | 109/265 [00:50<01:12, 2.14it/s]\u001b[A\n 42%|βββββββββββββββββ | 110/265 [00:50<01:12, 2.14it/s]\u001b[A\n 42%|ββββββββββββββββββ | 111/265 [00:50<01:11, 2.15it/s]\u001b[A\n 42%|ββββββββββββββββββ | 112/265 [00:51<01:11, 2.15it/s]\u001b[A\n 43%|ββββββββββββββββββ | 113/265 [00:51<01:10, 2.16it/s]\u001b[A\n 43%|ββββββββββββββββββ | 114/265 [00:52<01:09, 2.16it/s]\u001b[A\n 43%|ββββββββββββββββββ | 115/265 [00:52<01:09, 2.16it/s]\u001b[A\n 44%|ββββββββββββββββββ | 116/265 [00:53<01:09, 2.15it/s]\u001b[A\n 44%|ββββββββββββββββββ | 117/265 [00:53<01:08, 2.16it/s]\u001b[A\n 45%|βββββββββββββββββββ | 118/265 [00:54<01:08, 2.16it/s]\u001b[A\n 45%|βββββββββββββββββββ | 119/265 [00:54<01:08, 2.14it/s]\u001b[A\n 45%|βββββββββββββββββββ | 120/265 [00:55<01:07, 2.16it/s]\u001b[A\n 46%|βββββββββββββββββββ | 121/265 [00:55<01:06, 2.15it/s]\u001b[A\n 46%|βββββββββββββββββββ | 122/265 [00:56<01:06, 2.15it/s]\u001b[A\n 46%|βββββββββββββββββββ | 123/265 [00:56<01:05, 2.16it/s]\u001b[A\n 47%|ββββββββββββββββββββ | 124/265 [00:56<01:05, 2.17it/s]\u001b[A\n 47%|ββββββββββββββββββββ | 125/265 [00:57<01:04, 2.16it/s]\u001b[A\n 48%|ββββββββββββββββββββ | 126/265 [00:57<01:04, 2.16it/s]\u001b[A\n 48%|ββββββββββββββββββββ | 127/265 [00:58<01:03, 2.16it/s]\u001b[A\n 48%|ββββββββββββββββββββ | 128/265 [00:58<01:03, 2.17it/s]\u001b[A\n 49%|ββββββββββββββββββββ | 129/265 [00:59<01:02, 2.17it/s]\u001b[A\n 49%|ββββββββββββββββββββ | 130/265 [00:59<01:02, 2.16it/s]\u001b[A\n 49%|βββββββββββββββββββββ | 131/265 [01:00<01:01, 2.16it/s]\u001b[A\n 50%|βββββββββββββββββββββ | 132/265 [01:00<01:01, 2.17it/s]\u001b[A\n 50%|βββββββββββββββββββββ | 133/265 [01:01<01:00, 2.17it/s]\u001b[A\n 51%|βββββββββββββββββββββ | 134/265 [01:01<01:00, 2.16it/s]\u001b[A\n 51%|βββββββββββββββββββββ | 135/265 [01:02<01:00, 2.17it/s]\u001b[A\n 51%|βββββββββββββββββββββ | 136/265 [01:02<00:59, 2.17it/s]\u001b[A\n 52%|ββββββββββββββββββββββ | 137/265 [01:02<00:58, 2.17it/s]\u001b[A\n 52%|ββββββββββββββββββββββ | 138/265 [01:03<00:58, 2.16it/s]\u001b[A\n 52%|ββββββββββββββββββββββ | 139/265 [01:03<00:58, 2.17it/s]\u001b[A\n 53%|ββββββββββββββββββββββ | 140/265 [01:04<00:57, 2.17it/s]\u001b[A\n 53%|ββββββββββββββββββββββ | 141/265 [01:04<00:57, 2.17it/s]\u001b[A\n 54%|ββββββββββββββββββββββ | 142/265 [01:05<00:56, 2.17it/s]\u001b[A\n 54%|ββββββββββββββββββββββ | 143/265 [01:05<00:56, 2.16it/s]\u001b[A\n 54%|βββββββββββββββββββββββ | 144/265 [01:06<00:56, 2.16it/s]\u001b[A\n 55%|βββββββββββββββββββββββ | 145/265 [01:06<00:55, 2.16it/s]\u001b[A\n 55%|βββββββββββββββββββββββ | 146/265 [01:07<00:54, 2.17it/s]\u001b[A\n 55%|βββββββββββββββββββββββ | 147/265 [01:07<00:54, 2.17it/s]\u001b[A\n 56%|βββββββββββββββββββββββ | 148/265 [01:08<00:54, 2.16it/s]\u001b[A\n 56%|βββββββββββββββββββββββ | 149/265 [01:08<00:53, 2.16it/s]\u001b[A\n 57%|ββββββββββββββββββββββββ | 150/265 [01:08<00:53, 2.15it/s]\u001b[A\n 57%|ββββββββββββββββββββββββ | 151/265 [01:09<00:53, 2.15it/s]\u001b[A\n 57%|ββββββββββββββββββββββββ | 152/265 [01:09<00:52, 2.15it/s]\u001b[A\n 58%|ββββββββββββββββββββββββ | 153/265 [01:10<00:51, 2.16it/s]\u001b[A\n 58%|ββββββββββββββββββββββββ | 154/265 [01:10<00:51, 2.14it/s]\u001b[A\n 58%|ββββββββββββββββββββββββ | 155/265 [01:11<00:51, 2.15it/s]\u001b[A\n 59%|βββββββββββββββββββββββββ | 156/265 [01:11<00:50, 2.15it/s]\u001b[A\n 59%|βββββββββββββββββββββββββ | 157/265 [01:12<00:50, 2.15it/s]\u001b[A\n 60%|βββββββββββββββββββββββββ | 158/265 [01:12<00:49, 2.15it/s]\u001b[A\n 60%|βββββββββββββββββββββββββ | 159/265 [01:13<00:49, 2.16it/s]\u001b[A\n 60%|βββββββββββββββββββββββββ | 160/265 [01:13<00:48, 2.15it/s]\u001b[A\n 61%|βββββββββββββββββββββββββ | 161/265 [01:14<00:47, 2.17it/s]\u001b[A\n 61%|βββββββββββββββββββββββββ | 162/265 [01:14<00:47, 2.15it/s]\u001b[A\n 62%|ββββββββββββββββββββββββββ | 163/265 [01:15<00:47, 2.16it/s]\u001b[A\n 62%|ββββββββββββββββββββββββββ | 164/265 [01:15<00:47, 2.15it/s]\u001b[A\n 62%|ββββββββββββββββββββββββββ | 165/265 [01:15<00:46, 2.16it/s]\u001b[A\n 63%|ββββββββββββββββββββββββββ | 166/265 [01:16<00:46, 2.15it/s]\u001b[A\n 63%|ββββββββββββββββββββββββββ | 167/265 [01:16<00:45, 2.14it/s]\u001b[A\n 63%|ββββββββββββββββββββββββββ | 168/265 [01:17<00:44, 2.16it/s]\u001b[A\n 64%|βββββββββββββββββββββββββββ | 169/265 [01:17<00:44, 2.16it/s]\u001b[A\n 64%|βββββββββββββββββββββββββββ | 170/265 [01:18<00:44, 2.15it/s]\u001b[A\n 65%|βββββββββββββββββββββββββββ | 171/265 [01:18<00:44, 2.13it/s]\u001b[A\n 65%|βββββββββββββββββββββββββββ | 172/265 [01:19<00:43, 2.16it/s]\u001b[A\n 65%|βββββββββββββββββββββββββββ | 173/265 [01:19<00:42, 2.16it/s]\u001b[A\n 66%|βββββββββββββββββββββββββββ | 174/265 [01:20<00:42, 2.15it/s]\u001b[A\n 66%|βββββββββββββββββββββββββββ | 175/265 [01:20<00:41, 2.16it/s]\u001b[A\n 66%|ββββββββββββββββββββββββββββ | 176/265 [01:21<00:41, 2.17it/s]\u001b[A\n 67%|ββββββββββββββββββββββββββββ | 177/265 [01:21<00:40, 2.17it/s]\u001b[A\n 67%|ββββββββββββββββββββββββββββ | 178/265 [01:21<00:40, 2.16it/s]\u001b[A\n 68%|ββββββββββββββββββββββββββββ | 179/265 [01:22<00:39, 2.16it/s]\u001b[A\n 68%|ββββββββββββββββββββββββββββ | 180/265 [01:22<00:39, 2.15it/s]\u001b[A\n 68%|ββββββββββββββββββββββββββββ | 181/265 [01:23<00:38, 2.16it/s]\u001b[A\n 69%|βββββββββββββββββββββββββββββ | 182/265 [01:23<00:38, 2.16it/s]\u001b[A\n 69%|βββββββββββββββββββββββββββββ | 183/265 [01:24<00:38, 2.15it/s]\u001b[A\n 69%|βββββββββββββββββββββββββββββ | 184/265 [01:24<00:37, 2.17it/s]\u001b[A\n 70%|βββββββββββββββββββββββββββββ | 185/265 [01:25<00:37, 2.16it/s]\u001b[A\n 70%|βββββββββββββββββββββββββββββ | 186/265 [01:25<00:36, 2.16it/s]\u001b[A\n 71%|βββββββββββββββββββββββββββββ | 187/265 [01:26<00:36, 2.16it/s]\u001b[A\n 71%|βββββββββββββββββββββββββββββ | 188/265 [01:26<00:35, 2.16it/s]\u001b[A\n 71%|ββββββββββββββββββββββββββββββ | 189/265 [01:27<00:35, 2.16it/s]\u001b[A\n 72%|ββββββββββββββββββββββββββββββ | 190/265 [01:27<00:34, 2.16it/s]\u001b[A\n 72%|ββββββββββββββββββββββββββββββ | 191/265 [01:27<00:34, 2.17it/s]\u001b[A\n 72%|ββββββββββββββββββββββββββββββ | 192/265 [01:28<00:33, 2.16it/s]\u001b[A\n 73%|ββββββββββββββββββββββββββββββ | 193/265 [01:28<00:33, 2.16it/s]\u001b[A\n 73%|ββββββββββββββββββββββββββββββ | 194/265 [01:29<00:32, 2.16it/s]\u001b[A\n 74%|βββββββββββββββββββββββββββββββ | 195/265 [01:29<00:32, 2.16it/s]\u001b[A\n 74%|βββββββββββββββββββββββββββββββ | 196/265 [01:30<00:31, 2.17it/s]\u001b[A\n 74%|βββββββββββββββββββββββββββββββ | 197/265 [01:30<00:31, 2.17it/s]\u001b[A\n 75%|βββββββββββββββββββββββββββββββ | 198/265 [01:31<00:30, 2.17it/s]\u001b[A\n 75%|βββββββββββββββββββββββββββββββ | 199/265 [01:31<00:30, 2.16it/s]\u001b[A\n 75%|βββββββββββββββββββββββββββββββ | 200/265 [01:32<00:30, 2.16it/s]\u001b[A\n 76%|βββββββββββββββββββββββββββββββ | 201/265 [01:32<00:29, 2.15it/s]\u001b[A\n 76%|ββββββββββββββββββββββββββββββββ | 202/265 [01:33<00:29, 2.16it/s]\u001b[A\n 77%|ββββββββββββββββββββββββββββββββ | 203/265 [01:33<00:28, 2.15it/s]\u001b[A\n 77%|ββββββββββββββββββββββββββββββββ | 204/265 [01:34<00:28, 2.17it/s]\u001b[A\n 77%|ββββββββββββββββββββββββββββββββ | 205/265 [01:34<00:27, 2.17it/s]\u001b[A\n 78%|ββββββββββββββββββββββββββββββββ | 206/265 [01:34<00:27, 2.16it/s]\u001b[A\n 78%|ββββββββββββββββββββββββββββββββ | 207/265 [01:35<00:26, 2.17it/s]\u001b[A\n 78%|βββββββββββββββββββββββββββββββββ | 208/265 [01:35<00:26, 2.17it/s]\u001b[A\n 79%|βββββββββββββββββββββββββββββββββ | 209/265 [01:36<00:25, 2.17it/s]\u001b[A\n 79%|βββββββββββββββββββββββββββββββββ | 210/265 [01:36<00:25, 2.17it/s]\u001b[A\n 80%|βββββββββββββββββββββββββββββββββ | 211/265 [01:37<00:24, 2.17it/s]\u001b[A\n 80%|βββββββββββββββββββββββββββββββββ | 212/265 [01:37<00:24, 2.16it/s]\u001b[A\n 80%|βββββββββββββββββββββββββββββββββ | 213/265 [01:38<00:24, 2.16it/s]\u001b[A\n 81%|βββββββββββββββββββββββββββββββββ | 214/265 [01:38<00:23, 2.16it/s]\u001b[A\n 81%|ββββββββββββββββββββββββββββββββββ | 215/265 [01:39<00:23, 2.16it/s]\u001b[A\n 82%|ββββββββββββββββββββββββββββββββββ | 216/265 [01:39<00:22, 2.16it/s]\u001b[A\n 82%|ββββββββββββββββββββββββββββββββββ | 217/265 [01:40<00:22, 2.17it/s]\u001b[A\n 82%|ββββββββββββββββββββββββββββββββββ | 218/265 [01:40<00:21, 2.15it/s]\u001b[A\n 83%|ββββββββββββββββββββββββββββββββββ | 219/265 [01:40<00:21, 2.15it/s]\u001b[A\n 83%|ββββββββββββββββββββββββββββββββββ | 220/265 [01:41<00:20, 2.16it/s]\u001b[A\n 83%|βββββββββββββββββββββββββββββββββββ | 221/265 [01:41<00:20, 2.16it/s]\u001b[A\n 84%|βββββββββββββββββββββββββββββββββββ | 222/265 [01:42<00:20, 2.14it/s]\u001b[A\n 84%|βββββββββββββββββββββββββββββββββββ | 223/265 [01:42<00:19, 2.15it/s]\u001b[A\n 85%|βββββββββββββββββββββββββββββββββββ | 224/265 [01:43<00:19, 2.15it/s]\u001b[A\n 85%|βββββββββββββββββββββββββββββββββββ | 225/265 [01:43<00:18, 2.15it/s]\u001b[A\n 85%|βββββββββββββββββββββββββββββββββββ | 226/265 [01:44<00:18, 2.15it/s]\u001b[A\n 86%|βββββββββββββββββββββββββββββββββββ | 227/265 [01:44<00:17, 2.15it/s]\u001b[A\n 86%|ββββββββββββββββββββββββββββββββββββ | 228/265 [01:45<00:17, 2.15it/s]\u001b[A\n 86%|ββββββββββββββββββββββββββββββββββββ | 229/265 [01:45<00:16, 2.14it/s]\u001b[A\n 87%|ββββββββββββββββββββββββββββββββββββ | 230/265 [01:46<00:16, 2.14it/s]\u001b[A\n 87%|ββββββββββββββββββββββββββββββββββββ | 231/265 [01:46<00:15, 2.15it/s]\u001b[A\n 88%|ββββββββββββββββββββββββββββββββββββ | 232/265 [01:46<00:15, 2.15it/s]\u001b[A\n 88%|ββββββββββββββββββββββββββββββββββββ | 233/265 [01:47<00:14, 2.15it/s]\u001b[A\n 88%|βββββββββββββββββββββββββββββββββββββ | 234/265 [01:47<00:14, 2.14it/s]\u001b[A\n 89%|βββββββββββββββββββββββββββββββββββββ | 235/265 [01:48<00:13, 2.16it/s]\u001b[A\n 89%|βββββββββββββββββββββββββββββββββββββ | 236/265 [01:48<00:13, 2.16it/s]\u001b[A\n 89%|βββββββββββββββββββββββββββββββββββββ | 237/265 [01:49<00:12, 2.16it/s]\u001b[A\n 90%|βββββββββββββββββββββββββββββββββββββ | 238/265 [01:49<00:12, 2.16it/s]\u001b[A\n 90%|βββββββββββββββββββββββββββββββββββββ | 239/265 [01:50<00:12, 2.15it/s]\u001b[A\n 91%|ββββββββββββββββββββββββββββββββββββββ | 240/265 [01:50<00:11, 2.16it/s]\u001b[A\n 91%|ββββββββββββββββββββββββββββββββββββββ | 241/265 [01:51<00:11, 2.15it/s]\u001b[A\n 91%|ββββββββββββββββββββββββββββββββββββββ | 242/265 [01:51<00:10, 2.15it/s]\u001b[A\n 92%|ββββββββββββββββββββββββββββββββββββββ | 243/265 [01:52<00:10, 2.16it/s]\u001b[A\n 92%|ββββββββββββββββββββββββββββββββββββββ | 244/265 [01:52<00:09, 2.15it/s]\u001b[A\n 92%|ββββββββββββββββββββββββββββββββββββββ | 245/265 [01:53<00:09, 2.16it/s]\u001b[A\n 93%|ββββββββββββββββββββββββββββββββββββββ | 246/265 [01:53<00:08, 2.16it/s]\u001b[A\n 93%|βββββββββββββββββββββββββββββββββββββββ | 247/265 [01:53<00:08, 2.17it/s]\u001b[A\n 94%|βββββββββββββββββββββββββββββββββββββββ | 248/265 [01:54<00:07, 2.16it/s]\u001b[A\n 94%|βββββββββββββββββββββββββββββββββββββββ | 249/265 [01:54<00:07, 2.16it/s]\u001b[A\n 94%|βββββββββββββββββββββββββββββββββββββββ | 250/265 [01:55<00:06, 2.15it/s]\u001b[A\n 95%|βββββββββββββββββββββββββββββββββββββββ | 251/265 [01:55<00:06, 2.15it/s]\u001b[A\n 95%|βββββββββββββββββββββββββββββββββββββββ | 252/265 [01:56<00:06, 2.16it/s]\u001b[A\n 95%|ββββββββββββββββββββββββββββββββββββββββ | 253/265 [01:56<00:05, 2.17it/s]\u001b[A\n 96%|ββββββββββββββββββββββββββββββββββββββββ | 254/265 [01:57<00:05, 2.16it/s]\u001b[A\n 96%|ββββββββββββββββββββββββββββββββββββββββ | 255/265 [01:57<00:04, 2.15it/s]\u001b[A\n 97%|ββββββββββββββββββββββββββββββββββββββββ | 256/265 [01:58<00:04, 2.16it/s]\u001b[A\n 97%|ββββββββββββββββββββββββββββββββββββββββ | 257/265 [01:58<00:03, 2.17it/s]\u001b[A\n 97%|ββββββββββββββββββββββββββββββββββββββββ | 258/265 [01:59<00:03, 2.17it/s]\u001b[A\n 98%|ββββββββββββββββββββββββββββββββββββββββ | 259/265 [01:59<00:02, 2.17it/s]\u001b[A\n 98%|βββββββββββββββββββββββββββββββββββββββββ| 260/265 [01:59<00:02, 2.16it/s]\u001b[A\n 98%|βββββββββββββββββββββββββββββββββββββββββ| 261/265 [02:00<00:01, 2.17it/s]\u001b[A\n 99%|βββββββββββββββββββββββββββββββββββββββββ| 262/265 [02:00<00:01, 2.17it/s]\u001b[A\n 99%|βββββββββββββββββββββββββββββββββββββββββ| 263/265 [02:01<00:00, 2.17it/s]\u001b[A\n100%|βββββββββββββββββββββββββββββββββββββββββ| 264/265 [02:01<00:00, 2.17it/s]\u001b[A\n \u001b[A\n\u001b[A{'eval_loss': '3.606', 'eval_runtime': '124', 'eval_samples_per_second': '545.7', 'eval_steps_per_second': '2.136', 'epoch': '0.2986'}\n 10%|ββββ | 1500/15000 [33:31<4:43:51, 1.26s/it]\n100%|βββββββββββββββββββββββββββββββββββββββββ| 265/265 [02:02<00:00, 1.87it/s]\u001b[A\n{'loss': '3.591', 'grad_norm': '0.5037', 'learning_rate': '0.0009955', 'epoch': '0.3185'}\n{'loss': '3.562', 'grad_norm': '0.4682', 'learning_rate': '0.0009939', 'epoch': '0.3384'}\n{'loss': '3.54', 'grad_norm': '0.5063', 'learning_rate': '0.000992', 'epoch': '0.3583'}\n{'loss': '3.516', 'grad_norm': '0.5929', 'learning_rate': '0.0009899', 'epoch': '0.3782'}\n{'loss': '3.496', 'grad_norm': '0.565', 'learning_rate': '0.0009875', 'epoch': '0.3981'}\n{'loss': '3.479', 'grad_norm': '0.486', 'learning_rate': '0.0009849', 'epoch': '0.418'}\n{'loss': '3.464', 'grad_norm': '0.5579', 'learning_rate': '0.000982', 'epoch': '0.4379'}\n{'loss': '3.452', 'grad_norm': '0.4419', 'learning_rate': '0.0009789', 'epoch': '0.4578'}\n{'loss': '3.44', 'grad_norm': '0.5291', 'learning_rate': '0.0009756', 'epoch': '0.4777'}\n{'loss': '3.432', 'grad_norm': '0.463', 'learning_rate': '0.000972', 'epoch': '0.4976'}\n{'loss': '3.417', 'grad_norm': '0.4299', 'learning_rate': '0.0009682', 'epoch': '0.5175'}\n{'loss': '3.409', 'grad_norm': '0.4442', 'learning_rate': '0.0009641', 'epoch': '0.5374'}\n{'loss': '3.401', 'grad_norm': '0.4859', 'learning_rate': '0.0009598', 'epoch': '0.5573'}\n{'loss': '3.391', 'grad_norm': '0.4831', 'learning_rate': '0.0009553', 'epoch': '0.5772'}\n{'loss': '3.383', 'grad_norm': '0.4141', 'learning_rate': '0.0009505', 'epoch': '0.5971'}\n 20%|βββββββ | 3000/15000 [1:05:03<4:12:24, 1.26s/it]\n 0%| | 0/265 [00:00<?, ?it/s]\u001b[A\n 1%|β | 2/265 [00:00<01:01, 4.30it/s]\u001b[A\n 1%|β | 3/265 [00:00<01:26, 3.03it/s]\u001b[A\n 2%|β | 4/265 [00:01<01:38, 2.65it/s]\u001b[A\n 2%|β | 5/265 [00:01<01:45, 2.46it/s]\u001b[A\n 2%|β | 6/265 [00:02<01:50, 2.35it/s]\u001b[A\n 3%|ββ | 7/265 [00:02<01:53, 2.28it/s]\u001b[A\n 3%|ββ | 8/265 [00:03<01:54, 2.25it/s]\u001b[A\n 3%|ββ | 9/265 [00:03<01:55, 2.22it/s]\u001b[A\n 4%|ββ | 10/265 [00:04<01:56, 2.19it/s]\u001b[A\n 4%|ββ | 11/265 [00:04<01:56, 2.19it/s]\u001b[A\n 5%|ββ | 12/265 [00:05<01:55, 2.19it/s]\u001b[A\n 5%|ββ | 13/265 [00:05<01:55, 2.18it/s]\u001b[A\n 5%|βββ | 14/265 [00:06<01:55, 2.17it/s]\u001b[A\n 6%|βββ | 15/265 [00:06<01:54, 2.18it/s]\u001b[A\n 6%|βββ | 16/265 [00:06<01:54, 2.18it/s]\u001b[A\n 6%|βββ | 17/265 [00:07<01:54, 2.17it/s]\u001b[A\n 7%|βββ | 18/265 [00:07<01:53, 2.17it/s]\u001b[A\n 7%|βββ | 19/265 [00:08<01:52, 2.18it/s]\u001b[A\n 8%|ββββ | 20/265 [00:08<01:52, 2.17it/s]\u001b[A\n 8%|ββββ | 21/265 [00:09<01:52, 2.17it/s]\u001b[A\n 8%|ββββ | 22/265 [00:09<01:51, 2.17it/s]\u001b[A\n 9%|ββββ | 23/265 [00:10<01:51, 2.17it/s]\u001b[A\n 9%|ββββ | 24/265 [00:10<01:51, 2.17it/s]\u001b[A\n 9%|ββββ | 25/265 [00:11<01:50, 2.18it/s]\u001b[A\n 10%|ββββ | 26/265 [00:11<01:49, 2.18it/s]\u001b[A\n 10%|βββββ | 27/265 [00:11<01:49, 2.17it/s]\u001b[A\n 11%|βββββ | 28/265 [00:12<01:49, 2.17it/s]\u001b[A\n 11%|βββββ | 29/265 [00:12<01:48, 2.17it/s]\u001b[A\n 11%|βββββ | 30/265 [00:13<01:47, 2.18it/s]\u001b[A\n 12%|βββββ | 31/265 [00:13<01:47, 2.17it/s]\u001b[A\n 12%|βββββ | 32/265 [00:14<01:47, 2.17it/s]\u001b[A\n 12%|ββββββ | 33/265 [00:14<01:46, 2.17it/s]\u001b[A\n 13%|ββββββ | 34/265 [00:15<01:46, 2.17it/s]\u001b[A\n 13%|ββββββ | 35/265 [00:15<01:46, 2.16it/s]\u001b[A\n 14%|ββββββ | 36/265 [00:16<01:45, 2.17it/s]\u001b[A\n 14%|ββββββ | 37/265 [00:16<01:45, 2.17it/s]\u001b[A\n 14%|ββββββ | 38/265 [00:17<01:45, 2.16it/s]\u001b[A\n 15%|βββββββ | 39/265 [00:17<01:44, 2.15it/s]\u001b[A\n 15%|βββββββ | 40/265 [00:17<01:43, 2.17it/s]\u001b[A\n 15%|βββββββ | 41/265 [00:18<01:43, 2.17it/s]\u001b[A\n 16%|βββββββ | 42/265 [00:18<01:43, 2.16it/s]\u001b[A\n 16%|βββββββ | 43/265 [00:19<01:43, 2.15it/s]\u001b[A\n 17%|βββββββ | 44/265 [00:19<01:42, 2.17it/s]\u001b[A\n 17%|ββββββββ | 45/265 [00:20<01:41, 2.17it/s]\u001b[A\n 17%|ββββββββ | 46/265 [00:20<01:41, 2.16it/s]\u001b[A\n 18%|ββββββββ | 47/265 [00:21<01:41, 2.16it/s]\u001b[A\n 18%|ββββββββ | 48/265 [00:21<01:40, 2.16it/s]\u001b[A\n 18%|ββββββββ | 49/265 [00:22<01:39, 2.17it/s]\u001b[A\n 19%|ββββββββ | 50/265 [00:22<01:39, 2.15it/s]\u001b[A\n 19%|ββββββββ | 51/265 [00:23<01:39, 2.15it/s]\u001b[A\n 20%|βββββββββ | 52/265 [00:23<01:38, 2.16it/s]\u001b[A\n 20%|βββββββββ | 53/265 [00:24<01:37, 2.16it/s]\u001b[A\n 20%|βββββββββ | 54/265 [00:24<01:37, 2.15it/s]\u001b[A\n 21%|βββββββββ | 55/265 [00:24<01:37, 2.16it/s]\u001b[A\n 21%|βββββββββ | 56/265 [00:25<01:36, 2.16it/s]\u001b[A\n 22%|βββββββββ | 57/265 [00:25<01:35, 2.17it/s]\u001b[A\n 22%|ββββββββββ | 58/265 [00:26<01:35, 2.17it/s]\u001b[A\n 22%|ββββββββββ | 59/265 [00:26<01:35, 2.15it/s]\u001b[A\n 23%|ββββββββββ | 60/265 [00:27<01:35, 2.15it/s]\u001b[A\n 23%|ββββββββββ | 61/265 [00:27<01:34, 2.16it/s]\u001b[A\n 23%|ββββββββββ | 62/265 [00:28<01:34, 2.16it/s]\u001b[A\n 24%|ββββββββββ | 63/265 [00:28<01:34, 2.14it/s]\u001b[A\n 24%|βββββββββββ | 64/265 [00:29<01:33, 2.16it/s]\u001b[A\n 25%|βββββββββββ | 65/265 [00:29<01:32, 2.16it/s]\u001b[A\n 25%|βββββββββββ | 66/265 [00:30<01:32, 2.15it/s]\u001b[A\n 25%|βββββββββββ | 67/265 [00:30<01:31, 2.15it/s]\u001b[A\n 26%|βββββββββββ | 68/265 [00:30<01:31, 2.16it/s]\u001b[A\n 26%|βββββββββββ | 69/265 [00:31<01:30, 2.16it/s]\u001b[A\n 26%|βββββββββββ | 70/265 [00:31<01:29, 2.17it/s]\u001b[A\n 27%|ββββββββββββ | 71/265 [00:32<01:30, 2.15it/s]\u001b[A\n 27%|ββββββββββββ | 72/265 [00:32<01:29, 2.17it/s]\u001b[A\n 28%|ββββββββββββ | 73/265 [00:33<01:29, 2.15it/s]\u001b[A\n 28%|ββββββββββββ | 74/265 [00:33<01:28, 2.16it/s]\u001b[A\n 28%|ββββββββββββ | 75/265 [00:34<01:28, 2.16it/s]\u001b[A\n 29%|ββββββββββββ | 76/265 [00:34<01:27, 2.16it/s]\u001b[A\n 29%|βββββββββββββ | 77/265 [00:35<01:27, 2.15it/s]\u001b[A\n 29%|βββββββββββββ | 78/265 [00:35<01:27, 2.15it/s]\u001b[A\n 30%|βββββββββββββ | 79/265 [00:36<01:26, 2.16it/s]\u001b[A\n 30%|βββββββββββββ | 80/265 [00:36<01:25, 2.16it/s]\u001b[A\n 31%|βββββββββββββ | 81/265 [00:36<01:25, 2.16it/s]\u001b[A\n 31%|βββββββββββββ | 82/265 [00:37<01:24, 2.16it/s]\u001b[A\n 31%|ββββββββββββββ | 83/265 [00:37<01:23, 2.17it/s]\u001b[A\n 32%|ββββββββββββββ | 84/265 [00:38<01:23, 2.16it/s]\u001b[A\n 32%|ββββββββββββββ | 85/265 [00:38<01:23, 2.17it/s]\u001b[A\n 32%|ββββββββββββββ | 86/265 [00:39<01:22, 2.17it/s]\u001b[A\n 33%|ββββββββββββββ | 87/265 [00:39<01:22, 2.17it/s]\u001b[A\n 33%|ββββββββββββββ | 88/265 [00:40<01:22, 2.15it/s]\u001b[A\n 34%|ββββββββββββββ | 89/265 [00:40<01:21, 2.15it/s]\u001b[A\n 34%|βββββββββββββββ | 90/265 [00:41<01:21, 2.15it/s]\u001b[A\n 34%|βββββββββββββββ | 91/265 [00:41<01:20, 2.16it/s]\u001b[A\n 35%|βββββββββββββββ | 92/265 [00:42<01:20, 2.15it/s]\u001b[A\n 35%|βββββββββββββββ | 93/265 [00:42<01:19, 2.15it/s]\u001b[A\n 35%|βββββββββββββββ | 94/265 [00:43<01:19, 2.15it/s]\u001b[A\n 36%|βββββββββββββββ | 95/265 [00:43<01:19, 2.15it/s]\u001b[A\n 36%|ββββββββββββββββ | 96/265 [00:43<01:18, 2.15it/s]\u001b[A\n 37%|ββββββββββββββββ | 97/265 [00:44<01:17, 2.15it/s]\u001b[A\n 37%|ββββββββββββββββ | 98/265 [00:44<01:17, 2.16it/s]\u001b[A\n 37%|ββββββββββββββββ | 99/265 [00:45<01:16, 2.16it/s]\u001b[A\n 38%|ββββββββββββββββ | 100/265 [00:45<01:16, 2.15it/s]\u001b[A\n 38%|ββββββββββββββββ | 101/265 [00:46<01:15, 2.16it/s]\u001b[A\n 38%|ββββββββββββββββ | 102/265 [00:46<01:15, 2.16it/s]\u001b[A\n 39%|ββββββββββββββββ | 103/265 [00:47<01:15, 2.16it/s]\u001b[A\n 39%|ββββββββββββββββ | 104/265 [00:47<01:15, 2.15it/s]\u001b[A\n 40%|βββββββββββββββββ | 105/265 [00:48<01:14, 2.15it/s]\u001b[A\n 40%|βββββββββββββββββ | 106/265 [00:48<01:13, 2.15it/s]\u001b[A\n 40%|βββββββββββββββββ | 107/265 [00:49<01:13, 2.15it/s]\u001b[A\n 41%|βββββββββββββββββ | 108/265 [00:49<01:12, 2.16it/s]\u001b[A\n 41%|βββββββββββββββββ | 109/265 [00:49<01:12, 2.16it/s]\u001b[A\n 42%|βββββββββββββββββ | 110/265 [00:50<01:12, 2.14it/s]\u001b[A\n 42%|ββββββββββββββββββ | 111/265 [00:50<01:11, 2.15it/s]\u001b[A\n 42%|ββββββββββββββββββ | 112/265 [00:51<01:11, 2.15it/s]\u001b[A\n 43%|ββββββββββββββββββ | 113/265 [00:51<01:10, 2.16it/s]\u001b[A\n 43%|ββββββββββββββββββ | 114/265 [00:52<01:09, 2.16it/s]\u001b[A\n 43%|ββββββββββββββββββ | 115/265 [00:52<01:09, 2.16it/s]\u001b[A\n 44%|ββββββββββββββββββ | 116/265 [00:53<01:08, 2.16it/s]\u001b[A\n 44%|ββββββββββββββββββ | 117/265 [00:53<01:08, 2.16it/s]\u001b[A\n 45%|βββββββββββββββββββ | 118/265 [00:54<01:07, 2.16it/s]\u001b[A\n 45%|βββββββββββββββββββ | 119/265 [00:54<01:07, 2.15it/s]\u001b[A\n 45%|βββββββββββββββββββ | 120/265 [00:55<01:07, 2.16it/s]\u001b[A\n 46%|βββββββββββββββββββ | 121/265 [00:55<01:06, 2.16it/s]\u001b[A\n 46%|βββββββββββββββββββ | 122/265 [00:55<01:06, 2.16it/s]\u001b[A\n 46%|βββββββββββββββββββ | 123/265 [00:56<01:05, 2.16it/s]\u001b[A\n 47%|ββββββββββββββββββββ | 124/265 [00:56<01:05, 2.15it/s]\u001b[A\n 47%|ββββββββββββββββββββ | 125/265 [00:57<01:04, 2.16it/s]\u001b[A\n 48%|ββββββββββββββββββββ | 126/265 [00:57<01:04, 2.15it/s]\u001b[A\n 48%|ββββββββββββββββββββ | 127/265 [00:58<01:04, 2.15it/s]\u001b[A\n 48%|ββββββββββββββββββββ | 128/265 [00:58<01:03, 2.15it/s]\u001b[A\n 49%|ββββββββββββββββββββ | 129/265 [00:59<01:03, 2.15it/s]\u001b[A\n 49%|ββββββββββββββββββββ | 130/265 [00:59<01:02, 2.15it/s]\u001b[A\n 49%|βββββββββββββββββββββ | 131/265 [01:00<01:02, 2.15it/s]\u001b[A\n 50%|βββββββββββββββββββββ | 132/265 [01:00<01:01, 2.15it/s]\u001b[A\n 50%|βββββββββββββββββββββ | 133/265 [01:01<01:01, 2.15it/s]\u001b[A\n 51%|βββββββββββββββββββββ | 134/265 [01:01<01:00, 2.15it/s]\u001b[A\n 51%|βββββββββββββββββββββ | 135/265 [01:02<01:00, 2.14it/s]\u001b[A\n 51%|βββββββββββββββββββββ | 136/265 [01:02<00:59, 2.15it/s]\u001b[A\n 52%|ββββββββββββββββββββββ | 137/265 [01:02<00:59, 2.15it/s]\u001b[A\n 52%|ββββββββββββββββββββββ | 138/265 [01:03<00:59, 2.15it/s]\u001b[A\n 52%|ββββββββββββββββββββββ | 139/265 [01:03<00:58, 2.16it/s]\u001b[A\n 53%|ββββββββββββββββββββββ | 140/265 [01:04<00:58, 2.15it/s]\u001b[A\n 53%|ββββββββββββββββββββββ | 141/265 [01:04<00:57, 2.16it/s]\u001b[A\n 54%|ββββββββββββββββββββββ | 142/265 [01:05<00:56, 2.16it/s]\u001b[A\n 54%|ββββββββββββββββββββββ | 143/265 [01:05<00:56, 2.16it/s]\u001b[A\n 54%|βββββββββββββββββββββββ | 144/265 [01:06<00:56, 2.15it/s]\u001b[A\n 55%|βββββββββββββββββββββββ | 145/265 [01:06<00:55, 2.16it/s]\u001b[A\n 55%|βββββββββββββββββββββββ | 146/265 [01:07<00:54, 2.17it/s]\u001b[A\n 55%|βββββββββββββββββββββββ | 147/265 [01:07<00:54, 2.16it/s]\u001b[A\n 56%|βββββββββββββββββββββββ | 148/265 [01:08<00:54, 2.16it/s]\u001b[A\n 56%|βββββββββββββββββββββββ | 149/265 [01:08<00:53, 2.16it/s]\u001b[A\n 57%|ββββββββββββββββββββββββ | 150/265 [01:08<00:53, 2.16it/s]\u001b[A\n 57%|ββββββββββββββββββββββββ | 151/265 [01:09<00:52, 2.16it/s]\u001b[A\n 57%|ββββββββββββββββββββββββ | 152/265 [01:09<00:52, 2.17it/s]\u001b[A\n 58%|ββββββββββββββββββββββββ | 153/265 [01:10<00:51, 2.17it/s]\u001b[A\n 58%|ββββββββββββββββββββββββ | 154/265 [01:10<00:51, 2.16it/s]\u001b[A\n 58%|ββββββββββββββββββββββββ | 155/265 [01:11<00:50, 2.16it/s]\u001b[A\n 59%|βββββββββββββββββββββββββ | 156/265 [01:11<00:50, 2.16it/s]\u001b[A\n 59%|βββββββββββββββββββββββββ | 157/265 [01:12<00:49, 2.16it/s]\u001b[A\n 60%|βββββββββββββββββββββββββ | 158/265 [01:12<00:49, 2.15it/s]\u001b[A\n 60%|βββββββββββββββββββββββββ | 159/265 [01:13<00:49, 2.16it/s]\u001b[A\n 60%|βββββββββββββββββββββββββ | 160/265 [01:13<00:48, 2.16it/s]\u001b[A\n 61%|βββββββββββββββββββββββββ | 161/265 [01:14<00:48, 2.17it/s]\u001b[A\n 61%|βββββββββββββββββββββββββ | 162/265 [01:14<00:47, 2.16it/s]\u001b[A\n 62%|ββββββββββββββββββββββββββ | 163/265 [01:15<00:46, 2.17it/s]\u001b[A\n 62%|ββββββββββββββββββββββββββ | 164/265 [01:15<00:46, 2.16it/s]\u001b[A\n 62%|ββββββββββββββββββββββββββ | 165/265 [01:15<00:46, 2.17it/s]\u001b[A\n 63%|ββββββββββββββββββββββββββ | 166/265 [01:16<00:45, 2.15it/s]\u001b[A\n 63%|ββββββββββββββββββββββββββ | 167/265 [01:16<00:45, 2.16it/s]\u001b[A\n 63%|ββββββββββββββββββββββββββ | 168/265 [01:17<00:44, 2.16it/s]\u001b[A\n 64%|βββββββββββββββββββββββββββ | 169/265 [01:17<00:44, 2.17it/s]\u001b[A\n 64%|βββββββββββββββββββββββββββ | 170/265 [01:18<00:43, 2.16it/s]\u001b[A\n 65%|βββββββββββββββββββββββββββ | 171/265 [01:18<00:43, 2.16it/s]\u001b[A\n 65%|βββββββββββββββββββββββββββ | 172/265 [01:19<00:43, 2.15it/s]\u001b[A\n 65%|βββββββββββββββββββββββββββ | 173/265 [01:19<00:42, 2.15it/s]\u001b[A\n 66%|βββββββββββββββββββββββββββ | 174/265 [01:20<00:42, 2.15it/s]\u001b[A\n 66%|βββββββββββββββββββββββββββ | 175/265 [01:20<00:41, 2.15it/s]\u001b[A\n 66%|ββββββββββββββββββββββββββββ | 176/265 [01:21<00:41, 2.15it/s]\u001b[A\n 67%|ββββββββββββββββββββββββββββ | 177/265 [01:21<00:40, 2.15it/s]\u001b[A\n 67%|ββββββββββββββββββββββββββββ | 178/265 [01:21<00:40, 2.16it/s]\u001b[A\n 68%|ββββββββββββββββββββββββββββ | 179/265 [01:22<00:39, 2.16it/s]\u001b[A\n 68%|ββββββββββββββββββββββββββββ | 180/265 [01:22<00:39, 2.16it/s]\u001b[A\n 68%|ββββββββββββββββββββββββββββ | 181/265 [01:23<00:39, 2.15it/s]\u001b[A\n 69%|βββββββββββββββββββββββββββββ | 182/265 [01:23<00:38, 2.15it/s]\u001b[A\n 69%|βββββββββββββββββββββββββββββ | 183/265 [01:24<00:38, 2.15it/s]\u001b[A\n 69%|βββββββββββββββββββββββββββββ | 184/265 [01:24<00:37, 2.15it/s]\u001b[A\n 70%|βββββββββββββββββββββββββββββ | 185/265 [01:25<00:37, 2.15it/s]\u001b[A\n 70%|βββββββββββββββββββββββββββββ | 186/265 [01:25<00:36, 2.15it/s]\u001b[A\n 71%|βββββββββββββββββββββββββββββ | 187/265 [01:26<00:36, 2.14it/s]\u001b[A\n 71%|βββββββββββββββββββββββββββββ | 188/265 [01:26<00:35, 2.16it/s]\u001b[A\n 71%|ββββββββββββββββββββββββββββββ | 189/265 [01:27<00:35, 2.15it/s]\u001b[A\n 72%|ββββββββββββββββββββββββββββββ | 190/265 [01:27<00:34, 2.15it/s]\u001b[A\n 72%|ββββββββββββββββββββββββββββββ | 191/265 [01:28<00:34, 2.14it/s]\u001b[A\n 72%|ββββββββββββββββββββββββββββββ | 192/265 [01:28<00:34, 2.15it/s]\u001b[A\n 73%|ββββββββββββββββββββββββββββββ | 193/265 [01:28<00:33, 2.15it/s]\u001b[A\n 73%|ββββββββββββββββββββββββββββββ | 194/265 [01:29<00:32, 2.16it/s]\u001b[A\n 74%|βββββββββββββββββββββββββββββββ | 195/265 [01:29<00:32, 2.15it/s]\u001b[A\n 74%|βββββββββββββββββββββββββββββββ | 196/265 [01:30<00:32, 2.15it/s]\u001b[A\n 74%|βββββββββββββββββββββββββββββββ | 197/265 [01:30<00:31, 2.15it/s]\u001b[A\n 75%|βββββββββββββββββββββββββββββββ | 198/265 [01:31<00:31, 2.16it/s]\u001b[A\n 75%|βββββββββββββββββββββββββββββββ | 199/265 [01:31<00:30, 2.16it/s]\u001b[A\n 75%|βββββββββββββββββββββββββββββββ | 200/265 [01:32<00:29, 2.17it/s]\u001b[A\n 76%|βββββββββββββββββββββββββββββββ | 201/265 [01:32<00:29, 2.15it/s]\u001b[A\n 76%|ββββββββββββββββββββββββββββββββ | 202/265 [01:33<00:29, 2.15it/s]\u001b[A\n 77%|ββββββββββββββββββββββββββββββββ | 203/265 [01:33<00:28, 2.16it/s]\u001b[A\n 77%|ββββββββββββββββββββββββββββββββ | 204/265 [01:34<00:28, 2.16it/s]\u001b[A\n 77%|ββββββββββββββββββββββββββββββββ | 205/265 [01:34<00:27, 2.16it/s]\u001b[A\n 78%|ββββββββββββββββββββββββββββββββ | 206/265 [01:34<00:27, 2.16it/s]\u001b[A\n 78%|ββββββββββββββββββββββββββββββββ | 207/265 [01:35<00:26, 2.16it/s]\u001b[A\n 78%|βββββββββββββββββββββββββββββββββ | 208/265 [01:35<00:26, 2.17it/s]\u001b[A\n 79%|βββββββββββββββββββββββββββββββββ | 209/265 [01:36<00:25, 2.16it/s]\u001b[A\n 79%|βββββββββββββββββββββββββββββββββ | 210/265 [01:36<00:25, 2.16it/s]\u001b[A\n 80%|βββββββββββββββββββββββββββββββββ | 211/265 [01:37<00:25, 2.16it/s]\u001b[A\n 80%|βββββββββββββββββββββββββββββββββ | 212/265 [01:37<00:24, 2.16it/s]\u001b[A\n 80%|βββββββββββββββββββββββββββββββββ | 213/265 [01:38<00:24, 2.16it/s]\u001b[A\n 81%|βββββββββββββββββββββββββββββββββ | 214/265 [01:38<00:23, 2.16it/s]\u001b[A\n 81%|ββββββββββββββββββββββββββββββββββ | 215/265 [01:39<00:23, 2.17it/s]\u001b[A\n 82%|ββββββββββββββββββββββββββββββββββ | 216/265 [01:39<00:22, 2.15it/s]\u001b[A\n 82%|ββββββββββββββββββββββββββββββββββ | 217/265 [01:40<00:22, 2.15it/s]\u001b[A\n 82%|ββββββββββββββββββββββββββββββββββ | 218/265 [01:40<00:21, 2.15it/s]\u001b[A\n 83%|ββββββββββββββββββββββββββββββββββ | 219/265 [01:40<00:21, 2.15it/s]\u001b[A\n 83%|ββββββββββββββββββββββββββββββββββ | 220/265 [01:41<00:20, 2.15it/s]\u001b[A\n 83%|βββββββββββββββββββββββββββββββββββ | 221/265 [01:41<00:20, 2.15it/s]\u001b[A\n 84%|βββββββββββββββββββββββββββββββββββ | 222/265 [01:42<00:20, 2.14it/s]\u001b[A\n 84%|βββββββββββββββββββββββββββββββββββ | 223/265 [01:42<00:19, 2.15it/s]\u001b[A\n 85%|βββββββββββββββββββββββββββββββββββ | 224/265 [01:43<00:19, 2.15it/s]\u001b[A\n 85%|βββββββββββββββββββββββββββββββββββ | 225/265 [01:43<00:18, 2.16it/s]\u001b[A\n 85%|βββββββββββββββββββββββββββββββββββ | 226/265 [01:44<00:18, 2.16it/s]\u001b[A\n 86%|βββββββββββββββββββββββββββββββββββ | 227/265 [01:44<00:17, 2.14it/s]\u001b[A\n 86%|ββββββββββββββββββββββββββββββββββββ | 228/265 [01:45<00:17, 2.15it/s]\u001b[A\n 86%|ββββββββββββββββββββββββββββββββββββ | 229/265 [01:45<00:16, 2.16it/s]\u001b[A\n 87%|ββββββββββββββββββββββββββββββββββββ | 230/265 [01:46<00:16, 2.15it/s]\u001b[A\n 87%|ββββββββββββββββββββββββββββββββββββ | 231/265 [01:46<00:15, 2.15it/s]\u001b[A\n 88%|ββββββββββββββββββββββββββββββββββββ | 232/265 [01:47<00:15, 2.15it/s]\u001b[A\n 88%|ββββββββββββββββββββββββββββββββββββ | 233/265 [01:47<00:14, 2.15it/s]\u001b[A\n 88%|βββββββββββββββββββββββββββββββββββββ | 234/265 [01:47<00:14, 2.15it/s]\u001b[A\n 89%|βββββββββββββββββββββββββββββββββββββ | 235/265 [01:48<00:13, 2.15it/s]\u001b[A\n 89%|βββββββββββββββββββββββββββββββββββββ | 236/265 [01:48<00:13, 2.16it/s]\u001b[A\n 89%|βββββββββββββββββββββββββββββββββββββ | 237/265 [01:49<00:12, 2.17it/s]\u001b[A\n 90%|βββββββββββββββββββββββββββββββββββββ | 238/265 [01:49<00:12, 2.17it/s]\u001b[A\n 90%|βββββββββββββββββββββββββββββββββββββ | 239/265 [01:50<00:12, 2.15it/s]\u001b[A\n 91%|ββββββββββββββββββββββββββββββββββββββ | 240/265 [01:50<00:11, 2.15it/s]\u001b[A\n 91%|ββββββββββββββββββββββββββββββββββββββ | 241/265 [01:51<00:11, 2.16it/s]\u001b[A\n 91%|ββββββββββββββββββββββββββββββββββββββ | 242/265 [01:51<00:10, 2.16it/s]\u001b[A\n 92%|ββββββββββββββββββββββββββββββββββββββ | 243/265 [01:52<00:10, 2.15it/s]\u001b[A\n 92%|ββββββββββββββββββββββββββββββββββββββ | 244/265 [01:52<00:09, 2.15it/s]\u001b[A\n 92%|ββββββββββββββββββββββββββββββββββββββ | 245/265 [01:53<00:09, 2.15it/s]\u001b[A\n 93%|ββββββββββββββββββββββββββββββββββββββ | 246/265 [01:53<00:08, 2.15it/s]\u001b[A\n 93%|βββββββββββββββββββββββββββββββββββββββ | 247/265 [01:53<00:08, 2.15it/s]\u001b[A\n 94%|βββββββββββββββββββββββββββββββββββββββ | 248/265 [01:54<00:07, 2.14it/s]\u001b[A\n 94%|βββββββββββββββββββββββββββββββββββββββ | 249/265 [01:54<00:07, 2.14it/s]\u001b[A\n 94%|βββββββββββββββββββββββββββββββββββββββ | 250/265 [01:55<00:06, 2.15it/s]\u001b[A\n 95%|βββββββββββββββββββββββββββββββββββββββ | 251/265 [01:55<00:06, 2.15it/s]\u001b[A\n 95%|βββββββββββββββββββββββββββββββββββββββ | 252/265 [01:56<00:06, 2.14it/s]\u001b[A\n 95%|ββββββββββββββββββββββββββββββββββββββββ | 253/265 [01:56<00:05, 2.15it/s]\u001b[A\n 96%|ββββββββββββββββββββββββββββββββββββββββ | 254/265 [01:57<00:05, 2.14it/s]\u001b[A\n 96%|ββββββββββββββββββββββββββββββββββββββββ | 255/265 [01:57<00:04, 2.16it/s]\u001b[A\n 97%|ββββββββββββββββββββββββββββββββββββββββ | 256/265 [01:58<00:04, 2.15it/s]\u001b[A\n 97%|ββββββββββββββββββββββββββββββββββββββββ | 257/265 [01:58<00:03, 2.17it/s]\u001b[A\n 97%|ββββββββββββββββββββββββββββββββββββββββ | 258/265 [01:59<00:03, 2.17it/s]\u001b[A\n 98%|ββββββββββββββββββββββββββββββββββββββββ | 259/265 [01:59<00:02, 2.17it/s]\u001b[A\n 98%|βββββββββββββββββββββββββββββββββββββββββ| 260/265 [02:00<00:02, 2.16it/s]\u001b[A\n 98%|βββββββββββββββββββββββββββββββββββββββββ| 261/265 [02:00<00:01, 2.16it/s]\u001b[A\n 99%|βββββββββββββββββββββββββββββββββββββββββ| 262/265 [02:00<00:01, 2.17it/s]\u001b[A\n 99%|βββββββββββββββββββββββββββββββββββββββββ| 263/265 [02:01<00:00, 2.18it/s]\u001b[A\n100%|βββββββββββββββββββββββββββββββββββββββββ| 264/265 [02:01<00:00, 2.17it/s]\u001b[A\n \u001b[A\n\u001b[A{'eval_loss': '3.379', 'eval_runtime': '124.1', 'eval_samples_per_second': '545.5', 'eval_steps_per_second': '2.136', 'epoch': '0.5971'}\n 20%|βββββββ | 3000/15000 [1:07:08<4:12:24, 1.26s/it]\n100%|βββββββββββββββββββββββββββββββββββββββββ| 265/265 [02:02<00:00, 1.84it/s]\u001b[A\n \u001b[A\nWriting model shards: 100%|βββββββββββββββββββββββ| 1/1 [00:00<00:00, 23.03it/s]\u001b[A\n{'loss': '3.379', 'grad_norm': '0.4565', 'learning_rate': '0.0009456', 'epoch': '0.617'}\n{'loss': '3.368', 'grad_norm': '0.4533', 'learning_rate': '0.0009404', 'epoch': '0.6369'}\n{'loss': '3.362', 'grad_norm': '0.4706', 'learning_rate': '0.0009349', 'epoch': '0.6568'}\n{'loss': '3.356', 'grad_norm': '0.4549', 'learning_rate': '0.0009293', 'epoch': '0.6768'}\n{'loss': '3.351', 'grad_norm': '0.4293', 'learning_rate': '0.0009234', 'epoch': '0.6967'}\n{'loss': '3.346', 'grad_norm': '0.4953', 'learning_rate': '0.0009173', 'epoch': '0.7166'}\n{'loss': '3.337', 'grad_norm': '0.5019', 'learning_rate': '0.0009111', 'epoch': '0.7365'}\n{'loss': '3.335', 'grad_norm': '0.4378', 'learning_rate': '0.0009046', 'epoch': '0.7564'}\n{'loss': '3.327', 'grad_norm': '0.4642', 'learning_rate': '0.0008979', 'epoch': '0.7763'}\n{'loss': '3.323', 'grad_norm': '0.415', 'learning_rate': '0.000891', 'epoch': '0.7962'}\n{'loss': '3.322', 'grad_norm': '0.3981', 'learning_rate': '0.0008839', 'epoch': '0.8161'}\n{'loss': '3.316', 'grad_norm': '0.4608', 'learning_rate': '0.0008766', 'epoch': '0.836'}\n{'loss': '3.312', 'grad_norm': '0.4348', 'learning_rate': '0.0008691', 'epoch': '0.8559'}\n{'loss': '3.306', 'grad_norm': '0.3941', 'learning_rate': '0.0008615', 'epoch': '0.8758'}\n{'loss': '3.301', 'grad_norm': '0.4383', 'learning_rate': '0.0008536', 'epoch': '0.8957'}\n 30%|βββββββββββ | 4500/15000 [1:38:41<3:40:41, 1.26s/it]\n 0%| | 0/265 [00:00<?, ?it/s]\u001b[A\n 1%|β | 2/265 [00:00<00:58, 4.46it/s]\u001b[A\n 1%|β | 3/265 [00:00<01:24, 3.10it/s]\u001b[A\n 2%|β | 4/265 [00:01<01:37, 2.68it/s]\u001b[A\n 2%|β | 5/265 [00:01<01:45, 2.47it/s]\u001b[A\n 2%|β | 6/265 [00:02<01:49, 2.36it/s]\u001b[A\n 3%|ββ | 7/265 [00:02<01:52, 2.29it/s]\u001b[A\n 3%|ββ | 8/265 [00:03<01:54, 2.24it/s]\u001b[A\n 3%|ββ | 9/265 [00:03<01:55, 2.22it/s]\u001b[A\n 4%|ββ | 10/265 [00:04<01:55, 2.21it/s]\u001b[A\n 4%|ββ | 11/265 [00:04<01:55, 2.19it/s]\u001b[A\n 5%|ββ | 12/265 [00:05<01:55, 2.19it/s]\u001b[A\n 5%|ββ | 13/265 [00:05<01:55, 2.18it/s]\u001b[A\n 5%|βββ | 14/265 [00:05<01:54, 2.19it/s]\u001b[A\n 6%|βββ | 15/265 [00:06<01:55, 2.17it/s]\u001b[A\n 6%|βββ | 16/265 [00:06<01:54, 2.17it/s]\u001b[A\n 6%|βββ | 17/265 [00:07<01:53, 2.18it/s]\u001b[A\n 7%|βββ | 18/265 [00:07<01:53, 2.17it/s]\u001b[A\n 7%|βββ | 19/265 [00:08<01:53, 2.17it/s]\u001b[A\n 8%|ββββ | 20/265 [00:08<01:52, 2.17it/s]\u001b[A\n 8%|ββββ | 21/265 [00:09<01:52, 2.17it/s]\u001b[A\n 8%|ββββ | 22/265 [00:09<01:51, 2.18it/s]\u001b[A\n 9%|ββββ | 23/265 [00:10<01:51, 2.18it/s]\u001b[A\n 9%|ββββ | 24/265 [00:10<01:50, 2.18it/s]\u001b[A\n 9%|ββββ | 25/265 [00:11<01:50, 2.17it/s]\u001b[A\n 10%|ββββ | 26/265 [00:11<01:49, 2.17it/s]\u001b[A\n 10%|βββββ | 27/265 [00:11<01:49, 2.18it/s]\u001b[A\n 11%|βββββ | 28/265 [00:12<01:49, 2.17it/s]\u001b[A\n 11%|βββββ | 29/265 [00:12<01:48, 2.17it/s]\u001b[A\n 11%|βββββ | 30/265 [00:13<01:47, 2.18it/s]\u001b[A\n 12%|βββββ | 31/265 [00:13<01:48, 2.16it/s]\u001b[A\n 12%|βββββ | 32/265 [00:14<01:48, 2.16it/s]\u001b[A\n 12%|ββββββ | 33/265 [00:14<01:47, 2.16it/s]\u001b[A\n 13%|ββββββ | 34/265 [00:15<01:46, 2.17it/s]\u001b[A\n 13%|ββββββ | 35/265 [00:15<01:45, 2.17it/s]\u001b[A\n 14%|ββββββ | 36/265 [00:16<01:45, 2.17it/s]\u001b[A\n 14%|ββββββ | 37/265 [00:16<01:45, 2.17it/s]\u001b[A\n 14%|ββββββ | 38/265 [00:17<01:44, 2.17it/s]\u001b[A\n 15%|βββββββ | 39/265 [00:17<01:44, 2.17it/s]\u001b[A\n 15%|βββββββ | 40/265 [00:17<01:43, 2.17it/s]\u001b[A\n 15%|βββββββ | 41/265 [00:18<01:44, 2.15it/s]\u001b[A\n 16%|βββββββ | 42/265 [00:18<01:42, 2.17it/s]\u001b[A\n 16%|βββββββ | 43/265 [00:19<01:42, 2.16it/s]\u001b[A\n 17%|βββββββ | 44/265 [00:19<01:41, 2.17it/s]\u001b[A\n 17%|ββββββββ | 45/265 [00:20<01:41, 2.17it/s]\u001b[A\n 17%|ββββββββ | 46/265 [00:20<01:41, 2.15it/s]\u001b[A\n 18%|ββββββββ | 47/265 [00:21<01:40, 2.17it/s]\u001b[A\n 18%|ββββββββ | 48/265 [00:21<01:40, 2.16it/s]\u001b[A\n 18%|ββββββββ | 49/265 [00:22<01:40, 2.16it/s]\u001b[A\n 19%|ββββββββ | 50/265 [00:22<01:39, 2.17it/s]\u001b[A\n 19%|ββββββββ | 51/265 [00:23<01:38, 2.17it/s]\u001b[A\n 20%|βββββββββ | 52/265 [00:23<01:38, 2.17it/s]\u001b[A\n 20%|βββββββββ | 53/265 [00:23<01:38, 2.16it/s]\u001b[A\n 20%|βββββββββ | 54/265 [00:24<01:38, 2.15it/s]\u001b[A\n 21%|βββββββββ | 55/265 [00:24<01:37, 2.15it/s]\u001b[A\n 21%|βββββββββ | 56/265 [00:25<01:36, 2.16it/s]\u001b[A\n 22%|βββββββββ | 57/265 [00:25<01:35, 2.17it/s]\u001b[A\n 22%|ββββββββββ | 58/265 [00:26<01:35, 2.17it/s]\u001b[A\n 22%|ββββββββββ | 59/265 [00:26<01:34, 2.17it/s]\u001b[A\n 23%|ββββββββββ | 60/265 [00:27<01:34, 2.16it/s]\u001b[A\n 23%|ββββββββββ | 61/265 [00:27<01:34, 2.16it/s]\u001b[A\n 23%|ββββββββββ | 62/265 [00:28<01:33, 2.17it/s]\u001b[A\n 24%|ββββββββββ | 63/265 [00:28<01:33, 2.16it/s]\u001b[A\n 24%|βββββββββββ | 64/265 [00:29<01:32, 2.17it/s]\u001b[A\n 25%|βββββββββββ | 65/265 [00:29<01:32, 2.16it/s]\u001b[A\n 25%|βββββββββββ | 66/265 [00:29<01:32, 2.16it/s]\u001b[A\n 25%|βββββββββββ | 67/265 [00:30<01:31, 2.15it/s]\u001b[A\n 26%|βββββββββββ | 68/265 [00:30<01:31, 2.15it/s]\u001b[A\n 26%|βββββββββββ | 69/265 [00:31<01:30, 2.16it/s]\u001b[A\n 26%|βββββββββββ | 70/265 [00:31<01:30, 2.16it/s]\u001b[A\n 27%|ββββββββββββ | 71/265 [00:32<01:29, 2.16it/s]\u001b[A\n 27%|ββββββββββββ | 72/265 [00:32<01:28, 2.17it/s]\u001b[A\n 28%|ββββββββββββ | 73/265 [00:33<01:28, 2.17it/s]\u001b[A\n 28%|ββββββββββββ | 74/265 [00:33<01:28, 2.16it/s]\u001b[A\n 28%|ββββββββββββ | 75/265 [00:34<01:28, 2.16it/s]\u001b[A\n 29%|ββββββββββββ | 76/265 [00:34<01:27, 2.16it/s]\u001b[A\n 29%|βββββββββββββ | 77/265 [00:35<01:26, 2.18it/s]\u001b[A\n 29%|βββββββββββββ | 78/265 [00:35<01:26, 2.17it/s]\u001b[A\n 30%|βββββββββββββ | 79/265 [00:35<01:26, 2.16it/s]\u001b[A\n 30%|βββββββββββββ | 80/265 [00:36<01:25, 2.17it/s]\u001b[A\n 31%|βββββββββββββ | 81/265 [00:36<01:24, 2.17it/s]\u001b[A\n 31%|βββββββββββββ | 82/265 [00:37<01:24, 2.17it/s]\u001b[A\n 31%|ββββββββββββββ | 83/265 [00:37<01:23, 2.17it/s]\u001b[A\n 32%|ββββββββββββββ | 84/265 [00:38<01:23, 2.17it/s]\u001b[A\n 32%|ββββββββββββββ | 85/265 [00:38<01:23, 2.16it/s]\u001b[A\n 32%|ββββββββββββββ | 86/265 [00:39<01:22, 2.16it/s]\u001b[A\n 33%|ββββββββββββββ | 87/265 [00:39<01:22, 2.16it/s]\u001b[A\n 33%|ββββββββββββββ | 88/265 [00:40<01:21, 2.17it/s]\u001b[A\n 34%|ββββββββββββββ | 89/265 [00:40<01:21, 2.15it/s]\u001b[A\n 34%|βββββββββββββββ | 90/265 [00:41<01:21, 2.16it/s]\u001b[A\n 34%|βββββββββββββββ | 91/265 [00:41<01:20, 2.16it/s]\u001b[A\n 35%|βββββββββββββββ | 92/265 [00:42<01:20, 2.16it/s]\u001b[A\n 35%|βββββββββββββββ | 93/265 [00:42<01:19, 2.15it/s]\u001b[A\n 35%|βββββββββββββββ | 94/265 [00:42<01:19, 2.14it/s]\u001b[A\n 36%|βββββββββββββββ | 95/265 [00:43<01:18, 2.15it/s]\u001b[A\n 36%|ββββββββββββββββ | 96/265 [00:43<01:18, 2.16it/s]\u001b[A\n 37%|ββββββββββββββββ | 97/265 [00:44<01:17, 2.15it/s]\u001b[A\n 37%|ββββββββββββββββ | 98/265 [00:44<01:17, 2.15it/s]\u001b[A\n 37%|ββββββββββββββββ | 99/265 [00:45<01:16, 2.16it/s]\u001b[A\n 38%|ββββββββββββββββ | 100/265 [00:45<01:16, 2.17it/s]\u001b[A\n 38%|ββββββββββββββββ | 101/265 [00:46<01:15, 2.16it/s]\u001b[A\n 38%|ββββββββββββββββ | 102/265 [00:46<01:15, 2.15it/s]\u001b[A\n 39%|ββββββββββββββββ | 103/265 [00:47<01:15, 2.15it/s]\u001b[A\n 39%|ββββββββββββββββ | 104/265 [00:47<01:15, 2.15it/s]\u001b[A\n 40%|βββββββββββββββββ | 105/265 [00:48<01:14, 2.15it/s]\u001b[A\n 40%|βββββββββββββββββ | 106/265 [00:48<01:14, 2.15it/s]\u001b[A\n 40%|βββββββββββββββββ | 107/265 [00:48<01:13, 2.15it/s]\u001b[A\n 41%|βββββββββββββββββ | 108/265 [00:49<01:12, 2.16it/s]\u001b[A\n 41%|βββββββββββββββββ | 109/265 [00:49<01:12, 2.16it/s]\u001b[A\n 42%|βββββββββββββββββ | 110/265 [00:50<01:11, 2.16it/s]\u001b[A\n 42%|ββββββββββββββββββ | 111/265 [00:50<01:11, 2.16it/s]\u001b[A\n 42%|ββββββββββββββββββ | 112/265 [00:51<01:10, 2.16it/s]\u001b[A\n 43%|ββββββββββββββββββ | 113/265 [00:51<01:10, 2.16it/s]\u001b[A\n 43%|ββββββββββββββββββ | 114/265 [00:52<01:10, 2.16it/s]\u001b[A\n 43%|ββββββββββββββββββ | 115/265 [00:52<01:09, 2.15it/s]\u001b[A\n 44%|ββββββββββββββββββ | 116/265 [00:53<01:08, 2.16it/s]\u001b[A\n 44%|ββββββββββββββββββ | 117/265 [00:53<01:08, 2.16it/s]\u001b[A\n 45%|βββββββββββββββββββ | 118/265 [00:54<01:08, 2.15it/s]\u001b[A\n 45%|βββββββββββββββββββ | 119/265 [00:54<01:07, 2.15it/s]\u001b[A\n 45%|βββββββββββββββββββ | 120/265 [00:55<01:07, 2.16it/s]\u001b[A\n 46%|βββββββββββββββββββ | 121/265 [00:55<01:06, 2.16it/s]\u001b[A\n 46%|βββββββββββββββββββ | 122/265 [00:55<01:06, 2.14it/s]\u001b[A\n 46%|βββββββββββββββββββ | 123/265 [00:56<01:06, 2.15it/s]\u001b[A\n 47%|ββββββββββββββββββββ | 124/265 [00:56<01:05, 2.15it/s]\u001b[A\n 47%|ββββββββββββββββββββ | 125/265 [00:57<01:05, 2.15it/s]\u001b[A\n 48%|ββββββββββββββββββββ | 126/265 [00:57<01:04, 2.15it/s]\u001b[A\n 48%|ββββββββββββββββββββ | 127/265 [00:58<01:04, 2.15it/s]\u001b[A\n 48%|ββββββββββββββββββββ | 128/265 [00:58<01:03, 2.16it/s]\u001b[A\n 49%|ββββββββββββββββββββ | 129/265 [00:59<01:02, 2.17it/s]\u001b[A\n 49%|ββββββββββββββββββββ | 130/265 [00:59<01:02, 2.16it/s]\u001b[A\n 49%|βββββββββββββββββββββ | 131/265 [01:00<01:01, 2.16it/s]\u001b[A\n 50%|βββββββββββββββββββββ | 132/265 [01:00<01:01, 2.15it/s]\u001b[A\n 50%|βββββββββββββββββββββ | 133/265 [01:01<01:01, 2.16it/s]\u001b[A\n 51%|βββββββββββββββββββββ | 134/265 [01:01<01:01, 2.14it/s]\u001b[A\n 51%|βββββββββββββββββββββ | 135/265 [01:01<01:00, 2.15it/s]\u001b[A\n 51%|βββββββββββββββββββββ | 136/265 [01:02<00:59, 2.15it/s]\u001b[A\n 52%|ββββββββββββββββββββββ | 137/265 [01:02<00:59, 2.16it/s]\u001b[A\n 52%|ββββββββββββββββββββββ | 138/265 [01:03<00:58, 2.16it/s]\u001b[A\n 52%|ββββββββββββββββββββββ | 139/265 [01:03<00:58, 2.15it/s]\u001b[A\n 53%|ββββββββββββββββββββββ | 140/265 [01:04<00:57, 2.16it/s]\u001b[A\n 53%|ββββββββββββββββββββββ | 141/265 [01:04<00:57, 2.16it/s]\u001b[A\n 54%|ββββββββββββββββββββββ | 142/265 [01:05<00:57, 2.16it/s]\u001b[A\n 54%|ββββββββββββββββββββββ | 143/265 [01:05<00:56, 2.16it/s]\u001b[A\n 54%|βββββββββββββββββββββββ | 144/265 [01:06<00:56, 2.15it/s]\u001b[A\n 55%|βββββββββββββββββββββββ | 145/265 [01:06<00:55, 2.16it/s]\u001b[A\n 55%|βββββββββββββββββββββββ | 146/265 [01:07<00:55, 2.16it/s]\u001b[A\n 55%|βββββββββββββββββββββββ | 147/265 [01:07<00:54, 2.15it/s]\u001b[A\n 56%|βββββββββββββββββββββββ | 148/265 [01:07<00:54, 2.16it/s]\u001b[A\n 56%|βββββββββββββββββββββββ | 149/265 [01:08<00:53, 2.16it/s]\u001b[A\n 57%|ββββββββββββββββββββββββ | 150/265 [01:08<00:53, 2.17it/s]\u001b[A\n 57%|ββββββββββββββββββββββββ | 151/265 [01:09<00:52, 2.17it/s]\u001b[A\n 57%|ββββββββββββββββββββββββ | 152/265 [01:09<00:52, 2.16it/s]\u001b[A\n 58%|ββββββββββββββββββββββββ | 153/265 [01:10<00:51, 2.16it/s]\u001b[A\n 58%|ββββββββββββββββββββββββ | 154/265 [01:10<00:51, 2.16it/s]\u001b[A\n 58%|ββββββββββββββββββββββββ | 155/265 [01:11<00:50, 2.16it/s]\u001b[A\n 59%|βββββββββββββββββββββββββ | 156/265 [01:11<00:50, 2.17it/s]\u001b[A\n 59%|βββββββββββββββββββββββββ | 157/265 [01:12<00:49, 2.16it/s]\u001b[A\n 60%|βββββββββββββββββββββββββ | 158/265 [01:12<00:49, 2.17it/s]\u001b[A\n 60%|βββββββββββββββββββββββββ | 159/265 [01:13<00:48, 2.17it/s]\u001b[A\n 60%|βββββββββββββββββββββββββ | 160/265 [01:13<00:48, 2.17it/s]\u001b[A\n 61%|βββββββββββββββββββββββββ | 161/265 [01:13<00:48, 2.17it/s]\u001b[A\n 61%|βββββββββββββββββββββββββ | 162/265 [01:14<00:48, 2.14it/s]\u001b[A\n 62%|ββββββββββββββββββββββββββ | 163/265 [01:14<00:47, 2.15it/s]\u001b[A\n 62%|ββββββββββββββββββββββββββ | 164/265 [01:15<00:47, 2.15it/s]\u001b[A\n 62%|ββββββββββββββββββββββββββ | 165/265 [01:15<00:46, 2.15it/s]\u001b[A\n 63%|ββββββββββββββββββββββββββ | 166/265 [01:16<00:45, 2.16it/s]\u001b[A\n 63%|ββββββββββββββββββββββββββ | 167/265 [01:16<00:45, 2.15it/s]\u001b[A\n 63%|ββββββββββββββββββββββββββ | 168/265 [01:17<00:45, 2.15it/s]\u001b[A\n 64%|βββββββββββββββββββββββββββ | 169/265 [01:17<00:44, 2.14it/s]\u001b[A\n 64%|βββββββββββββββββββββββββββ | 170/265 [01:18<00:44, 2.15it/s]\u001b[A\n 65%|βββββββββββββββββββββββββββ | 171/265 [01:18<00:43, 2.14it/s]\u001b[A\n 65%|βββββββββββββββββββββββββββ | 172/265 [01:19<00:43, 2.16it/s]\u001b[A\n 65%|βββββββββββββββββββββββββββ | 173/265 [01:19<00:42, 2.15it/s]\u001b[A\n 66%|βββββββββββββββββββββββββββ | 174/265 [01:20<00:42, 2.16it/s]\u001b[A\n 66%|βββββββββββββββββββββββββββ | 175/265 [01:20<00:41, 2.15it/s]\u001b[A\n 66%|ββββββββββββββββββββββββββββ | 176/265 [01:20<00:41, 2.16it/s]\u001b[A\n 67%|ββββββββββββββββββββββββββββ | 177/265 [01:21<00:40, 2.16it/s]\u001b[A\n 67%|ββββββββββββββββββββββββββββ | 178/265 [01:21<00:40, 2.17it/s]\u001b[A\n 68%|ββββββββββββββββββββββββββββ | 179/265 [01:22<00:39, 2.17it/s]\u001b[A\n 68%|ββββββββββββββββββββββββββββ | 180/265 [01:22<00:39, 2.17it/s]\u001b[A\n 68%|ββββββββββββββββββββββββββββ | 181/265 [01:23<00:38, 2.17it/s]\u001b[A\n 69%|βββββββββββββββββββββββββββββ | 182/265 [01:23<00:38, 2.17it/s]\u001b[A\n 69%|βββββββββββββββββββββββββββββ | 183/265 [01:24<00:37, 2.17it/s]\u001b[A\n 69%|βββββββββββββββββββββββββββββ | 184/265 [01:24<00:37, 2.16it/s]\u001b[A\n 70%|βββββββββββββββββββββββββββββ | 185/265 [01:25<00:37, 2.16it/s]\u001b[A\n 70%|βββββββββββββββββββββββββββββ | 186/265 [01:25<00:36, 2.16it/s]\u001b[A\n 71%|βββββββββββββββββββββββββββββ | 187/265 [01:26<00:36, 2.15it/s]\u001b[A\n 71%|βββββββββββββββββββββββββββββ | 188/265 [01:26<00:35, 2.16it/s]\u001b[A\n 71%|ββββββββββββββββββββββββββββββ | 189/265 [01:26<00:35, 2.16it/s]\u001b[A\n 72%|ββββββββββββββββββββββββββββββ | 190/265 [01:27<00:34, 2.16it/s]\u001b[A\n 72%|ββββββββββββββββββββββββββββββ | 191/265 [01:27<00:34, 2.15it/s]\u001b[A\n 72%|ββββββββββββββββββββββββββββββ | 192/265 [01:28<00:33, 2.16it/s]\u001b[A\n 73%|ββββββββββββββββββββββββββββββ | 193/265 [01:28<00:33, 2.16it/s]\u001b[A\n 73%|ββββββββββββββββββββββββββββββ | 194/265 [01:29<00:32, 2.16it/s]\u001b[A\n 74%|βββββββββββββββββββββββββββββββ | 195/265 [01:29<00:32, 2.16it/s]\u001b[A\n 74%|βββββββββββββββββββββββββββββββ | 196/265 [01:30<00:31, 2.17it/s]\u001b[A\n 74%|βββββββββββββββββββββββββββββββ | 197/265 [01:30<00:31, 2.16it/s]\u001b[A\n 75%|βββββββββββββββββββββββββββββββ | 198/265 [01:31<00:31, 2.16it/s]\u001b[A\n 75%|βββββββββββββββββββββββββββββββ | 199/265 [01:31<00:30, 2.16it/s]\u001b[A\n 75%|βββββββββββββββββββββββββββββββ | 200/265 [01:32<00:30, 2.17it/s]\u001b[A\n 76%|βββββββββββββββββββββββββββββββ | 201/265 [01:32<00:29, 2.16it/s]\u001b[A\n 76%|ββββββββββββββββββββββββββββββββ | 202/265 [01:32<00:29, 2.16it/s]\u001b[A\n 77%|ββββββββββββββββββββββββββββββββ | 203/265 [01:33<00:28, 2.17it/s]\u001b[A\n 77%|ββββββββββββββββββββββββββββββββ | 204/265 [01:33<00:28, 2.17it/s]\u001b[A\n 77%|ββββββββββββββββββββββββββββββββ | 205/265 [01:34<00:27, 2.16it/s]\u001b[A\n 78%|ββββββββββββββββββββββββββββββββ | 206/265 [01:34<00:27, 2.15it/s]\u001b[A\n 78%|ββββββββββββββββββββββββββββββββ | 207/265 [01:35<00:26, 2.16it/s]\u001b[A\n 78%|βββββββββββββββββββββββββββββββββ | 208/265 [01:35<00:26, 2.16it/s]\u001b[A\n 79%|βββββββββββββββββββββββββββββββββ | 209/265 [01:36<00:26, 2.15it/s]\u001b[A\n 79%|βββββββββββββββββββββββββββββββββ | 210/265 [01:36<00:25, 2.16it/s]\u001b[A\n 80%|βββββββββββββββββββββββββββββββββ | 211/265 [01:37<00:24, 2.17it/s]\u001b[A\n 80%|βββββββββββββββββββββββββββββββββ | 212/265 [01:37<00:24, 2.16it/s]\u001b[A\n 80%|βββββββββββββββββββββββββββββββββ | 213/265 [01:38<00:24, 2.16it/s]\u001b[A\n 81%|βββββββββββββββββββββββββββββββββ | 214/265 [01:38<00:23, 2.16it/s]\u001b[A\n 81%|ββββββββββββββββββββββββββββββββββ | 215/265 [01:39<00:23, 2.16it/s]\u001b[A\n 82%|ββββββββββββββββββββββββββββββββββ | 216/265 [01:39<00:22, 2.17it/s]\u001b[A\n 82%|ββββββββββββββββββββββββββββββββββ | 217/265 [01:39<00:22, 2.17it/s]\u001b[A\n 82%|ββββββββββββββββββββββββββββββββββ | 218/265 [01:40<00:21, 2.16it/s]\u001b[A\n 83%|ββββββββββββββββββββββββββββββββββ | 219/265 [01:40<00:21, 2.16it/s]\u001b[A\n 83%|ββββββββββββββββββββββββββββββββββ | 220/265 [01:41<00:20, 2.17it/s]\u001b[A\n 83%|βββββββββββββββββββββββββββββββββββ | 221/265 [01:41<00:20, 2.17it/s]\u001b[A\n 84%|βββββββββββββββββββββββββββββββββββ | 222/265 [01:42<00:19, 2.17it/s]\u001b[A\n 84%|βββββββββββββββββββββββββββββββββββ | 223/265 [01:42<00:19, 2.16it/s]\u001b[A\n 85%|βββββββββββββββββββββββββββββββββββ | 224/265 [01:43<00:18, 2.16it/s]\u001b[A\n 85%|βββββββββββββββββββββββββββββββββββ | 225/265 [01:43<00:18, 2.16it/s]\u001b[A\n 85%|βββββββββββββββββββββββββββββββββββ | 226/265 [01:44<00:18, 2.16it/s]\u001b[A\n 86%|βββββββββββββββββββββββββββββββββββ | 227/265 [01:44<00:17, 2.16it/s]\u001b[A\n 86%|ββββββββββββββββββββββββββββββββββββ | 228/265 [01:45<00:17, 2.17it/s]\u001b[A\n 86%|ββββββββββββββββββββββββββββββββββββ | 229/265 [01:45<00:16, 2.17it/s]\u001b[A\n 87%|ββββββββββββββββββββββββββββββββββββ | 230/265 [01:45<00:16, 2.17it/s]\u001b[A\n 87%|ββββββββββββββββββββββββββββββββββββ | 231/265 [01:46<00:15, 2.17it/s]\u001b[A\n 88%|ββββββββββββββββββββββββββββββββββββ | 232/265 [01:46<00:15, 2.17it/s]\u001b[A\n 88%|ββββββββββββββββββββββββββββββββββββ | 233/265 [01:47<00:14, 2.17it/s]\u001b[A\n 88%|βββββββββββββββββββββββββββββββββββββ | 234/265 [01:47<00:14, 2.17it/s]\u001b[A\n 89%|βββββββββββββββββββββββββββββββββββββ | 235/265 [01:48<00:13, 2.16it/s]\u001b[A\n 89%|βββββββββββββββββββββββββββββββββββββ | 236/265 [01:48<00:13, 2.17it/s]\u001b[A\n 89%|βββββββββββββββββββββββββββββββββββββ | 237/265 [01:49<00:12, 2.17it/s]\u001b[A\n 90%|βββββββββββββββββββββββββββββββββββββ | 238/265 [01:49<00:12, 2.17it/s]\u001b[A\n 90%|βββββββββββββββββββββββββββββββββββββ | 239/265 [01:50<00:12, 2.16it/s]\u001b[A\n 91%|ββββββββββββββββββββββββββββββββββββββ | 240/265 [01:50<00:11, 2.16it/s]\u001b[A\n 91%|ββββββββββββββββββββββββββββββββββββββ | 241/265 [01:51<00:11, 2.16it/s]\u001b[A\n 91%|ββββββββββββββββββββββββββββββββββββββ | 242/265 [01:51<00:10, 2.17it/s]\u001b[A\n 92%|ββββββββββββββββββββββββββββββββββββββ | 243/265 [01:51<00:10, 2.16it/s]\u001b[A\n 92%|ββββββββββββββββββββββββββββββββββββββ | 244/265 [01:52<00:09, 2.16it/s]\u001b[A\n 92%|ββββββββββββββββββββββββββββββββββββββ | 245/265 [01:52<00:09, 2.17it/s]\u001b[A\n 93%|ββββββββββββββββββββββββββββββββββββββ | 246/265 [01:53<00:08, 2.16it/s]\u001b[A\n 93%|βββββββββββββββββββββββββββββββββββββββ | 247/265 [01:53<00:08, 2.15it/s]\u001b[A\n 94%|βββββββββββββββββββββββββββββββββββββββ | 248/265 [01:54<00:07, 2.14it/s]\u001b[A\n 94%|βββββββββββββββββββββββββββββββββββββββ | 249/265 [01:54<00:07, 2.14it/s]\u001b[A\n 94%|βββββββββββββββββββββββββββββββββββββββ | 250/265 [01:55<00:06, 2.16it/s]\u001b[A\n 95%|βββββββββββββββββββββββββββββββββββββββ | 251/265 [01:55<00:06, 2.15it/s]\u001b[A\n 95%|βββββββββββββββββββββββββββββββββββββββ | 252/265 [01:56<00:06, 2.16it/s]\u001b[A\n 95%|ββββββββββββββββββββββββββββββββββββββββ | 253/265 [01:56<00:05, 2.16it/s]\u001b[A\n 96%|ββββββββββββββββββββββββββββββββββββββββ | 254/265 [01:57<00:05, 2.16it/s]\u001b[A\n 96%|ββββββββββββββββββββββββββββββββββββββββ | 255/265 [01:57<00:04, 2.16it/s]\u001b[A\n 97%|ββββββββββββββββββββββββββββββββββββββββ | 256/265 [01:57<00:04, 2.16it/s]\u001b[A\n 97%|ββββββββββββββββββββββββββββββββββββββββ | 257/265 [01:58<00:03, 2.16it/s]\u001b[A\n 97%|ββββββββββββββββββββββββββββββββββββββββ | 258/265 [01:58<00:03, 2.17it/s]\u001b[A\n 98%|ββββββββββββββββββββββββββββββββββββββββ | 259/265 [01:59<00:02, 2.17it/s]\u001b[A\n 98%|βββββββββββββββββββββββββββββββββββββββββ| 260/265 [01:59<00:02, 2.17it/s]\u001b[A\n 98%|βββββββββββββββββββββββββββββββββββββββββ| 261/265 [02:00<00:01, 2.18it/s]\u001b[A\n 99%|βββββββββββββββββββββββββββββββββββββββββ| 262/265 [02:00<00:01, 2.17it/s]\u001b[A\n 99%|βββββββββββββββββββββββββββββββββββββββββ| 263/265 [02:01<00:00, 2.18it/s]\u001b[A\n100%|βββββββββββββββββββββββββββββββββββββββββ| 264/265 [02:01<00:00, 2.18it/s]\u001b[A\n \u001b[A\n\u001b[A{'eval_loss': '3.302', 'eval_runtime': '123.8', 'eval_samples_per_second': '547', 'eval_steps_per_second': '2.141', 'epoch': '0.8957'}\n 30%|βββββββββββ | 4500/15000 [1:40:45<3:40:41, 1.26s/it]\n100%|βββββββββββββββββββββββββββββββββββββββββ| 265/265 [02:02<00:00, 1.96it/s]\u001b[A\n{'loss': '3.299', 'grad_norm': '0.4956', 'learning_rate': '0.0008456', 'epoch': '0.9156'}\n{'loss': '3.295', 'grad_norm': '0.4151', 'learning_rate': '0.0008374', 'epoch': '0.9355'}\n{'loss': '3.288', 'grad_norm': '0.3972', 'learning_rate': '0.0008291', 'epoch': '0.9554'}\n{'loss': '3.286', 'grad_norm': '0.4891', 'learning_rate': '0.0008205', 'epoch': '0.9753'}\n{'loss': '3.281', 'grad_norm': '0.4219', 'learning_rate': '0.0008118', 'epoch': '0.9952'}\n{'loss': '3.279', 'grad_norm': '0.4423', 'learning_rate': '0.000803', 'epoch': '1.015'}\n{'loss': '3.274', 'grad_norm': '0.451', 'learning_rate': '0.000794', 'epoch': '1.035'}\n{'loss': '3.265', 'grad_norm': '0.4244', 'learning_rate': '0.0007848', 'epoch': '1.055'}\n{'loss': '3.268', 'grad_norm': '0.4712', 'learning_rate': '0.0007755', 'epoch': '1.075'}\n{'loss': '3.266', 'grad_norm': '0.4572', 'learning_rate': '0.0007661', 'epoch': '1.095'}\n{'loss': '3.259', 'grad_norm': '0.4602', 'learning_rate': '0.0007565', 'epoch': '1.115'}\n{'loss': '3.26', 'grad_norm': '0.482', 'learning_rate': '0.0007469', 'epoch': '1.135'}\n{'loss': '3.256', 'grad_norm': '0.3824', 'learning_rate': '0.000737', 'epoch': '1.154'}\n{'loss': '3.251', 'grad_norm': '0.4026', 'learning_rate': '0.0007271', 'epoch': '1.174'}\n{'loss': '3.254', 'grad_norm': '0.4204', 'learning_rate': '0.000717', 'epoch': '1.194'}\n 40%|ββββββββββββββ | 6000/15000 [2:12:21<3:09:36, 1.26s/it]\n 0%| | 0/265 [00:00<?, ?it/s]\u001b[A\n 1%|β | 2/265 [00:00<01:00, 4.34it/s]\u001b[A\n 1%|β | 3/265 [00:00<01:25, 3.07it/s]\u001b[A\n 2%|β | 4/265 [00:01<01:38, 2.65it/s]\u001b[A\n 2%|β | 5/265 [00:01<01:45, 2.47it/s]\u001b[A\n 2%|β | 6/265 [00:02<01:50, 2.34it/s]\u001b[A\n 3%|ββ | 7/265 [00:02<01:53, 2.28it/s]\u001b[A\n 3%|ββ | 8/265 [00:03<01:53, 2.26it/s]\u001b[A\n 3%|ββ | 9/265 [00:03<01:55, 2.22it/s]\u001b[A\n 4%|ββ | 10/265 [00:04<01:55, 2.21it/s]\u001b[A\n 4%|ββ | 11/265 [00:04<01:55, 2.20it/s]\u001b[A\n 5%|ββ | 12/265 [00:05<01:55, 2.18it/s]\u001b[A\n 5%|ββ | 13/265 [00:05<01:55, 2.18it/s]\u001b[A\n 5%|βββ | 14/265 [00:05<01:55, 2.18it/s]\u001b[A\n 6%|βββ | 15/265 [00:06<01:54, 2.18it/s]\u001b[A\n 6%|βββ | 16/265 [00:06<01:54, 2.17it/s]\u001b[A\n 6%|βββ | 17/265 [00:07<01:54, 2.18it/s]\u001b[A\n 7%|βββ | 18/265 [00:07<01:53, 2.18it/s]\u001b[A\n 7%|βββ | 19/265 [00:08<01:53, 2.17it/s]\u001b[A\n 8%|ββββ | 20/265 [00:08<01:52, 2.18it/s]\u001b[A\n 8%|ββββ | 21/265 [00:09<01:51, 2.18it/s]\u001b[A\n 8%|ββββ | 22/265 [00:09<01:51, 2.18it/s]\u001b[A\n 9%|ββββ | 23/265 [00:10<01:51, 2.17it/s]\u001b[A\n 9%|ββββ | 24/265 [00:10<01:50, 2.18it/s]\u001b[A\n 9%|ββββ | 25/265 [00:11<01:50, 2.17it/s]\u001b[A\n 10%|ββββ | 26/265 [00:11<01:50, 2.17it/s]\u001b[A\n 10%|βββββ | 27/265 [00:11<01:49, 2.17it/s]\u001b[A\n 11%|βββββ | 28/265 [00:12<01:48, 2.17it/s]\u001b[A\n 11%|βββββ | 29/265 [00:12<01:48, 2.17it/s]\u001b[A\n 11%|βββββ | 30/265 [00:13<01:48, 2.17it/s]\u001b[A\n 12%|βββββ | 31/265 [00:13<01:47, 2.18it/s]\u001b[A\n 12%|βββββ | 32/265 [00:14<01:47, 2.16it/s]\u001b[A\n 12%|ββββββ | 33/265 [00:14<01:47, 2.16it/s]\u001b[A\n 13%|ββββββ | 34/265 [00:15<01:46, 2.17it/s]\u001b[A\n 13%|ββββββ | 35/265 [00:15<01:46, 2.17it/s]\u001b[A\n 14%|ββββββ | 36/265 [00:16<01:46, 2.16it/s]\u001b[A\n 14%|ββββββ | 37/265 [00:16<01:45, 2.15it/s]\u001b[A\n 14%|ββββββ | 38/265 [00:17<01:45, 2.15it/s]\u001b[A\n 15%|βββββββ | 39/265 [00:17<01:44, 2.16it/s]\u001b[A\n 15%|βββββββ | 40/265 [00:17<01:44, 2.15it/s]\u001b[A\n 15%|βββββββ | 41/265 [00:18<01:43, 2.17it/s]\u001b[A\n 16%|βββββββ | 42/265 [00:18<01:42, 2.17it/s]\u001b[A\n 16%|βββββββ | 43/265 [00:19<01:42, 2.16it/s]\u001b[A\n 17%|βββββββ | 44/265 [00:19<01:42, 2.16it/s]\u001b[A\n 17%|ββββββββ | 45/265 [00:20<01:41, 2.17it/s]\u001b[A\n 17%|ββββββββ | 46/265 [00:20<01:40, 2.18it/s]\u001b[A\n 18%|ββββββββ | 47/265 [00:21<01:40, 2.16it/s]\u001b[A\n 18%|ββββββββ | 48/265 [00:21<01:40, 2.15it/s]\u001b[A\n 18%|ββββββββ | 49/265 [00:22<01:39, 2.16it/s]\u001b[A\n 19%|ββββββββ | 50/265 [00:22<01:39, 2.17it/s]\u001b[A\n 19%|ββββββββ | 51/265 [00:23<01:39, 2.15it/s]\u001b[A\n 20%|βββββββββ | 52/265 [00:23<01:38, 2.16it/s]\u001b[A\n 20%|βββββββββ | 53/265 [00:23<01:37, 2.17it/s]\u001b[A\n 20%|βββββββββ | 54/265 [00:24<01:37, 2.16it/s]\u001b[A\n 21%|βββββββββ | 55/265 [00:24<01:37, 2.15it/s]\u001b[A\n 21%|βββββββββ | 56/265 [00:25<01:36, 2.16it/s]\u001b[A\n 22%|βββββββββ | 57/265 [00:25<01:36, 2.16it/s]\u001b[A\n 22%|ββββββββββ | 58/265 [00:26<01:35, 2.16it/s]\u001b[A\n 22%|ββββββββββ | 59/265 [00:26<01:35, 2.16it/s]\u001b[A\n 23%|ββββββββββ | 60/265 [00:27<01:35, 2.16it/s]\u001b[A\n 23%|ββββββββββ | 61/265 [00:27<01:34, 2.16it/s]\u001b[A\n 23%|ββββββββββ | 62/265 [00:28<01:34, 2.15it/s]\u001b[A\n 24%|ββββββββββ | 63/265 [00:28<01:34, 2.15it/s]\u001b[A\n 24%|βββββββββββ | 64/265 [00:29<01:33, 2.16it/s]\u001b[A\n 25%|βββββββββββ | 65/265 [00:29<01:32, 2.16it/s]\u001b[A\n 25%|βββββββββββ | 66/265 [00:30<01:32, 2.15it/s]\u001b[A\n 25%|βββββββββββ | 67/265 [00:30<01:32, 2.14it/s]\u001b[A\n 26%|βββββββββββ | 68/265 [00:30<01:31, 2.16it/s]\u001b[A\n 26%|βββββββββββ | 69/265 [00:31<01:30, 2.16it/s]\u001b[A\n 26%|βββββββββββ | 70/265 [00:31<01:30, 2.15it/s]\u001b[A\n 27%|ββββββββββββ | 71/265 [00:32<01:29, 2.16it/s]\u001b[A\n 27%|ββββββββββββ | 72/265 [00:32<01:29, 2.17it/s]\u001b[A\n 28%|ββββββββββββ | 73/265 [00:33<01:28, 2.16it/s]\u001b[A\n 28%|ββββββββββββ | 74/265 [00:33<01:28, 2.16it/s]\u001b[A\n 28%|ββββββββββββ | 75/265 [00:34<01:27, 2.16it/s]\u001b[A\n 29%|ββββββββββββ | 76/265 [00:34<01:27, 2.16it/s]\u001b[A\n 29%|βββββββββββββ | 77/265 [00:35<01:27, 2.16it/s]\u001b[A\n 29%|βββββββββββββ | 78/265 [00:35<01:27, 2.14it/s]\u001b[A\n 30%|βββββββββββββ | 79/265 [00:36<01:26, 2.15it/s]\u001b[A\n 30%|βββββββββββββ | 80/265 [00:36<01:26, 2.14it/s]\u001b[A\n 31%|βββββββββββββ | 81/265 [00:36<01:25, 2.15it/s]\u001b[A\n 31%|βββββββββββββ | 82/265 [00:37<01:24, 2.15it/s]\u001b[A\n 31%|ββββββββββββββ | 83/265 [00:37<01:24, 2.15it/s]\u001b[A\n 32%|ββββββββββββββ | 84/265 [00:38<01:24, 2.15it/s]\u001b[A\n 32%|ββββββββββββββ | 85/265 [00:38<01:23, 2.15it/s]\u001b[A\n 32%|ββββββββββββββ | 86/265 [00:39<01:23, 2.14it/s]\u001b[A\n 33%|ββββββββββββββ | 87/265 [00:39<01:22, 2.15it/s]\u001b[A\n 33%|ββββββββββββββ | 88/265 [00:40<01:22, 2.15it/s]\u001b[A\n 34%|ββββββββββββββ | 89/265 [00:40<01:22, 2.15it/s]\u001b[A\n 34%|βββββββββββββββ | 90/265 [00:41<01:21, 2.15it/s]\u001b[A\n 34%|βββββββββββββββ | 91/265 [00:41<01:20, 2.15it/s]\u001b[A\n 35%|βββββββββββββββ | 92/265 [00:42<01:20, 2.16it/s]\u001b[A\n 35%|βββββββββββββββ | 93/265 [00:42<01:20, 2.15it/s]\u001b[A\n 35%|βββββββββββββββ | 94/265 [00:43<01:19, 2.14it/s]\u001b[A\n 36%|βββββββββββββββ | 95/265 [00:43<01:18, 2.16it/s]\u001b[A\n 36%|ββββββββββββββββ | 96/265 [00:43<01:18, 2.16it/s]\u001b[A\n 37%|ββββββββββββββββ | 97/265 [00:44<01:18, 2.15it/s]\u001b[A\n 37%|ββββββββββββββββ | 98/265 [00:44<01:17, 2.15it/s]\u001b[A\n 37%|ββββββββββββββββ | 99/265 [00:45<01:17, 2.16it/s]\u001b[A\n 38%|ββββββββββββββββ | 100/265 [00:45<01:17, 2.14it/s]\u001b[A\n 38%|ββββββββββββββββ | 101/265 [00:46<01:16, 2.15it/s]\u001b[A\n 38%|ββββββββββββββββ | 102/265 [00:46<01:15, 2.15it/s]\u001b[A\n 39%|ββββββββββββββββ | 103/265 [00:47<01:15, 2.14it/s]\u001b[A\n 39%|ββββββββββββββββ | 104/265 [00:47<01:14, 2.15it/s]\u001b[A\n 40%|βββββββββββββββββ | 105/265 [00:48<01:14, 2.15it/s]\u001b[A\n 40%|βββββββββββββββββ | 106/265 [00:48<01:14, 2.15it/s]\u001b[A\n 40%|βββββββββββββββββ | 107/265 [00:49<01:13, 2.15it/s]\u001b[A\n 41%|βββββββββββββββββ | 108/265 [00:49<01:12, 2.15it/s]\u001b[A\n 41%|βββββββββββββββββ | 109/265 [00:50<01:12, 2.16it/s]\u001b[A\n 42%|βββββββββββββββββ | 110/265 [00:50<01:11, 2.16it/s]\u001b[A\n 42%|ββββββββββββββββββ | 111/265 [00:50<01:11, 2.15it/s]\u001b[A\n 42%|ββββββββββββββββββ | 112/265 [00:51<01:11, 2.15it/s]\u001b[A\n 43%|ββββββββββββββββββ | 113/265 [00:51<01:10, 2.15it/s]\u001b[A\n 43%|ββββββββββββββββββ | 114/265 [00:52<01:10, 2.15it/s]\u001b[A\n 43%|ββββββββββββββββββ | 115/265 [00:52<01:09, 2.15it/s]\u001b[A\n 44%|ββββββββββββββββββ | 116/265 [00:53<01:09, 2.15it/s]\u001b[A\n 44%|ββββββββββββββββββ | 117/265 [00:53<01:08, 2.16it/s]\u001b[A\n 45%|βββββββββββββββββββ | 118/265 [00:54<01:07, 2.17it/s]\u001b[A\n 45%|βββββββββββββββββββ | 119/265 [00:54<01:07, 2.16it/s]\u001b[A\n 45%|βββββββββββββββββββ | 120/265 [00:55<01:07, 2.16it/s]\u001b[A\n 46%|βββββββββββββββββββ | 121/265 [00:55<01:06, 2.16it/s]\u001b[A\n 46%|βββββββββββββββββββ | 122/265 [00:56<01:06, 2.16it/s]\u001b[A\n 46%|βββββββββββββββββββ | 123/265 [00:56<01:06, 2.15it/s]\u001b[A\n 47%|ββββββββββββββββββββ | 124/265 [00:56<01:05, 2.14it/s]\u001b[A\n 47%|ββββββββββββββββββββ | 125/265 [00:57<01:05, 2.15it/s]\u001b[A\n 48%|ββββββββββββββββββββ | 126/265 [00:57<01:04, 2.15it/s]\u001b[A\n 48%|ββββββββββββββββββββ | 127/265 [00:58<01:04, 2.15it/s]\u001b[A\n 48%|ββββββββββββββββββββ | 128/265 [00:58<01:03, 2.15it/s]\u001b[A\n 49%|ββββββββββββββββββββ | 129/265 [00:59<01:03, 2.15it/s]\u001b[A\n 49%|ββββββββββββββββββββ | 130/265 [00:59<01:02, 2.16it/s]\u001b[A\n 49%|βββββββββββββββββββββ | 131/265 [01:00<01:02, 2.15it/s]\u001b[A\n 50%|βββββββββββββββββββββ | 132/265 [01:00<01:01, 2.16it/s]\u001b[A\n 50%|βββββββββββββββββββββ | 133/265 [01:01<01:01, 2.16it/s]\u001b[A\n 51%|βββββββββββββββββββββ | 134/265 [01:01<01:00, 2.17it/s]\u001b[A\n 51%|βββββββββββββββββββββ | 135/265 [01:02<01:00, 2.16it/s]\u001b[A\n 51%|βββββββββββββββββββββ | 136/265 [01:02<01:00, 2.15it/s]\u001b[A\n 52%|ββββββββββββββββββββββ | 137/265 [01:03<00:59, 2.16it/s]\u001b[A\n 52%|ββββββββββββββββββββββ | 138/265 [01:03<00:58, 2.16it/s]\u001b[A\n 52%|ββββββββββββββββββββββ | 139/265 [01:03<00:58, 2.16it/s]\u001b[A\n 53%|ββββββββββββββββββββββ | 140/265 [01:04<00:58, 2.15it/s]\u001b[A\n 53%|ββββββββββββββββββββββ | 141/265 [01:04<00:57, 2.16it/s]\u001b[A\n 54%|ββββββββββββββββββββββ | 142/265 [01:05<00:56, 2.16it/s]\u001b[A\n 54%|ββββββββββββββββββββββ | 143/265 [01:05<00:56, 2.16it/s]\u001b[A\n 54%|βββββββββββββββββββββββ | 144/265 [01:06<00:55, 2.17it/s]\u001b[A\n 55%|βββββββββββββββββββββββ | 145/265 [01:06<00:55, 2.16it/s]\u001b[A\n 55%|βββββββββββββββββββββββ | 146/265 [01:07<00:55, 2.16it/s]\u001b[A\n 55%|βββββββββββββββββββββββ | 147/265 [01:07<00:54, 2.16it/s]\u001b[A\n 56%|βββββββββββββββββββββββ | 148/265 [01:08<00:54, 2.16it/s]\u001b[A\n 56%|βββββββββββββββββββββββ | 149/265 [01:08<00:53, 2.16it/s]\u001b[A\n 57%|ββββββββββββββββββββββββ | 150/265 [01:09<00:52, 2.17it/s]\u001b[A\n 57%|ββββββββββββββββββββββββ | 151/265 [01:09<00:52, 2.17it/s]\u001b[A\n 57%|ββββββββββββββββββββββββ | 152/265 [01:09<00:52, 2.15it/s]\u001b[A\n 58%|ββββββββββββββββββββββββ | 153/265 [01:10<00:51, 2.16it/s]\u001b[A\n 58%|ββββββββββββββββββββββββ | 154/265 [01:10<00:51, 2.15it/s]\u001b[A\n 58%|ββββββββββββββββββββββββ | 155/265 [01:11<00:51, 2.16it/s]\u001b[A\n 59%|βββββββββββββββββββββββββ | 156/265 [01:11<00:50, 2.16it/s]\u001b[A\n 59%|βββββββββββββββββββββββββ | 157/265 [01:12<00:50, 2.16it/s]\u001b[A\n 60%|βββββββββββββββββββββββββ | 158/265 [01:12<00:49, 2.17it/s]\u001b[A\n 60%|βββββββββββββββββββββββββ | 159/265 [01:13<00:48, 2.16it/s]\u001b[A\n 60%|βββββββββββββββββββββββββ | 160/265 [01:13<00:48, 2.15it/s]\u001b[A\n 61%|βββββββββββββββββββββββββ | 161/265 [01:14<00:48, 2.16it/s]\u001b[A\n 61%|βββββββββββββββββββββββββ | 162/265 [01:14<00:47, 2.16it/s]\u001b[A\n 62%|ββββββββββββββββββββββββββ | 163/265 [01:15<00:47, 2.17it/s]\u001b[A\n 62%|ββββββββββββββββββββββββββ | 164/265 [01:15<00:46, 2.16it/s]\u001b[A\n 62%|ββββββββββββββββββββββββββ | 165/265 [01:15<00:46, 2.16it/s]\u001b[A\n 63%|ββββββββββββββββββββββββββ | 166/265 [01:16<00:45, 2.17it/s]\u001b[A\n 63%|ββββββββββββββββββββββββββ | 167/265 [01:16<00:45, 2.16it/s]\u001b[A\n 63%|ββββββββββββββββββββββββββ | 168/265 [01:17<00:45, 2.15it/s]\u001b[A\n 64%|βββββββββββββββββββββββββββ | 169/265 [01:17<00:44, 2.15it/s]\u001b[A\n 64%|βββββββββββββββββββββββββββ | 170/265 [01:18<00:43, 2.16it/s]\u001b[A\n 65%|βββββββββββββββββββββββββββ | 171/265 [01:18<00:43, 2.16it/s]\u001b[A\n 65%|βββββββββββββββββββββββββββ | 172/265 [01:19<00:42, 2.17it/s]\u001b[A\n 65%|βββββββββββββββββββββββββββ | 173/265 [01:19<00:42, 2.17it/s]\u001b[A\n 66%|βββββββββββββββββββββββββββ | 174/265 [01:20<00:42, 2.16it/s]\u001b[A\n 66%|βββββββββββββββββββββββββββ | 175/265 [01:20<00:41, 2.16it/s]\u001b[A\n 66%|ββββββββββββββββββββββββββββ | 176/265 [01:21<00:40, 2.17it/s]\u001b[A\n 67%|ββββββββββββββββββββββββββββ | 177/265 [01:21<00:40, 2.17it/s]\u001b[A\n 67%|ββββββββββββββββββββββββββββ | 178/265 [01:21<00:40, 2.16it/s]\u001b[A\n 68%|ββββββββββββββββββββββββββββ | 179/265 [01:22<00:39, 2.16it/s]\u001b[A\n 68%|ββββββββββββββββββββββββββββ | 180/265 [01:22<00:39, 2.16it/s]\u001b[A\n 68%|ββββββββββββββββββββββββββββ | 181/265 [01:23<00:38, 2.17it/s]\u001b[A\n 69%|βββββββββββββββββββββββββββββ | 182/265 [01:23<00:38, 2.17it/s]\u001b[A\n 69%|βββββββββββββββββββββββββββββ | 183/265 [01:24<00:37, 2.17it/s]\u001b[A\n 69%|βββββββββββββββββββββββββββββ | 184/265 [01:24<00:37, 2.17it/s]\u001b[A\n 70%|βββββββββββββββββββββββββββββ | 185/265 [01:25<00:37, 2.16it/s]\u001b[A\n 70%|βββββββββββββββββββββββββββββ | 186/265 [01:25<00:36, 2.17it/s]\u001b[A\n 71%|βββββββββββββββββββββββββββββ | 187/265 [01:26<00:35, 2.17it/s]\u001b[A\n 71%|βββββββββββββββββββββββββββββ | 188/265 [01:26<00:35, 2.17it/s]\u001b[A\n 71%|ββββββββββββββββββββββββββββββ | 189/265 [01:27<00:35, 2.16it/s]\u001b[A\n 72%|ββββββββββββββββββββββββββββββ | 190/265 [01:27<00:34, 2.16it/s]\u001b[A\n 72%|ββββββββββββββββββββββββββββββ | 191/265 [01:28<00:34, 2.15it/s]\u001b[A\n 72%|ββββββββββββββββββββββββββββββ | 192/265 [01:28<00:33, 2.15it/s]\u001b[A\n 73%|ββββββββββββββββββββββββββββββ | 193/265 [01:28<00:33, 2.15it/s]\u001b[A\n 73%|ββββββββββββββββββββββββββββββ | 194/265 [01:29<00:33, 2.15it/s]\u001b[A\n 74%|βββββββββββββββββββββββββββββββ | 195/265 [01:29<00:32, 2.15it/s]\u001b[A\n 74%|βββββββββββββββββββββββββββββββ | 196/265 [01:30<00:32, 2.15it/s]\u001b[A\n 74%|βββββββββββββββββββββββββββββββ | 197/265 [01:30<00:31, 2.15it/s]\u001b[A\n 75%|βββββββββββββββββββββββββββββββ | 198/265 [01:31<00:31, 2.14it/s]\u001b[A\n 75%|βββββββββββββββββββββββββββββββ | 199/265 [01:31<00:30, 2.15it/s]\u001b[A\n 75%|βββββββββββββββββββββββββββββββ | 200/265 [01:32<00:30, 2.15it/s]\u001b[A\n 76%|βββββββββββββββββββββββββββββββ | 201/265 [01:32<00:29, 2.16it/s]\u001b[A\n 76%|ββββββββββββββββββββββββββββββββ | 202/265 [01:33<00:29, 2.16it/s]\u001b[A\n 77%|ββββββββββββββββββββββββββββββββ | 203/265 [01:33<00:28, 2.16it/s]\u001b[A\n 77%|ββββββββββββββββββββββββββββββββ | 204/265 [01:34<00:28, 2.16it/s]\u001b[A\n 77%|ββββββββββββββββββββββββββββββββ | 205/265 [01:34<00:27, 2.15it/s]\u001b[A\n 78%|ββββββββββββββββββββββββββββββββ | 206/265 [01:34<00:27, 2.14it/s]\u001b[A\n 78%|ββββββββββββββββββββββββββββββββ | 207/265 [01:35<00:27, 2.15it/s]\u001b[A\n 78%|βββββββββββββββββββββββββββββββββ | 208/265 [01:35<00:26, 2.16it/s]\u001b[A\n 79%|βββββββββββββββββββββββββββββββββ | 209/265 [01:36<00:26, 2.15it/s]\u001b[A\n 79%|βββββββββββββββββββββββββββββββββ | 210/265 [01:36<00:25, 2.16it/s]\u001b[A\n 80%|βββββββββββββββββββββββββββββββββ | 211/265 [01:37<00:25, 2.15it/s]\u001b[A\n 80%|βββββββββββββββββββββββββββββββββ | 212/265 [01:37<00:24, 2.16it/s]\u001b[A\n 80%|βββββββββββββββββββββββββββββββββ | 213/265 [01:38<00:23, 2.17it/s]\u001b[A\n 81%|βββββββββββββββββββββββββββββββββ | 214/265 [01:38<00:23, 2.17it/s]\u001b[A\n 81%|ββββββββββββββββββββββββββββββββββ | 215/265 [01:39<00:23, 2.17it/s]\u001b[A\n 82%|ββββββββββββββββββββββββββββββββββ | 216/265 [01:39<00:22, 2.17it/s]\u001b[A\n 82%|ββββββββββββββββββββββββββββββββββ | 217/265 [01:40<00:22, 2.15it/s]\u001b[A\n 82%|ββββββββββββββββββββββββββββββββββ | 218/265 [01:40<00:21, 2.17it/s]\u001b[A\n 83%|ββββββββββββββββββββββββββββββββββ | 219/265 [01:40<00:21, 2.16it/s]\u001b[A\n 83%|ββββββββββββββββββββββββββββββββββ | 220/265 [01:41<00:20, 2.16it/s]\u001b[A\n 83%|βββββββββββββββββββββββββββββββββββ | 221/265 [01:41<00:20, 2.16it/s]\u001b[A\n 84%|βββββββββββββββββββββββββββββββββββ | 222/265 [01:42<00:19, 2.15it/s]\u001b[A\n 84%|βββββββββββββββββββββββββββββββββββ | 223/265 [01:42<00:19, 2.16it/s]\u001b[A\n 85%|βββββββββββββββββββββββββββββββββββ | 224/265 [01:43<00:18, 2.17it/s]\u001b[A\n 85%|βββββββββββββββββββββββββββββββββββ | 225/265 [01:43<00:18, 2.17it/s]\u001b[A\n 85%|βββββββββββββββββββββββββββββββββββ | 226/265 [01:44<00:18, 2.16it/s]\u001b[A\n 86%|βββββββββββββββββββββββββββββββββββ | 227/265 [01:44<00:17, 2.15it/s]\u001b[A\n 86%|ββββββββββββββββββββββββββββββββββββ | 228/265 [01:45<00:17, 2.16it/s]\u001b[A\n 86%|ββββββββββββββββββββββββββββββββββββ | 229/265 [01:45<00:16, 2.16it/s]\u001b[A\n 87%|ββββββββββββββββββββββββββββββββββββ | 230/265 [01:46<00:16, 2.15it/s]\u001b[A\n 87%|ββββββββββββββββββββββββββββββββββββ | 231/265 [01:46<00:15, 2.16it/s]\u001b[A\n 88%|ββββββββββββββββββββββββββββββββββββ | 232/265 [01:46<00:15, 2.17it/s]\u001b[A\n 88%|ββββββββββββββββββββββββββββββββββββ | 233/265 [01:47<00:14, 2.17it/s]\u001b[A\n 88%|βββββββββββββββββββββββββββββββββββββ | 234/265 [01:47<00:14, 2.15it/s]\u001b[A\n 89%|βββββββββββββββββββββββββββββββββββββ | 235/265 [01:48<00:13, 2.16it/s]\u001b[A\n 89%|βββββββββββββββββββββββββββββββββββββ | 236/265 [01:48<00:13, 2.16it/s]\u001b[A\n 89%|βββββββββββββββββββββββββββββββββββββ | 237/265 [01:49<00:12, 2.16it/s]\u001b[A\n 90%|βββββββββββββββββββββββββββββββββββββ | 238/265 [01:49<00:12, 2.16it/s]\u001b[A\n 90%|βββββββββββββββββββββββββββββββββββββ | 239/265 [01:50<00:12, 2.15it/s]\u001b[A\n 91%|ββββββββββββββββββββββββββββββββββββββ | 240/265 [01:50<00:11, 2.16it/s]\u001b[A\n 91%|ββββββββββββββββββββββββββββββββββββββ | 241/265 [01:51<00:11, 2.17it/s]\u001b[A\n 91%|ββββββββββββββββββββββββββββββββββββββ | 242/265 [01:51<00:10, 2.16it/s]\u001b[A\n 92%|ββββββββββββββββββββββββββββββββββββββ | 243/265 [01:52<00:10, 2.16it/s]\u001b[A\n 92%|ββββββββββββββββββββββββββββββββββββββ | 244/265 [01:52<00:09, 2.16it/s]\u001b[A\n 92%|ββββββββββββββββββββββββββββββββββββββ | 245/265 [01:53<00:09, 2.15it/s]\u001b[A\n 93%|ββββββββββββββββββββββββββββββββββββββ | 246/265 [01:53<00:08, 2.16it/s]\u001b[A\n 93%|βββββββββββββββββββββββββββββββββββββββ | 247/265 [01:53<00:08, 2.15it/s]\u001b[A\n 94%|βββββββββββββββββββββββββββββββββββββββ | 248/265 [01:54<00:07, 2.15it/s]\u001b[A\n 94%|βββββββββββββββββββββββββββββββββββββββ | 249/265 [01:54<00:07, 2.14it/s]\u001b[A\n 94%|βββββββββββββββββββββββββββββββββββββββ | 250/265 [01:55<00:06, 2.15it/s]\u001b[A\n 95%|βββββββββββββββββββββββββββββββββββββββ | 251/265 [01:55<00:06, 2.15it/s]\u001b[A\n 95%|βββββββββββββββββββββββββββββββββββββββ | 252/265 [01:56<00:06, 2.15it/s]\u001b[A\n 95%|ββββββββββββββββββββββββββββββββββββββββ | 253/265 [01:56<00:05, 2.14it/s]\u001b[A\n 96%|ββββββββββββββββββββββββββββββββββββββββ | 254/265 [01:57<00:05, 2.14it/s]\u001b[A\n 96%|ββββββββββββββββββββββββββββββββββββββββ | 255/265 [01:57<00:04, 2.15it/s]\u001b[A\n 97%|ββββββββββββββββββββββββββββββββββββββββ | 256/265 [01:58<00:04, 2.15it/s]\u001b[A\n 97%|ββββββββββββββββββββββββββββββββββββββββ | 257/265 [01:58<00:03, 2.16it/s]\u001b[A\n 97%|ββββββββββββββββββββββββββββββββββββββββ | 258/265 [01:59<00:03, 2.16it/s]\u001b[A\n 98%|ββββββββββββββββββββββββββββββββββββββββ | 259/265 [01:59<00:02, 2.17it/s]\u001b[A\n 98%|βββββββββββββββββββββββββββββββββββββββββ| 260/265 [01:59<00:02, 2.17it/s]\u001b[A\n 98%|βββββββββββββββββββββββββββββββββββββββββ| 261/265 [02:00<00:01, 2.16it/s]\u001b[A\n 99%|βββββββββββββββββββββββββββββββββββββββββ| 262/265 [02:00<00:01, 2.17it/s]\u001b[A\n 99%|βββββββββββββββββββββββββββββββββββββββββ| 263/265 [02:01<00:00, 2.17it/s]\u001b[A\n100%|βββββββββββββββββββββββββββββββββββββββββ| 264/265 [02:01<00:00, 2.17it/s]\u001b[A\n \u001b[A\n\u001b[A{'eval_loss': '3.253', 'eval_runtime': '124', 'eval_samples_per_second': '545.9', 'eval_steps_per_second': '2.137', 'epoch': '1.194'}\n 40%|ββββββββββββββ | 6000/15000 [2:14:25<3:09:36, 1.26s/it]\n100%|βββββββββββββββββββββββββββββββββββββββββ| 265/265 [02:02<00:00, 1.95it/s]\u001b[A\n \u001b[A\nWriting model shards: 100%|βββββββββββββββββββββββ| 1/1 [00:00<00:00, 22.53it/s]\u001b[A\n{'loss': '3.249', 'grad_norm': '0.4128', 'learning_rate': '0.0007069', 'epoch': '1.214'}\n{'loss': '3.245', 'grad_norm': '0.4653', 'learning_rate': '0.0006966', 'epoch': '1.234'}\n{'loss': '3.244', 'grad_norm': '0.4505', 'learning_rate': '0.0006863', 'epoch': '1.254'}\n{'loss': '3.237', 'grad_norm': '0.4407', 'learning_rate': '0.0006758', 'epoch': '1.274'}\n{'loss': '3.237', 'grad_norm': '0.4491', 'learning_rate': '0.0006652', 'epoch': '1.294'}\n 44%|βββββββββββββββ | 6527/15000 [2:25:30<2:58:44, 1.27s/it]","output_type":"stream"}],"execution_count":null},{"cell_type":"code","source":"%%writefile finalize_model.py\nimport torch\nimport os\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\n\npath = \"./spark_v4_final\"\nsave_path = \"./spark_v4_fp16_final\"\n\nprint(\"Loading model for finalization...\")\nmodel = AutoModelForCausalLM.from_pretrained(path)\ntokenizer = AutoTokenizer.from_pretrained(path)\n\nprint(\"Converting to FP16...\")\nmodel.half() \n\nprint(f\"Saving final model to {save_path}...\")\nmodel.save_pretrained(save_path)\ntokenizer.save_pretrained(save_path)\n\nprint(\"Spark v4 ready!\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-23T14:23:28.372118Z","iopub.execute_input":"2026-04-23T14:23:28.373269Z","iopub.status.idle":"2026-04-23T14:23:28.379592Z","shell.execute_reply.started":"2026-04-23T14:23:28.373199Z","shell.execute_reply":"2026-04-23T14:23:28.378821Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"!python3 finalize_model.py","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-23T14:23:31.994449Z","iopub.execute_input":"2026-04-23T14:23:31.995089Z","iopub.status.idle":"2026-04-23T14:23:41.768206Z","shell.execute_reply.started":"2026-04-23T14:23:31.995059Z","shell.execute_reply":"2026-04-23T14:23:41.767476Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"%%writefile inference.py\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\nimport torch\n\npath = \"spark_v4_fp16_final\"\ntokenizer = AutoTokenizer.from_pretrained(path)\nmodel = AutoModelForCausalLM.from_pretrained(path).to(\"cuda\")\n\nprompts = [\n \"Artificial Intelligence is\",\n \"The main concept of physics is\",\n \"In the year 1969, \"\n]\n\nfor prompt in prompts:\n inputs = tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\")\n outputs = model.generate(\n **inputs, \n max_new_tokens=200, \n do_sample=True, \n top_k=25, \n temperature=0.8,\n pad_token_id=tokenizer.eos_token_id\n )\n print(f\"PROMPT: {prompt}\")\n print(f\"OUTPUT: {tokenizer.decode(outputs[0], skip_special_tokens=True)}\\n{'-'*40}\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-23T14:23:46.940223Z","iopub.execute_input":"2026-04-23T14:23:46.941423Z","iopub.status.idle":"2026-04-23T14:23:46.946858Z","shell.execute_reply.started":"2026-04-23T14:23:46.941344Z","shell.execute_reply":"2026-04-23T14:23:46.946186Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"!python3 inference.py","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-23T14:23:48.647731Z","iopub.execute_input":"2026-04-23T14:23:48.648497Z","iopub.status.idle":"2026-04-23T14:24:04.207269Z","shell.execute_reply.started":"2026-04-23T14:23:48.648463Z","shell.execute_reply":"2026-04-23T14:24:04.206465Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"!pip install lm_eval","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-23T14:24:12.714573Z","iopub.execute_input":"2026-04-23T14:24:12.715446Z","iopub.status.idle":"2026-04-23T14:24:16.904135Z","shell.execute_reply.started":"2026-04-23T14:24:12.715364Z","shell.execute_reply":"2026-04-23T14:24:16.903448Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"%%writefile eval.sh\nlm_eval --model hf \\\n --model_args pretrained=./spark_v4_fp16_final \\\n --tasks piqa,lambada_openai,hellaswag \\\n --device cuda:0 \\\n --batch_size 16","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-23T14:24:18.317926Z","iopub.execute_input":"2026-04-23T14:24:18.318575Z","iopub.status.idle":"2026-04-23T14:24:18.323771Z","shell.execute_reply.started":"2026-04-23T14:24:18.318542Z","shell.execute_reply":"2026-04-23T14:24:18.323034Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"!bash eval.sh","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-23T14:24:19.996556Z","iopub.execute_input":"2026-04-23T14:24:19.997059Z","iopub.status.idle":"2026-04-23T14:26:54.305094Z","shell.execute_reply.started":"2026-04-23T14:24:19.997027Z","shell.execute_reply":"2026-04-23T14:26:54.304135Z"}},"outputs":[],"execution_count":null}]} |