Image-Text-to-Text
Transformers
Safetensors
qwen3_5_moe
llama-factory
full
Generated from Trainer
conversational
Instructions to use BakeLab/Kallisti-35B-A3B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use BakeLab/Kallisti-35B-A3B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="BakeLab/Kallisti-35B-A3B") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("BakeLab/Kallisti-35B-A3B") model = AutoModelForImageTextToText.from_pretrained("BakeLab/Kallisti-35B-A3B") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use BakeLab/Kallisti-35B-A3B with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "BakeLab/Kallisti-35B-A3B" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "BakeLab/Kallisti-35B-A3B", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/BakeLab/Kallisti-35B-A3B
- SGLang
How to use BakeLab/Kallisti-35B-A3B with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "BakeLab/Kallisti-35B-A3B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "BakeLab/Kallisti-35B-A3B", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "BakeLab/Kallisti-35B-A3B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "BakeLab/Kallisti-35B-A3B", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use BakeLab/Kallisti-35B-A3B with Docker Model Runner:
docker model run hf.co/BakeLab/Kallisti-35B-A3B
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 159, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.018867924528301886, | |
| "grad_norm": 15.087599797333977, | |
| "learning_rate": 0.0, | |
| "loss": 1.3308970928192139, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.03773584905660377, | |
| "grad_norm": 13.905749218469701, | |
| "learning_rate": 1e-07, | |
| "loss": 1.285962700843811, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.05660377358490566, | |
| "grad_norm": 12.056842971309038, | |
| "learning_rate": 2e-07, | |
| "loss": 1.3014495372772217, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.07547169811320754, | |
| "grad_norm": 13.244790508016711, | |
| "learning_rate": 3e-07, | |
| "loss": 1.306698203086853, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.09433962264150944, | |
| "grad_norm": 13.343404572220207, | |
| "learning_rate": 4e-07, | |
| "loss": 1.316192388534546, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.11320754716981132, | |
| "grad_norm": 11.45228080585811, | |
| "learning_rate": 5e-07, | |
| "loss": 1.3045374155044556, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.1320754716981132, | |
| "grad_norm": 14.318691524925551, | |
| "learning_rate": 6e-07, | |
| "loss": 1.3311705589294434, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.1509433962264151, | |
| "grad_norm": 12.050953431316902, | |
| "learning_rate": 7e-07, | |
| "loss": 1.2908077239990234, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.16981132075471697, | |
| "grad_norm": 10.724349690276135, | |
| "learning_rate": 8e-07, | |
| "loss": 1.3058435916900635, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.18867924528301888, | |
| "grad_norm": 9.473154789049158, | |
| "learning_rate": 9e-07, | |
| "loss": 1.2856130599975586, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.20754716981132076, | |
| "grad_norm": 6.5491980764342275, | |
| "learning_rate": 1e-06, | |
| "loss": 1.2199636697769165, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.22641509433962265, | |
| "grad_norm": 6.462810113541478, | |
| "learning_rate": 9.99888864929809e-07, | |
| "loss": 1.1673463582992554, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.24528301886792453, | |
| "grad_norm": 6.9296822460672045, | |
| "learning_rate": 9.995555091232516e-07, | |
| "loss": 1.1699671745300293, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.2641509433962264, | |
| "grad_norm": 6.0515568106146596, | |
| "learning_rate": 9.990000807704114e-07, | |
| "loss": 1.1814613342285156, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.2830188679245283, | |
| "grad_norm": 4.743020637878028, | |
| "learning_rate": 9.982228267815643e-07, | |
| "loss": 1.0652694702148438, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.3018867924528302, | |
| "grad_norm": 4.526630266791274, | |
| "learning_rate": 9.972240926774166e-07, | |
| "loss": 1.0635337829589844, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.32075471698113206, | |
| "grad_norm": 4.609514753545406, | |
| "learning_rate": 9.96004322435508e-07, | |
| "loss": 1.0902111530303955, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.33962264150943394, | |
| "grad_norm": 4.34097054100359, | |
| "learning_rate": 9.945640582928437e-07, | |
| "loss": 1.06702721118927, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.3584905660377358, | |
| "grad_norm": 3.864434007517437, | |
| "learning_rate": 9.9290394050485e-07, | |
| "loss": 1.0476477146148682, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.37735849056603776, | |
| "grad_norm": 3.8969527857656656, | |
| "learning_rate": 9.91024707060755e-07, | |
| "loss": 1.0617330074310303, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.39622641509433965, | |
| "grad_norm": 3.911924948199517, | |
| "learning_rate": 9.889271933555212e-07, | |
| "loss": 1.07832932472229, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.41509433962264153, | |
| "grad_norm": 3.833567236811609, | |
| "learning_rate": 9.8661233181848e-07, | |
| "loss": 1.0324124097824097, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.4339622641509434, | |
| "grad_norm": 3.6955004487569396, | |
| "learning_rate": 9.840811514988293e-07, | |
| "loss": 0.9815853834152222, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.4528301886792453, | |
| "grad_norm": 4.023404125176107, | |
| "learning_rate": 9.813347776081788e-07, | |
| "loss": 1.0266845226287842, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.4716981132075472, | |
| "grad_norm": 3.712461139695743, | |
| "learning_rate": 9.78374431020349e-07, | |
| "loss": 1.0085935592651367, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.49056603773584906, | |
| "grad_norm": 3.7543864084874596, | |
| "learning_rate": 9.752014277286431e-07, | |
| "loss": 0.9968965649604797, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.5094339622641509, | |
| "grad_norm": 3.8046734306564467, | |
| "learning_rate": 9.718171782608353e-07, | |
| "loss": 0.9803509712219238, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.5283018867924528, | |
| "grad_norm": 3.6105782650336433, | |
| "learning_rate": 9.682231870521345e-07, | |
| "loss": 0.9759021997451782, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.5471698113207547, | |
| "grad_norm": 3.3896428780092753, | |
| "learning_rate": 9.644210517764013e-07, | |
| "loss": 0.9812103509902954, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.5660377358490566, | |
| "grad_norm": 3.118079780719029, | |
| "learning_rate": 9.60412462635919e-07, | |
| "loss": 0.9091012477874756, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5849056603773585, | |
| "grad_norm": 3.3662986364845, | |
| "learning_rate": 9.561992016100291e-07, | |
| "loss": 0.9503388404846191, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.6037735849056604, | |
| "grad_norm": 2.9779547004368196, | |
| "learning_rate": 9.517831416629716e-07, | |
| "loss": 0.9247981309890747, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.6226415094339622, | |
| "grad_norm": 3.468415170701323, | |
| "learning_rate": 9.471662459112745e-07, | |
| "loss": 0.9473499655723572, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.6415094339622641, | |
| "grad_norm": 2.8573918489427688, | |
| "learning_rate": 9.423505667510723e-07, | |
| "loss": 0.9340516328811646, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.660377358490566, | |
| "grad_norm": 2.949529150108781, | |
| "learning_rate": 9.373382449457303e-07, | |
| "loss": 0.9248940348625183, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.6792452830188679, | |
| "grad_norm": 2.9658340262784697, | |
| "learning_rate": 9.321315086741915e-07, | |
| "loss": 0.9420664310455322, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.6981132075471698, | |
| "grad_norm": 3.019712899281778, | |
| "learning_rate": 9.267326725404598e-07, | |
| "loss": 0.9231287240982056, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.7169811320754716, | |
| "grad_norm": 2.827563138085356, | |
| "learning_rate": 9.21144136544666e-07, | |
| "loss": 0.9293084740638733, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.7358490566037735, | |
| "grad_norm": 3.126960585054511, | |
| "learning_rate": 9.153683850161705e-07, | |
| "loss": 0.9372609853744507, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.7547169811320755, | |
| "grad_norm": 2.7757572634358456, | |
| "learning_rate": 9.094079855091797e-07, | |
| "loss": 0.9204014539718628, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7735849056603774, | |
| "grad_norm": 2.86268897243828, | |
| "learning_rate": 9.032655876613635e-07, | |
| "loss": 0.9143469333648682, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.7924528301886793, | |
| "grad_norm": 2.899411491265449, | |
| "learning_rate": 8.96943922015986e-07, | |
| "loss": 0.901626467704773, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.8113207547169812, | |
| "grad_norm": 3.0296165470958494, | |
| "learning_rate": 8.90445798808068e-07, | |
| "loss": 0.9193109273910522, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.8301886792452831, | |
| "grad_norm": 2.832066082274235, | |
| "learning_rate": 8.837741067151249e-07, | |
| "loss": 0.9078618288040161, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.8490566037735849, | |
| "grad_norm": 2.9792386000035083, | |
| "learning_rate": 8.769318115730328e-07, | |
| "loss": 0.9032235145568848, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.8679245283018868, | |
| "grad_norm": 2.8570785041355373, | |
| "learning_rate": 8.699219550575952e-07, | |
| "loss": 0.8799638152122498, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.8867924528301887, | |
| "grad_norm": 2.8898604537645185, | |
| "learning_rate": 8.627476533323956e-07, | |
| "loss": 0.9072629809379578, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.9056603773584906, | |
| "grad_norm": 2.819489131324746, | |
| "learning_rate": 8.554120956635374e-07, | |
| "loss": 0.879642128944397, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.9245283018867925, | |
| "grad_norm": 2.884576949261456, | |
| "learning_rate": 8.479185430018858e-07, | |
| "loss": 0.9129672050476074, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.9433962264150944, | |
| "grad_norm": 2.8206974490824663, | |
| "learning_rate": 8.402703265334454e-07, | |
| "loss": 0.9072036147117615, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.9622641509433962, | |
| "grad_norm": 2.8666837714043414, | |
| "learning_rate": 8.324708461985124e-07, | |
| "loss": 0.8936312198638916, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.9811320754716981, | |
| "grad_norm": 2.75278105425475, | |
| "learning_rate": 8.245235691802643e-07, | |
| "loss": 0.886029839515686, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.9063116637756807, | |
| "learning_rate": 8.164320283634585e-07, | |
| "loss": 0.886949360370636, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.0188679245283019, | |
| "grad_norm": 2.8027377644406104, | |
| "learning_rate": 8.081998207639212e-07, | |
| "loss": 0.8734487891197205, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.0377358490566038, | |
| "grad_norm": 2.975237594360833, | |
| "learning_rate": 7.998306059295302e-07, | |
| "loss": 0.8541756868362427, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.0566037735849056, | |
| "grad_norm": 2.7212092257296785, | |
| "learning_rate": 7.913281043133977e-07, | |
| "loss": 0.855162501335144, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.0754716981132075, | |
| "grad_norm": 4.004522306787069, | |
| "learning_rate": 7.826960956199794e-07, | |
| "loss": 0.8469276428222656, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.0943396226415094, | |
| "grad_norm": 2.789521379215554, | |
| "learning_rate": 7.739384171248434e-07, | |
| "loss": 0.8612252473831177, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.1132075471698113, | |
| "grad_norm": 3.0001618191920008, | |
| "learning_rate": 7.650589619688468e-07, | |
| "loss": 0.8504967093467712, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.1320754716981132, | |
| "grad_norm": 2.803340918384437, | |
| "learning_rate": 7.560616774274774e-07, | |
| "loss": 0.8487892150878906, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.150943396226415, | |
| "grad_norm": 2.7872996717171112, | |
| "learning_rate": 7.469505631561317e-07, | |
| "loss": 0.8430064916610718, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.169811320754717, | |
| "grad_norm": 2.767338948376076, | |
| "learning_rate": 7.377296694121058e-07, | |
| "loss": 0.834577202796936, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.1886792452830188, | |
| "grad_norm": 2.7744551402453883, | |
| "learning_rate": 7.284030952540936e-07, | |
| "loss": 0.8389214277267456, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.2075471698113207, | |
| "grad_norm": 2.94391173341089, | |
| "learning_rate": 7.189749867199898e-07, | |
| "loss": 0.8442764282226562, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.2264150943396226, | |
| "grad_norm": 2.9244734720758285, | |
| "learning_rate": 7.094495349838092e-07, | |
| "loss": 0.802047848701477, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.2452830188679245, | |
| "grad_norm": 2.997891576167027, | |
| "learning_rate": 6.998309744925411e-07, | |
| "loss": 0.8562427163124084, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.2641509433962264, | |
| "grad_norm": 2.7454101056544618, | |
| "learning_rate": 6.901235810837667e-07, | |
| "loss": 0.8214827179908752, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.2830188679245282, | |
| "grad_norm": 2.9952605769764853, | |
| "learning_rate": 6.803316700848778e-07, | |
| "loss": 0.7995479702949524, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.3018867924528301, | |
| "grad_norm": 2.86683247629566, | |
| "learning_rate": 6.704595943947385e-07, | |
| "loss": 0.8077808022499084, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.320754716981132, | |
| "grad_norm": 2.7702979738330322, | |
| "learning_rate": 6.605117425486481e-07, | |
| "loss": 0.8417398929595947, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.3396226415094339, | |
| "grad_norm": 2.725158428984504, | |
| "learning_rate": 6.504925367674594e-07, | |
| "loss": 0.8494030833244324, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.3584905660377358, | |
| "grad_norm": 2.8106277256279255, | |
| "learning_rate": 6.40406430991723e-07, | |
| "loss": 0.8620424866676331, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.3773584905660377, | |
| "grad_norm": 2.818628329932316, | |
| "learning_rate": 6.302579089017327e-07, | |
| "loss": 0.8398749232292175, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.3962264150943398, | |
| "grad_norm": 2.745904001646307, | |
| "learning_rate": 6.200514819243475e-07, | |
| "loss": 0.8420323133468628, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.4150943396226414, | |
| "grad_norm": 2.7850840819985416, | |
| "learning_rate": 6.097916872274814e-07, | |
| "loss": 0.8359158635139465, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.4339622641509435, | |
| "grad_norm": 2.793048578545994, | |
| "learning_rate": 5.994830857031499e-07, | |
| "loss": 0.8336814641952515, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.4528301886792452, | |
| "grad_norm": 2.8505241824701826, | |
| "learning_rate": 5.891302599399684e-07, | |
| "loss": 0.7930982112884521, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.4716981132075473, | |
| "grad_norm": 2.6769256052426615, | |
| "learning_rate": 5.78737812186009e-07, | |
| "loss": 0.8192281723022461, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.490566037735849, | |
| "grad_norm": 2.7762595596745916, | |
| "learning_rate": 5.683103623029134e-07, | |
| "loss": 0.8389377593994141, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.509433962264151, | |
| "grad_norm": 2.8899154085340166, | |
| "learning_rate": 5.578525457121806e-07, | |
| "loss": 0.8256187438964844, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.5283018867924527, | |
| "grad_norm": 2.7720983651750917, | |
| "learning_rate": 5.473690113345342e-07, | |
| "loss": 0.8473238945007324, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.5471698113207548, | |
| "grad_norm": 2.8065774463241495, | |
| "learning_rate": 5.368644195232895e-07, | |
| "loss": 0.8165145516395569, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.5660377358490565, | |
| "grad_norm": 2.9614754969968016, | |
| "learning_rate": 5.263434399926398e-07, | |
| "loss": 0.8529609441757202, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.5849056603773586, | |
| "grad_norm": 2.90447128441676, | |
| "learning_rate": 5.158107497417794e-07, | |
| "loss": 0.8249980211257935, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.6037735849056602, | |
| "grad_norm": 2.7563670691746767, | |
| "learning_rate": 5.052710309757898e-07, | |
| "loss": 0.7900608777999878, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.6226415094339623, | |
| "grad_norm": 2.781624786647774, | |
| "learning_rate": 4.947289690242102e-07, | |
| "loss": 0.7917711734771729, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.641509433962264, | |
| "grad_norm": 2.8227831992064165, | |
| "learning_rate": 4.841892502582205e-07, | |
| "loss": 0.8228881359100342, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.6603773584905661, | |
| "grad_norm": 3.0626612203128687, | |
| "learning_rate": 4.736565600073602e-07, | |
| "loss": 0.8176588416099548, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.6792452830188678, | |
| "grad_norm": 2.7691999193756316, | |
| "learning_rate": 4.6313558047671047e-07, | |
| "loss": 0.8315557837486267, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.6981132075471699, | |
| "grad_norm": 2.9603416787137276, | |
| "learning_rate": 4.5263098866546586e-07, | |
| "loss": 0.8079712390899658, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.7169811320754715, | |
| "grad_norm": 2.7648310195075023, | |
| "learning_rate": 4.421474542878194e-07, | |
| "loss": 0.7854694128036499, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.7358490566037736, | |
| "grad_norm": 2.9565749840190736, | |
| "learning_rate": 4.316896376970866e-07, | |
| "loss": 0.8382487297058105, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.7547169811320755, | |
| "grad_norm": 2.904524931485949, | |
| "learning_rate": 4.2126218781399114e-07, | |
| "loss": 0.8337287902832031, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.7735849056603774, | |
| "grad_norm": 2.9419686201700794, | |
| "learning_rate": 4.1086974006003154e-07, | |
| "loss": 0.8450314402580261, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.7924528301886793, | |
| "grad_norm": 2.738066358519684, | |
| "learning_rate": 4.0051691429685023e-07, | |
| "loss": 0.7846765518188477, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.8113207547169812, | |
| "grad_norm": 2.7276079074380895, | |
| "learning_rate": 3.902083127725186e-07, | |
| "loss": 0.814504861831665, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.830188679245283, | |
| "grad_norm": 2.8093937971147835, | |
| "learning_rate": 3.799485180756525e-07, | |
| "loss": 0.8011671304702759, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.849056603773585, | |
| "grad_norm": 2.842796846086812, | |
| "learning_rate": 3.697420910982672e-07, | |
| "loss": 0.8165295124053955, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.8679245283018868, | |
| "grad_norm": 2.8189503982268977, | |
| "learning_rate": 3.5959356900827687e-07, | |
| "loss": 0.8199301958084106, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.8867924528301887, | |
| "grad_norm": 2.910644604198592, | |
| "learning_rate": 3.4950746323254063e-07, | |
| "loss": 0.8019869327545166, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.9056603773584906, | |
| "grad_norm": 2.863904675767849, | |
| "learning_rate": 3.394882574513519e-07, | |
| "loss": 0.8060827255249023, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.9245283018867925, | |
| "grad_norm": 2.8904123754351723, | |
| "learning_rate": 3.295404056052616e-07, | |
| "loss": 0.8078351020812988, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.9433962264150944, | |
| "grad_norm": 2.8850916542883778, | |
| "learning_rate": 3.1966832991512225e-07, | |
| "loss": 0.8068495988845825, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.9622641509433962, | |
| "grad_norm": 2.9528533111592865, | |
| "learning_rate": 3.0987641891623315e-07, | |
| "loss": 0.8184278011322021, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.9811320754716981, | |
| "grad_norm": 2.869159446180868, | |
| "learning_rate": 3.0016902550745895e-07, | |
| "loss": 0.8299746513366699, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.778568933671074, | |
| "learning_rate": 2.9055046501619083e-07, | |
| "loss": 0.785747766494751, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 2.018867924528302, | |
| "grad_norm": 2.9408610818195062, | |
| "learning_rate": 2.810250132800103e-07, | |
| "loss": 0.7670397758483887, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 2.0377358490566038, | |
| "grad_norm": 2.6257935800346694, | |
| "learning_rate": 2.715969047459066e-07, | |
| "loss": 0.7878092527389526, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 2.056603773584906, | |
| "grad_norm": 3.058449053263793, | |
| "learning_rate": 2.6227033058789403e-07, | |
| "loss": 0.7904379367828369, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 2.0754716981132075, | |
| "grad_norm": 2.88973427193669, | |
| "learning_rate": 2.5304943684386825e-07, | |
| "loss": 0.8011707067489624, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.0943396226415096, | |
| "grad_norm": 2.723021754211135, | |
| "learning_rate": 2.439383225725225e-07, | |
| "loss": 0.7658779621124268, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 2.1132075471698113, | |
| "grad_norm": 2.787460559434829, | |
| "learning_rate": 2.3494103803115318e-07, | |
| "loss": 0.7720337510108948, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 2.1320754716981134, | |
| "grad_norm": 2.7422069166294802, | |
| "learning_rate": 2.2606158287515658e-07, | |
| "loss": 0.7842212915420532, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 2.150943396226415, | |
| "grad_norm": 3.381034950183202, | |
| "learning_rate": 2.1730390438002056e-07, | |
| "loss": 0.7690730094909668, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 2.169811320754717, | |
| "grad_norm": 2.7764924352985663, | |
| "learning_rate": 2.0867189568660236e-07, | |
| "loss": 0.7737655639648438, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 2.188679245283019, | |
| "grad_norm": 2.8245587551592264, | |
| "learning_rate": 2.0016939407046986e-07, | |
| "loss": 0.7852470278739929, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 2.207547169811321, | |
| "grad_norm": 3.429004827616326, | |
| "learning_rate": 1.9180017923607883e-07, | |
| "loss": 0.7893455624580383, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 2.2264150943396226, | |
| "grad_norm": 3.1969648790899408, | |
| "learning_rate": 1.835679716365417e-07, | |
| "loss": 0.7634609937667847, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 2.2452830188679247, | |
| "grad_norm": 2.70318214433158, | |
| "learning_rate": 1.7547643081973578e-07, | |
| "loss": 0.7859703898429871, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 2.2641509433962264, | |
| "grad_norm": 2.961996890522788, | |
| "learning_rate": 1.6752915380148768e-07, | |
| "loss": 0.7709099650382996, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.2830188679245285, | |
| "grad_norm": 2.8177889556978095, | |
| "learning_rate": 1.5972967346655448e-07, | |
| "loss": 0.7789061069488525, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 2.30188679245283, | |
| "grad_norm": 3.320024417308839, | |
| "learning_rate": 1.5208145699811415e-07, | |
| "loss": 0.7862054705619812, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 2.3207547169811322, | |
| "grad_norm": 2.8631784669698415, | |
| "learning_rate": 1.4458790433646263e-07, | |
| "loss": 0.7816888689994812, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 2.339622641509434, | |
| "grad_norm": 2.902161614336072, | |
| "learning_rate": 1.3725234666760427e-07, | |
| "loss": 0.7391059398651123, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 2.358490566037736, | |
| "grad_norm": 2.882470659827849, | |
| "learning_rate": 1.3007804494240476e-07, | |
| "loss": 0.7627633810043335, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.3773584905660377, | |
| "grad_norm": 2.8433427591245284, | |
| "learning_rate": 1.2306818842696715e-07, | |
| "loss": 0.7769066095352173, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.3962264150943398, | |
| "grad_norm": 2.8617729260756573, | |
| "learning_rate": 1.1622589328487503e-07, | |
| "loss": 0.7934216856956482, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 2.4150943396226414, | |
| "grad_norm": 2.8509595069990823, | |
| "learning_rate": 1.0955420119193198e-07, | |
| "loss": 0.7673547863960266, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.4339622641509435, | |
| "grad_norm": 2.874293982355328, | |
| "learning_rate": 1.03056077984014e-07, | |
| "loss": 0.7849991917610168, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.452830188679245, | |
| "grad_norm": 3.0937215388279, | |
| "learning_rate": 9.673441233863661e-08, | |
| "loss": 0.7473263740539551, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.4716981132075473, | |
| "grad_norm": 2.9292035796935054, | |
| "learning_rate": 9.059201449082043e-08, | |
| "loss": 0.784021258354187, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.490566037735849, | |
| "grad_norm": 2.810444173384006, | |
| "learning_rate": 8.463161498382949e-08, | |
| "loss": 0.7882828712463379, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.509433962264151, | |
| "grad_norm": 2.829313317652292, | |
| "learning_rate": 7.885586345533396e-08, | |
| "loss": 0.7572199702262878, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.5283018867924527, | |
| "grad_norm": 2.6656369607187567, | |
| "learning_rate": 7.326732745954e-08, | |
| "loss": 0.7826784253120422, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.547169811320755, | |
| "grad_norm": 2.7036355808226173, | |
| "learning_rate": 6.786849132580841e-08, | |
| "loss": 0.7726486325263977, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.5660377358490565, | |
| "grad_norm": 2.805033772692598, | |
| "learning_rate": 6.266175505426957e-08, | |
| "loss": 0.7736940383911133, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.5849056603773586, | |
| "grad_norm": 2.8181269221147396, | |
| "learning_rate": 5.7649433248927794e-08, | |
| "loss": 0.7888213396072388, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.6037735849056602, | |
| "grad_norm": 2.9760303324315256, | |
| "learning_rate": 5.283375408872537e-08, | |
| "loss": 0.7611340284347534, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.6226415094339623, | |
| "grad_norm": 2.828152013200315, | |
| "learning_rate": 4.821685833702849e-08, | |
| "loss": 0.779454231262207, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.641509433962264, | |
| "grad_norm": 2.8581322420761786, | |
| "learning_rate": 4.3800798389970863e-08, | |
| "loss": 0.769560694694519, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.660377358490566, | |
| "grad_norm": 2.8125888801619103, | |
| "learning_rate": 3.958753736408105e-08, | |
| "loss": 0.7890896797180176, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.6792452830188678, | |
| "grad_norm": 2.757727954638762, | |
| "learning_rate": 3.557894822359864e-08, | |
| "loss": 0.7476776838302612, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.69811320754717, | |
| "grad_norm": 2.802525331124496, | |
| "learning_rate": 3.1776812947865384e-08, | |
| "loss": 0.7551087737083435, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.7169811320754715, | |
| "grad_norm": 3.172109709327269, | |
| "learning_rate": 2.818282173916453e-08, | |
| "loss": 0.7675119638442993, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.7358490566037736, | |
| "grad_norm": 2.836017838014085, | |
| "learning_rate": 2.4798572271356843e-08, | |
| "loss": 0.7670686841011047, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.7547169811320753, | |
| "grad_norm": 2.9198667506437905, | |
| "learning_rate": 2.162556897965101e-08, | |
| "loss": 0.7993500828742981, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.7735849056603774, | |
| "grad_norm": 2.795471164301072, | |
| "learning_rate": 1.8665222391821166e-08, | |
| "loss": 0.7754116654396057, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.7924528301886795, | |
| "grad_norm": 2.7725526525432787, | |
| "learning_rate": 1.5918848501170644e-08, | |
| "loss": 0.7710179090499878, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.811320754716981, | |
| "grad_norm": 2.784214561225124, | |
| "learning_rate": 1.3387668181519818e-08, | |
| "loss": 0.7384580969810486, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.830188679245283, | |
| "grad_norm": 2.8847249743481833, | |
| "learning_rate": 1.1072806644478738e-08, | |
| "loss": 0.7740883827209473, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.849056603773585, | |
| "grad_norm": 2.8315645307075945, | |
| "learning_rate": 8.975292939244927e-09, | |
| "loss": 0.7919697165489197, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.867924528301887, | |
| "grad_norm": 2.9085892225722034, | |
| "learning_rate": 7.096059495149853e-09, | |
| "loss": 0.781722903251648, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.8867924528301887, | |
| "grad_norm": 2.7506543384708224, | |
| "learning_rate": 5.435941707156388e-09, | |
| "loss": 0.7471998929977417, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.9056603773584904, | |
| "grad_norm": 2.8426972222396136, | |
| "learning_rate": 3.995677564492039e-09, | |
| "loss": 0.7751771807670593, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.9245283018867925, | |
| "grad_norm": 2.844363880881091, | |
| "learning_rate": 2.7759073225832597e-09, | |
| "loss": 0.7668254375457764, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.9433962264150946, | |
| "grad_norm": 3.278094344932399, | |
| "learning_rate": 1.7771732184357901e-09, | |
| "loss": 0.7961957454681396, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.9622641509433962, | |
| "grad_norm": 2.9897635623753955, | |
| "learning_rate": 9.999192295886971e-10, | |
| "loss": 0.7848834991455078, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.981132075471698, | |
| "grad_norm": 2.748244107712091, | |
| "learning_rate": 4.4449087674847117e-10, | |
| "loss": 0.777495801448822, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 2.9554977361208974, | |
| "learning_rate": 1.1113507019094858e-10, | |
| "loss": 0.7618961334228516, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 159, | |
| "total_flos": 23335512768512.0, | |
| "train_loss": 0.8809327138294963, | |
| "train_runtime": 1440.1859, | |
| "train_samples_per_second": 3.485, | |
| "train_steps_per_second": 0.11 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 159, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 999999, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 23335512768512.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |