diff --git a/.huggingface/.gitignore b/.huggingface/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f59ec20aabf5842d237244ece8c81ab184faeac1 --- /dev/null +++ b/.huggingface/.gitignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/.huggingface/download/.gitattributes.lock b/.huggingface/download/.gitattributes.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/.gitattributes.metadata b/.huggingface/download/.gitattributes.metadata new file mode 100644 index 0000000000000000000000000000000000000000..55beb53813a29a69dd9f420fc6376033207fc924 --- /dev/null +++ b/.huggingface/download/.gitattributes.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +52373fe24473b1aa44333d318f578ae6bf04b49b +1761226115.0468304 diff --git a/.huggingface/download/README.md.lock b/.huggingface/download/README.md.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/README.md.metadata b/.huggingface/download/README.md.metadata new file mode 100644 index 0000000000000000000000000000000000000000..75413624e4ebe981e2f0479ac4bd12c66dfe0dbd --- /dev/null +++ b/.huggingface/download/README.md.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +5ca349317e49179a7f2d45da0d2f1661fb10eedc +1761226115.0058203 diff --git a/.huggingface/download/action_tokenizer.py.lock b/.huggingface/download/action_tokenizer.py.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/action_tokenizer.py.metadata b/.huggingface/download/action_tokenizer.py.metadata new file mode 100644 index 0000000000000000000000000000000000000000..2b43dcb7282c1a50c2d3073c4937643f6ed1b820 --- /dev/null +++ b/.huggingface/download/action_tokenizer.py.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +88140abeaf417a3a3f2e352b4dc61e1561b0d85c +1761226115.0607412 diff --git a/.huggingface/download/adapter_config.json.lock b/.huggingface/download/adapter_config.json.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/adapter_config.json.metadata b/.huggingface/download/adapter_config.json.metadata new file mode 100644 index 0000000000000000000000000000000000000000..b702fe230c232e7fbd1f27cb5309e510ce28a915 --- /dev/null +++ b/.huggingface/download/adapter_config.json.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +400e0801a3d6524f28c5cc76f53a586cd7f892bd +1761226115.0597107 diff --git a/.huggingface/download/adapter_model.safetensors.lock b/.huggingface/download/adapter_model.safetensors.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/adapter_model.safetensors.metadata b/.huggingface/download/adapter_model.safetensors.metadata new file mode 100644 index 0000000000000000000000000000000000000000..c9934bd93ff416fa651ad197e76bbe2f31229948 --- /dev/null +++ b/.huggingface/download/adapter_model.safetensors.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +321e8bf39040ecb5cc098d9c1a5d949863b8984e7284834c6ed1284cfae9330a +1761226121.0416002 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.lock b/.huggingface/download/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.metadata b/.huggingface/download/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.metadata new file mode 100644 index 0000000000000000000000000000000000000000..437fcdc8886b749c609016db2589afdaf4bafc28 --- /dev/null +++ b/.huggingface/download/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +ba2aede6b1df77b71c4892ad88bda9f33409895b5e59a6bdd649f141fc52771b +1761226116.8750958 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.lock b/.huggingface/download/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.metadata b/.huggingface/download/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.metadata new file mode 100644 index 0000000000000000000000000000000000000000..5e7ad469dafba17ef459d23d18138cd4268bc37a --- /dev/null +++ b/.huggingface/download/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +4c18c2d01d4ba549d29734b5e4dd0f9746384bc14b9ac06efe309101be15f637 +1761226116.6164677 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.lock b/.huggingface/download/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.metadata b/.huggingface/download/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.metadata new file mode 100644 index 0000000000000000000000000000000000000000..e4fc4633a0dd81d8f6e3bb9af71007c43f5f95ce --- /dev/null +++ b/.huggingface/download/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +37be0911da3ad4fe5511128313655e786b3214cc756103aae74384fdcc3df55d +1761226116.6099718 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.lock b/.huggingface/download/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.metadata b/.huggingface/download/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.metadata new file mode 100644 index 0000000000000000000000000000000000000000..e8515f563766c6ba48897782e664b02d3e4991c5 --- /dev/null +++ b/.huggingface/download/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +a025c115f1da140a7a883998b2c4cb3a55e7df9801aed9212e54075f91f4b3a9 +1761226117.5304518 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.lock b/.huggingface/download/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.metadata b/.huggingface/download/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.metadata new file mode 100644 index 0000000000000000000000000000000000000000..93b5145789cbacb8aab582b894704f780d7a7d19 --- /dev/null +++ b/.huggingface/download/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +f9cfd19e10f2d22f9c014b8f0073154c5600f949e933ad4cd4020b9002c5206f +1761226116.8223166 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.lock b/.huggingface/download/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.metadata b/.huggingface/download/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.metadata new file mode 100644 index 0000000000000000000000000000000000000000..e4ee814f0500b097907b52ed68a50e3608da0b11 --- /dev/null +++ b/.huggingface/download/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +a1ef3bbe9d768d8c27d3f67e0cb7487d7f9fd2235b0526ac43e1d6498ea236ee +1761226117.273343 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.lock b/.huggingface/download/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.metadata b/.huggingface/download/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.metadata new file mode 100644 index 0000000000000000000000000000000000000000..df0d95a2b2a90fbf4e0d61fcc11354549cbeabfd --- /dev/null +++ b/.huggingface/download/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +0052e1b05a49523bdf2c059acd156a32c843d943978d98f58e6829fc31e01f7b +1761226116.8142471 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.lock b/.huggingface/download/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.metadata b/.huggingface/download/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.metadata new file mode 100644 index 0000000000000000000000000000000000000000..71e80a869cbe96555c32cef3c0e5b7cc22531804 --- /dev/null +++ b/.huggingface/download/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +050596d6b020de84b3013a4a2c09a804a90f4c35efb24e6cfa1d6a7fde2dce31 +1761226118.6946356 diff --git a/.huggingface/download/global_step20000/mp_rank_00_model_states.pt.lock b/.huggingface/download/global_step20000/mp_rank_00_model_states.pt.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/global_step20000/mp_rank_00_model_states.pt.metadata b/.huggingface/download/global_step20000/mp_rank_00_model_states.pt.metadata new file mode 100644 index 0000000000000000000000000000000000000000..f2d584a793b531d030da489c19bc0d1f5ccbbf58 --- /dev/null +++ b/.huggingface/download/global_step20000/mp_rank_00_model_states.pt.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +72997e653930faaf3c066d4f82a1834386f75086babb84d66fe65311e1385935 +1761226122.2875574 diff --git a/.huggingface/download/latest.lock b/.huggingface/download/latest.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/latest.metadata b/.huggingface/download/latest.metadata new file mode 100644 index 0000000000000000000000000000000000000000..6a851a0fe5e387c05bc1a97b85ee13f57cde9662 --- /dev/null +++ b/.huggingface/download/latest.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +50908603509898f37e005b455ca2e7cad40a4bb0 +1761226117.1420465 diff --git a/.huggingface/download/preprocessor_config.json.lock b/.huggingface/download/preprocessor_config.json.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/preprocessor_config.json.metadata b/.huggingface/download/preprocessor_config.json.metadata new file mode 100644 index 0000000000000000000000000000000000000000..dbcde13dd01b8627f5a3fca7bd7b90f484da68dc --- /dev/null +++ b/.huggingface/download/preprocessor_config.json.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +bdfe56961f826723a5a96d8efbc71f29951c2462 +1761226117.1195855 diff --git a/.huggingface/download/processing_spatialvla.py.lock b/.huggingface/download/processing_spatialvla.py.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/processing_spatialvla.py.metadata b/.huggingface/download/processing_spatialvla.py.metadata new file mode 100644 index 0000000000000000000000000000000000000000..dfda84eac09eb24b372a6457a69f1edd1f050a3c --- /dev/null +++ b/.huggingface/download/processing_spatialvla.py.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +6a6fc5e5d6eac345d2d2cdb9f0e0c19501a7425d +1761226117.172361 diff --git a/.huggingface/download/processor_config.json.lock b/.huggingface/download/processor_config.json.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/processor_config.json.metadata b/.huggingface/download/processor_config.json.metadata new file mode 100644 index 0000000000000000000000000000000000000000..aa51afbe3107800f9126432feaa243e40ca8ab52 --- /dev/null +++ b/.huggingface/download/processor_config.json.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +a24e37c6594120e551d9a294c9ed2cf6f1a9a0eb +1761226117.4233541 diff --git a/.huggingface/download/rng_state_0.pth.lock b/.huggingface/download/rng_state_0.pth.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/rng_state_0.pth.metadata b/.huggingface/download/rng_state_0.pth.metadata new file mode 100644 index 0000000000000000000000000000000000000000..2a253197d269cc62cf156299ec083c6ea1b4f828 --- /dev/null +++ b/.huggingface/download/rng_state_0.pth.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +0f53daabe594c8b32637070472cd1c25820ccde6527abe961448850eebac6137 +1761226117.4378443 diff --git a/.huggingface/download/rng_state_1.pth.lock b/.huggingface/download/rng_state_1.pth.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/rng_state_1.pth.metadata b/.huggingface/download/rng_state_1.pth.metadata new file mode 100644 index 0000000000000000000000000000000000000000..e6fa38ea344927fbc8cff8fc40f3f6c208182b56 --- /dev/null +++ b/.huggingface/download/rng_state_1.pth.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +2df1c88932d17c7a1aa846ee90a7eba5ecd5c61717310ec4ff7d4d63d39755a9 +1761226117.4504387 diff --git a/.huggingface/download/rng_state_2.pth.lock b/.huggingface/download/rng_state_2.pth.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/rng_state_2.pth.metadata b/.huggingface/download/rng_state_2.pth.metadata new file mode 100644 index 0000000000000000000000000000000000000000..556616b2a4b8b17b6fb7e46a8768432248501fdc --- /dev/null +++ b/.huggingface/download/rng_state_2.pth.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +d6d56d63613d02d9abc855988cac3009ddba2d488f312885fdcdac70798145e4 +1761226117.5601354 diff --git a/.huggingface/download/rng_state_3.pth.lock b/.huggingface/download/rng_state_3.pth.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/rng_state_3.pth.metadata b/.huggingface/download/rng_state_3.pth.metadata new file mode 100644 index 0000000000000000000000000000000000000000..62bd61f6624ad0dbf5a254d97b3d234ba1efe40a --- /dev/null +++ b/.huggingface/download/rng_state_3.pth.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +25a53d531491a2d7bd9f21491b781454462391cec88b48c156683c9854a01eff +1761226117.6945634 diff --git a/.huggingface/download/rng_state_4.pth.lock b/.huggingface/download/rng_state_4.pth.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/rng_state_4.pth.metadata b/.huggingface/download/rng_state_4.pth.metadata new file mode 100644 index 0000000000000000000000000000000000000000..0468b8e1e86631f4ddbab285202d55fb610cf33a --- /dev/null +++ b/.huggingface/download/rng_state_4.pth.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +9eaa5b76e4ba5e23b79468dbbd7ea5cdad23037532b651f51ccd4a4b5a96fc9f +1761226117.701255 diff --git a/.huggingface/download/rng_state_5.pth.lock b/.huggingface/download/rng_state_5.pth.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/rng_state_5.pth.metadata b/.huggingface/download/rng_state_5.pth.metadata new file mode 100644 index 0000000000000000000000000000000000000000..d7ca9277c335316ada44d224ea589e97a6b4d0e4 --- /dev/null +++ b/.huggingface/download/rng_state_5.pth.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +1d2c95071adce2d21e49f781fe594986c0f9cd732f90345b72bc280cd22bd651 +1761226117.737589 diff --git a/.huggingface/download/rng_state_6.pth.lock b/.huggingface/download/rng_state_6.pth.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/rng_state_6.pth.metadata b/.huggingface/download/rng_state_6.pth.metadata new file mode 100644 index 0000000000000000000000000000000000000000..f61a682e03126f06b0218c87cb234901f4cc8bf9 --- /dev/null +++ b/.huggingface/download/rng_state_6.pth.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +be4c045403f43fce73abae229b7b9ca51e4953c5c028684c7a1445e99294532f +1761226117.8160088 diff --git a/.huggingface/download/rng_state_7.pth.lock b/.huggingface/download/rng_state_7.pth.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/rng_state_7.pth.metadata b/.huggingface/download/rng_state_7.pth.metadata new file mode 100644 index 0000000000000000000000000000000000000000..bf3e0c74a88c9d9c448673e65e33ab34e27c2c2a --- /dev/null +++ b/.huggingface/download/rng_state_7.pth.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +e746515df1af11066084c97a0b2d90c0978b135c67129f890fb8f9062eb02089 +1761226117.8445039 diff --git a/.huggingface/download/special_tokens_map.json.lock b/.huggingface/download/special_tokens_map.json.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/special_tokens_map.json.metadata b/.huggingface/download/special_tokens_map.json.metadata new file mode 100644 index 0000000000000000000000000000000000000000..5a7ed825091bf312431e8e0b6bbbb0d3abae1bb6 --- /dev/null +++ b/.huggingface/download/special_tokens_map.json.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +f48cea75e45286da1529fb88a24e851bc1a3c882 +1761226117.9869068 diff --git a/.huggingface/download/tokenizer.json.lock b/.huggingface/download/tokenizer.json.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/tokenizer.json.metadata b/.huggingface/download/tokenizer.json.metadata new file mode 100644 index 0000000000000000000000000000000000000000..166779d268f4eb3f6d037006ba39cdcd27b0b74f --- /dev/null +++ b/.huggingface/download/tokenizer.json.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +2523a63c898ebf0a32c7282a2e459ef2c950a846c5f3172305089e4149b6b6c3 +1761226118.589334 diff --git a/.huggingface/download/tokenizer_config.json.lock b/.huggingface/download/tokenizer_config.json.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/tokenizer_config.json.metadata b/.huggingface/download/tokenizer_config.json.metadata new file mode 100644 index 0000000000000000000000000000000000000000..40eb0ee69313cd8442aa2cd825bbae5d1d938ed6 --- /dev/null +++ b/.huggingface/download/tokenizer_config.json.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +333b43dab08444106e5cebd5b4fe47fd60b7cdfb +1761226118.0585616 diff --git a/.huggingface/download/trainer_state.json.lock b/.huggingface/download/trainer_state.json.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/trainer_state.json.metadata b/.huggingface/download/trainer_state.json.metadata new file mode 100644 index 0000000000000000000000000000000000000000..ad6f2fee46ad09e3ee31c99a60a4c8efaed77e66 --- /dev/null +++ b/.huggingface/download/trainer_state.json.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +79e2436feabf2bd2578dc0110b3ae7de93bb5e5a +1761226118.2284403 diff --git a/.huggingface/download/training_args.bin.lock b/.huggingface/download/training_args.bin.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/training_args.bin.metadata b/.huggingface/download/training_args.bin.metadata new file mode 100644 index 0000000000000000000000000000000000000000..c44d2dcda4bb5d2d31971befc4734983226c578a --- /dev/null +++ b/.huggingface/download/training_args.bin.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +35012f067ff71727beaf7acafc3fd20a1696ffc176b4ae4d484dea60668c5017 +1761226118.1031542 diff --git a/.huggingface/download/zero_to_fp32.py.lock b/.huggingface/download/zero_to_fp32.py.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.huggingface/download/zero_to_fp32.py.metadata b/.huggingface/download/zero_to_fp32.py.metadata new file mode 100644 index 0000000000000000000000000000000000000000..6358721c08bbc0026fc8a37055a6d8b71ff5affe --- /dev/null +++ b/.huggingface/download/zero_to_fp32.py.metadata @@ -0,0 +1,3 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 +e69ecd9acb5a235ffbf927091051106d902b3d39 +1761226118.2845747 diff --git a/cache/hub/models--szang18--spatialvla-pickcube-20k/refs/main b/cache/hub/models--szang18--spatialvla-pickcube-20k/refs/main new file mode 100644 index 0000000000000000000000000000000000000000..df143c47e9e12b91e13cf8ec39c03bce70081955 --- /dev/null +++ b/cache/hub/models--szang18--spatialvla-pickcube-20k/refs/main @@ -0,0 +1 @@ +19db2245b0f4215505bf254e382fa9b91e552b82 \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98a20676884398ebbe355e73c05adab427323fc --- /dev/null +++ b/config.json @@ -0,0 +1,320 @@ +{ + "_name_or_path": "IPEC-COMMUNITY/spatialvla-4b-224-pt", + "_vocab_size": 265347, + "action_token_begin_idx": 257153, + "architectures": [ + "SpatialVLAForConditionalGeneration" + ], + "auto_map": { + "AutoConfig": "IPEC-COMMUNITY/spatialvla-4b-224-pt--configuration_spatialvla.SpatialVLAConfig", + "AutoModel": "IPEC-COMMUNITY/spatialvla-4b-224-pt--modeling_spatialvla.SpatialVLAForConditionalGeneration" + }, + "bos_token_id": 2, + "ego3d_patch_reso": 2, + "eos_token_id": 1, + "hidden_size": 2048, + "image_token_index": 257152, + "model_type": "spatialvla", + "n_freqs": 8, + "num_hidden_layers": 26, + "pad_token_id": 0, + "projection_dim": 2304, + "spatial_token_num": 8194, + "text_config": { + "_attn_implementation_autoset": true, + "architectures": [ + "Gemma2ForCausalLM" + ], + "eos_token_id": [ + 1, + 107 + ], + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 2304, + "intermediate_size": 9216, + "model_type": "gemma2", + "num_hidden_layers": 26, + "num_image_tokens": 256, + "num_key_value_heads": 4, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "vocab_size": 265347 + }, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.0", + "use_spatial_token": true, + "use_vision_zoe": true, + "vision_config": { + "hidden_size": 1152, + "intermediate_size": 4304, + "model_type": "siglip_vision_model", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "num_image_tokens": 256, + "num_positions": 256, + "patch_size": 14, + "projection_dim": 2304, + "torch_dtype": "bfloat16", + "vision_use_head": false + }, + "vision_zoe_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "Intel/zoedepth-nyu-kitti", + "add_cross_attention": false, + "add_projection": false, + "architectures": [ + "ZoeDepthForDepthEstimation" + ], + "attractor_alpha": 1000, + "attractor_gamma": 2, + "attractor_kind": "mean", + "backbone": null, + "backbone_config": { + "_attn_implementation_autoset": false, + "_name_or_path": "", + "add_cross_attention": false, + "add_fpn": false, + "architectures": null, + "attention_probs_dropout_prob": 0.0, + "auxiliary_channels": 256, + "auxiliary_concat_input": false, + "auxiliary_loss_weight": 0.4, + "auxiliary_num_convs": 1, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.1, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 384, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-12, + "layer_scale_init_value": 0.1, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "beit", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "out_features": [ + "stage6", + "stage12", + "stage18", + "stage24" + ], + "out_indices": [ + 6, + 12, + 18, + 24 + ], + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 16, + "pool_scales": [ + 1, + 2, + 3, + 6 + ], + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "reshape_hidden_states": false, + "return_dict": true, + "return_dict_in_generate": false, + "semantic_loss_ignore_index": 255, + "sep_token_id": null, + "stage_names": [ + "stem", + "stage1", + "stage2", + "stage3", + "stage4", + "stage5", + "stage6", + "stage7", + "stage8", + "stage9", + "stage10", + "stage11", + "stage12", + "stage13", + "stage14", + "stage15", + "stage16", + "stage17", + "stage18", + "stage19", + "stage20", + "stage21", + "stage22", + "stage23", + "stage24" + ], + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_absolute_position_embeddings": false, + "use_auxiliary_head": true, + "use_bfloat16": false, + "use_mask_token": false, + "use_mean_pooling": true, + "use_relative_position_bias": true, + "use_shared_relative_position_bias": false, + "vocab_size": 8192 + }, + "backbone_hidden_size": 1024, + "bad_words_ids": null, + "batch_norm_eps": 1e-05, + "begin_suppress_tokens": null, + "bin_centers_type": "softplus", + "bin_configurations": [ + { + "max_depth": 10.0, + "min_depth": 0.001, + "n_bins": 64, + "name": "nyu" + }, + { + "max_depth": 80.0, + "min_depth": 0.001, + "n_bins": 64, + "name": "kitti" + } + ], + "bin_embedding_dim": 128, + "bos_token_id": null, + "bottleneck_features": 256, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fusion_hidden_size": 256, + "head_in_index": -1, + "hidden_act": "gelu", + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_temp": 50.0, + "min_length": 0, + "min_temp": 0.0212, + "model_type": "zoedepth", + "neck_hidden_sizes": [ + 256, + 512, + 1024, + 1024 + ], + "no_repeat_ngram_size": 0, + "num_attractors": [ + 16, + 8, + 4, + 1 + ], + "num_beam_groups": 1, + "num_beams": 1, + "num_patch_transformer_layers": 4, + "num_relative_features": 32, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_transformer_hidden_size": 128, + "patch_transformer_intermediate_size": 1024, + "patch_transformer_num_attention_heads": 4, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "readout_type": "project", + "reassemble_factors": [ + 4, + 2, + 1, + 0.5 + ], + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_batch_norm_in_fusion_residual": false, + "use_bfloat16": false, + "use_bias_in_fusion_residual": null, + "use_pretrained_backbone": false + } +} diff --git a/configuration_spatialvla.py b/configuration_spatialvla.py new file mode 100644 index 0000000000000000000000000000000000000000..7a55383947587f03c420aab1e924663f24125884 --- /dev/null +++ b/configuration_spatialvla.py @@ -0,0 +1,119 @@ +# coding=utf-8 +# Copyright 2024 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging +from transformers import CONFIG_MAPPING, AutoConfig + +logger = logging.get_logger(__name__) + +class SpatialVLAConfig(PretrainedConfig): + model_type = "spatialvla" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "vision_zoe_config": AutoConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + ignore_index=-100, + image_token_index=256000, + vocab_size=257152, + projection_dim=2048, + hidden_size=2048, + vision_zoe_config=None, + action_token_begin_idx=None, + spatial_token_num=259, + use_spatial_token=False, + ego3d_patch_reso=4, + n_freqs=8, + use_vision_zoe=True, + **kwargs, + ): + self._ignore_index = ignore_index + self.image_token_index = image_token_index + self._vocab_size = vocab_size + self.projection_dim = projection_dim + self.hidden_size = hidden_size + self.vision_config = vision_config + self.is_encoder_decoder = False + + if isinstance(self.vision_config, dict): + vision_config["model_type"] = ( + vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model" + ) + self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + self.vision_config = CONFIG_MAPPING["siglip_vision_model"]( + intermediate_size=4096, + hidden_size=1152, + patch_size=14, + image_size=224, + num_hidden_layers=27, + num_attention_heads=16, + vocab_size=257152, + vision_use_head=False, + ) + + self.text_config = text_config + if isinstance(self.text_config, dict): + text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gemma2" + self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + self.text_config = CONFIG_MAPPING["gemma2"]( + hidden_size=2048, + num_hidden_layers=18, + intermediate_size=16384, + num_attention_heads=8, + num_key_value_heads=1, + is_encoder_decoder=False, + vocab_size=vocab_size, + ) + self.text_config.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2 + self.vision_config.projection_dim = projection_dim + + # vision zoe config + self.vision_zoe_config = vision_zoe_config + if isinstance(self.vision_zoe_config, dict): + vision_zoe_config["model_type"] = vision_zoe_config["model_type"] if "model_type" in vision_zoe_config else "zoedepth" + self.vision_zoe_config = CONFIG_MAPPING[vision_zoe_config["model_type"]](**vision_zoe_config) + else: + pass + + # additional attributes + self.action_token_begin_idx = action_token_begin_idx + self.spatial_token_num = spatial_token_num + self.use_spatial_token = use_spatial_token + self.ego3d_patch_reso = ego3d_patch_reso + self.n_freqs = n_freqs + self.use_vision_zoe = use_vision_zoe + + super().__init__(**kwargs) + + @property + def ignore_index(self): + warnings.warn( + "The `ignore_index` attribute is deprecated and will be removed in v4.47.", + FutureWarning, + ) + return self._ignore_index + + @ignore_index.setter + def ignore_index(self, value): + self._ignore_index = value + + def to_dict(self): + output = super().to_dict() + output.pop("_ignore_index", None) + return output \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..32bcf65c293e9a7290f043dc94fbc069fe647339 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "cache_implementation": "hybrid", + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.47.0" +} diff --git a/model-00001-of-00002.safetensors b/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5c7fd4a8b4e2f9cf5791c0573858540b8ceb72b2 --- /dev/null +++ b/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dcbc9042db787c285269136be463bda8401f8f77667e40097754572725aad86 +size 4969426016 diff --git a/model-00002-of-00002.safetensors b/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a7f054a08a6fe550c1b9034f229e86680fae90cf --- /dev/null +++ b/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90dd6cddb4bb82e0e9449402ada333adb5bca5f65f3379b4d901321a71bed735 +size 3086476734 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..f4f5689202723216dc6721e5724515d397da95e9 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,1389 @@ +{ + "metadata": { + "total_size": 8055709462 + }, + "weight_map": { + "language_model.lm_head.weight": "model-00002-of-00002.safetensors", + "language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.0.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.0.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.1.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.1.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.10.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.10.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.11.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.11.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.12.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.12.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.13.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.13.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.14.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.14.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.15.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.15.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.16.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.16.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.17.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.17.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.18.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.18.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.19.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.19.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.2.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.2.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.20.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.20.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.21.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.21.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.22.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.22.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.23.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.23.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.24.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.24.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.25.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.25.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.3.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.3.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.4.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.4.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.5.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.5.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.6.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.6.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.7.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.7.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.8.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.8.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.9.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.9.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "language_model.model.norm.weight": "model-00002-of-00002.safetensors", + "multi_modal_projector.linear.bias": "model-00001-of-00002.safetensors", + "multi_modal_projector.linear.weight": "model-00001-of-00002.safetensors", + "position_embedding_3d.position_embedding_head.0.bias": "model-00002-of-00002.safetensors", + "position_embedding_3d.position_embedding_head.0.weight": "model-00002-of-00002.safetensors", + "position_embedding_3d.position_embedding_head.1.bias": "model-00002-of-00002.safetensors", + "position_embedding_3d.position_embedding_head.1.weight": "model-00002-of-00002.safetensors", + "position_embedding_3d.position_embedding_head.3.bias": "model-00002-of-00002.safetensors", + "position_embedding_3d.position_embedding_head.3.weight": "model-00002-of-00002.safetensors", + "spatial_embed_tokens.weight": "model-00002-of-00002.safetensors", + "vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors", + "vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors", + "vision_zoe_model.backbone.embeddings.cls_token": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.embeddings.patch_embeddings.projection.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.embeddings.patch_embeddings.projection.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.0.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.1.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.10.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.11.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.12.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.13.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.14.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.15.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.16.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.17.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.18.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.19.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.2.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.20.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.21.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.22.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.23.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.3.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.4.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.5.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.6.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.7.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.8.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.attention.attention.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.attention.attention.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.attention.attention.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.attention.attention.relative_position_bias.relative_position_bias_table": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.attention.attention.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.attention.attention.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.attention.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.attention.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.intermediate.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.intermediate.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.lambda_1": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.lambda_2": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.layernorm_after.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.layernorm_after.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.layernorm_before.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.layernorm_before.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.output.dense.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.backbone.encoder.layer.9.output.dense.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.0.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.0.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.0.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.0.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.1.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.1.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.1.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.1.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.2.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.2.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.2.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.2.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.3.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.3.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.3.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.kitti.3.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.0.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.0.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.0.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.0.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.1.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.1.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.1.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.1.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.2.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.2.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.2.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.2.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.3.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.3.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.3.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.attractors.nyu.3.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.conditional_log_binomial.kitti.mlp.0.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.conditional_log_binomial.kitti.mlp.0.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.conditional_log_binomial.kitti.mlp.2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.conditional_log_binomial.kitti.mlp.2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.conditional_log_binomial.nyu.mlp.0.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.conditional_log_binomial.nyu.mlp.0.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.conditional_log_binomial.nyu.mlp.2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.conditional_log_binomial.nyu.mlp.2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.mlp_classifier.linear1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.mlp_classifier.linear1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.mlp_classifier.linear2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.mlp_classifier.linear2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.embedding_convPxP.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.embedding_convPxP.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.linear1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.linear1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.linear2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.linear2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.norm1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.norm1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.norm2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.norm2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.self_attn.key.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.self_attn.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.self_attn.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.self_attn.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.self_attn.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.0.self_attn.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.linear1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.linear1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.linear2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.linear2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.norm1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.norm1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.norm2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.norm2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.self_attn.key.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.self_attn.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.self_attn.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.self_attn.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.self_attn.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.1.self_attn.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.linear1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.linear1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.linear2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.linear2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.norm1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.norm1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.norm2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.norm2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.self_attn.key.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.self_attn.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.self_attn.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.self_attn.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.self_attn.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.2.self_attn.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.linear1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.linear1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.linear2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.linear2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.norm1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.norm1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.norm2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.norm2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.self_attn.key.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.self_attn.key.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.self_attn.query.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.self_attn.query.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.self_attn.value.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.patch_transformer.transformer_encoder.3.self_attn.value.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.0.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.0.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.0.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.0.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.1.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.1.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.1.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.1.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.2.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.2.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.2.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.2.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.3.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.3.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.3.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.projectors.3.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.seed_bin_regressors.kitti.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.seed_bin_regressors.kitti.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.seed_bin_regressors.kitti.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.seed_bin_regressors.kitti.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.seed_bin_regressors.nyu.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.seed_bin_regressors.nyu.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.seed_bin_regressors.nyu.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.seed_bin_regressors.nyu.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.seed_projector.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.seed_projector.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.seed_projector.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.metric_head.seed_projector.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.convs.0.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.convs.1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.convs.2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.convs.3.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.0.projection.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.0.projection.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.0.residual_layer1.convolution1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.0.residual_layer1.convolution1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.0.residual_layer1.convolution2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.0.residual_layer1.convolution2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.0.residual_layer2.convolution1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.0.residual_layer2.convolution1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.0.residual_layer2.convolution2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.0.residual_layer2.convolution2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.1.projection.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.1.projection.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.1.residual_layer1.convolution1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.1.residual_layer1.convolution1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.1.residual_layer1.convolution2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.1.residual_layer1.convolution2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.1.residual_layer2.convolution1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.1.residual_layer2.convolution1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.1.residual_layer2.convolution2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.1.residual_layer2.convolution2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.2.projection.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.2.projection.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.2.residual_layer1.convolution1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.2.residual_layer1.convolution1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.2.residual_layer1.convolution2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.2.residual_layer1.convolution2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.2.residual_layer2.convolution1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.2.residual_layer2.convolution1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.2.residual_layer2.convolution2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.2.residual_layer2.convolution2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.3.projection.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.3.projection.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.3.residual_layer1.convolution1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.3.residual_layer1.convolution1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.3.residual_layer1.convolution2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.3.residual_layer1.convolution2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.3.residual_layer2.convolution1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.3.residual_layer2.convolution1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.3.residual_layer2.convolution2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.fusion_stage.layers.3.residual_layer2.convolution2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.0.projection.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.0.projection.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.0.resize.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.0.resize.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.1.projection.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.1.projection.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.1.resize.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.1.resize.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.2.projection.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.2.projection.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.3.projection.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.3.projection.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.3.resize.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.layers.3.resize.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.readout_projects.0.0.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.readout_projects.0.0.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.readout_projects.1.0.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.readout_projects.1.0.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.readout_projects.2.0.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.readout_projects.2.0.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.readout_projects.3.0.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.neck.reassemble_stage.readout_projects.3.0.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.relative_head.conv1.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.relative_head.conv1.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.relative_head.conv2.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.relative_head.conv2.weight": "model-00002-of-00002.safetensors", + "vision_zoe_model.relative_head.conv3.bias": "model-00002-of-00002.safetensors", + "vision_zoe_model.relative_head.conv3.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/modeling_gemma2.py b/modeling_gemma2.py new file mode 100644 index 0000000000000000000000000000000000000000..f069c8faebaaf1b9d76101d2496c19335bb3678b --- /dev/null +++ b/modeling_gemma2.py @@ -0,0 +1,1283 @@ +# custom gemma2 to support flash_attention_2, +# source from https://github.com/huggingface/transformers/blob/v4.47.0/src/transformers/models/gemma2/modeling_gemma2.py +# coding=utf-8 +# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, HybridCache +from transformers.generation import GenerationMixin +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal, + is_torch_greater_or_equal, + logging, + replace_return_docstrings, + is_flash_attn_greater_or_equal_2_10, +) +from transformers import Gemma2Config + + +if is_flash_attn_2_available(): + from transformers.modeling_flash_attention_utils import _flash_attention_forward + +if is_torch_greater_or_equal("2.5"): + from torch.nn.attention.flex_attention import flex_attention + +logger = logging.get_logger(__name__) + + +_CHECKPOINT_FOR_DOC = "google/gemma2-7b" +_CONFIG_FOR_DOC = "Gemma2Config" + + +class Gemma2RMSNorm(nn.Module): + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.zeros(dim)) + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + output = self._norm(x.float()) + # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16) + # See https://github.com/huggingface/transformers/pull/29402 + output = output * (1.0 + self.weight.float()) + return output.type_as(x) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.eps}" + + +class Gemma2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_activation] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class Gemma2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)) + self.register_buffer("inv_freq", tensor=inv_freq, persistent=False) + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + self.inv_freq.to(x.device) + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + config: Gemma2Config, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor], + **_kwargs, +) -> Tuple[torch.Tensor, torch.Tensor]: + key_states = repeat_kv(key, config.num_key_value_groups) + value_states = repeat_kv(value, config.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * config.scaling + + if config.attn_logit_softcapping is not None: + attn_weights = attn_weights / config.attn_logit_softcapping + attn_weights = torch.tanh(attn_weights) + attn_weights = attn_weights * config.attn_logit_softcapping + if mask is not None: # no matter the length, we just slice it + causal_mask = mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +def flash_attention_forward( + config: Gemma2Config, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor], + target_dtype: torch.dtype = torch.float16, + **_kwargs, +) -> Tuple[torch.Tensor, None]: + # NOTE: None mask cause un defined https://github.com/huggingface/transformers/blob/c8c8dffbe45ebef0a8dba4a51024e5e5e498596b/src/transformers/models/gemma2/modeling_gemma2.py#L211 + seq_len = query.shape[2] + if mask is not None: + query = query[:, :, :seq_len] + value = value[:, :, :seq_len] + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout + # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding + query_states = query.transpose(1, 2) + key_states = key.transpose(1, 2) + value_states = value.transpose(1, 2) + + dropout_rate = config.attention_dropout if config.training else 0.0 + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + mask, + seq_len, + dropout=dropout_rate, + softmax_scale=config.scaling, + is_causal=config.is_causal, + sliding_window=config.sliding_window, + use_top_left_mask=config._flash_attn_uses_top_left_mask, + softcap=config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None, + ) + + return attn_output, None + + +def flex_attention_forward( + config: Gemma2Config, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor], + output_attentions: bool = False, + **_kwargs, +) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + def tanh_softcap(score, b, h, q_idx, kv_idx): + soft_cap = config.attn_logit_softcapping + score = soft_cap * torch.tanh(score / soft_cap) + if mask is not None: + return score + mask[b][0][q_idx][kv_idx] + return score + + attn_output = flex_attention( + query, + key, + value, + score_mod=tanh_softcap, + enable_gqa=True, + scale=config.scaling, + return_lse=output_attentions, + ) + if not output_attentions: + attn_weights = None + else: + attn_output, attn_weights = attn_output + + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +def sdpa_attention_forward( + config: Gemma2Config, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor], + **_kwargs, +) -> Tuple[torch.Tensor, None]: + key = repeat_kv(key, config.num_key_value_groups) + value = repeat_kv(value, config.num_key_value_groups) + + causal_mask = mask + if mask is not None: + causal_mask = causal_mask[:, :, :, : key.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query.device.type == "cuda" and causal_mask is not None: + query = query.contiguous() + key = key.contiguous() + value = value.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = True if causal_mask is None and query.shape[1] > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query, + key, + value, + attn_mask=causal_mask, + dropout_p=config.attention_dropout if config.training else 0.0, + is_causal=is_causal, + scale=config.scaling, + ) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, None + + +GEMMA2_ATTENTION_FUNCTION = { + "flash_attention_2": flash_attention_forward, + "flex_attention": flex_attention_forward, + "eager": eager_attention_forward, + "sdpa": sdpa_attention_forward, +} + + +class Gemma2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = config.head_dim + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.scaling = config.query_pre_attn_scalar**-0.5 + self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None + self.attn_logit_softcapping = config.attn_logit_softcapping + if self.hidden_size % self.num_heads != 0: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) + self.rotary_emb = Gemma2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + # NOTE: gemma2 do not include _flash_attn_uses_top_left_mask for flash attention + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = { + "sin": sin, + "cos": cos, + "sliding_window": self.sliding_window, + "cache_position": cache_position, + } + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]: + logger.warning_once("Setting `attention_type` to `flex_attention` because `output_attentions=True`") + attention_type = "flex_attention" + else: + attention_type = self.config._attn_implementation + + attn_output, attn_weights = GEMMA2_ATTENTION_FUNCTION[attention_type]( + self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Gemma2FlashAttention2(Gemma2Attention): + def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): + super().__init__(config, layer_idx) + self.config._attn_implementation = "flash_attention_2" + logger.warning_once( + "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`" + "attribute of the `GemmaAttention` class! It will be removed in v4.48" + ) + + +class Gemma2SdpaAttention(Gemma2Attention): + def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): + super().__init__(config, layer_idx) + self.config._attn_implementation = "sdpa" + logger.warning_once( + "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`" + "attribute of the `GemmaAttention` class! It will be removed in v4.48" + ) + + +class Gemma2DecoderLayer(nn.Module): + def __init__(self, config: Gemma2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.config = config + self.is_sliding = not bool(layer_idx % 2) + self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx) + self.mlp = Gemma2MLP(config) + self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.sliding_window = config.sliding_window + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if self.is_sliding and attention_mask is not None: # efficient SDPA and no padding + # Flash-attn is a 2D tensor + if self.config._attn_implementation == "flash_attention_2": + if past_key_value is not None: # when decoding + attention_mask = attention_mask[:, -self.sliding_window :] + else: + min_dtype = torch.finfo(hidden_states.dtype).min + sliding_window_mask = torch.tril( + torch.ones_like(attention_mask, dtype=torch.bool), diagonal=-self.sliding_window + ) + attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask) + if attention_mask.shape[-1] <= 1: # when decoding + attention_mask = attention_mask[:, :, :, -self.sliding_window :] + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.pre_feedforward_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_feedforward_layernorm(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +GEMMA2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Gemma2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.", + GEMMA2_START_DOCSTRING, +) +class Gemma2PreTrainedModel(PreTrainedModel): + config_class = Gemma2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Gemma2DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + _supports_quantized_cache = False + _supports_static_cache = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + @classmethod + def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False): + """ + Overloads `PreTrainedModel._check_and_enable_sdpa` so as to DISABLE torch SDPA by default on Gemma2 models. + SDPA reduces the model performance on Gemma2 because of the logits softcapping. + """ + config = super()._check_and_enable_sdpa(config, hard_check_only=hard_check_only) + + # if using the default path -> swap sdpa by eager + if not hard_check_only and config._attn_implementation == "sdpa": + config._attn_implementation = "eager" + + return config + + +GEMMA2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.", + GEMMA2_START_DOCSTRING, +) +class Gemma2Model(Gemma2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Gemma2DecoderLayer`] + + Args: + config: Gemma2Config + """ + + def __init__(self, config: Gemma2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[HybridCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None and not self.training: + batch_size, seq_len, _ = inputs_embeds.shape + past_key_values = HybridCache( + self.config, + batch_size=batch_size, + max_cache_len=seq_len, + device=self.device, + dtype=inputs_embeds.dtype, + ) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + + # embed positions + hidden_states = inputs_embeds + + # normalized + # Gemma2 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5 + # See https://github.com/huggingface/transformers/pull/29402 + normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) + hidden_states = hidden_states * normalizer + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = past_key_values if use_cache else None + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + @torch.no_grad() + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: HybridCache, + output_attentions: bool, + ): + # Flash Attention currently doesn't support static cache but Gemma2 work only with static cache. + # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape + # to cut out keys/values trailing 0 used in static cache. This workaround should be compile compatible + # as it doesn't cause dynamic control issues. + if self.config._attn_implementation == "flash_attention_2": + return attention_mask + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if isinstance(past_key_values, HybridCache): + target_length = past_key_values.get_max_cache_shape() + else: + target_length = attention_mask.shape[-1] if attention_mask is not None else input_tensor.shape[1] + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + + def __init__(self, config): + super().__init__(config) + self.model = Gemma2Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[HybridCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + **loss_kwargs, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + num_logits_to_keep (`int`, *optional*): + Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, GemmaForCausalLM + + >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b") + >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b") + + >>> prompt = "What is your favorite condiment?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "What is your favorite condiment?" + ```""" + + if self.training and self.config._attn_implementation != "eager": + logger.warning_once( + "It is strongly recommended to train Gemma2 models with the `eager` attention implementation " + f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('', attn_implementation='eager')`." + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) + if self.config.final_logit_softcapping is not None: + logits = logits / self.config.final_logit_softcapping + logits = torch.tanh(logits) + logits = logits * self.config.final_logit_softcapping + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + num_logits_to_keep=None, + **kwargs, + ): + # Overwritten: has a special cache type, `HybridCache` + + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + if past_key_values is not None: + if inputs_embeds is not None: # Exception 1 + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s + # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride + # during the decoding. Here, simply using `.contiguous()` is not sufficient as in the + # batch size = 1 case, `position_ids` is already contiguous but with varying stride + # which retriggers a capture. + position_ids = position_ids.clone(memory_format=torch.contiguous_format) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and cache_position[0] == 0: + model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} + else: + # The clone here is for the same reason as for `position_ids`. + model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} + + if ( + isinstance(past_key_values, HybridCache) + and attention_mask.ndim == 2 + and not self.config._attn_implementation == "flash_attention_2" + ): + if model_inputs["inputs_embeds"] is not None: + batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape + device = model_inputs["inputs_embeds"].device + else: + batch_size, sequence_length = model_inputs["input_ids"].shape + device = model_inputs["input_ids"].device + + attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=past_key_values.get_max_cache_shape(), + dtype=self.lm_head.weight.dtype, + device=device, + cache_position=cache_position, + batch_size=batch_size, + ) + + if num_logits_to_keep is not None: + model_inputs["num_logits_to_keep"] = num_logits_to_keep + + model_inputs.update( + { + "position_ids": position_ids, + "cache_position": cache_position, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + } + ) + return model_inputs + + +@add_start_docstrings( + """ + The Gemma2 Model transformer with a sequence classification head on top (linear layer). + + [`Gemma2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + GEMMA2_START_DOCSTRING, +) +class Gemma2ForSequenceClassification(Gemma2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Gemma2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@add_start_docstrings( + """ + The Gemma2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states + output) e.g. for Named-Entity-Recognition (NER) tasks. + """, + GEMMA2_START_DOCSTRING, +) +class Gemma2ForTokenClassification(Gemma2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Gemma2Model(config) + if getattr(config, "classifier_dropout", None) is not None: + classifier_dropout = config.classifier_dropout + elif getattr(config, "hidden_dropout", None) is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.score = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + logits = self.score(sequence_output) + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.config) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/modeling_spatialvla.py b/modeling_spatialvla.py new file mode 100644 index 0000000000000000000000000000000000000000..657983dceae8ff82db8deaaaada7887fc89c60b6 --- /dev/null +++ b/modeling_spatialvla.py @@ -0,0 +1,526 @@ +# coding=utf-8 +# Copyright 2024 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import os +import torch +import torch.utils.checkpoint +from torch import nn +from torch.linalg import inv +import torchvision.transforms.functional as TF +import torch.nn.functional as F +from transformers.cache_utils import Cache, HybridCache, StaticCache +from transformers.generation import GenerationMixin +from transformers.modeling_utils import PreTrainedModel, PretrainedConfig +from transformers.utils import ( + ModelOutput, + logging, +) +from .configuration_spatialvla import SpatialVLAConfig +from .modeling_gemma2 import Gemma2ForCausalLM +from transformers import AutoModel, ZoeDepthForDepthEstimation + +SIGLIP_MEAN, SIGLIP_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5) +ZOE_MEAN, ZOE_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5) + +logger = logging.get_logger(__name__) + +class Ego3DPositionEmbeddingMLP(nn.Module): + """Absolute pos embedding, learned. + https://github.com/kwea123/nerf_pl/blob/52aeb387da64a9ad9a0f914ea9b049ffc598b20c/models/nerf.py#L4 + """ + + def __init__(self, in_channels=3, num_pos_feats=768, n_freqs=8, logscale=True): + super(Ego3DPositionEmbeddingMLP, self).__init__() + self.n_freqs = n_freqs + self.freq_out_channels = in_channels * (2 * n_freqs + 1) + if logscale: + freq_bands = 2 ** torch.linspace(0, n_freqs - 1, n_freqs) + else: + freq_bands = torch.linspace(1, 2 ** (n_freqs - 1), n_freqs) + + center = torch.tensor([0., 0., 2.]).repeat(in_channels // 3) + self.register_buffer("freq_bands", freq_bands, persistent=False) + self.register_buffer("center", center, persistent=False) + + self.position_embedding_head = nn.Sequential( + nn.Linear(self.freq_out_channels, num_pos_feats), + nn.LayerNorm(num_pos_feats), + nn.ReLU(), + nn.Linear(num_pos_feats, num_pos_feats), + ) + self._reset_parameters() + + def _reset_parameters(self): + """init with small weights to maintain stable training.""" + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p, gain=0.01) + + @torch.no_grad() + def frequency_encoding(self, xyz): + """ + Embeds x to (x, sin(2^k x), cos(2^k x), ...) + Different from the paper, "x" is also in the output + See https://github.com/bmild/nerf/issues/12 + x \in [-2, 2] + y \in [-2, 2] + z \in [0., 4] + Inputs: + x: (b n m) + Outputs: + out: (b n o) + """ + xyz_n = ((xyz - self.center) / 2.0).to(self.freq_bands.dtype) + xyz_feq = xyz_n.unsqueeze(-1) * self.freq_bands # (b n m 1) + sin_xyz, cos_xyz = torch.sin(xyz_feq), torch.cos(xyz_feq) # (b n m nf) + encoding = torch.cat([xyz_n.unsqueeze(-1), sin_xyz, cos_xyz], -1).reshape(*xyz.shape[:2], -1) + return encoding + + def forward(self, xyz): + """Forward pass, xyz is (B, N, 3or6), output (B, N, F).""" + freq_encoding = self.frequency_encoding(xyz) + position_embedding = self.position_embedding_head(freq_encoding) + return position_embedding + +def process_zoe(pixel_values, pad_mode="reflect", output_size=(384, 512)): + """https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/zoedepth/image_processing_zoedepth.py""" + # h, w = images.shape[-2:] + # pad + ph, pw = 31, 31 # int((h / 2)**0.5 * 3), int((w / 2)**0.5 * 3) # 32, 31 + images = F.pad(pixel_values, (pw, pw, ph, ph), mode=pad_mode) + # resize + size = (384, 384) # get_resize_output_image_size + images = F.interpolate(images, size=size, mode="bicubic", align_corners=True) + # zoe: padding -> resize -> nomalize. we follow `nomalize -> padding -> resize` from siglip + images = TF.normalize(images, mean=ZOE_MEAN, std=ZOE_STD) + return images, ph, pw + +@dataclass +class SpatialVLACausalLMOutputWithPast(ModelOutput): + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + +class SpatialVLAMultiModalProjector(nn.Module): + def __init__(self, config: SpatialVLAConfig): + super().__init__() + self.linear = nn.Linear(config.vision_config.hidden_size, config.vision_config.projection_dim, bias=True) + + def forward(self, image_features): + hidden_states = self.linear(image_features) + return hidden_states + +class SpatialVLAPreTrainedModel(PreTrainedModel): + config_class = SpatialVLAConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["SpatialVLAMultiModalProjector", "ZoeDepthForDepthEstimation", "Ego3DPositionEmbeddingMLP"] + _skip_keys_device_placement = "past_key_values" + _supports_cache_class = True + _supports_quantized_cache = True + _supports_static_cache = True + _supports_cache_class = True + _supports_flash_attn_2 = True + _supports_sdpa = True + + def _init_weights(self, module): + std = ( + self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.config.text_config.initializer_range + ) + + if hasattr(module, "class_embedding"): + module.class_embedding.data.normal_(mean=0.0, std=std) + + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + +class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMixin): + def __init__(self, config: SpatialVLAConfig, vision_model=None, vision_zoe_model=None, projector_model=None, language_model=None): + super().__init__(config) + + self.vision_tower = vision_model or AutoModel.from_config(config=config.vision_config) + self.multi_modal_projector = projector_model or SpatialVLAMultiModalProjector(config) + self.vocab_size = config.text_config.vocab_size + if language_model is None: + language_model = Gemma2ForCausalLM(config=config.text_config) + if language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys] + self.language_model = language_model + + if config.use_vision_zoe: + self.vision_zoe_model = vision_zoe_model or ZoeDepthForDepthEstimation(config.vision_zoe_config) + self.position_embedding_3d = Ego3DPositionEmbeddingMLP( + config.ego3d_patch_reso**2 * 3, num_pos_feats=config.vision_config.hidden_size, n_freqs=config.n_freqs + ) + # register buffer + patch_size, reso, image_size = config.vision_config.patch_size, config.ego3d_patch_reso, config.vision_config.image_size + y, x = torch.meshgrid(torch.arange(0, image_size, patch_size // reso), torch.arange(0, image_size, patch_size // reso), indexing="ij") # (h//sp w//sp) + y, x = y + patch_size / reso / 2, x + patch_size / reso / 2 + uv_h = torch.stack([x, y, torch.ones_like(x)], dim=0).reshape(3, -1) # (3 hw) + self.register_buffer("uv_h", uv_h, persistent=False) + + # shared spatial embeddings for + if config.use_spatial_token: + self.spatial_embed_tokens = nn.Embedding(self.config.spatial_token_num, config.text_config.hidden_size) + else: + self.spatial_embed_tokens = None + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + + + def backproject_patch(self, K: torch.Tensor, depth: torch.Tensor, patch_size=14, reso=2) -> torch.Tensor: + """ + Backproject depth map to 3D points in camera coordinate. + Args: + K: camera intrinsic matrix (b 3 3) + depth: depth map (b 1 h w) + patch_size: patch size for siglip + reso: reso^2 -> sample points in each patch + patch sz = 14 ...... + ┌────────┬────────┐ + │ ─ ─ │ ─ ─ │ + │ points │ ├─ ─ ─ + │ ─ ─ │ ─ ─ │ + ├────────┼────────┤ + │ ─ ─ │ ─ ─ │ + │ │ │ + │ ─ ─ │ ─ ─ │ + └────────┴────────┘ + reso=2───►points=4 + │ + │ + """ + b, c, h, w = depth.shape + hp, wp = h // patch_size, w // patch_size + sub_hp = sub_wp = reso + patch_depth = F.interpolate(depth, size=(hp * reso, wp * reso), mode="area").reshape(b, c, -1) + p_cam = (inv(K.float()) @ self.uv_h.float()) * patch_depth # (b 3 3) @ (3 hw) -> (b 3 hw) * (b 1 hw) -> (b 3 hw) + patch_p_cam = p_cam.reshape(b, 3, hp, sub_hp, wp, sub_wp).permute(0, 2, 4, 3, 5, 1).reshape(b, hp * wp, -1) + return patch_p_cam + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def tie_weights(self): + return self.language_model.tie_weights() + + def resize_token_embeddings( + self, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + mean_resizing: bool = True, + ) -> nn.Embedding: + model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + vocab_size = model_embeds.weight.shape[0] + self.config.text_config.vocab_size = self.vocab_size = self.config._vocab_size = vocab_size + self.tie_weights() + return model_embeds + + def _update_causal_mask( + self, + attention_mask, + token_type_ids, + past_key_values, + cache_position, + input_ids=None, + inputs_embeds=None, + is_training: bool = False, + ): + if self.config.text_config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + using_static_cache = isinstance(past_key_values, StaticCache) + min_dtype = torch.finfo(self.dtype).min + inputs_lead_dim = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0] + sequence_length = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + elif isinstance(past_key_values, HybridCache): + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else cache_position[0] + sequence_length + 1 + ) + + if attention_mask is not None and attention_mask.dim() == 4: + return attention_mask + + causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=self.dtype, device=cache_position.device) + if sequence_length != 1: + if is_training: causal_mask = torch.triu(causal_mask, diagonal=1) + else: causal_mask[:, :sequence_length] = 0.0 + + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype) + if is_training: + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0) + return causal_mask + + def get_image_features(self, pixel_values: torch.FloatTensor, intrinsic: torch.FloatTensor): + siglip_pixel_values = TF.normalize(pixel_values, mean=SIGLIP_MEAN, std=SIGLIP_STD) + image_outputs = self.vision_tower(siglip_pixel_values) + + # ego3d position encoding + if self.config.use_vision_zoe: + zoe_pixel_values, ph, pw = process_zoe(pixel_values, pad_mode="reflect") + with torch.no_grad(): + pvh, pvw = pixel_values.shape[-2:] + depth = self.vision_zoe_model(pixel_values=zoe_pixel_values).predicted_depth + depth = F.interpolate( + depth.unsqueeze(1), + size=(pvh+2*ph, pvw+2*pw), + mode="bicubic", + align_corners=True, + )[..., ph:-ph, pw:-pw] + xyz = self.backproject_patch( + intrinsic, depth, patch_size=self.config.vision_config.patch_size, reso=self.config.ego3d_patch_reso + ) # (b, n, 3*4) + pos_embed_3d = self.position_embedding_3d(xyz) + selected_image_feature = image_outputs.last_hidden_state + pos_embed_3d + else: + selected_image_feature = image_outputs.last_hidden_state + image_features = self.multi_modal_projector(selected_image_feature) + image_features = image_features / (self.config.text_config.hidden_size**0.5) + return image_features + + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + actions: Optional[torch.FloatTensor] = None, + intrinsic: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None, + token_type_ids: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + num_logits_to_keep: int = 0, + ) -> Union[Tuple, SpatialVLACausalLMOutputWithPast]: + + output_attentions = output_attentions or self.config.output_attentions + output_hidden_states = output_hidden_states or self.config.output_hidden_states + return_dict = return_dict or self.config.use_return_dict + + is_training = token_type_ids is not None and labels is not None + + if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids).clone() # avoid checkpint grad True + + if self.config.use_spatial_token: + spatial_selected = (input_ids >= self.config.action_token_begin_idx) & (input_ids < self.config.action_token_begin_idx + self.config.spatial_token_num) + inputs_embeds[spatial_selected] = inputs_embeds[spatial_selected] * 0.0 + self.spatial_embed_tokens(input_ids[spatial_selected] - self.config.action_token_begin_idx) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + 1 # Paligemma positions are 1-indexed + + # merge + if pixel_values is not None: + image_features = self.get_image_features(pixel_values, intrinsic) + special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1) + special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + image_tokens_in_text = torch.sum(input_ids == self.config.image_token_index) + raise ValueError( + f"Number of images does not match number of special image tokens in the input text. " + f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} " + "tokens from image embeddings." + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + # mask out pad-token-ids in labels for BC + if labels is not None and self.pad_token_id in labels: + logger.warning_once( + "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. ", + "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.", + ) + labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels) + + causal_mask = self._update_causal_mask( + attention_mask, token_type_ids, past_key_values, cache_position, input_ids, inputs_embeds, is_training + ) + outputs = self.language_model( + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + num_logits_to_keep=num_logits_to_keep, + ) + + logits = outputs.logits + loss = None + if labels is not None: + logits = logits.float() + shift_logits = logits[..., :-1, :] + shift_labels = labels[..., 1:] + if attention_mask is not None: + shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device) + shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous() + shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous() + else: + shift_logits = shift_logits.contiguous() + shift_labels = shift_labels.contiguous() + loss_fct = nn.CrossEntropyLoss() + + flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size) + flat_labels = shift_labels.view(-1).to(shift_logits.device) + loss = loss_fct(flat_logits, flat_labels) + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return SpatialVLACausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + # AR inference + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + pixel_values=None, + intrinsic=None, + attention_mask=None, + token_type_ids=None, + use_cache=True, + num_logits_to_keep=None, + labels=None, + **kwargs, + ): + model_inputs = self.language_model.prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + cache_position=cache_position, + use_cache=use_cache, + num_logits_to_keep=num_logits_to_keep, + token_type_ids=token_type_ids, + **kwargs, + ) + if model_inputs.get("position_ids") is not None: + model_inputs["position_ids"] += 1 + if cache_position[0] == 0: + model_inputs["pixel_values"] = pixel_values + is_training = token_type_ids is not None and labels is not None + if cache_position[0] == 0 and isinstance(past_key_values, HybridCache): + causal_mask = self._update_causal_mask(attention_mask, token_type_ids, past_key_values, cache_position, input_ids, inputs_embeds, is_training) + model_inputs["attention_mask"] = causal_mask + model_inputs["intrinsic"] = intrinsic + return model_inputs + + @torch.no_grad() + def predict_action( + self, + model_inputs, + ) -> torch.Tensor: + model_inputs = model_inputs.to(torch.bfloat16).to(self.device) + input_len = model_inputs["input_ids"].shape[-1] + generation_outputs = self.generate(**model_inputs, max_new_tokens=256, do_sample=False) + return generation_outputs[:,input_len:] + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], + *model_args, + config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None, + cache_dir: Optional[Union[str, os.PathLike]] = None, + ignore_mismatched_sizes: bool = False, + force_download: bool = False, + local_files_only: bool = False, + token: Optional[Union[str, bool]] = None, + revision: str = "main", + use_safetensors: Optional[bool] = None, + weights_only: bool = True, + **kwargs, + ): + model = super().from_pretrained( + pretrained_model_name_or_path, + *model_args, + config=config, + cache_dir=cache_dir, + ignore_mismatched_sizes=ignore_mismatched_sizes, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + use_safetensors=use_safetensors, + weights_only=weights_only, + **kwargs, + ) + if model.config.use_spatial_token: + model.language_model.model.embed_tokens.weight.data[-model.config.spatial_token_num:] = model.spatial_embed_tokens.weight.data + return model \ No newline at end of file