Lekr0 commited on 3 days ago

Commit

90afcf2

verified ·

1 Parent(s): 5818a32

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

.claude/settings.local.json +17 -0
ICL/.claude/settings.local.json +32 -0
ICL/DAPO/verl-recipe/.github/workflows/pre-commit.yml +37 -0
ICL/DAPO/verl-recipe/dapo/config/dapo_megatron_trainer.yaml +28 -0
ICL/DAPO/verl-recipe/entropy/reward_score/entropy_math/math_normalize.py +192 -0
ICL/DAPO/verl-recipe/fault_recover/agent_loop/fault_recover_agent_loop.py +137 -0
ICL/DAPO/verl-recipe/spo/agent_loop/spo_agent_loop.py +155 -0
ICL/DAPO/verl-recipe/spo/agent_loop/spo_tool_agent_loop.py +414 -0
ICL/DAPO/verl-recipe/sppo/config/sppo_trainer.yaml +38 -0
ICL/EVAL_GUIDE.md +47 -0
ICL/LV/dataset_inspect.tree.txt +456 -0
ICL/RL_DAPO/__init__.py +1 -0
ICL/SFT_new/README.md +389 -0
ICL/SFT_new/convert_and_eval.sh +87 -0
ICL/SFT_new/ds_zero2.json +37 -0
ICL/SFT_new/ds_zero3.json +28 -0
ICL/SFT_new/eval.py +961 -0
ICL/SFT_new/launch_wrapper.py +13 -0
ICL/SFT_new/rebuild_and_train.sh +86 -0
ICL/SFT_new/run_eval.sh +74 -0
ICL/SFT_new/run_single_node.sh +49 -0
ICL/SFT_new/submit_northjob.sh +38 -0
ICL/SFT_new/train.py +659 -0
ICL/build_embeddings.py +370 -0
ICL/build_index.py +506 -0
ICL/build_sft.py +466 -0
ICL/dataset_inspect.tree.txt +456 -0
ICL/eval_icl.py +524 -0
ICL/extract_images.py +231 -0
ICL/merge_captions.py +70 -0
ICL/sft_model/epoch3_step1406_fp32/chat_template.json +3 -0
ICL/sft_model/epoch3_step1406_fp32/config.json +62 -0
ICL/sft_model/epoch3_step1406_fp32/generation_config.json +14 -0
ICL/sft_model/epoch3_step1406_fp32/merges.txt +0 -0
ICL/sft_model/epoch3_step1406_fp32/model.safetensors.index.json +757 -0
ICL/sft_model/epoch3_step1406_fp32/preprocessor_config.json +21 -0
ICL/sft_model/epoch3_step1406_fp32/tokenizer.json +0 -0
ICL/sft_model/epoch3_step1406_fp32/tokenizer_config.json +239 -0
ICL/sft_model/epoch3_step1406_fp32/video_preprocessor_config.json +21 -0
ICL/sft_model/epoch3_step1406_fp32/vocab.json +0 -0
ICL/sft_model/zero_to_fp32.py +760 -0
RL_dataset/.gitattributes +89 -0
RL_dataset/.msc +0 -0
RL_dataset/.mv +1 -0
RL_dataset/INFOSEEK_DOWNLOAD.md +337 -0
RL_dataset/README.md +171 -0
RL_dataset/dataset_infos.json +1 -0
RL_dataset/download_oven_hf_mirror.sh +189 -0
RL_dataset/download_scienceqa_hf.sh +135 -0
download_hf.py +49 -0

.claude/settings.local.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "permissions": {
+    "allow": [
+      "Bash(find /workspace/xiaobin/ICL/SFT_new/output/emb_cache/ -type f -name *.json)",
+      "Bash(find /workspace/xiaobin/ICL/SFT_new/output -type f -name *.json)",
+      "Bash(find /workspace/xiaobin/ICL -type f -name *.json)",
+      "Bash(find /workspace/xiaobin/ICL/SFT_new/output/emb_cache -name *.json)",
+      "Bash(find /workspace/xiaobin -path */medlab/*vllm_thread* -o -path */medlab/*vllm*)",
+      "Bash(find /workspace/xiaobin/ICL -path */emb_cache/*.json)",
+      "Bash(python -c \"import py_compile; py_compile.compile\\(''build_sft.py'', doraise=True\\); print\\(''OK''\\)\")",
+      "Bash(python -c \"import py_compile; py_compile.compile\\(''generate_captions.py'', doraise=True\\); print\\(''OK''\\)\")",
+      "Bash(find /workspace/xiaobin -type f -name *.py)",
+      "Bash(python:*)",
+      "Bash(find /workspace -path */NorthServe/* -maxdepth 3)"
+    ]
+  }
+}

ICL/.claude/settings.local.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "permissions": {
+    "allow": [
+      "Bash(python3 -c \"import sys,json; line=sys.stdin.readline\\(\\); d=json.loads\\(line\\); print\\(list\\(d.keys\\(\\)\\)\\); [print\\(f''''{k}: {type\\(d[k]\\).__name__}, len={len\\(str\\(d[k]\\)\\)}''''\\) for k in d.keys\\(\\)]\")",
+      "Bash(python3:*)",
+      "Bash(find /workspace/xiaobin/ICL -name *sft*.jsonl -o -name output -type d)",
+      "Bash(find /workspace/xiaobin/ICL -name *.jsonl)",
+      "Bash(wc:*)",
+      "Bash(grep -r \"model_path\\\\|model-path\" /workspace/xiaobin/ICL/SFT_new/*.py)",
+      "Bash(grep -r Qwen /workspace/xiaobin/ICL/SFT_new/*.py)",
+      "Bash(grep -l embedding /workspace/xiaobin/ICL/SFT/*.py)",
+      "Bash(du -sh /workspace/xiaobin/dataset/*)",
+      "Bash(lscpu)",
+      "Bash(/workspace/miniconda3/envs/sft/bin/python -c \"import torch; print\\('torch:', torch.__version__\\); print\\('CXX11_ABI:', torch._C._GLIBCXX_USE_CXX11_ABI\\)\")",
+      "Bash(find /workspace/xiaobin/ICL -maxdepth 3 -name *eval* -o -name *inference* -o -name *test*)",
+      "Bash(ls /workspace/xiaobin/ICL/sft_model/final/*.py)",
+      "Bash(ls /workspace/xiaobin/ICL/sft_model/final/mp_rank*)",
+      "Bash(ls /workspace/xiaobin/ICL/sft_model/final/*.json)",
+      "Bash(ls /workspace/xiaobin/ICL/sft_model/final/*tag*)",
+      "Bash(pip show:*)",
+      "Bash(conda run:*)",
+      "Read(//workspace/xiaobin/dataset/sft/all/**)",
+      "Bash(find /workspace/xiaobin/ICL -type f -name *eval* -o -name *test* -o -name *infer* -o -name *benchmark* -o -name *generate* -o -name *predict*)",
+      "Bash(find /workspace/xiaobin/ICL -type f \\\\\\(-name *.jsonl -o -name *.json \\\\\\))",
+      "Bash(grep -E \"\\\\.\\(py|sh\\)$\")",
+      "Bash(find /workspace/xiaobin/ICL -type f -name *.jsonl)",
+      "Read(//workspace/xiaobin/dataset/sft/**)",
+      "Read(//workspace/xiaobin/dataset/**)",
+      "Bash(python3 -c \"import json; d=json.load\\(open\\(''/workspace/xiaobin/dataset/detail/captioning/coco/train/captions.json''\\)\\); print\\(''keys:'', list\\(d.keys\\(\\)\\)\\); items=d[''items'']; k=list\\(items.keys\\(\\)\\)[0]; print\\(k, ''->'', items[k][:100]\\)\")"
+    ]
+  }
+}

ICL/DAPO/verl-recipe/.github/workflows/pre-commit.yml ADDED Viewed

	@@ -0,0 +1,37 @@

+# c.f. https://github.com/pre-commit/action?tab=readme-ov-file#using-this-action
+name: pre-commit
+# No need to avoid / cancel lightweight pre-commit jobs
+on:
+  schedule:
+    - cron: "0 0 * * 0"
+  pull_request:
+  push:
+    branches:
+      - main
+      - v0.*
+  # Allow manual triggering
+  workflow_dispatch:
+# Declare permissions just read content.
+permissions:
+  contents: read
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Set ruff --output-format=github
+        run: |
+          sed -i 's/--output-format=full/--output-format=github/' .pre-commit-config.yaml
+          git add .pre-commit-config.yaml
+      # Check "--all-files" by default
+      - uses: pre-commit/action@v3.0.1

ICL/DAPO/verl-recipe/dapo/config/dapo_megatron_trainer.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+hydra:
+  searchpath:
+    - file://verl/trainer/config
+defaults:
+  - ppo_megatron_trainer
+  - _self_
+data:
+  gen_batch_size: ${data.train_batch_size}
+reward_model:
+  reward_manager: dapo
+  overlong_buffer:
+    enable: False # We try to avoid forgetting to set enable
+    len: 0
+    penalty_factor: 0.0
+    log: False
+algorithm:
+  filter_groups:
+    _target_: verl.trainer.config.FilterGroupsConfig
+    enable: False # We try to avoid forgetting to set enable
+    metric: null # acc / score / seq_reward / seq_final_reward / ...
+    max_num_gen_batches: 0 # Non-positive values mean no upper limit
+trainer:
+  project_name: verl-dapo

ICL/DAPO/verl-recipe/entropy/reward_score/entropy_math/math_normalize.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright (c) 2021 Dan Hendrycks
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+This logic is largely copied from the Hendrycks' MATH release (math_equivalence).
+From: https://github.com/openai/prm800k/blob/main/prm800k/grading/math_normalize.py
+"""
+import re
+from typing import Optional
+def normalize_answer(answer: Optional[str]) -> Optional[str]:
+    if answer is None:
+        return None
+    answer = answer.strip()
+    try:
+        # Remove enclosing `\text{}`.
+        m = re.search(r"^\\text\{(?P<text>.+?)\}$", answer)
+        if m is not None:
+            answer = m.group("text").strip()
+        return _strip_string(answer)
+    except Exception:
+        return answer
+def _fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except Exception:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+def _fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        a = int(a)
+        b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except Exception:
+        return string
+def _remove_right_units(string):
+    # "\\text{ " only ever occurs (at least in the val set) when describing units
+    if "\\text{ " in string:
+        splits = string.split("\\text{ ")
+        assert len(splits) == 2
+        return splits[0]
+    else:
+        return string
+def _fix_sqrt(string):
+    if "\\sqrt" not in string:
+        return string
+    splits = string.split("\\sqrt")
+    new_string = splits[0]
+    for split in splits[1:]:
+        if split[0] != "{":
+            a = split[0]
+            new_substr = "\\sqrt{" + a + "}" + split[1:]
+        else:
+            new_substr = "\\sqrt" + split
+        new_string += new_substr
+    return new_string
+def _strip_string(string):
+    # linebreaks
+    string = string.replace("\n", "")
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+    # remove dollar signs
+    string = string.replace("\\$", "")
+    # remove units (on the right)
+    string = _remove_right_units(string)
+    # remove percentage
+    string = string.replace("\\\\%", "")
+    string = string.replace("\\%", "")
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2 and len(string.split("=")[0]) <= 2:
+        string = string.split("=")[1]
+    # fix sqrt3 --> sqrt{3}
+    string = _fix_sqrt(string)
+    # remove spaces
+    string = string.replace(" ", "")
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1).
+    # Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+    return string

ICL/DAPO/verl-recipe/fault_recover/agent_loop/fault_recover_agent_loop.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from typing import Any, Optional
+from uuid import uuid4
+import ray
+from omegaconf import DictConfig
+from verl.experimental.agent_loop.agent_loop import AgentLoopManager, AgentLoopWorker, AsyncLLMServerManager
+from verl.single_controller.ray.base import RayResourcePool, RayWorkerGroup
+from verl.utils.rollout_trace import rollout_trace_op
+from verl.workers.rollout.replica import TokenOutput
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+class FaultRecoverAsyncLLMServerManager(AsyncLLMServerManager):
+    """
+    A class to manage multiple OpenAI compatible LLM servers. This class provides
+    - Load balance: least requests load balancing
+    - Sticky session: send multi-turn chat completions to same server for automatic prefix caching
+    """
+    @rollout_trace_op
+    async def generate(
+        self,
+        request_id,
+        *,
+        prompt_ids: list[int],
+        sampling_params: dict[str, Any],
+        image_data: Optional[list[Any]] = None,
+        video_data: Optional[list[Any]] = None,
+        global_id: int = None,
+    ) -> TokenOutput:
+        """Generate tokens from prompt ids.
+        Args:
+            request_id (str): request id for sticky session.
+            prompt_ids (List[int]): List of prompt token ids.
+            sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
+            global_id: Global batch id of req.
+        Returns:
+            TokenOutput: token output
+        """
+        server = self._choose_server(request_id)
+        new_request_id = uuid4().hex
+        tokens_queue = None
+        if global_id is not None:
+            from recipe.fault_recover.fault_manager import get_tokens_queue
+            tokens_queue = get_tokens_queue()
+        if tokens_queue is not None:
+            await tokens_queue.put.remote((new_request_id, global_id))
+        output = await server.generate.remote(
+            request_id=new_request_id,  # use new request_id for each turn
+            prompt_ids=prompt_ids,
+            sampling_params=sampling_params,
+            image_data=image_data,
+            video_data=video_data,
+        )
+        if tokens_queue is not None:
+            await tokens_queue.put.remote(
+                {
+                    new_request_id: {
+                        "log_probs": output.log_probs,
+                        "routed_experts": output.routed_experts,
+                        "num_preempted": output.num_preempted,
+                    }
+                }
+            )
+        return output
+class FaultRecoverAgentLoopWorker(AgentLoopWorker):
+    """Agent loop worker takes a batch of messages and run each message in an agent loop."""
+    def __init__(
+        self,
+        config: DictConfig,
+        server_handles: list[ray.actor.ActorHandle],
+        reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
+    ):
+        super().__init__(config, server_handles, reward_loop_worker_handles)
+        self.server_manager = FaultRecoverAsyncLLMServerManager(config, server_handles)
+class FaultRecoverAgentLoopManager(AgentLoopManager):
+    """Agent loop manager that manages a group of agent loop workers."""
+    def __init__(
+        self,
+        config: DictConfig,
+        worker_group: RayWorkerGroup = None,
+        rollout_resource_pool: RayResourcePool = None,
+        reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
+    ):
+        """Initialize agent loop manager.
+        Args:
+            config (DictConfig): trainer config.
+            worker_group (RayWorkerGroup): ActorRolloutRef worker group for hybrid mode; None for standalone mode.
+            rollout_resource_pool (RayResourcePool): Resource pool for actor rollout (Colocate or Standalone mode).
+            reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation.
+        """
+        self.config = config
+        self.worker_group = worker_group
+        self.reward_loop_worker_handles = reward_loop_worker_handles
+        # for recipe to change
+        if not hasattr(self, "rollout_replica_class"):
+            from recipe.fault_recover.vllm_rollout.vllm_async_server import FaultRecovervLLMReplica
+            self.rollout_replica_class = FaultRecovervLLMReplica
+        if not hasattr(self, "agent_loop_workers_class"):
+            self.agent_loop_workers_class = ray.remote(FaultRecoverAgentLoopWorker)
+        self._initialize_llm_servers(rollout_resource_pool)
+        self._init_agent_loop_workers()

ICL/DAPO/verl-recipe/spo/agent_loop/spo_agent_loop.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Modifications Copyright 2025 SPO authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+SPO Agent Loop - Extends base agent loop with code generation support.
+This module inherits from verl.experimental.agent_loop and only overrides
+the generate_sequences method to add SPO-specific stop tokens for code generation.
+"""
+import asyncio
+import numpy as np
+import ray
+from verl import DataProto
+# Re-export all base classes for backward compatibility
+from verl.experimental.agent_loop.agent_loop import AgentLoopManager, get_trajectory_info
+from verl.experimental.agent_loop.agent_loop import (
+    AgentLoopWorkerBase as BaseAgentLoopWorkerBase,
+)
+from verl.utils.transferqueue_utils import tqbridge
+__all__ = [
+    "AgentLoopWorkerBase",
+    "SPOAgentLoopWorker",
+    "SPOAgentLoopManager",
+]
+class AgentLoopWorkerBase(BaseAgentLoopWorkerBase):
+    """SPO-specific agent loop worker with code generation stop tokens.
+    Inherits all functionality from base AgentLoopWorkerBase and only overrides
+    the generate_sequences method to add SPO-specific parameters:
+    - stop="</code>" for code block termination
+    - include_stop_str_in_output=True to include the stop token
+    """
+    @tqbridge()
+    async def generate_sequences(self, batch: DataProto) -> DataProto:
+        """Generate sequences from agent loop with SPO-specific stop tokens.
+        Override: Adds stop="</code>" and include_stop_str_in_output=True
+        to sampling_params for SPO code generation use case.
+        Args:
+            batch (DataProto): Input batch.
+        Returns:
+            DataProto: Output batch.
+            - prompts: [bsz, prompt_length], prompt token ids from dataset.
+            - responses: [bsz, response_length], output token ids include response tokens
+              from LLM generation and observation tokens from tool_calls.
+            - response_mask: [bsz, response_length], 1 for LLM generated tokens, 0 for observation/padding tokens.
+            - input_ids: [bsz, prompt_length + response_length], whole sequence token ids, including prompt tokens
+              and response tokens.
+            - attention_mask: [bsz, prompt_length + response_length], 0 for padding tokens, 1 for other tokens.
+            - position_ids: [bsz, prompt_length + response_length], incremental position ids.
+            For multi-turn conversations:
+            responses:     |<- LLM generation ->|<- tool_calls ->|<- LLM generation ->|<- padding ->|
+            response_mask: | 1, 1, 1, ..., 1, 1 | 0, 0, .., 0, 0 | 1, 1, 1, ..., 1, 1 | 0, 0, ..., 0|
+        """
+        config = self.config.actor_rollout_ref.rollout
+        # SPO-specific: Add stop tokens for code generation
+        sampling_params = dict(
+            temperature=config.temperature,
+            top_p=config.top_p,
+            repetition_penalty=1.0,
+            logprobs=config.calculate_log_probs,
+            stop="</code>",  # SPO-SPECIFIC
+            include_stop_str_in_output=True,  # SPO-SPECIFIC
+        )
+        # override sampling params for validation
+        if batch.meta_info.get("validate", False):
+            sampling_params["top_p"] = config.val_kwargs.top_p
+            sampling_params["temperature"] = config.val_kwargs.temperature
+        # by default, we assume it's a single turn agent
+        if "agent_name" not in batch.non_tensor_batch:
+            default_agent_loop = config.agent.default_agent_loop
+            batch.non_tensor_batch["agent_name"] = np.array([default_agent_loop] * len(batch), dtype=object)
+        if "index" in batch.non_tensor_batch:
+            index = batch.non_tensor_batch["index"]
+        else:
+            index = np.arange(len(batch))
+        trajectory_info = await get_trajectory_info(
+            batch.meta_info.get("global_steps", -1), index.tolist(), batch.meta_info.get("validate", False)
+        )
+        tasks = []
+        for i in range(len(batch)):
+            kwargs = {k: v[i] for k, v in batch.non_tensor_batch.items()}
+            tasks.append(asyncio.create_task(self._run_agent_loop(sampling_params, trajectory_info[i], **kwargs)))
+        outputs = await asyncio.gather(*tasks)
+        output = self._postprocess(outputs)
+        return output
+@ray.remote
+class SPOAgentLoopWorker(AgentLoopWorkerBase):
+    """SPO Agent Loop Worker as a Ray remote actor.
+    This is a Ray remote actor wrapper around AgentLoopWorkerBase,
+    enabling distributed execution with SPO-specific stop tokens.
+    """
+    def __init__(self, config, server_handles, reward_router_address=None):
+        """Initialize SPO Agent Loop Worker.
+        Args:
+            config: trainer config.
+            server_handles: OpenAI compatible LLM server actor handles.
+            reward_router_address: reward router address.
+        """
+        super().__init__(config, server_handles, reward_router_address)
+class SPOAgentLoopManager(AgentLoopManager):
+    """SPO-specific Agent Loop Manager that uses SPO's AgentLoopWorker.
+    Inherits all functionality from base AgentLoopManager and only overrides
+    the agent_loop_workers_class to use SPOAgentLoopWorker which includes
+    code generation stop tokens.
+    """
+    def __init__(self, config, worker_group=None, rm_wg=None):
+        """Initialize SPO Agent Loop Manager.
+        Args:
+            config: trainer config.
+            worker_group: ActorRolloutRef worker group for hybrid mode; None for standalone mode.
+            rm_wg: Reward model worker group.
+        """
+        # Set SPO-specific worker class before calling parent __init__
+        self.agent_loop_workers_class = SPOAgentLoopWorker
+        super().__init__(config, worker_group, rm_wg)

ICL/DAPO/verl-recipe/spo/agent_loop/spo_tool_agent_loop.py ADDED Viewed

	@@ -0,0 +1,414 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Modifications Copyright 2025 SPO authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import copy
+import logging
+import os
+from typing import Any, Optional
+from uuid import uuid4
+from verl.experimental.agent_loop.agent_loop import (
+    AgentLoopBase,
+    AgentLoopOutput,
+    register,
+)
+from verl.experimental.agent_loop.tool_agent_loop import AgentState
+from verl.interactions.base import BaseInteraction
+from verl.interactions.utils.interaction_registry import (
+    initialize_interactions_from_config,
+)
+from verl.tools.schemas import ToolResponse
+from verl.tools.utils.tool_registry import initialize_tools_from_config
+from verl.utils.profiler import simple_timer
+from verl.utils.rollout_trace import rollout_trace_op
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+class AgentData:
+    """Encapsulates all state variables for the agent loop."""
+    def __init__(
+        self,
+        messages: list[dict[str, Any]],
+        image_data: Any,
+        metrics: dict[str, Any],
+        request_id: str,
+        tools_kwargs: dict[str, Any],
+        interaction: Optional[BaseInteraction] = None,
+        interaction_kwargs: Optional[dict[str, Any]] = None,
+    ):
+        self.messages = messages
+        self.image_data = image_data
+        self.metrics = metrics
+        self.request_id = request_id
+        self.tools_kwargs = tools_kwargs
+        self.interaction = interaction
+        self.interaction_kwargs = interaction_kwargs or {}
+        # State variables
+        self.prompt_ids: list[int] = []
+        self.response_ids: list[int] = []
+        self.response_mask: list[int] = []
+        self.response_logprobs: list[float] = []
+        self.turn_scores: list[float] = []
+        self.tool_rewards: list[float] = []
+        self.user_turns = 0
+        self.assistant_turns = 0
+        # Temporary state for tool calls
+        self.tool_calls: list[str] = []  # Raw Python code strings extracted from <code> tags
+@register("spo_tool_agent")
+class SPOToolAgentLoop(AgentLoopBase):
+    @classmethod
+    def init_class(cls, config, tokenizer, processor, **kwargs):
+        if cls._class_initialized:
+            return
+        cls._class_initialized = True
+        print("Performing class-level ToolAgentLoop initialization")
+        # Initialize tools from config file
+        cls.tokenizer = tokenizer
+        cls.processor = processor
+        cls.max_user_turns = config.actor_rollout_ref.rollout.multi_turn.max_user_turns
+        cls.max_assistant_turns = config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns
+        cls.max_parallel_calls = config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls
+        cls.max_tool_response_length = config.actor_rollout_ref.rollout.multi_turn.max_tool_response_length
+        cls.tool_response_truncate_side = config.actor_rollout_ref.rollout.multi_turn.tool_response_truncate_side
+        tool_config_path = config.actor_rollout_ref.rollout.multi_turn.tool_config_path
+        tool_list = initialize_tools_from_config(tool_config_path) if tool_config_path else []
+        cls.tools = {tool.name: tool for tool in tool_list}
+        cls.tool_schemas = [tool.tool_schema.model_dump(exclude_unset=True, exclude_none=True) for tool in tool_list]
+        print(f"Initialized tools: {cls.tools}")
+        cls.apply_chat_template_kwargs = config.data.get("apply_chat_template_kwargs", {})
+        cls.prompt_length = config.actor_rollout_ref.rollout.prompt_length
+        cls.response_length = config.actor_rollout_ref.rollout.response_length
+        cls.system_prompt = tokenizer.apply_chat_template(
+            [{}], add_generation_prompt=False, tokenize=True, **cls.apply_chat_template_kwargs
+        )
+        # Initialize interactions from config file
+        cls.interaction_config_file = config.actor_rollout_ref.rollout.multi_turn.interaction_config_path
+        if cls.interaction_config_file:
+            cls.interaction_map: dict[str, BaseInteraction] = cls._initialize_interactions(cls.interaction_config_file)
+    @rollout_trace_op
+    async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
+        messages = list(kwargs["raw_prompt"])
+        image_data = copy.deepcopy(kwargs.get("multi_modal_data", {}).get("image", None))
+        metrics = {}
+        request_id = uuid4().hex
+        tools_kwargs = kwargs.get("tools_kwargs", {})
+        # Initialize interaction if needed
+        interaction = None
+        interaction_kwargs = {}
+        if self.interaction_config_file:
+            interaction_kwargs = kwargs["extra_info"]["interaction_kwargs"]
+            if "name" not in interaction_kwargs:
+                raise ValueError("'name' key is required in interaction_kwargs")
+            interaction_name = interaction_kwargs["name"]
+            if interaction_name not in self.interaction_map:
+                raise ValueError(
+                    f"Interaction '{interaction_name}' not found in interaction_map. Available interactions: "
+                    f"{list(self.interaction_map.keys())}"
+                )
+            interaction = self.interaction_map[interaction_name]
+            await interaction.start_interaction(request_id, **interaction_kwargs)
+        # Create AgentData instance to encapsulate all state
+        agent_data = AgentData(
+            messages=messages,
+            image_data=image_data,
+            metrics=metrics,
+            request_id=request_id,
+            tools_kwargs=tools_kwargs,
+            interaction=interaction,
+            interaction_kwargs=interaction_kwargs,
+        )
+        # State machine loop
+        state = AgentState.PENDING
+        while state != AgentState.TERMINATED:
+            if state == AgentState.PENDING:
+                state = await self._handle_pending_state(agent_data, sampling_params)
+            elif state == AgentState.GENERATING:
+                state = await self._handle_generating_state(agent_data, sampling_params)
+            elif state == AgentState.PROCESSING_TOOLS:
+                state = await self._handle_processing_tools_state(agent_data)
+            elif state == AgentState.INTERACTING:
+                state = await self._handle_interacting_state(agent_data)
+            else:
+                logger.error(f"Invalid state: {state}")
+                state = AgentState.TERMINATED
+        # Finalize output
+        response_ids = agent_data.prompt_ids[-len(agent_data.response_mask) :]
+        prompt_ids = agent_data.prompt_ids[: len(agent_data.prompt_ids) - len(agent_data.response_mask)]
+        multi_modal_data = {"image": agent_data.image_data} if agent_data.image_data is not None else {}
+        output = AgentLoopOutput(
+            prompt_ids=prompt_ids,
+            response_ids=response_ids[: self.response_length],
+            response_mask=agent_data.response_mask[: self.response_length],
+            multi_modal_data=multi_modal_data,
+            response_logprobs=agent_data.response_logprobs[: self.response_length]
+            if agent_data.response_logprobs
+            else None,
+            num_turns=agent_data.user_turns + agent_data.assistant_turns + 1,
+            metrics=agent_data.metrics,
+            extra_fields={},
+        )
+        output.extra_fields.update({"turn_scores": agent_data.turn_scores, "tool_rewards": agent_data.tool_rewards})
+        return output
+    def _extract_code_blocks(self, response_ids: list[int]) -> list[str]:
+        """Extract Python code from <code>...</code> tags in response.
+        Args:
+            response_ids: Token IDs from model response
+        Returns:
+            List of cleaned Python code strings
+        """
+        import re
+        # Decode token IDs to text
+        response_text = self.tokenizer.decode(response_ids, skip_special_tokens=False)
+        # Extract all code blocks between <code> and </code> tags
+        pattern = r"<code>(.*?)</code>"
+        matches = re.findall(pattern, response_text, re.DOTALL)
+        # Clean each code block (remove markdown fences, strip whitespace)
+        cleaned_codes = []
+        for match in matches:
+            # Remove markdown code fences if present
+            cleaned = re.sub(r"^```(?:python)?\s*\n?", "", match.strip())
+            cleaned = re.sub(r"\n?```\s*$", "", cleaned)
+            cleaned_codes.append(cleaned.strip())
+        return cleaned_codes
+    async def _handle_pending_state(self, agent_data: AgentData, sampling_params: dict[str, Any]) -> AgentState:
+        """Handle the pending state: prepare the prompt and start generation."""
+        problem = agent_data.messages[0]["content"]
+        user_prompt = (
+            "Solve the following problem step by step. "
+            "You now have the ability to selectively write executable Python code to enhance your reasoning process. "
+            "The Python code will be executed by an external sandbox, and the output "
+            "(wrapped in `<interpreter>output_str</interpreter>`)"
+            " can be returned to aid your reasoning and help you arrive at the final answer. "
+            "The Python code should be complete scripts, including necessary imports. "
+            "Important: The sandbox is stateless and non-interactive; thus, prior imports, definitions, "
+            "and state do not persist between executions and cannot be referenced.\n"
+            "Each code snippet is wrapped with `<code>\n```python\ncode snippet\n```\n</code>`.\n"
+        )
+        user_prompt += "*user question:*\n"
+        user_prompt += problem
+        messages = [{"role": "user", "content": user_prompt}]
+        agent_data.prompt_ids = await self.loop.run_in_executor(
+            None,
+            lambda: self.tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=True, **self.apply_chat_template_kwargs
+            ),
+        )
+        return AgentState.GENERATING
+    async def _handle_generating_state(
+        self, agent_data: AgentData, sampling_params: dict[str, Any], ignore_termination: bool = False
+    ) -> AgentState:
+        """Handle the generating state: generate model response and check for tool calls."""
+        add_messages: list[dict[str, Any]] = []
+        with simple_timer("generate_sequences", agent_data.metrics):
+            output = await self.server_manager.generate(
+                request_id=agent_data.request_id,
+                prompt_ids=agent_data.prompt_ids,
+                sampling_params=sampling_params,
+                image_data=agent_data.image_data,
+            )
+        agent_data.assistant_turns += 1
+        agent_data.response_ids = output.token_ids
+        agent_data.prompt_ids += agent_data.response_ids
+        agent_data.response_mask += [1] * len(agent_data.response_ids)
+        if output.log_probs:
+            agent_data.response_logprobs += output.log_probs
+        # Check termination conditions
+        if not ignore_termination and len(agent_data.response_mask) >= self.response_length:
+            return AgentState.TERMINATED
+        if self.max_assistant_turns and agent_data.assistant_turns >= self.max_assistant_turns:
+            return AgentState.TERMINATED
+        if self.max_user_turns and agent_data.user_turns >= self.max_user_turns:
+            return AgentState.TERMINATED
+        # Extract code blocks from <code> tags
+        agent_data.tool_calls = self._extract_code_blocks(agent_data.response_ids)
+        # Handle interaction if needed
+        if self.interaction_config_file:
+            assistant_message = await self.loop.run_in_executor(
+                None, lambda: self.tokenizer.decode(agent_data.response_ids, skip_special_tokens=True)
+            )
+            add_messages.append({"role": "assistant", "content": assistant_message})
+            agent_data.messages.extend(add_messages)
+        # Determine next state
+        if agent_data.tool_calls:
+            return AgentState.PROCESSING_TOOLS
+        elif self.interaction_config_file:
+            return AgentState.INTERACTING
+        else:
+            return AgentState.TERMINATED
+    async def _handle_processing_tools_state(self, agent_data: AgentData) -> AgentState:
+        """Handle the processing tools state: execute tool calls and prepare tool responses."""
+        tasks = []
+        tool_call_names = []
+        for tool_call in agent_data.tool_calls[: self.max_parallel_calls]:
+            tasks.append(self._call_tool(tool_call, agent_data.tools_kwargs))
+            tool_call_names.append("code_interpreter")
+        with simple_timer("tool_calls", agent_data.metrics):
+            responses = await asyncio.gather(*tasks)
+        response_ids = await self.loop.run_in_executor(
+            None, lambda: self.tokenizer.encode(responses[0].text or "", add_special_tokens=False)
+        )
+        if len(agent_data.response_mask) + len(response_ids) >= self.response_length:
+            return AgentState.TERMINATED
+        # Update prompt_ids and response_mask
+        agent_data.prompt_ids += response_ids
+        agent_data.response_mask += [0] * len(response_ids)
+        if agent_data.response_logprobs:
+            agent_data.response_logprobs += [0.0] * len(response_ids)
+        agent_data.user_turns += 1
+        # Change agent_data.request_id to avoid caching issues
+        agent_data.request_id = uuid4().hex
+        return AgentState.GENERATING
+    async def _handle_interacting_state(self, agent_data: AgentData) -> AgentState:
+        """Handle the interacting state: get user input from interaction."""
+        (
+            should_terminate_sequence,
+            interaction_responses,
+            reward,
+            metrics,
+        ) = await agent_data.interaction.generate_response(
+            agent_data.request_id, agent_data.messages, **agent_data.interaction_kwargs
+        )
+        agent_data.user_turns += 1
+        add_messages: list[dict[str, Any]] = [{"role": "user", "content": interaction_responses}]
+        agent_data.messages.extend(add_messages)
+        if reward is not None:
+            agent_data.turn_scores.append(reward)
+        # Update prompt with user responses (similar to _handle_processing_tools_state)
+        if self.processor is not None:
+            raw_user_response = await self.loop.run_in_executor(
+                None,
+                lambda: self.processor.apply_chat_template(
+                    add_messages,
+                    add_generation_prompt=True,
+                    tokenize=False,
+                    **self.apply_chat_template_kwargs,
+                ),
+            )
+            model_inputs = self.processor(text=[raw_user_response], images=None, return_tensors="pt")
+            response_ids = model_inputs.pop("input_ids").squeeze(0).tolist()
+        else:
+            response_ids = await self.loop.run_in_executor(
+                None,
+                lambda: self.tokenizer.apply_chat_template(add_messages, add_generation_prompt=True, tokenize=True),
+            )
+        response_ids = response_ids[len(self.system_prompt) :]
+        # Update prompt_ids and response_mask
+        agent_data.prompt_ids += response_ids
+        agent_data.response_mask += [0] * len(response_ids)
+        if agent_data.response_logprobs:
+            agent_data.response_logprobs += [0.0] * len(response_ids)
+        # double check prompt
+        # Check termination condition
+        if should_terminate_sequence:
+            return AgentState.TERMINATED
+        else:
+            return AgentState.GENERATING
+    async def _call_tool(self, tool_call: str, tools_kwargs: dict[str, Any]) -> tuple[ToolResponse, float, dict]:
+        """Call tool and return tool response."""
+        tool, instance_id = None, None
+        try:
+            tool = self.tools["code_interpreter"]
+            instance_id, _ = await tool.create(create_kwargs={})
+            tool_execution_response, _, _ = await tool.execute(instance_id, tool_call)
+        except Exception as e:
+            logger.warning(f"Error when executing tool: {e}")
+            return (
+                ToolResponse(
+                    text=f"Error when executing tool: {e}",
+                ),
+                0.0,
+                {},
+            )
+        finally:
+            if tool and instance_id:
+                await tool.release(instance_id)
+        tool_response_text = tool_execution_response.text
+        if tool_response_text and len(tool_response_text) > self.max_tool_response_length:
+            if self.tool_response_truncate_side == "left":
+                tool_response_text = tool_response_text[: self.max_tool_response_length] + "...(truncated)"
+            elif self.tool_response_truncate_side == "right":
+                tool_response_text = "(truncated)..." + tool_response_text[-self.max_tool_response_length :]
+            else:
+                length = self.max_tool_response_length // 2
+                tool_response_text = tool_response_text[:length] + "...(truncated)..." + tool_response_text[-length:]
+        tool_response_text = f"<interpreter>\n{tool_response_text}\n</interpreter>\n\n"
+        # Create ToolResponse from tool execution result
+        tool_response_kwargs = {"text": tool_response_text}
+        # Add multimedia data if present
+        for attr_name in ["image", "video"]:
+            if hasattr(tool_execution_response, attr_name):
+                attr_value = getattr(tool_execution_response, attr_name)
+                if attr_value is not None:
+                    tool_response_kwargs[attr_name] = attr_value
+        return ToolResponse(**tool_response_kwargs)
+    @classmethod
+    def _initialize_interactions(cls, interaction_config_file):
+        """Initialize interactions from configuration.
+        Returns:
+            dict[str, BaseInteraction]: A dictionary mapping interaction names to interaction instances.
+        """
+        if interaction_config_file is None:
+            return {}
+        interaction_map = initialize_interactions_from_config(interaction_config_file)
+        logger.info(f"Initialize interactions from configuration: interaction_map: {list(interaction_map.keys())}")
+        return interaction_map

ICL/DAPO/verl-recipe/sppo/config/sppo_trainer.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+# the sppo config will override default ppo_trainer.yaml
+hydra:
+  searchpath:
+    - file://verl/trainer/config
+defaults:
+  - ppo_trainer
+  - _self_
+actor_rollout_ref:
+  actor:
+    _target_: recipe.sppo.config.SPPOActorConfig
+    # sppo_eta is an additional hyperparameter for SPPO, not available in
+    # verl core. specifying _target_ with SPPOActorConfig is needed to
+    # extend verl ActorConfig with custom fields.
+    # additional, it is also possible to use the `extra` field natively supported
+    # by all verl core dataclasses, without having to define SPPOActorConfig
+    # extra:
+    #   sppo_eta: 1.0
+    sppo_eta: 1.0
+    optim:
+      lr_warmup_steps: 15
+  rollout:
+    name: sglang
+    tensor_model_parallel_size: 2
+    gpu_memory_utilization: 0.5
+    val_kwargs:
+      n: 2  # 2 will trigger validation, 1 will bypass
+algorithm:
+  adv_estimator: null
+  sppo_eta: 1.0
+trainer:
+  log_val_generations: 0

ICL/EVAL_GUIDE.md ADDED Viewed

	@@ -0,0 +1,47 @@

+# ICL 模型评测步骤
+## Step 1: 合并 DeepSpeed checkpoint（safetensors 格式）
+```bash
+cd /workspace/xiaobin/ICL
+python3 sft_model/zero_to_fp32.py \
+  sft_model \
+  sft_model/merged_hf \
+  --safe_serialization
+```
+## Step 2: 复制 tokenizer 和 config（注意不要复制 model.safetensors.index.json）
+```bash
+cp /workspace/models/Qwen3-VL-8B-Instruct/config.json sft_model/merged_hf/
+cp /workspace/models/Qwen3-VL-8B-Instruct/generation_config.json sft_model/merged_hf/
+cp /workspace/models/Qwen3-VL-8B-Instruct/preprocessor_config.json sft_model/merged_hf/
+cp /workspace/models/Qwen3-VL-8B-Instruct/chat_template.json sft_model/merged_hf/ 2>/dev/null
+cp /workspace/models/Qwen3-VL-8B-Instruct/tokenizer* sft_model/merged_hf/
+cp /workspace/models/Qwen3-VL-8B-Instruct/merges.txt sft_model/merged_hf/
+cp /workspace/models/Qwen3-VL-8B-Instruct/vocab.json sft_model/merged_hf/
+```
+## Step 3: 跑评测
+单卡:
+```bash
+python3 eval_icl.py \
+  --model-path sft_model/merged_hf \
+  --all-categories \
+  --num-samples 100 \
+  --max-rounds 4 \
+  --device cuda:0
+```
+多卡 (8 GPU):
+```bash
+torchrun --nproc_per_node=8 eval_icl.py \
+  --model-path sft_model/merged_hf \
+  --all-categories \
+  --num-samples 100 \
+  --max-rounds 4
+```

ICL/LV/dataset_inspect.tree.txt ADDED Viewed

	@@ -0,0 +1,456 @@

+M3IT/
+    .git/
+    data/
+    .gitattributes  (2.8KB)
+    .gitignore  (29.0B)
+    M3IT.py  (54.5KB)
+    README.md  (18.3KB)
+        branches/
+        hooks/
+        info/
+        lfs/
+        logs/
+        objects/
+        refs/
+        FETCH_HEAD  (110.0B)
+        HEAD  (21.0B)
+        config  (339.0B)
+        description  (73.0B)
+        packed-refs  (112.0B)
+            refs/
+            HEAD  (189.0B)
+                heads/
+                remotes/
+                    main  (189.0B)
+            heads/
+            remotes/
+            tags/
+                origin/
+                    HEAD  (30.0B)
+                main  (41.0B)
+            info/
+            pack/
+                pack-ee3e40a1a23ec17affa3b8afb61dc14bdffb229c.idx  (38.9KB)
+                pack-ee3e40a1a23ec17affa3b8afb61dc14bdffb229c.pack  (195.5KB)
+            applypatch-msg.sample  (478.0B)
+            commit-msg.sample  (896.0B)
+            fsmonitor-watchman.sample  (4.5KB)
+            post-checkout  (280.0B)
+            post-commit  (276.0B)
+            post-merge  (274.0B)
+            post-update.sample  (189.0B)
+            pre-applypatch.sample  (424.0B)
+            pre-commit.sample  (1.6KB)
+            pre-merge-commit.sample  (416.0B)
+            pre-push  (270.0B)
+            pre-push.sample  (1.3KB)
+            pre-rebase.sample  (4.8KB)
+            pre-receive.sample  (544.0B)
+            prepare-commit-msg.sample  (1.5KB)
+            push-to-checkout.sample  (2.7KB)
+            update.sample  (3.6KB)
+            incomplete/
+            logs/
+            objects/
+            tmp/
+                0152398d9443f2d300adc9e6099a773c66303d4e2e085812cd502cb36da7a0c73483193049  (0.0B)
+                0152398d9443f2d300adc9e6099a773c66303d4e2e085812cd502cb36da7a0c7763208216  (0.0B)
+                0152398d9443f2d300adc9e6099a773c66303d4e2e085812cd502cb36da7a0c789921672  (2.5MB)
+                0968a4438d46277583968011563e959e130feaee66f51bb2d66dbd7e8c979f8c.part  (0.0B)
+                1f77f56225e10edca84be06b6e0d796c579cbf1d4884aee46da564438ad1ba9b1484563810  (437.0KB)
+                1f77f56225e10edca84be06b6e0d796c579cbf1d4884aee46da564438ad1ba9b3850099655  (326.7KB)
+                1f77f56225e10edca84be06b6e0d796c579cbf1d4884aee46da564438ad1ba9b3898577811  (4.1MB)
+                220d32d087b6b29d1c5aaa49324d32b32ae1c19f42e9800f40f24d3a695c2a8d1743027097  (0.0B)
+                220d32d087b6b29d1c5aaa49324d32b32ae1c19f42e9800f40f24d3a695c2a8d3014727128  (0.0B)
+                220d32d087b6b29d1c5aaa49324d32b32ae1c19f42e9800f40f24d3a695c2a8d71894927  (62.6KB)
+                24f014bb5bc7b1fa7d9183dd65fd4b43c0c49aafd6af01bb91ae3a0e7e65502b2818819757  (49.3MB)
+                3da69649bfbc671710f38c2c2f7c6aaecb8f8544de3446866054bf927257c9332854861486  (158.6KB)
+                3da69649bfbc671710f38c2c2f7c6aaecb8f8544de3446866054bf927257c9334214717938  (0.0B)
+                3da69649bfbc671710f38c2c2f7c6aaecb8f8544de3446866054bf927257c933593947826  (0.0B)
+                45e8c51ed0df8edb1ae51d2012b3f7d6cd9cc84addf41e6f9f9adb0f625d41033126870057  (259.2MB)
+                4a80559730d917177e4d13246da0ce23ca318735b29d519d0448bea5579b1a771450117433  (154.4MB)
+                4fda2aa4918e5dec847935db6d46e9bebc570a173bd4201c5f48e60a3f73813a1530155941  (1.1MB)
+                4fda2aa4918e5dec847935db6d46e9bebc570a173bd4201c5f48e60a3f73813a2738070238  (0.0B)
+                4fda2aa4918e5dec847935db6d46e9bebc570a173bd4201c5f48e60a3f73813a2828099128  (0.0B)
+                52a445f8a26cd898e64129e7f1d4bfa6d7203311442068684f5344fc73407310.part  (0.0B)
+                6728a8fb7bad0bad3a2a27669232cb9ae66461c635172f1f7958c80a28e09fa32607733000  (150.2MB)
+                6bb6c9f17e77eab7d88e4a4501c38cb31a6cf792fe77e3b75d511b964a5667df2998182268  (91.8MB)
+                8cb15647ff6bbac322142fea1a38599c523f73acb3614ddb7d12e6a1975a79dc1986657385  (0.0B)
+                8cb15647ff6bbac322142fea1a38599c523f73acb3614ddb7d12e6a1975a79dc2743098052  (0.0B)
+                8cb15647ff6bbac322142fea1a38599c523f73acb3614ddb7d12e6a1975a79dc4193739161  (0.0B)
+                9919274ad6bc88e37235a4c7245d05e357e404ef3352a90a1ba0594e694893c01114223911  (0.0B)
+                9919274ad6bc88e37235a4c7245d05e357e404ef3352a90a1ba0594e694893c03545613611  (0.0B)
+                9919274ad6bc88e37235a4c7245d05e357e404ef3352a90a1ba0594e694893c0559090370  (2.8MB)
+                9cdf4d1a6972db893c8db1a4f2be0d1ec0362ba22a44542402b336760029c87253830692  (88.0MB)
+                b6aed90c79d180c5346994f8e7d0657b3d8a9aab002c057503736b4013a2096b.part  (0.0B)
+                ba47b9680dc949322877399218d1f210a057249803bc70addfb9528152e4b1662004000729  (218.5MB)
+                ca49e0b3f3400f38519a1103b2a567db32c9fa990a7395b1024b94454601479b.part  (0.0B)
+                d66a5b3267a7935b8ff272bcc166a8f43a8d66fb89c59503d536ac87661a02022501429466  (0.0B)
+                d66a5b3267a7935b8ff272bcc166a8f43a8d66fb89c59503d536ac87661a020230475132  (0.0B)
+                d66a5b3267a7935b8ff272bcc166a8f43a8d66fb89c59503d536ac87661a0202373225118  (62.5KB)
+                e5a3eb3e2d0c47d6f014e294ef7398bf26375920c8d2af80fd65e255396dcc78.part  (0.0B)
+                f19cacf3a9f9a57abdcafc4a6d242aa9c6fa48188ad0a394b1a2558cb8ab4dc5372340294  (199.2MB)
+                20251021T152133.441099492.log  (1.4KB)
+                01/
+                02/
+                03/
+                05/
+                06/
+                07/
+                09/
+                0b/
+                0f/
+                10/
+                12/
+                15/
+                16/
+                19/
+                1d/
+                1e/
+                1f/
+                21/
+                22/
+                23/
+                24/
+                2a/
+                2b/
+                2c/
+                2d/
+                2f/
+                30/
+                32/
+                34/
+                37/
+                3b/
+                3d/
+                44/
+                45/
+                4a/
+                4f/
+                50/
+                52/
+                54/
+                56/
+                58/
+                5a/
+                5b/
+                60/
+                61/
+                64/
+                65/
+                67/
+                68/
+                69/
+                6b/
+                6d/
+                6e/
+                70/
+                75/
+                76/
+                7b/
+                7c/
+                80/
+                87/
+                88/
+                89/
+                8b/
+                8c/
+                90/
+                91/
+                93/
+                99/
+                9a/
+                9b/
+                9c/
+                9e/
+                9f/
+                a0/
+                a5/
+                a9/
+                ac/
+                ae/
+                b1/
+                b3/
+                b4/
+                b6/
+                ba/
+                bb/
+                bc/
+                bd/
+                be/
+                c0/
+                c1/
+                c2/
+                c4/
+                c6/
+                c7/
+                c8/
+                ca/
+                cb/
+                d6/
+                d9/
+                dd/
+                e2/
+                e5/
+                e7/
+                e8/
+                e9/
+                ee/
+                ef/
+                f1/
+                f3/
+                f4/
+                f5/
+                f6/
+                f7/
+                f8/
+                f9/
+                fc/
+            exclude  (240.0B)
+        captioning/
+        classification/
+        generation/
+        reasoning/
+        vqa/
+            chinesefoodnet-10/
+            coco-goi/
+            coco-text/
+            imagenet/
+            iqa/
+            itm/
+            mocheg/
+            refcoco/
+            snli-ve/
+            ss/
+            vsr/
+            winoground/
+                .gitattributes  (141.0B)
+                README.md  (211.0B)
+                instructions.json  (1.4KB)
+                labels.json  (9.0KB)
+                test.jsonl  (223.5MB)
+                train.jsonl  (238.9MB)
+                val.jsonl  (227.6MB)
+                README.md  (31.0B)
+                esnlive_test.jsonl  (743.0MB)
+                esnlive_train.jsonl  (1000.8MB)
+                esnlive_val.jsonl  (717.9MB)
+                instructions.json  (1.9KB)
+                test_2023-10-09.jsonl  (2.9GB)
+                train_2023-10-09.jsonl  (3.9GB)
+                instructions.json  (825.0B)
+                mapping.txt  (30.9KB)
+                test_2023-10-08.jsonl  (10.6GB)
+                train.jsonl  (1.5GB)
+                train_2023-10-08.jsonl  (5.9GB)
+                val.jsonl  (2.6GB)
+                instructions.json  (907.0B)
+                test.jsonl  (330.4MB)
+                test_2023-10-09.jsonl  (1.3GB)
+                train.jsonl  (1.9GB)
+                train_2023-10-08.jsonl  (7.8GB)
+                val.jsonl  (330.8MB)
+                instructions.json  (773.0B)
+                test.jsonl  (730.0MB)
+                test_2023-10-09.jsonl  (2.9GB)
+                train.jsonl  (4.3GB)
+                train_2023-10-08.jsonl  (17.1GB)
+                val.jsonl  (730.2MB)
+                instructions.json  (1.4KB)
+                test_2023-10-09.jsonl  (553.7MB)
+                train_2023-10-09.jsonl  (1.9GB)
+                vsr_test.jsonl  (137.7MB)
+                vsr_train.jsonl  (483.3MB)
+                vsr_val.jsonl  (68.8MB)
+                instructions.json  (774.0B)
+                test_2023-10-10.jsonl  (7.6GB)
+                train.jsonl  (8.2GB)
+                train_2023-10-08.jsonl  (32.8GB)
+                val.jsonl  (1.9GB)
+                instructions.json  (733.0B)
+                test_2023-10-07.jsonl  (279.1MB)
+                train.jsonl  (2.0GB)
+                train_2023-10-06.jsonl  (4.1GB)
+                val.jsonl  (138.9MB)
+                instructions.json  (2.0KB)
+                winoground_test.jsonl  (245.5MB)
+                instructions.json  (1.3KB)
+                test.jsonl  (122.9MB)
+                instructions.json  (1.0KB)
+                mocheg_test.jsonl  (60.3MB)
+                mocheg_train.jsonl  (631.7MB)
+                mocheg_val.jsonl  (28.2MB)
+                test_2023-10-08.jsonl  (242.5MB)
+                train_2023-10-08.jsonl  (2.5GB)
+                instructions.json  (1.5KB)
+                test.jsonl  (701.9MB)
+                test_2023-10-08.jsonl  (2.7GB)
+                train.jsonl  (3.9GB)
+                train_2023-10-08.jsonl  (15.6GB)
+                val.jsonl  (667.7MB)
+            clevr/
+            nlvr/
+            science_qa/
+            vcr/
+            visual_mrc/
+                instructions.json  (2.5KB)
+                science_qa_test.jsonl  (174.0MB)
+                science_qa_train.jsonl  (531.3MB)
+                science_qa_validation.jsonl  (176.4MB)
+                instructions.json  (976.0B)
+                train.jsonl  (5.6GB)
+                train_2023-10-07.jsonl  (11.1GB)
+                val.jsonl  (379.6MB)
+                val_2023-10-07.jsonl  (760.4MB)
+                instructions.json  (911.0B)
+                test.jsonl  (1.2GB)
+                train.jsonl  (3.9GB)
+                val.jsonl  (266.9MB)
+                instructions.json  (1.3KB)
+                test.jsonl  (909.3MB)
+                train.jsonl  (4.3GB)
+                val.jsonl  (992.9MB)
+                instructions.json  (1.2KB)
+                test.jsonl  (489.0MB)
+                train.jsonl  (7.9GB)
+                val.jsonl  (533.3MB)
+            mmchat/
+            multi30k/
+            vist/
+            visual_dialog/
+                instructions.json  (818.0B)
+                test.jsonl  (65.2MB)
+                test_2023-10-10.jsonl  (262.2MB)
+                train.jsonl  (3.2GB)
+                train_2023-10-09.jsonl  (13.0GB)
+                val.jsonl  (66.0MB)
+                instructions.json  (1.2KB)
+                test.jsonl  (610.6MB)
+                train.jsonl  (4.4GB)
+                val.jsonl  (301.1MB)
+                instructions.json  (809.0B)
+                test.jsonl  (2.3GB)
+                train.jsonl  (6.2GB)
+                train_new.jsonl  (6.2GB)
+                validation.jsonl  (2.0GB)
+                instructions.json  (1.0KB)
+                test.jsonl  (14.0GB)
+                train.jsonl  (15.4GB)
+                val.jsonl  (13.0GB)
+            a-okvqa/
+            activitynet-qa/
+            docvqa/
+            fm-iqa/
+            gqa/
+            ivqa/
+            msrvtt-qa/
+            msvd-qa/
+            ocr-vqa/
+            okvqa/
+            shapes/
+            st-vqa/
+            text-vqa/
+            viquae/
+            vqav2/
+                instruction.json  (905.0B)
+                train.jsonl  (533.5MB)
+                train_new.jsonl  (533.5MB)
+                validation.jsonl  (228.3MB)
+                instructions.json  (1.9KB)
+                train.jsonl  (1.2GB)
+                train_v2.jsonl  (1.2GB)
+                val.jsonl  (77.7MB)
+                val_v2.jsonl  (78.2MB)
+                instruction.json  (905.0B)
+                test.jsonl  (713.3MB)
+                train.jsonl  (3.3GB)
+                validation_new.jsonl  (529.5MB)
+                instruction.json  (772.0B)
+                train.jsonl  (1.5GB)
+                validation.jsonl  (260.3MB)
+                instruction.json  (853.0B)
+                test.jsonl  (229.4MB)
+                train.jsonl  (1.4GB)
+                README.md  (288.0B)
+                instructions.json  (1.2KB)
+                test.jsonl  (132.4MB)
+                train.jsonl  (343.1MB)
+                val.jsonl  (60.9MB)
+                instructions.json  (853.0B)
+                train.jsonl  (1.9GB)
+                val.jsonl  (1.9GB)
+                instructions.json  (1.7KB)
+                train.jsonl  (7.2GB)
+                val.jsonl  (976.6MB)
+                instructions.json  (1.5KB)
+                test.jsonl  (1.4MB)
+                test_2023-10-08.jsonl  (7.0MB)
+                train.large.jsonl  (18.3MB)
+                train_2023-10-08.jsonl  (92.6MB)
+                val.jsonl  (1.4MB)
+                README.md  (334.0B)
+                instructions.json  (1.0KB)
+                test.jsonl  (500.8MB)
+                train.jsonl  (1.5GB)
+                val.jsonl  (485.4MB)
+                README.md  (434.0B)
+                instructions.json  (1.0KB)
+                test.jsonl  (348.1MB)
+                train.jsonl  (757.5MB)
+                val.jsonl  (58.0MB)
+                .gitattributes  (141.0B)
+                README.md  (332.0B)
+                instructions.json  (1.4KB)
+                test.jsonl  (474.7MB)
+                train.jsonl  (2.1GB)
+                val.jsonl  (1.1GB)
+                instructions.json  (1.2KB)
+                train.jsonl  (594.8MB)
+                train_v2.jsonl  (596.3MB)
+                val.jsonl  (334.3MB)
+                val_v2.jsonl  (335.2MB)
+                instructions.json  (802.0B)
+                para_train.jsonl  (10.5GB)
+                para_val.jsonl  (4.8GB)
+                train.jsonl  (10.5GB)
+                val.jsonl  (4.8GB)
+                instructions.json  (1.2KB)
+                test.jsonl  (122.5MB)
+                test_v2.jsonl  (120.9MB)
+                train.jsonl  (110.1MB)
+                train_v2.jsonl  (110.2MB)
+                validation.jsonl  (125.5MB)
+                validation_v2.jsonl  (125.6MB)
+            coco/
+            coco-cn/
+            flickr8k-cn/
+            image_paragraph_captioning/
+            msrvtt/
+            textcap/
+                .gitattributes  (141.0B)
+                README.md  (490.0B)
+                instructions.json  (1010.0B)
+                test.jsonl  (117.1MB)
+                train.jsonl  (231.1MB)
+                val.jsonl  (116.9MB)
+                instructions.json  (541.0B)
+                test.jsonl  (49.4MB)
+                train.jsonl  (300.0MB)
+                val.jsonl  (49.9MB)
+                instructions.json  (790.0B)
+                test.jsonl  (66.4MB)
+                train.jsonl  (1.2GB)
+                val.jsonl  (65.0MB)
+                image_paragraph_captioning_test.jsonl  (120.7MB)
+                image_paragraph_captioning_train.jsonl  (701.2MB)
+                image_paragraph_captioning_val.jsonl  (118.0MB)
+                instruction.json  (1.4KB)
+                README.md  (73.0B)
+                create_dataset.py  (5.5KB)
+                instructions.json  (882.0B)
+                test.jsonl  (333.1MB)
+                train.jsonl  (7.4GB)
+                val.jsonl  (333.4MB)
+                instructions.json  (1.1KB)
+                train.jsonl  (5.7GB)
+                val.jsonl  (851.3MB)

ICL/RL_DAPO/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

ICL/SFT_new/README.md ADDED Viewed

	@@ -0,0 +1,389 @@

+# Qwen3-VL-8B Single-Step Decision SFT
+## 项目结构
+```
+SFT_new/
+├── build_sft.py          # 数据构造 (SigLIP2 相似度选 shots, 单步决策格式)
+├── generate_captions.py  # VLM 批量 caption 生成 (替代短答案作为检索描述)
+├── train.py              # 训练主脚本 (DeepSpeed + Flash Attention 2)
+├── ds_zero2.json         # DeepSpeed ZeRO-2 配置 (推荐, 速度快)
+├── ds_zero3.json         # DeepSpeed ZeRO-3 配置 (备用, 更省显存)
+├── run_single_node.sh    # 单机启动脚本 (debug)
+├── run_multi_node.sh     # 多机训练入口 (每个 node 执行)
+├── submit_northjob.sh    # northjob 集群提交 (64卡)
+├── launch_wrapper.py     # northjob → bash 桥接
+└── README.md             # 本文件
+```
+---
+## 整体 Pipeline
+```
+原始数据集 (jsonl + 图片)
+        │
+        ▼
+  ┌─────────────────┐
+  │  build_sft.py   │  --build-cache    ← 只跑一次, GPU
+  │  SigLIP2 编码    │  生成 emb_cache/
+  └────────┬────────┘
+           │
+           ▼
+  ┌──────────────────────┐
+  │ generate_captions.py │  VLM API 批量生成  ← 只跑一次, 无需 GPU
+  │ 生成 caption_cache/   │  (vLLM 部署的 Qwen3-VL)
+  └────────┬─────────────┘
+           │
+           ▼
+  ┌─────────────────┐
+  │  build_sft.py   │  构造 SFT 数据     ← CPU, 可多进程并行
+  │  读取 emb_cache  │  读取 caption_cache
+  │  + caption_cache │  输出 sft.jsonl
+  └────────┬────────┘
+           │
+           ▼
+  ┌─────────────────┐
+  │    train.py     │  DeepSpeed 训练
+  └─────────────────┘
+```
+---
+## 1. 配环境
+```bash
+# 创建 conda 环境
+conda create -n sft python=3.11 -y
+conda activate sft
+# PyTorch 2.4 + CUDA 12 (匹配 flash-attn whl)
+pip install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124
+# Flash Attention 2 (本地 whl, 先试 TRUE 版, 不行换 FALSE 版)
+pip install /workspace/flash_attn-2.8.3+cu12torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
+# 如果报 CXX11 ABI 不匹配:
+# pip install /workspace/flash_attn-2.8.3+cu12torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+# 核心依赖
+pip install transformers>=4.57.0
+pip install accelerate>=1.13.0
+pip install peft>=0.18.0
+pip install deepspeed>=0.16.0
+pip install qwen-vl-utils
+pip install tqdm pillow
+pip install openai   # generate_captions.py 需要
+# 验证安装
+python -c "
+import torch, transformers, deepspeed, flash_attn, peft
+print(f'torch:        {torch.__version__}')
+print(f'transformers: {transformers.__version__}')
+print(f'deepspeed:    {deepspeed.__version__}')
+print(f'flash_attn:   {flash_attn.__version__}')
+print(f'peft:         {peft.__version__}')
+print(f'CUDA:         {torch.cuda.is_available()}, {torch.cuda.get_device_name(0)}')
+from transformers import Qwen3VLForConditionalGeneration
+print('Qwen3VL:      OK')
+"
+```
+**注意**: flash-attn whl 是针对 torch 2.4 编译的, 所以 PyTorch 必须装 2.4.x 版本.
+---
+## 2. 构造数据
+### 2.1 构建 SigLIP embedding 缓存 (只跑一次, GPU)
+```bash
+conda activate sft
+python /workspace/xiaobin/ICL/SFT_new/build_sft.py \
+    --build-cache \
+    --data-root /path/to/your/dataset \
+    --output-dir /workspace/xiaobin/ICL/SFT_new/output \
+    --siglip-model /workspace/models/siglip2-so400m-patch16-naflex \
+    --device cuda:0 \
+    --categories vqa,captioning,classification,reasoning
+```
+缓存保存在 `output/emb_cache/` 下, JSON 格式 (float16 base64), 可跨环境复用.
+### 2.2 生成 VLM Caption (只跑一次, 调 API 无需本地 GPU)
+**为什么需要这一步**: 很多 VQA 数据集的 answer 是短答案 ("yes", "3", "cab"), 不适合做语义检索的 query 描述. 用 VLM 给每张 pool 图片生成描述性 caption, 作为 `<RET>` 输出的 Description 和 context shot 的 Caption, 质量远好于原始 answer.
+#### 启动 vLLM 服务 (NorthServe)
+```bash
+# 启动 Qwen3-VL-8B 推理服务（8 副本，每副本 1 卡）
+HOME=/root /workspace/nex-agi/NorthServe/northserve launch \
+  --model-name qwen3vl8b-caption \
+  --served-model-name Qwen3-VL-8B-Instruct \
+  --namespace bg-agentic-coding \
+  --model-path /i_workspace/models/Qwen3-VL-8B-Instruct \
+  --volumes "i-xinsiyang-y4zy0sik0a:/i_workspace" \
+  --replicas 32 \
+  --gpus-per-pod 1 \
+  --pods-per-job 1 \
+  --profile generation \
+  --backend vllm \
+  --priority-class-name higher-priority-job \
+  --extra-cmds "--trust-remote-code --max-model-len 4096 --max-num-seqs 128" \
+  -y
+# 验证（所有模型共用 http://10.51.6.110/v1，模型名在请求体里指定）
+curl http://10.51.6.110/v1/models
+```
+#### 生成 caption (emb_cache 对齐版)
+```bash
+python /workspace/xiaobin/ICL/SFT_new/generate_captions.py \
+    --api-base http://10.51.6.110/v1 \
+    --model Qwen3-VL-8B-Instruct \
+    --emb-cache-dir /workspace/xiaobin/ICL/SFT_new/output/emb_cache \
+    --output-dir /workspace/xiaobin/ICL/SFT_new/output/caption_cache \
+    --num-workers 128 \
+    --prompt "Describe this image in one or two sentences. Focus on the main objects, their attributes, and spatial relationships."
+```
+#### 生成 caption (全量图片版, 按 split 分开保存)
+```bash
+# 全量跑 (~200 万张图)
+python /workspace/xiaobin/ICL/SFT_new/generate_captions_all.py \
+    --api-base http://10.51.6.110/v1 \
+    --model Qwen3-VL-8B-Instruct \
+    --num-workers 128
+# 只跑某个 category
+python /workspace/xiaobin/ICL/SFT_new/generate_captions_all.py \
+    --api-base http://10.51.6.110/v1 \
+    --model Qwen3-VL-8B-Instruct \
+    --categories vqa \
+    --num-workers 128
+```
+输出到 `/workspace/xiaobin/dataset/detail/{category}/{dataset}/{split}/captions.json`
+#### 停止服务
+```bash
+HOME=/root /workspace/nex-agi/NorthServe/northserve stop \
+  --model-name qwen3vl8b-caption
+```
+**关键特性**:
+- **断点续传**: 已完成的文件自动跳过, 部分完成的只处理缺失图片
+- **定期存盘**: 每 500 张自动保存 (防崩溃丢数据), `--save-every` 可调
+- **并发请求**: `--num-workers 128`, 8 副本理论上限 1024, 不报错就往大了开
+### 2.3 构建 SFT 数据集 (CPU, 不需要 GPU, 可多进程并行)
+```bash
+# 单进程
+python /workspace/xiaobin/ICL/SFT_new/build_sft.py \
+    --data-root /path/to/your/dataset \
+    --output-dir /workspace/xiaobin/ICL/SFT_new/output \
+    --caption-cache-dir /workspace/xiaobin/ICL/SFT_new/output/caption_cache \
+    --samples-per-cat 20000 \
+    --max-shots 3 \
+    --answer-at-weights 3,3,2,1
+# 多进程并行 (4 shards)
+for i in 0 1 2 3; do
+    python /workspace/xiaobin/ICL/SFT_new/build_sft.py \
+        --data-root /path/to/your/dataset \
+        --output-dir /workspace/xiaobin/ICL/SFT_new/output \
+        --caption-cache-dir /workspace/xiaobin/ICL/SFT_new/output/caption_cache \
+        --shard-id $i --num-shards 4 &
+done
+wait
+# 合并
+python /workspace/xiaobin/ICL/SFT_new/build_sft.py \
+    --data-root /path/to/your/dataset \
+    --output-dir /workspace/xiaobin/ICL/SFT_new/output \
+    --merge --shuffle
+```
+**注意**: `--caption-cache-dir` 不传或目录不存在时行为和之前完全一致（用原始 answer）。正式训练前务必先跑 `generate_captions.py` 生成完整的 caption cache。
+最终数据: `output/all/sft.jsonl`
+**生成数据中的描述字段变化**:
+```
+# 之前 (用原始 answer, 短答案质量差)
+{"from": "gpt", "value": "<RET>\nDescription: yes"}
+{"from": "human", "value": "...<image>\nCaption: yes..."}
+# 现在 (用 VLM 生成的描述, 适合语义检索)
+{"from": "gpt", "value": "<RET>\nDescription: A woman cutting a large white cake in a kitchen."}
+{"from": "human", "value": "...<image>\nCaption: A woman cutting a large white cake in a kitchen...."}
+```
+---
+## 3. 训练
+### 3.1 单机 debug (1 node x 8 H100)
+```bash
+conda activate sft
+bash /workspace/xiaobin/ICL/SFT_new/run_single_node.sh \
+    /workspace/xiaobin/ICL/SFT_new/output/all/sft.jsonl \
+    8
+```
+可改 GPU 数快速 debug:
+```bash
+# 用 2 卡 debug
+bash /workspace/xiaobin/ICL/SFT_new/run_single_node.sh /path/to/sft.jsonl 2
+```
+### 3.2 多机训练 (8 nodes x 8 GPUs = 64 H100)
+**方式 A: northjob 提交 (推荐)**
+先修改 `submit_northjob.sh` 里的 k8s 参数 (queue/namespace/pvc-name 改成你自己的), 然后:
+```bash
+bash /workspace/xiaobin/ICL/SFT_new/submit_northjob.sh 64   # 64卡
+bash /workspace/xiaobin/ICL/SFT_new/submit_northjob.sh 32   # 32卡
+```
+**方式 B: 手动 torchrun (每个 node 上跑)**
+```bash
+# 在每个 node 上执行, 修改 --node_rank=0/1/2/.../7
+torchrun \
+    --nproc_per_node=8 \
+    --nnodes=8 \
+    --node_rank=${NODE_RANK} \
+    --master_addr=${MASTER_ADDR} \
+    --master_port=29500 \
+    /workspace/xiaobin/ICL/SFT_new/train.py \
+    --model-path /workspace/models/Qwen3-VL-8B-Instruct \
+    --data-path /workspace/xiaobin/ICL/SFT_new/output/all/sft.jsonl \
+    --output-dir /workspace/xiaobin/ICL/SFT_new/output/qwen3vl_sft_64gpu \
+    --deepspeed /workspace/xiaobin/ICL/SFT_new/ds_zero2.json \
+    --num-epochs 3 \
+    --batch-size 1 \
+    --gradient-accumulation-steps 2 \
+    --learning-rate 2e-5
+```
+---
+## 4. 训练策略说明
+| 配置 | 单机 8 GPU (debug) | 64 GPU (正式) |
+|------|-------------------|---------------|
+| 并行 | DeepSpeed ZeRO-2 | DeepSpeed ZeRO-2 |
+| micro_batch/GPU | 1 | 1 |
+| grad_accum | 8 | 2 |
+| **global_batch** | **64** | **128** |
+| LR | 1e-5 | 2e-5 |
+| Epochs | 3 | 3 |
+| max_length | 4096 | 4096 |
+| 精度 | BF16 | BF16 |
+| Attention | Flash Attention 2 | Flash Attention 2 |
+| Gradient ckpt | yes | yes |
+| 训��方式 | Full fine-tuning | Full fine-tuning |
+**为什么 ZeRO-2**: 8B 模型 BF16 约 16GB, H100 80GB 绰绰有余, ZeRO-2 比 ZeRO-3 快 30-40%.
+**为什么 Full FT**: 任务需要学 `<RET>/<ANS>` 新 token + 新决策能力, LoRA 对 embedding 层学习有限. 加 `--use-lora` 可切换.
+**Loss**: 只在 assistant turn 内容上计算, user turn 全部 mask (-100).
+---
+## 5. 关键参数调整
+```bash
+# 如果显存不够 → 降 max_pixels 或切 ZeRO-3
+--max-pixels $((512*28*28))   # 减少图片分辨率
+--deepspeed ds_zero3.json     # 切 ZeRO-3
+# 如果想用 LoRA (省显存, 快, 但效果可能差一点)
+--use-lora --lora-rank 64 --lora-alpha 128
+# 调整 n-shot 分布 (answer_at_weights)
+--answer-at-weights 3,3,2,1   # 偏向少 shot (默认)
+--answer-at-weights 1,1,1,1   # 均匀分布
+--answer-at-weights 1,2,3,3   # 偏向多 shot
+```
+---
+## 6. 输出目录结构
+```
+output/
+├── emb_cache/                    # SigLIP2 embedding 缓存
+│   ├── vqa_vqav2.json
+│   ├── vqa_okvqa.json
+│   └── ...
+├── caption_cache/                # VLM 生成的 caption 缓存
+│   ├── vqa_vqav2.json
+│   ├── vqa_okvqa.json
+│   └── ...
+├── vqa/
+│   ├── sft.part00.jsonl          # 分片
+│   └── sft.jsonl                 # 合并后
+├── captioning/
+│   └── ...
+├── classification/
+│   └── ...
+├── reasoning/
+│   └── ...
+└── all/
+    └── sft.jsonl                 # 全部合并 + shuffle, 训练用这个
+```
+---
+## 7. 快速验证 (小规模测试)
+```bash
+# Step 1: 建 embedding cache
+python build_sft.py --build-cache --data-root /path/to/data \
+    --categories vqa --device cuda:0
+# Step 2: 生成 VLM caption (先小规模测试)
+python generate_captions.py \
+    --api-base http://10.51.6.110/v1 \
+    --model Qwen3-VL-8B-Instruct \
+    --emb-cache-dir ./output/emb_cache \
+    --output-dir ./output/caption_cache \
+    --num-workers 128 --save-every 50
+# Step 3: 检查 caption 质量
+python -c "
+import json
+d = json.load(open('./output/caption_cache/vqa_vqav2.json'))
+for k, v in list(d['items'].items())[:10]:
+    print(f'{k}\n  → {v}\n')
+"
+# Step 4: 构造 SFT 数据 (100 条快速测试)
+python build_sft.py --data-root /path/to/data \
+    --caption-cache-dir ./output/caption_cache \
+    --categories vqa --samples-per-cat 100
+# Step 5: 检查生成结果
+python -c "
+import json
+with open('./output/vqa/sft.part00.jsonl') as f:
+    for i, line in enumerate(f):
+        if i >= 5: break
+        r = json.loads(line)
+        for c in r['conversations']:
+            print(f'[{c[\"from\"]}] {c[\"value\"][:120]}')
+        print('---')
+"
+```

ICL/SFT_new/convert_and_eval.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#!/bin/bash
+# =============================================================================
+# DeepSpeed ZeRO checkpoint -> HuggingFace 格式转换 + 跑评测
+#
+# 用法：
+#   bash convert_and_eval.sh                          # 转换 epoch3_step1406，8卡评测
+#   bash convert_and_eval.sh final                    # 转换 final checkpoint
+#   bash convert_and_eval.sh epoch2_step937            # 转换指定 checkpoint
+#   NUM_GPUS=4 bash convert_and_eval.sh               # 4卡评测
+#   SKIP_EVAL=1 bash convert_and_eval.sh              # 只转换不评测
+# =============================================================================
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+# ---- 参数 ----
+CKPT_TAG="${1:-epoch3_step1406}"
+CKPT_DIR="/workspace/xiaobin/ICL/sft_model"
+BASE_MODEL="/workspace/models/Qwen3-VL-8B-Instruct"
+OUTPUT_DIR="${CKPT_DIR}/${CKPT_TAG}_fp32"
+NUM_GPUS="${NUM_GPUS:-8}"
+BATCH_SIZE="${BATCH_SIZE:-32}"
+SKIP_EVAL="${SKIP_EVAL:-0}"
+echo "============================================"
+echo "  Checkpoint:  ${CKPT_TAG}"
+echo "  Source:      ${CKPT_DIR}/${CKPT_TAG}"
+echo "  Output:      ${OUTPUT_DIR}"
+echo "  Base model:  ${BASE_MODEL}"
+echo "============================================"
+# ---- Step 1: 检查源 checkpoint 存在 ----
+if [ ! -d "${CKPT_DIR}/${CKPT_TAG}" ]; then
+    echo "[ERROR] Checkpoint not found: ${CKPT_DIR}/${CKPT_TAG}"
+    echo "Available checkpoints:"
+    ls -d "${CKPT_DIR}"/epoch* "${CKPT_DIR}"/final 2>/dev/null || echo "  (none)"
+    exit 1
+fi
+# ---- Step 2: 转换 DeepSpeed ZeRO -> fp32 ----
+if [ -d "${OUTPUT_DIR}" ] && [ "$(ls -A "${OUTPUT_DIR}" 2>/dev/null)" ]; then
+    echo "[SKIP] ${OUTPUT_DIR} already exists, skipping conversion."
+    echo "       Delete it if you want to re-convert."
+else
+    echo "[1/3] Converting DeepSpeed ZeRO checkpoint to fp32..."
+    mkdir -p "${OUTPUT_DIR}"
+    python3 "${CKPT_DIR}/zero_to_fp32.py" \
+        "${CKPT_DIR}" \
+        "${OUTPUT_DIR}" \
+        --tag "${CKPT_TAG}" \
+        --safe_serialization
+    echo "Done."
+fi
+# ---- Step 3: 拷贝 config / tokenizer ----
+echo "[2/3] Copying config & tokenizer from base model..."
+FILES_TO_COPY=(
+    config.json
+    tokenizer.json
+    tokenizer_config.json
+    generation_config.json
+    preprocessor_config.json
+    video_preprocessor_config.json
+    special_tokens_map.json
+    chat_template.json
+    merges.txt
+    vocab.json
+)
+copied=0
+for f in "${FILES_TO_COPY[@]}"; do
+    if [ -f "${BASE_MODEL}/${f}" ] && [ ! -f "${OUTPUT_DIR}/${f}" ]; then
+        cp "${BASE_MODEL}/${f}" "${OUTPUT_DIR}/"
+        copied=$((copied + 1))
+    fi
+done
+echo "Copied ${copied} files. Model ready at: ${OUTPUT_DIR}"
+# ---- Step 4: 跑评测 ----
+if [ "${SKIP_EVAL}" = "1" ]; then
+    echo "[3/3] SKIP_EVAL=1, skipping evaluation."
+    echo "To run eval manually:"
+    echo "  MODEL_PATH=${OUTPUT_DIR} BATCH_SIZE=${BATCH_SIZE} bash ${SCRIPT_DIR}/run_eval.sh ${NUM_GPUS}"
+    exit 0
+fi
+echo "[3/3] Running evaluation (${NUM_GPUS} GPUs, batch_size=${BATCH_SIZE})..."
+MODEL_PATH="${OUTPUT_DIR}" BATCH_SIZE="${BATCH_SIZE}" bash "${SCRIPT_DIR}/run_eval.sh" "${NUM_GPUS}"

ICL/SFT_new/ds_zero2.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "bf16": {
+    "enabled": true
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "allgather_bucket_size": 5e8
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": 1e-6,
+      "betas": [0.9, 0.999],
+      "eps": 1e-8,
+      "weight_decay": 0.1
+    }
+  },
+  "scheduler": {
+    "type": "WarmupDecayLR",
+    "params": {
+      "warmup_min_lr": 0,
+      "warmup_max_lr": 1e-6,
+      "warmup_num_steps": 50,
+      "total_num_steps": 950
+    }
+  },
+  "gradient_accumulation_steps": 4,
+  "gradient_clipping": 1.0,
+  "train_batch_size": 64,
+  "train_micro_batch_size_per_gpu": 2,
+  "wall_clock_breakdown": false,
+  "steps_per_print": 50
+}

ICL/SFT_new/ds_zero3.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "bf16": {
+    "enabled": true
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "reduce_bucket_size": 5e8,
+    "stage3_prefetch_bucket_size": 5e8,
+    "stage3_param_persistence_threshold": 1e6,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": 1e-5,
+      "betas": [0.9, 0.999],
+      "eps": 1e-8,
+      "weight_decay": 0.1
+    }
+  },
+  "gradient_accumulation_steps": 4,
+  "gradient_clipping": 1.0,
+  "train_micro_batch_size_per_gpu": 2,
+  "wall_clock_breakdown": false,
+  "steps_per_print": 50
+}

ICL/SFT_new/eval.py ADDED Viewed

	@@ -0,0 +1,961 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+ICL 多轮推理评测脚本：模拟 RET/ANS 决策循环，验证 SFT 模型效果。
+流程：
+  1. 从 source index 的 val split 加载原始记录（与训练集无重叠）
+  2. 给模型 query_image + question（0-shot）
+  3. 模型输出 <RET> → 从预计算 top5 取下一张 shot + caption，追加 context，再问
+  4. 模型输出 <ANS> → 提取答案，结束
+  5. 最多 max_rounds 轮（防止死循环 RET）
+多卡策略：
+  - 每张 GPU 加载一份模型，按 dataset 粒度分配任务
+  - 只有 rank 0 打印进度日志（其他 rank 静默）
+  - 最后 rank 0 汇总并写出有序 JSON log
+用法：
+  # 单卡 (debug)
+  python3 eval.py \\
+      --model-path /workspace/xiaobin/ICL/sft_model/merged_hf \\
+      --category vqa --dataset vqav2 --split val \\
+      --num-samples 20 --device cuda:0
+  # 多卡
+  torchrun --nproc_per_node=8 eval.py \\
+      --model-path /workspace/xiaobin/ICL/sft_model/merged_hf \\
+      --all-categories --split val --num-samples 200
+"""
+import argparse
+import json
+import math
+import os
+import random
+import re
+import sys
+import time
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.distributed as dist
+# 绕过 transformers 对 torch<2.6 的 torch.load 安全检查 (CVE-2025-32434)
+# 在 import transformers 之前 patch modeling_utils.load_state_dict
+import transformers.utils.import_utils as _tu
+if hasattr(_tu, "check_torch_load_is_safe"):
+    _tu.check_torch_load_is_safe = lambda: None
+import transformers.modeling_utils as _mu
+if hasattr(_mu, "check_torch_load_is_safe"):
+    _mu.check_torch_load_is_safe = lambda: None
+# 直接 patch load_state_dict 里调用的那个
+_orig_load_state_dict = getattr(_mu, "load_state_dict", None)
+if _orig_load_state_dict is not None:
+    import functools
+    @functools.wraps(_orig_load_state_dict)
+    def _patched_load_state_dict(checkpoint_file, **kwargs):
+        # 直接用 torch.load 跳过安全检查
+        return torch.load(checkpoint_file, map_location="cpu", weights_only=False)
+    _mu.load_state_dict = _patched_load_state_dict
+from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+from qwen_vl_utils import process_vision_info
+from tqdm import tqdm
+# ---------------------------------------------------------------------------
+# 默认路径
+# ---------------------------------------------------------------------------
+INDEX_ROOT = "/workspace/xiaobin/dataset/index"
+EMBEDDINGS_DIR = "/workspace/xiaobin/dataset/embeddings"
+CAPTION_CACHE_DIR = "/workspace/xiaobin/dataset/caption_cache"
+# ---------------------------------------------------------------------------
+# 分布式工具
+# ---------------------------------------------------------------------------
+def setup_distributed():
+    """初始化分布式环境，返回 (rank, world_size, device)。"""
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+        local_rank = int(os.environ.get("LOCAL_RANK", rank))
+        dist.init_process_group("nccl")
+        torch.cuda.set_device(local_rank)
+        device = f"cuda:{local_rank}"
+    else:
+        rank, world_size = 0, 1
+        device = None
+    return rank, world_size, device
+def gather_results(local_results: List[Dict], rank: int, world_size: int) -> List[Dict]:
+    """各 rank 结果汇总到 rank 0。"""
+    if world_size == 1:
+        return local_results
+    data = json.dumps(local_results, ensure_ascii=False).encode("utf-8")
+    size = torch.tensor([len(data)], dtype=torch.long, device=f"cuda:{rank}")
+    size_list = [torch.zeros(1, dtype=torch.long, device=f"cuda:{rank}") for _ in range(world_size)]
+    dist.all_gather(size_list, size)
+    max_size = max(s.item() for s in size_list)
+    padded = data + b"\x00" * (max_size - len(data))
+    tensor = torch.ByteTensor(list(padded)).cuda(rank)
+    tensor_list = [torch.zeros(max_size, dtype=torch.uint8, device=f"cuda:{rank}") for _ in range(world_size)]
+    dist.all_gather(tensor_list, tensor)
+    if rank == 0:
+        all_results = []
+        for t, s in zip(tensor_list, size_list):
+            raw = bytes(t[: s.item()].cpu().tolist())
+            all_results.extend(json.loads(raw.decode("utf-8")))
+        return all_results
+    return []
+def log(msg: str, rank: int = 0, force: bool = False):
+    """只在 rank 0 或 force=True 时打印。"""
+    if rank == 0 or force:
+        print(msg, flush=True)
+# ---------------------------------------------------------------------------
+# 数据加载
+# ---------------------------------------------------------------------------
+def load_records(cat: str, ds: str, split: str, limit: int = 0) -> List[Dict]:
+    """从 index root 加载指定 split 的记录。"""
+    path = os.path.join(INDEX_ROOT, cat, ds, f"{split}.jsonl")
+    if not os.path.exists(path):
+        return []
+    records = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            r = json.loads(line)
+            if r.get("image") and r.get("answer"):
+                records.append(r)
+            if limit and len(records) >= limit:
+                break
+    return records
+def load_top5(cat: str, ds: str) -> Dict[str, List[str]]:
+    path = os.path.join(EMBEDDINGS_DIR, f"{cat}_{ds}_top5.json")
+    if not os.path.exists(path):
+        return {}
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def load_caption_cache(cat: str, ds: str) -> Dict[str, str]:
+    path = os.path.join(CAPTION_CACHE_DIR, f"{cat}_{ds}.json")
+    if not os.path.exists(path):
+        return {}
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if isinstance(data, dict) and "items" in data:
+        return data["items"]
+    return data if isinstance(data, dict) else {}
+def load_instructions(cat: str, ds: str) -> List[str]:
+    path = os.path.join(INDEX_ROOT, cat, ds, "instructions.json")
+    if not os.path.exists(path):
+        return ["Look at the image and answer the question."]
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if isinstance(data, list):
+        return [str(x).strip() for x in data if str(x).strip()]
+    if isinstance(data, dict):
+        for key in ("instructions", "instruction", "prompts"):
+            v = data.get(key)
+            if isinstance(v, list):
+                return [str(x).strip() for x in v if str(x).strip()]
+    return ["Look at the image and answer the question."]
+def discover_datasets(categories: List[str]) -> List[Tuple[str, str]]:
+    results = []
+    for cat in sorted(os.listdir(INDEX_ROOT)):
+        if categories and cat not in categories:
+            continue
+        cat_dir = os.path.join(INDEX_ROOT, cat)
+        if not os.path.isdir(cat_dir):
+            continue
+        for ds in sorted(os.listdir(cat_dir)):
+            if os.path.isdir(os.path.join(cat_dir, ds)):
+                results.append((cat, ds))
+    return results
+# ---------------------------------------------------------------------------
+# 模型加载
+# ---------------------------------------------------------------------------
+def load_model(model_path: str, device: str):
+    from transformers import AutoConfig
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+        min_pixels=256 * 28 * 28,
+        max_pixels=1280 * 28 * 28,
+    )
+    # 先添加 special tokens 到 tokenizer，这样 vocab_size 对齐 checkpoint
+    special_tokens = ["<RET>", "<ANS>", "</ANS>", "<RETQ>", "</RETQ>"]
+    processor.tokenizer.add_tokens(special_tokens, special_tokens=True)
+    # batch 推理 decoder-only 模型必须左 padding
+    processor.tokenizer.padding_side = "left"
+    target_vocab_size = len(processor.tokenizer)
+    # 关键：把 config 的 vocab_size 改成 checkpoint 实际大小，
+    # 否则 ignore_mismatched_sizes 会导致 embed_tokens/lm_head 被随机初始化！
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    print(f"[load_model] text_config.vocab_size={config.text_config.vocab_size}, target={target_vocab_size}")
+    config.text_config.vocab_size = target_vocab_size
+    model = Qwen3VLForConditionalGeneration.from_pretrained(
+        model_path,
+        config=config,
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="sdpa",
+        device_map=device,
+    )
+    model.eval()
+    ret_id = processor.tokenizer.convert_tokens_to_ids("<RET>")
+    ans_id = processor.tokenizer.convert_tokens_to_ids("<ANS>")
+    return model, processor, ret_id, ans_id
+# ---------------------------------------------------------------------------
+# 推理核心
+# ---------------------------------------------------------------------------
+def build_messages(
+    instruction: str,
+    query_image: str,
+    question: Optional[str],
+    shots: List[Dict],
+    min_pixels: int = 256 * 28 * 28,
+    max_pixels: int = 1280 * 28 * 28,
+) -> List[Dict]:
+    """构建 Qwen3-VL chat messages。"""
+    user_content = []
+    if instruction:
+        user_content.append({"type": "text", "text": instruction})
+    user_content.append({
+        "type": "image",
+        "image": f"file://{query_image}",
+        "min_pixels": min_pixels,
+        "max_pixels": max_pixels,
+    })
+    if question:
+        user_content.append({"type": "text", "text": f"Question: {question}"})
+    for shot in shots:
+        user_content.append({
+            "type": "image",
+            "image": f"file://{shot['image']}",
+            "min_pixels": min_pixels,
+            "max_pixels": max_pixels,
+        })
+        if shot.get("caption"):
+            user_content.append({"type": "text", "text": f"Caption: {shot['caption']}"})
+    user_content.append({"type": "text", "text": "Action:"})
+    return [{"role": "user", "content": user_content}]
+@torch.no_grad()
+def generate_action(model, processor, messages: List[Dict], max_new_tokens: int = 256) -> str:
+    """单条推理（fallback 用）。"""
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs = None
+    try:
+        image_inputs, _ = process_vision_info(messages)
+    except Exception:
+        pass
+    inputs = processor(
+        text=[text],
+        images=image_inputs if image_inputs else None,
+        return_tensors="pt",
+        padding=False,
+        truncation=False,
+    )
+    device = next(model.parameters()).device
+    inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=False,
+        temperature=None,
+        top_p=None,
+    )
+    input_len = inputs["input_ids"].shape[1]
+    generated = outputs[0][input_len:]
+    return processor.tokenizer.decode(generated, skip_special_tokens=False)
+@torch.no_grad()
+def generate_action_batch(
+    model, processor, messages_list: List[List[Dict]],
+    max_new_tokens: int = 256, batch_size: int = 4,
+    pbar=None,
+) -> List[str]:
+    """批量推理，按 batch_size 分批处理。每个 batch 完成后更新 pbar。"""
+    all_results = []
+    device = next(model.parameters()).device
+    for start in range(0, len(messages_list), batch_size):
+        batch_msgs = messages_list[start : start + batch_size]
+        texts = []
+        all_images_nested = []  # 嵌套 list: [[sample0 imgs], [sample1 imgs], ...]
+        has_any_image = False
+        for msgs in batch_msgs:
+            texts.append(processor.apply_chat_template(
+                msgs, tokenize=False, add_generation_prompt=True
+            ))
+            try:
+                imgs, _ = process_vision_info(msgs)
+                if imgs:
+                    all_images_nested.append(imgs)
+                    has_any_image = True
+                else:
+                    all_images_nested.append([])
+            except Exception:
+                all_images_nested.append([])
+        inputs = processor(
+            text=texts,
+            images=all_images_nested if has_any_image else None,
+            return_tensors="pt",
+            padding=True,
+            truncation=False,
+        )
+        inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            temperature=None,
+            top_p=None,
+        )
+        # 解码每条（左 padding 时，所有样本的 padded 输入长度相同）
+        input_len = inputs["input_ids"].shape[1]
+        for i in range(len(batch_msgs)):
+            generated = outputs[i][input_len:]
+            text = processor.tokenizer.decode(generated, skip_special_tokens=False)
+            all_results.append(text)
+        # 每个 batch 完成后更新进度条
+        if pbar is not None:
+            pbar.set_postfix_str(f"batch {start // batch_size + 1}/{math.ceil(len(messages_list) / batch_size)}")
+    return all_results
+def parse_action(text: str) -> Tuple[str, str]:
+    """解析模型输出，返回 (action, content)。"""
+    text = text.strip()
+    if text.startswith("<RET>"):
+        desc = text[len("<RET>"):].strip()
+        if desc.startswith("Description:"):
+            desc = desc[len("Description:"):].strip()
+        for tok in ["<|im_end|>", "</s>", "<|endoftext|>"]:
+            desc = desc.replace(tok, "").strip()
+        return "ret", desc
+    if text.startswith("<ANS>"):
+        ans = text[len("<ANS>"):]
+        end_idx = ans.find("</ANS>")
+        if end_idx != -1:
+            ans = ans[:end_idx]
+        else:
+            for tok in ["<|im_end|>", "</s>", "<|endoftext|>"]:
+                ans = ans.replace(tok, "").strip()
+        return "ans", ans.strip()
+    return "unknown", text
+def run_icl_loop(
+    model,
+    processor,
+    record: Dict,
+    instruction: str,
+    top5: Dict[str, List[str]],
+    caption_cache: Dict[str, str],
+    max_rounds: int = 4,
+) -> Dict:
+    """对单条记录跑多轮 RET/ANS 循环（fallback 用）。"""
+    query_image = record["image"]
+    question = record.get("question", "")
+    gt_answer = record.get("answer", "")
+    shots = []
+    used_images = {query_image}
+    rounds = []
+    candidates = top5.get(query_image, [])
+    for round_idx in range(max_rounds):
+        messages = build_messages(instruction, query_image, question, shots)
+        raw_output = generate_action(model, processor, messages)
+        action, content = parse_action(raw_output)
+        rounds.append({
+            "round": round_idx,
+            "action": action,
+            "content": content,
+            "raw": raw_output[:300],
+        })
+        if action == "ans":
+            return {
+                "image": query_image,
+                "question": question,
+                "gt_answer": gt_answer,
+                "final_answer": content,
+                "num_rounds": round_idx + 1,
+                "terminated_by": "ans",
+                "rounds": rounds,
+            }
+        if action == "ret":
+            next_image = None
+            for c in candidates:
+                if c not in used_images:
+                    next_image = c
+                    break
+            if next_image is None:
+                return {
+                    "image": query_image,
+                    "question": question,
+                    "gt_answer": gt_answer,
+                    "final_answer": None,
+                    "num_rounds": round_idx + 1,
+                    "terminated_by": "no_more_shots",
+                    "rounds": rounds,
+                }
+            cap = caption_cache.get(next_image, content)
+            shots.append({"image": next_image, "caption": cap})
+            used_images.add(next_image)
+        else:
+            return {
+                "image": query_image,
+                "question": question,
+                "gt_answer": gt_answer,
+                "final_answer": content,
+                "num_rounds": round_idx + 1,
+                "terminated_by": "unknown_action",
+                "rounds": rounds,
+            }
+    return {
+        "image": query_image,
+        "question": question,
+        "gt_answer": gt_answer,
+        "final_answer": None,
+        "num_rounds": max_rounds,
+        "terminated_by": "max_rounds",
+        "rounds": rounds,
+    }
+def run_icl_batch(
+    model, processor,
+    records: List[Dict],
+    instructions: List[str],
+    top5: Dict[str, List[str]],
+    caption_cache: Dict[str, str],
+    max_rounds: int = 4,
+    batch_size: int = 4,
+    rank: int = 0,
+    ds_label: str = "",
+) -> List[Dict]:
+    """对一批记录做 round-parallel 的批量 ICL 推理。
+    Round 0: 所有样本 batch 推理
+    Round 1: RET 的样本加 shot 后 batch 推理
+    ...直到全部完成或 max_rounds
+    """
+    rng = random.Random(42)
+    # 初始化每条样本的状态
+    states = []
+    for rec in records:
+        states.append({
+            "record": rec,
+            "instruction": rng.choice(instructions),
+            "query_image": rec["image"],
+            "question": rec.get("question", ""),
+            "gt_answer": rec.get("answer", ""),
+            "shots": [],
+            "used_images": {rec["image"]},
+            "candidates": top5.get(rec["image"], []),
+            "rounds": [],
+            "done": False,
+            "result": None,
+        })
+    total = len(states)
+    pbar = tqdm(total=total, desc=f"  {ds_label}", unit="done",
+                disable=(rank != 0))
+    for round_idx in range(max_rounds):
+        # 收集未完成的样本
+        active = [(i, s) for i, s in enumerate(states) if not s["done"]]
+        if not active:
+            break
+        n_active = len(active)
+        pbar.set_postfix(round=round_idx, active=n_active)
+        # 构建 messages
+        messages_list = []
+        active_indices = []
+        for i, s in active:
+            msgs = build_messages(
+                s["instruction"], s["query_image"], s["question"], s["shots"]
+            )
+            messages_list.append(msgs)
+            active_indices.append(i)
+        # 批量推理
+        try:
+            raw_outputs = generate_action_batch(
+                model, processor, messages_list,
+                batch_size=batch_size,
+                pbar=pbar,
+            )
+        except Exception as e:
+            # batch 推理失败时 fallback 到逐条
+            log(f"  [WARN] Batch failed at round {round_idx}, falling back to single: {e}", rank)
+            raw_outputs = []
+            for msgs in messages_list:
+                try:
+                    raw_outputs.append(generate_action(model, processor, msgs))
+                except Exception:
+                    raw_outputs.append("")
+        # 解析结果、更新状态
+        newly_done = 0
+        for idx_in_batch, global_idx in enumerate(active_indices):
+            s = states[global_idx]
+            raw = raw_outputs[idx_in_batch]
+            action, content = parse_action(raw)
+            s["rounds"].append({
+                "round": round_idx,
+                "action": action,
+                "content": content,
+                "raw": raw[:300],
+            })
+            if action == "ans":
+                s["done"] = True
+                s["result"] = {
+                    "image": s["query_image"],
+                    "question": s["question"],
+                    "gt_answer": s["gt_answer"],
+                    "final_answer": content,
+                    "num_rounds": round_idx + 1,
+                    "terminated_by": "ans",
+                    "rounds": s["rounds"],
+                }
+                newly_done += 1
+            elif action == "ret":
+                next_image = None
+                for c in s["candidates"]:
+                    if c not in s["used_images"]:
+                        next_image = c
+                        break
+                if next_image is None:
+                    s["done"] = True
+                    s["result"] = {
+                        "image": s["query_image"],
+                        "question": s["question"],
+                        "gt_answer": s["gt_answer"],
+                        "final_answer": None,
+                        "num_rounds": round_idx + 1,
+                        "terminated_by": "no_more_shots",
+                        "rounds": s["rounds"],
+                    }
+                    newly_done += 1
+                else:
+                    cap = caption_cache.get(next_image, content)
+                    s["shots"].append({"image": next_image, "caption": cap})
+                    s["used_images"].add(next_image)
+            else:
+                s["done"] = True
+                s["result"] = {
+                    "image": s["query_image"],
+                    "question": s["question"],
+                    "gt_answer": s["gt_answer"],
+                    "final_answer": content,
+                    "num_rounds": round_idx + 1,
+                    "terminated_by": "unknown_action",
+                    "rounds": s["rounds"],
+                }
+                newly_done += 1
+        pbar.update(newly_done)
+        n_active = sum(1 for s in states if not s["done"])
+        if rank == 0:
+            pbar.set_postfix(round=round_idx, active=n_active)
+    # 处理还没完成的（达到 max_rounds）
+    for s in states:
+        if not s["done"]:
+            s["result"] = {
+                "image": s["query_image"],
+                "question": s["question"],
+                "gt_answer": s["gt_answer"],
+                "final_answer": None,
+                "num_rounds": max_rounds,
+                "terminated_by": "max_rounds",
+                "rounds": s["rounds"],
+            }
+            pbar.update(1)
+    pbar.close()
+    return [s["result"] for s in states]
+# ---------------------------------------------------------------------------
+# 答案质量指标
+# ---------------------------------------------------------------------------
+def normalize_answer(s: str) -> str:
+    """归一化答案用于比较。"""
+    s = s.lower().strip()
+    # 去标点
+    s = re.sub(r"[^\w\s]", "", s)
+    # 去多余空格
+    s = " ".join(s.split())
+    return s
+def compute_metrics(results: List[Dict]) -> Dict:
+    """计算答案质量指标。"""
+    answered = [r for r in results if r.get("final_answer") is not None]
+    if not answered:
+        return {"exact_match": 0.0, "contains_gt": 0.0, "answer_rate": 0.0}
+    em_count = 0
+    contains_count = 0
+    for r in answered:
+        pred = normalize_answer(r["final_answer"])
+        gt = normalize_answer(r["gt_answer"])
+        if pred == gt:
+            em_count += 1
+        if gt in pred or pred in gt:
+            contains_count += 1
+    n_total = len(results)
+    n_answered = len(answered)
+    return {
+        "exact_match": em_count / n_answered * 100 if n_answered else 0.0,
+        "contains_gt": contains_count / n_answered * 100 if n_answered else 0.0,
+        "answer_rate": n_answered / n_total * 100 if n_total else 0.0,
+        "shot_distribution": compute_shot_distribution(results),
+        "avg_shots": compute_avg_shots(results),
+    }
+def compute_shot_distribution(results: List[Dict]) -> Dict[str, int]:
+    """统计 shot 数量分布。"""
+    shot_counts = defaultdict(int)
+    for r in results:
+        if r.get("terminated_by") == "ans":
+            n_shots = r["num_rounds"] - 1
+        else:
+            n_shots = r["num_rounds"]
+        shot_counts[f"{n_shots}-shot"] += 1
+    return dict(sorted(shot_counts.items()))
+def compute_avg_shots(results: List[Dict]) -> float:
+    if not results:
+        return 0.0
+    total = 0
+    for r in results:
+        if r.get("terminated_by") == "ans":
+            total += r["num_rounds"] - 1
+        else:
+            total += r["num_rounds"]
+    return total / len(results)
+# ---------------------------------------------------------------------------
+# 统计输出
+# ---------------------------------------------------------------------------
+def print_stats(results: List[Dict], cat: str = "", ds: str = ""):
+    prefix = f"[{cat}/{ds}]" if ds else f"[{cat}]" if cat else "[ALL]"
+    n = len(results)
+    if n == 0:
+        print(f"{prefix} 无结果")
+        return
+    # 终止原因
+    term_counts = defaultdict(int)
+    for r in results:
+        term_counts[r["terminated_by"]] += 1
+    # 每轮 action 分布
+    round_actions = defaultdict(lambda: defaultdict(int))
+    for r in results:
+        for rd in r["rounds"]:
+            round_actions[rd["round"]][rd["action"]] += 1
+    avg_rounds = sum(r["num_rounds"] for r in results) / n
+    # 答案质量
+    metrics = compute_metrics(results)
+    print(f"\n{'=' * 64}")
+    print(f"{prefix} 共 {n} 条样本")
+    print(f"  平均轮次: {avg_rounds:.2f}")
+    print(f"  终止原因:")
+    for k, v in sorted(term_counts.items()):
+        print(f"    {k}: {v} ({v / n * 100:.1f}%)")
+    print(f"  每轮 RET/ANS 分布:")
+    for rd_idx in sorted(round_actions.keys()):
+        actions = round_actions[rd_idx]
+        total = sum(actions.values())
+        parts = [f"{a}={c}({c / total * 100:.0f}%)" for a, c in sorted(actions.items())]
+        print(f"    Round {rd_idx}: {' | '.join(parts)}  (共 {total} 条)")
+    # Shot 数量统计（num_rounds - 1 = 回答前检索了几个 shot）
+    shot_counts = defaultdict(int)
+    for r in results:
+        if r["terminated_by"] == "ans":
+            n_shots = r["num_rounds"] - 1  # RET 次数 = 回答时已有的 shot 数
+        else:
+            n_shots = r["num_rounds"]  # 没回答的，全是 RET
+        shot_counts[n_shots] += 1
+    print(f"  Shot 数量分布 (回答时已有的 shot 数):")
+    for k in sorted(shot_counts.keys()):
+        v = shot_counts[k]
+        bar = "█" * int(v / n * 40)
+        print(f"    {k}-shot: {v:4d} ({v / n * 100:5.1f}%) {bar}")
+    avg_shots = sum(k * v for k, v in shot_counts.items()) / n
+    print(f"    平均 shot 数: {avg_shots:.2f}")
+    answered = [r for r in results if r["final_answer"] is not None]
+    print(f"  产出答案: {len(answered)}/{n} ({metrics['answer_rate']:.1f}%)")
+    if answered:
+        print(f"  答案质量 (仅 ans 样本):")
+        print(f"    Exact Match: {metrics['exact_match']:.1f}%")
+        print(f"    Contains GT: {metrics['contains_gt']:.1f}%")
+    print(f"{'=' * 64}")
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(description="ICL 多轮推理评测（支持多卡，log 对齐）")
+    parser.add_argument("--model-path", required=True, help="合并后的 HF 模型路径")
+    parser.add_argument("--category", type=str, default="")
+    parser.add_argument("--dataset", type=str, default="")
+    parser.add_argument("--split", type=str, default="val",
+                        help="使用的数据 split（默认 val，与训练集 train 隔离）")
+    parser.add_argument("--all-categories", action="store_true")
+    parser.add_argument("--num-samples", type=int, default=100,
+                        help="每个 dataset 采样数")
+    parser.add_argument("--max-rounds", type=int, default=4)
+    parser.add_argument("--batch-size", type=int, default=4,
+                        help="每轮 batch 推理的样本数")
+    parser.add_argument("--device", type=str, default="cuda:0",
+                        help="单卡时用的设备")
+    parser.add_argument("--output-dir", type=str,
+                        default="/workspace/xiaobin/ICL/SFT_new/eval_results",
+                        help="评测结果保存目录")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+    random.seed(args.seed)
+    # ---- 分布式初始化 ----
+    rank, world_size, dist_device = setup_distributed()
+    device = dist_device or args.device
+    is_main = rank == 0
+    log(f"World size: {world_size}", rank)
+    log(f"Model: {args.model_path}", rank)
+    log(f"Split: {args.split} (与训练集 train 隔离)", rank)
+    # ---- 加载模型 ----
+    model, processor, ret_id, ans_id = load_model(args.model_path, device)
+    log(f"Model loaded. <RET>={ret_id}, <ANS>={ans_id}", rank)
+    # ---- 确定 dataset 列表 ----
+    if args.all_categories:
+        categories = ["vqa", "captioning", "classification", "reasoning"]
+    elif args.category:
+        categories = [args.category]
+    else:
+        categories = ["vqa"]
+    if args.dataset:
+        ds_list = [(args.category or "vqa", args.dataset)]
+    else:
+        ds_list = discover_datasets(categories)
+    # ---- 按 rank 分配 dataset ----
+    my_ds_list = ds_list[rank::world_size]
+    log(f"共 {len(ds_list)} 个 dataset，rank {rank} 分到 {len(my_ds_list)} 个", rank)
+    local_results = []
+    t_start = time.time()
+    for ds_idx, (cat, ds) in enumerate(my_ds_list):
+        log(f"[{ds_idx + 1}/{len(my_ds_list)}] Evaluating {cat}/{ds} ({args.split})", rank)
+        records = load_records(cat, ds, args.split, limit=args.num_samples * 5)
+        if not records:
+            log(f"  跳过 {cat}/{ds}：无记录", rank)
+            continue
+        top5 = load_top5(cat, ds)
+        if not top5:
+            log(f"  跳过 {cat}/{ds}：无 top5 embedding", rank)
+            continue
+        caption_cache = load_caption_cache(cat, ds)
+        instructions = load_instructions(cat, ds)
+        # 过滤：需要 top5 覆盖
+        records = [r for r in records if r["image"] in top5]
+        if not records:
+            log(f"  跳过 {cat}/{ds}：val 图片无 top5 覆盖", rank)
+            continue
+        if len(records) > args.num_samples:
+            records = random.sample(records, args.num_samples)
+        log(f"  {cat}/{ds}: {len(records)} 条, batch_size={args.batch_size}", rank)
+        ds_results = run_icl_batch(
+            model, processor, records, instructions, top5, caption_cache,
+            max_rounds=args.max_rounds,
+            batch_size=args.batch_size,
+            rank=rank,
+            ds_label=f"{cat}/{ds}",
+        )
+        for r in ds_results:
+            r["category"] = cat
+            r["dataset"] = ds
+        local_results.extend(ds_results)
+    elapsed = time.time() - t_start
+    log(f"\nrank {rank} 完成，{len(local_results)} 条，耗时 {elapsed:.1f}s", rank)
+    # ---- 汇总结果 ----
+    all_results = gather_results(local_results, rank, world_size)
+    if is_main:
+        # 排序：category → dataset → image
+        all_results.sort(key=lambda r: (r.get("category", ""), r.get("dataset", ""), r.get("image", "")))
+        # ---- 按 category / dataset 打印统计 ----
+        cat_results = defaultdict(list)
+        for r in all_results:
+            cat_results[r["category"]].append(r)
+        for cat in categories:
+            if not cat_results.get(cat):
+                continue
+            ds_groups = defaultdict(list)
+            for r in cat_results[cat]:
+                ds_groups[r["dataset"]].append(r)
+            for d in sorted(ds_groups):
+                print_stats(ds_groups[d], cat, d)
+            # category 汇总
+            if len(ds_groups) > 1:
+                print_stats(cat_results[cat], cat)
+        # 总汇总
+        if len(categories) > 1 or not args.dataset:
+            print_stats(all_results)
+        # ---- 保存 JSON log ----
+        os.makedirs(args.output_dir, exist_ok=True)
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        output_path = os.path.join(args.output_dir, f"eval_{args.split}_{timestamp}.json")
+        # 构建 summary
+        summary = {
+            "model_path": args.model_path,
+            "split": args.split,
+            "num_samples_per_ds": args.num_samples,
+            "max_rounds": args.max_rounds,
+            "total_samples": len(all_results),
+            "world_size": world_size,
+            "elapsed_seconds": elapsed,
+            "metrics": {},
+        }
+        # 整体 metrics
+        summary["metrics"]["overall"] = compute_metrics(all_results)
+        # 按 category metrics
+        for cat in categories:
+            if cat_results.get(cat):
+                summary["metrics"][cat] = compute_metrics(cat_results[cat])
+        output_data = {
+            "summary": summary,
+            "results": all_results,
+        }
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(output_data, f, ensure_ascii=False, indent=2)
+        print(f"\n详细结果已保存到: {output_path}")
+        # 也保存一份不带时间戳的 latest
+        latest_path = os.path.join(args.output_dir, f"eval_{args.split}_latest.json")
+        with open(latest_path, "w", encoding="utf-8") as f:
+            json.dump(output_data, f, ensure_ascii=False, indent=2)
+        print(f"Latest 链接: {latest_path}")
+    if world_size > 1:
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

ICL/SFT_new/launch_wrapper.py ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env python3
+"""Wrapper for northjob: receives torchrun args, launches run_multi_node.sh."""
+import subprocess
+import sys
+import os
+if __name__ == "__main__":
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    bash_script = os.path.join(script_dir, "run_multi_node.sh")
+    args = sys.argv[1:]
+    cmd = ["bash", bash_script] + args
+    result = subprocess.run(cmd, env=os.environ.copy())
+    sys.exit(result.returncode)

ICL/SFT_new/rebuild_and_train.sh ADDED Viewed

	@@ -0,0 +1,86 @@

+#!/bin/bash
+# =============================================================================
+# 一键：重建 SFT 数据 → 提交 16 卡训练任务
+#
+# 1. 用新配比 (answer_at_weights=1,3,3,2 + 去掉中间ANS) 重建数据
+# 2. 通过 northjob 提交 16 GPU 训练
+#
+# Usage:
+#   bash rebuild_and_train.sh
+# =============================================================================
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+ICL_DIR="$(dirname "${SCRIPT_DIR}")"
+PYTHON_BIN="/workspace/miniconda3/envs/sft/bin/python3"
+BUILD_SCRIPT="${ICL_DIR}/build_sft.py"
+SFT_OUTPUT="/workspace/xiaobin/dataset/sft"
+SFT_DATA="${SFT_OUTPUT}/all/sft.jsonl"
+echo "============================================"
+echo "Step 1: 重建 SFT 数据集"
+echo "  权重: 5,3,2,1 (多给 0-shot ANS，轨迹式无矛盾)"
+echo "  轨迹式生成：同一输入只出现一种 action"
+echo "============================================"
+# 备份旧数据
+if [ -f "${SFT_DATA}" ]; then
+    BACKUP="${SFT_DATA}.bak.$(date +%Y%m%d_%H%M%S)"
+    cp "${SFT_DATA}" "${BACKUP}"
+    echo "旧数据已备份: ${BACKUP}"
+fi
+# 重建数据（4 类，总量 ~6 万条 SFT 样本）
+${PYTHON_BIN} "${BUILD_SCRIPT}" \
+    --answer-at-weights "5,3,2,1" \
+    --samples-per-cat 7800 \
+    --shuffle
+echo ""
+# 验证新数据
+echo "============================================"
+echo "Step 2: 验证新数据配比"
+echo "============================================"
+${PYTHON_BIN} -c "
+import json
+ret, ans = 0, 0
+shot_ret, shot_ans = {}, {}
+with open('${SFT_DATA}') as f:
+    for line in f:
+        r = json.loads(line)
+        n = len(r.get('shots', []))
+        if r['type'] == 'ret':
+            ret += 1
+            shot_ret[n] = shot_ret.get(n, 0) + 1
+        else:
+            ans += 1
+            shot_ans[n] = shot_ans.get(n, 0) + 1
+total = ret + ans
+print(f'总样本: {total}')
+print(f'RET: {ret} ({ret/total*100:.1f}%)')
+print(f'ANS: {ans} ({ans/total*100:.1f}%)')
+print(f'RET/ANS 比: {ret/max(ans,1):.2f}:1')
+print()
+print('RET shot 分布:')
+for k in sorted(shot_ret): print(f'  {k}-shot: {shot_ret[k]}')
+print('ANS shot 分布:')
+for k in sorted(shot_ans): print(f'  {k}-shot: {shot_ans[k]}')
+r0 = shot_ret.get(0, 0); a0 = shot_ans.get(0, 0)
+print(f'\n0-shot: RET={r0}({r0/(r0+a0)*100:.1f}%) ANS={a0}({a0/(r0+a0)*100:.1f}%)')
+"
+echo ""
+echo "============================================"
+echo "Step 3: 提交 16 卡训练任务"
+echo "============================================"
+bash "${SCRIPT_DIR}/submit_northjob.sh" 16
+echo ""
+echo "============================================"
+echo "全部完成！"
+echo "  数据: ${SFT_DATA}"
+echo "  任务: 16 GPU via northjob"
+echo "============================================"

ICL/SFT_new/run_eval.sh ADDED Viewed

	@@ -0,0 +1,74 @@

+#!/bin/bash
+# =============================================================================
+# ICL 评测启动脚本
+#
+# 默认：四类任务 (vqa, captioning, classification, reasoning) 各 500 条
+#
+# 用法：
+#   bash run_eval.sh                              # 单卡，四类各 500 条
+#   bash run_eval.sh 8                            # 8 卡，四类各 500 条
+#   bash run_eval.sh 1 vqa vqav2 20              # 单卡，指定 dataset，20 条
+#
+# 环境变量：
+#   MODEL_PATH=... bash run_eval.sh               # 指定模型路径
+#   BATCH_SIZE=8 bash run_eval.sh                  # 调大 batch
+# =============================================================================
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+# ---- 默认参数 ----
+NUM_GPUS="${1:-1}"
+CATEGORY="${2:-}"
+DATASET="${3:-}"
+NUM_SAMPLES="${4:-500}"
+BATCH_SIZE="${BATCH_SIZE:-4}"
+SPLIT="val"
+MODEL_PATH="${MODEL_PATH:-/workspace/xiaobin/ICL/sft_model/epoch3_step1406_fp32}"
+OUTPUT_DIR="${SCRIPT_DIR}/eval_results"
+echo "============================================"
+echo "ICL Evaluation"
+echo "  GPUs:        ${NUM_GPUS}"
+echo "  Model:       ${MODEL_PATH}"
+echo "  Split:       ${SPLIT}"
+echo "  Batch size:  ${BATCH_SIZE}"
+echo "  Samples/ds:  ${NUM_SAMPLES}"
+echo "  Category:    ${CATEGORY:-all (vqa,captioning,classification,reasoning)}"
+echo "  Dataset:     ${DATASET:-all}"
+echo "  Output:      ${OUTPUT_DIR}"
+echo "============================================"
+# ---- 构建参数 ----
+EXTRA_ARGS=""
+if [ -n "${CATEGORY}" ] && [ -n "${DATASET}" ]; then
+    EXTRA_ARGS="--category ${CATEGORY} --dataset ${DATASET}"
+elif [ -n "${CATEGORY}" ]; then
+    EXTRA_ARGS="--category ${CATEGORY}"
+else
+    EXTRA_ARGS="--all-categories"
+fi
+if [ "${NUM_GPUS}" -eq 1 ]; then
+    python3 "${SCRIPT_DIR}/eval.py" \
+        --model-path "${MODEL_PATH}" \
+        --split "${SPLIT}" \
+        --num-samples "${NUM_SAMPLES}" \
+        --batch-size "${BATCH_SIZE}" \
+        --max-rounds 4 \
+        --output-dir "${OUTPUT_DIR}" \
+        --device cuda:0 \
+        ${EXTRA_ARGS}
+else
+    torchrun \
+        --nproc_per_node="${NUM_GPUS}" \
+        --master_port=29501 \
+        "${SCRIPT_DIR}/eval.py" \
+        --model-path "${MODEL_PATH}" \
+        --split "${SPLIT}" \
+        --num-samples "${NUM_SAMPLES}" \
+        --batch-size "${BATCH_SIZE}" \
+        --max-rounds 4 \
+        --output-dir "${OUTPUT_DIR}" \
+        ${EXTRA_ARGS}
+fi

ICL/SFT_new/run_single_node.sh ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/bin/bash
+# =============================================================================
+# Single-node training (1 machine, 8x H100)
+# For debugging and quick iteration
+#
+# Usage:
+#   bash run_single_node.sh <data.jsonl> [num_gpus]
+#   bash run_single_node.sh /path/to/sft.jsonl 8
+#   bash run_single_node.sh /path/to/sft.jsonl 2   # quick debug
+# =============================================================================
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+# ---- Config ----
+MODEL_PATH="/workspace/models/Qwen3-VL-8B-Instruct"
+DATA_PATH="${1:?Usage: $0 <data.jsonl> [num_gpus]}"
+NUM_GPUS="${2:-8}"
+OUTPUT_DIR="/workspace/xiaobin/ICL/sft_model"
+# ---- Env ----
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export NCCL_P2P_DISABLE=0
+export NCCL_IB_DISABLE=0
+# ---- Launch ----
+echo "============================================"
+echo "Single-node SFT: ${NUM_GPUS} GPUs"
+echo "Model:  ${MODEL_PATH}"
+echo "Data:   ${DATA_PATH}"
+echo "Output: ${OUTPUT_DIR}"
+echo "============================================"
+torchrun \
+    --nproc_per_node=${NUM_GPUS} \
+    --master_port=29500 \
+    ${SCRIPT_DIR}/train.py \
+    --model-path ${MODEL_PATH} \
+    --data-path ${DATA_PATH} \
+    --output-dir ${OUTPUT_DIR} \
+    --deepspeed ${SCRIPT_DIR}/ds_zero2.json \
+    --num-epochs 3 \
+    --batch-size 2 \
+    --gradient-accumulation-steps 4 \
+    --learning-rate 1e-6 \
+    --max-length 32768 \
+    --gradient-checkpointing \
+    --log-interval 10 \
+    --save-interval 500

ICL/SFT_new/submit_northjob.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/bin/bash
+# =============================================================================
+# Submit multi-node job via northjob (16 GPUs = 2 nodes × 8 H100)
+#
+# Usage:
+#   bash submit_northjob.sh [num_gpus]
+#   bash submit_northjob.sh 16    # 2 nodes
+#   bash submit_northjob.sh 8     # 1 node (debug)
+# =============================================================================
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+GPU_NUMS="${1:-16}"
+GPU_PER_NODE=8
+NNODES=$((GPU_NUMS / GPU_PER_NODE))
+JOB_NAME="qwen3vl-sft-${GPU_NUMS}gpu"
+WORK_DIR="${SCRIPT_DIR}"
+TRAIN_SCRIPT="${SCRIPT_DIR}/launch_wrapper.py"
+echo "Submitting: ${JOB_NAME} (${NNODES} nodes × ${GPU_PER_NODE} GPUs)"
+/workspace/miniconda3/envs/sft/bin/northjob \
+    create \
+    --job-type train \
+    --nproc-per-node ${GPU_PER_NODE} \
+    --gpu-per-node ${GPU_PER_NODE} \
+    --nnodes ${NNODES} \
+    --k8s-priority 3 \
+    --k8s-queue bg-agentic-coding \
+    --k8s-namespace bg-agentic-coding \
+    --k8s-pvc-name i-xinsiyang-y4zy0sik0a \
+    --k8s-pvc-mount-path /workspace \
+    --k8s-no-reclaim \
+    --k8s-images harbor.local.clusters/bp/megatron-bplm:25.03_fp8.ibgda.qwen3.next.fix_triton.fix_te.hf457.qwen3_vl \
+    --job-name ${JOB_NAME} \
+    --workspace ${WORK_DIR} \
+    ${TRAIN_SCRIPT} ${GPU_PER_NODE}

ICL/SFT_new/train.py ADDED Viewed

	@@ -0,0 +1,659 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Qwen3-VL-8B SFT Training Script (single-step RET/ANS decision).
+Supports:
+  - Full fine-tuning or LoRA
+  - DeepSpeed ZeRO-2/3
+  - Multi-image conversations
+  - Loss masking on user turns only
+  - Flash Attention 2 on H100
+"""
+import argparse
+import json
+import logging
+import math
+import os
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence
+import torch
+import torch.distributed as dist
+from torch.utils.data import Dataset, DataLoader
+from transformers import (
+    AutoProcessor,
+    Qwen3VLForConditionalGeneration,
+    get_cosine_schedule_with_warmup,
+)
+from peft import LoraConfig, get_peft_model, TaskType
+from qwen_vl_utils import process_vision_info
+try:
+    import deepspeed
+    HAS_DEEPSPEED = True
+except ImportError:
+    HAS_DEEPSPEED = False
+logging.basicConfig(
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+# Special token IDs (Qwen3-VL)
+IM_START_ID = 151644
+IM_END_ID = 151645
+IGNORE_INDEX = -100
+# ============================================================================
+# Dataset
+# ============================================================================
+class SFTDataset(Dataset):
+    """Load single-step SFT JSONL (轻量引用格式).
+    支持两种格式：
+    格式 A (新，轻量引用):
+      {
+        "type": "ret" | "ans",
+        "query_image": "/path/to/query.jpg",
+        "question": "What color?",
+        "answer": "black",
+        "instruction": "Answer the question...",
+        "shots": [{"image": "/path/shot.jpg", "caption": "A cat..."}],
+        "next_description": "A dog..."  // 仅 ret 类型
+      }
+    格式 B (旧，conversations):
+      {
+        "images": ["path1.jpg", ...],
+        "conversations": [
+          {"from": "human", "value": "...<image>..."},
+          {"from": "gpt",   "value": "<ANS>answer</ANS>"}
+        ]
+      }
+    """
+    def __init__(self, data_path: str, processor, max_length: int = 4096,
+                 min_pixels: int = 256 * 28 * 28,
+                 max_pixels: int = 1280 * 28 * 28):
+        self.processor = processor
+        self.max_length = max_length
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.records = []
+        logger.info(f"Loading data from {data_path}")
+        with open(data_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    self.records.append(json.loads(line))
+                except Exception:
+                    continue
+        logger.info(f"Loaded {len(self.records)} samples")
+    def __len__(self):
+        return len(self.records)
+    # ---- 新格式: 从引用字段动态构建 messages ----
+    def _build_messages_v2(self, record: Dict) -> List[Dict]:
+        """从轻量引用格式构建 Qwen3-VL chat messages."""
+        user_content = []
+        # 1. instruction
+        inst = record.get("instruction", "")
+        if inst:
+            user_content.append({"type": "text", "text": inst})
+        # 2. query image
+        user_content.append({
+            "type": "image",
+            "image": f"file://{record['query_image']}",
+            "min_pixels": self.min_pixels,
+            "max_pixels": self.max_pixels,
+        })
+        # 3. question (可能为空，如 captioning 类)
+        question = record.get("question", "")
+        if question:
+            user_content.append({"type": "text", "text": f"Question: {question}"})
+        # 4. context shots (image + caption)
+        for shot in record.get("shots", []):
+            user_content.append({
+                "type": "image",
+                "image": f"file://{shot['image']}",
+                "min_pixels": self.min_pixels,
+                "max_pixels": self.max_pixels,
+            })
+            cap = shot.get("caption", "")
+            if cap:
+                user_content.append({"type": "text", "text": f"Caption: {cap}"})
+        # 5. Action prompt
+        user_content.append({"type": "text", "text": "Action:"})
+        # 6. assistant response
+        if record["type"] == "ret":
+            desc = record.get("next_description", "")
+            assistant_text = f"<RET>\nDescription: {desc}"
+        else:
+            assistant_text = f"<ANS>{record['answer']}</ANS>"
+        messages = [
+            {"role": "user", "content": user_content},
+            {"role": "assistant", "content": [{"type": "text", "text": assistant_text}]},
+        ]
+        return messages
+    # ---- 旧格式: conversations + <image> 占位符 ----
+    def _build_messages_v1(self, record: Dict) -> List[Dict]:
+        """Convert conversations format → Qwen3-VL chat messages."""
+        convs = record["conversations"]
+        image_paths = record.get("images", [])
+        messages = []
+        for turn in convs:
+            role = "user" if turn["from"] == "human" else "assistant"
+            text = turn["value"]
+            if role == "user":
+                content = []
+                parts = text.split("<image>")
+                img_idx = 0
+                for i, part in enumerate(parts):
+                    if i > 0 and img_idx < len(image_paths):
+                        content.append({
+                            "type": "image",
+                            "image": f"file://{image_paths[img_idx]}",
+                            "min_pixels": self.min_pixels,
+                            "max_pixels": self.max_pixels,
+                        })
+                        img_idx += 1
+                    if part.strip():
+                        content.append({"type": "text", "text": part.strip()})
+                messages.append({"role": role, "content": content})
+            else:
+                messages.append({
+                    "role": role,
+                    "content": [{"type": "text", "text": text}],
+                })
+        return messages
+    def __getitem__(self, idx):
+        record = self.records[idx]
+        # 自动检测格式
+        if "type" in record and "query_image" in record:
+            messages = self._build_messages_v2(record)
+        else:
+            messages = self._build_messages_v1(record)
+        # Apply chat template (no generation prompt for training)
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=False
+        )
+        # Process images
+        image_inputs = None
+        try:
+            image_inputs, _ = process_vision_info(messages)
+        except Exception:
+            pass
+        # Tokenize — 不截断，避免图片 token 不匹配
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs if image_inputs else None,
+            return_tensors="pt",
+            padding=False,
+            truncation=False,
+        )
+        # Squeeze batch dim
+        input_ids = inputs["input_ids"].squeeze(0)
+        attention_mask = inputs["attention_mask"].squeeze(0)
+        # 超长时截断文本部分（保留前 max_length 个 token）
+        if input_ids.shape[0] > self.max_length:
+            input_ids = input_ids[:self.max_length]
+            attention_mask = attention_mask[:self.max_length]
+        # Build labels: mask user turns, keep assistant turns
+        labels = self._build_labels(input_ids)
+        result = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+        # Pass through pixel values if present
+        if "pixel_values" in inputs:
+            result["pixel_values"] = inputs["pixel_values"].squeeze(0) \
+                if inputs["pixel_values"].dim() > 3 else inputs["pixel_values"]
+        if "image_grid_thw" in inputs:
+            result["image_grid_thw"] = inputs["image_grid_thw"]
+        return result
+    def _build_labels(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """Mask everything except assistant responses.
+        Strategy: find <|im_start|>assistant ... <|im_end|> spans,
+        only compute loss on tokens after 'assistant\n' until <|im_end|>.
+        """
+        labels = torch.full_like(input_ids, IGNORE_INDEX)
+        ids = input_ids.tolist()
+        assist_tokens = self.processor.tokenizer.encode(
+            "assistant\n", add_special_tokens=False
+        )
+        i = 0
+        while i < len(ids):
+            if ids[i] == IM_START_ID:
+                start = i + 1
+                end = start + len(assist_tokens)
+                if end <= len(ids) and ids[start:end] == assist_tokens:
+                    content_start = end
+                    j = content_start
+                    while j < len(ids) and ids[j] != IM_END_ID:
+                        j += 1
+                    labels[content_start:j + 1] = input_ids[content_start:j + 1]
+                    i = j + 1
+                    continue
+            i += 1
+        return labels
+# ============================================================================
+# Collator
+# ============================================================================
+class SFTCollator:
+    """Pad variable-length samples into a batch."""
+    def __init__(self, pad_token_id: int, max_length: int = 4096):
+        self.pad_token_id = pad_token_id
+        self.max_length = max_length
+    def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]:
+        max_len = min(
+            max(f["input_ids"].size(0) for f in features),
+            self.max_length,
+        )
+        batch_input_ids = []
+        batch_attention_mask = []
+        batch_labels = []
+        batch_pixel_values = []
+        batch_image_grid_thw = []
+        for f in features:
+            ids = f["input_ids"][:max_len]
+            mask = f["attention_mask"][:max_len]
+            lab = f["labels"][:max_len]
+            pad_len = max_len - ids.size(0)
+            if pad_len > 0:
+                ids = torch.cat([ids, torch.full((pad_len,), self.pad_token_id, dtype=ids.dtype)])
+                mask = torch.cat([mask, torch.zeros(pad_len, dtype=mask.dtype)])
+                lab = torch.cat([lab, torch.full((pad_len,), IGNORE_INDEX, dtype=lab.dtype)])
+            batch_input_ids.append(ids)
+            batch_attention_mask.append(mask)
+            batch_labels.append(lab)
+            if "pixel_values" in f:
+                batch_pixel_values.append(f["pixel_values"])
+            if "image_grid_thw" in f:
+                batch_image_grid_thw.append(f["image_grid_thw"])
+        result = {
+            "input_ids": torch.stack(batch_input_ids),
+            "attention_mask": torch.stack(batch_attention_mask),
+            "labels": torch.stack(batch_labels),
+        }
+        if batch_pixel_values:
+            result["pixel_values"] = torch.cat(batch_pixel_values, dim=0)
+        if batch_image_grid_thw:
+            result["image_grid_thw"] = torch.cat(batch_image_grid_thw, dim=0)
+        return result
+# ============================================================================
+# Training
+# ============================================================================
+def train(args):
+    # ---- Distributed setup ----
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    rank = int(os.environ.get("RANK", 0))
+    if world_size > 1 and not dist.is_initialized():
+        dist.init_process_group("nccl")
+    torch.cuda.set_device(local_rank)
+    device = torch.device(f"cuda:{local_rank}")
+    is_main = rank == 0
+    if is_main:
+        logger.info(f"World size: {world_size}, Local rank: {local_rank}")
+        logger.info(f"Args: {vars(args)}")
+    # ---- Load processor & model ----
+    processor = AutoProcessor.from_pretrained(
+        args.model_path, trust_remote_code=True,
+        min_pixels=args.min_pixels, max_pixels=args.max_pixels,
+    )
+    model_kwargs = {
+        "trust_remote_code": True,
+        "torch_dtype": torch.bfloat16,
+        "attn_implementation": "flash_attention_2",
+    }
+    if not (HAS_DEEPSPEED and args.deepspeed):
+        model_kwargs["device_map"] = {"": device}
+    model = Qwen3VLForConditionalGeneration.from_pretrained(
+        args.model_path, **model_kwargs,
+    )
+    # Add special tokens
+    special_tokens = ["<RET>", "<ANS>", "</ANS>", "<RETQ>", "</RETQ>"]
+    num_added = processor.tokenizer.add_tokens(special_tokens, special_tokens=True)
+    if num_added > 0:
+        model.resize_token_embeddings(len(processor.tokenizer))
+        if is_main:
+            logger.info(f"Added {num_added} special tokens, vocab → {len(processor.tokenizer)}")
+    # ---- LoRA (optional) ----
+    if args.use_lora:
+        lora_config = LoraConfig(
+            task_type=TaskType.CAUSAL_LM,
+            r=args.lora_rank,
+            lora_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+            target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                            "gate_proj", "up_proj", "down_proj"],
+        )
+        model = get_peft_model(model, lora_config)
+        if is_main:
+            model.print_trainable_parameters()
+    else:
+        if args.gradient_checkpointing:
+            model.gradient_checkpointing_enable(
+                gradient_checkpointing_kwargs={"use_reentrant": False}
+            )
+    # ---- Dataset ----
+    train_dataset = SFTDataset(
+        args.data_path, processor, args.max_length,
+        args.min_pixels, args.max_pixels,
+    )
+    collator = SFTCollator(processor.tokenizer.pad_token_id, args.max_length)
+    # ---- DeepSpeed or vanilla DDP ----
+    if HAS_DEEPSPEED and args.deepspeed:
+        # Load DS config and dynamically set scheduler params
+        import copy
+        with open(args.deepspeed, "r") as _f:
+            ds_config = json.load(_f)
+        # Explicitly set all batch-size params (avoid "auto" which some DS versions don't support)
+        micro_bs = ds_config.get("train_micro_batch_size_per_gpu", args.batch_size)
+        grad_accum_cfg = ds_config.get("gradient_accumulation_steps", args.gradient_accumulation_steps)
+        ds_config["train_micro_batch_size_per_gpu"] = micro_bs
+        ds_config["gradient_accumulation_steps"] = grad_accum_cfg
+        ds_config["train_batch_size"] = micro_bs * grad_accum_cfg * world_size
+        # Override LR from CLI args
+        if "optimizer" in ds_config and "params" in ds_config["optimizer"]:
+            ds_config["optimizer"]["params"]["lr"] = args.learning_rate
+        if is_main:
+            logger.info(f"DeepSpeed config: micro_bs={micro_bs}, grad_accum={grad_accum_cfg}, "
+                        f"world_size={world_size}, train_batch_size={ds_config['train_batch_size']}")
+        model_engine, optimizer, train_loader, _ = deepspeed.initialize(
+            model=model,
+            model_parameters=[p for p in model.parameters() if p.requires_grad],
+            training_data=train_dataset,
+            collate_fn=collator,
+            config=ds_config,
+        )
+        # total_steps = optimizer steps (micro-batch steps per epoch / grad_accum * num_epochs)
+        grad_accum = model_engine.gradient_accumulation_steps()
+        steps_per_epoch = len(train_loader) // grad_accum
+        total_steps = steps_per_epoch * args.num_epochs
+        warmup_steps = int(total_steps * args.warmup_ratio)
+        # Replace DS scheduler with cosine schedule
+        # Note: model_engine.optimizer is DeepSpeedZeroOptimizer (not a torch.optim.Optimizer),
+        # so we must use the underlying torch optimizer for LambdaLR.
+        base_optimizer = model_engine.optimizer.optimizer  # unwrap to torch AdamW
+        ds_scheduler = get_cosine_schedule_with_warmup(
+            base_optimizer,
+            num_warmup_steps=warmup_steps,
+            num_training_steps=total_steps,
+        )
+        model_engine.lr_scheduler = ds_scheduler
+        scheduler = None
+    else:
+        # Vanilla DDP
+        if world_size > 1:
+            model = torch.nn.parallel.DistributedDataParallel(
+                model, device_ids=[local_rank],
+                find_unused_parameters=False,
+            )
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            train_dataset, num_replicas=world_size, rank=rank, shuffle=True,
+        ) if world_size > 1 else None
+        train_loader = DataLoader(
+            train_dataset, batch_size=args.batch_size,
+            sampler=sampler, shuffle=(sampler is None),
+            collate_fn=collator, num_workers=args.num_workers,
+            pin_memory=True, drop_last=True,
+        )
+        optimizer = torch.optim.AdamW(
+            [p for p in model.parameters() if p.requires_grad],
+            lr=args.learning_rate, weight_decay=args.weight_decay,
+            betas=(0.9, 0.999),
+        )
+        total_steps = (len(train_loader) * args.num_epochs) // args.gradient_accumulation_steps
+        warmup_steps = int(total_steps * args.warmup_ratio)
+        scheduler = get_cosine_schedule_with_warmup(
+            optimizer, warmup_steps, total_steps,
+        )
+        model_engine = None
+    if is_main:
+        logger.info(f"Dataset: {len(train_dataset)} samples")
+        logger.info(f"Total steps: {total_steps}, Warmup: {warmup_steps}")
+    # ---- Training loop ----
+    optimizer_step = 0
+    running_loss = 0.0
+    running_count = 0
+    accum_loss = 0.0  # accumulate loss across micro-batches within one grad accum cycle
+    for epoch in range(args.num_epochs):
+        if hasattr(train_loader, "sampler") and hasattr(train_loader.sampler, "set_epoch"):
+            train_loader.sampler.set_epoch(epoch)
+        model.train() if model_engine is None else model_engine.train()
+        for step, batch in enumerate(train_loader):
+            # Move batch to GPU
+            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v
+                     for k, v in batch.items()}
+            # Forward
+            if model_engine:
+                outputs = model_engine(**batch)
+                loss = outputs.loss
+                model_engine.backward(loss)
+                model_engine.step()
+                # Accumulate loss across micro-batches
+                accum_loss += loss.item()
+                # Log/save only on optimizer step boundaries
+                if model_engine.is_gradient_accumulation_boundary():
+                    grad_accum = model_engine.gradient_accumulation_steps()
+                    optimizer_step += 1
+                    cur_loss = accum_loss / grad_accum  # average over micro-batches
+                    accum_loss = 0.0
+                    running_loss += cur_loss
+                    running_count += 1
+                    avg_loss = running_loss / running_count
+                    if is_main and optimizer_step % args.log_interval == 0:
+                        lr_now = ds_scheduler.get_last_lr()[0]
+                        logger.info(
+                            f"Epoch {epoch+1}/{args.num_epochs} "
+                            f"Step {optimizer_step}/{total_steps} "
+                            f"Loss {cur_loss:.4f} "
+                            f"AvgLoss {avg_loss:.4f} "
+                            f"LR {lr_now:.2e}"
+                        )
+                    # Save checkpoint
+                    if optimizer_step > 0 and optimizer_step % args.save_interval == 0:
+                        _save_checkpoint(args, model, model_engine, processor, epoch, optimizer_step, is_main)
+            else:
+                outputs = model(**batch)
+                loss = outputs.loss / args.gradient_accumulation_steps
+                loss.backward()
+                accum_loss += loss.item() * args.gradient_accumulation_steps
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    torch.nn.utils.clip_grad_norm_(
+                        model.parameters(), args.max_grad_norm
+                    )
+                    optimizer.step()
+                    scheduler.step()
+                    optimizer.zero_grad()
+                    optimizer_step += 1
+                    cur_loss = accum_loss / args.gradient_accumulation_steps
+                    accum_loss = 0.0
+                    running_loss += cur_loss
+                    running_count += 1
+                    avg_loss = running_loss / running_count
+                    if is_main and optimizer_step % args.log_interval == 0:
+                        lr_now = scheduler.get_last_lr()[0]
+                        logger.info(
+                            f"Epoch {epoch+1}/{args.num_epochs} "
+                            f"Step {optimizer_step}/{total_steps} "
+                            f"Loss {cur_loss:.4f} "
+                            f"AvgLoss {avg_loss:.4f} "
+                            f"LR {lr_now:.2e}"
+                        )
+        # End of epoch save
+        if model_engine:
+            _save_checkpoint(args, model, model_engine, processor, epoch, optimizer_step, is_main)
+        else:
+            _save_checkpoint(args, model, model_engine, processor, epoch, optimizer_step, is_main)
+    # Final save
+    if model_engine:
+        _save_checkpoint(args, model, model_engine, processor, args.num_epochs, optimizer_step, is_main, final=True)
+    else:
+        _save_checkpoint(args, model, model_engine, processor, args.num_epochs, optimizer_step, is_main, final=True)
+    if is_main:
+        logger.info("Training complete!")
+def _save_checkpoint(args, model, model_engine, processor, epoch, step, is_main, final=False):
+    tag = "final" if final else f"epoch{epoch+1}_step{step}"
+    save_dir = Path(args.output_dir) / tag
+    if model_engine and HAS_DEEPSPEED:
+        # DeepSpeed save_checkpoint must be called by ALL ranks
+        model_engine.save_checkpoint(str(args.output_dir), tag=tag)
+    elif is_main:
+        unwrapped = model.module if hasattr(model, "module") else model
+        if args.use_lora:
+            unwrapped.save_pretrained(str(save_dir))
+        else:
+            unwrapped.save_pretrained(str(save_dir))
+        processor.save_pretrained(str(save_dir))
+    if is_main:
+        logger.info(f"Saved checkpoint → {save_dir}")
+# ============================================================================
+# Main
+# ============================================================================
+def parse_args():
+    p = argparse.ArgumentParser(description="Qwen3-VL-8B SFT")
+    # Model
+    p.add_argument("--model-path", default="/workspace/models/Qwen3-VL-8B-Instruct")
+    p.add_argument("--output-dir", default="/workspace/xiaobin/ICL/SFT_new/output/qwen3vl_sft")
+    # Data
+    p.add_argument("--data-path", required=True, help="Path to sft.jsonl")
+    p.add_argument("--max-length", type=int, default=4096)
+    p.add_argument("--min-pixels", type=int, default=256 * 28 * 28)
+    p.add_argument("--max-pixels", type=int, default=1280 * 28 * 28)
+    # Training
+    p.add_argument("--num-epochs", type=int, default=3)
+    p.add_argument("--batch-size", type=int, default=1,
+                   help="Per-GPU micro batch size")
+    p.add_argument("--gradient-accumulation-steps", type=int, default=4)
+    p.add_argument("--learning-rate", type=float, default=1e-5)
+    p.add_argument("--weight-decay", type=float, default=0.1)
+    p.add_argument("--warmup-ratio", type=float, default=0.05)
+    p.add_argument("--max-grad-norm", type=float, default=1.0)
+    p.add_argument("--gradient-checkpointing", action="store_true", default=True)
+    p.add_argument("--num-workers", type=int, default=4)
+    # LoRA
+    p.add_argument("--use-lora", action="store_true", default=False)
+    p.add_argument("--lora-rank", type=int, default=64)
+    p.add_argument("--lora-alpha", type=int, default=128)
+    p.add_argument("--lora-dropout", type=float, default=0.05)
+    # Logging
+    p.add_argument("--log-interval", type=int, default=10)
+    p.add_argument("--save-interval", type=int, default=500)
+    # DeepSpeed
+    p.add_argument("--deepspeed", type=str, default=None,
+                   help="Path to DeepSpeed config JSON")
+    p.add_argument("--local_rank", type=int, default=-1)  # torchrun sets this
+    return p.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    train(args)

ICL/build_embeddings.py ADDED Viewed

	@@ -0,0 +1,370 @@

+#!/usr/bin/env python3
+"""
+预算 SigLIP2 embeddings + Top5 相似图片映射（8卡 DataParallel）。
+用法：
+  python3 build_embeddings.py                          # 8卡，全部
+  python3 build_embeddings.py --datasets vqa/shapes    # 测试
+  python3 build_embeddings.py --force                  # 强制重建
+"""
+import argparse
+import json
+import os
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Tuple
+try:
+    from tqdm import tqdm
+except ImportError:
+    def tqdm(x, **kw):
+        return x
+import torch
+import torch.nn as nn
+import cv2
+import numpy as np
+from PIL import Image
+from transformers import AutoModel, AutoProcessor
+# ---------------------------------------------------------------------------
+IMAGES_ROOT = "/workspace/xiaobin/dataset/images"
+CAPTION_CACHE_DIR = "/workspace/xiaobin/dataset/caption_cache"
+EMBEDDINGS_DIR = "/workspace/xiaobin/dataset/embeddings"
+DEFAULT_MODEL = "/workspace/models/siglip2-so400m-patch14-384"
+# ---------------------------------------------------------------------------
+# DataParallel wrappers
+# ---------------------------------------------------------------------------
+class SigLIPImageModule(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+    def forward(self, **kwargs):
+        out = self.model.get_image_features(**kwargs)
+        feat = out.pooler_output if hasattr(out, "pooler_output") else out
+        return feat / feat.norm(dim=-1, keepdim=True)
+class SigLIPTextModule(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+    def forward(self, **kwargs):
+        out = self.model.get_text_features(**kwargs)
+        feat = out.pooler_output if hasattr(out, "pooler_output") else out
+        return feat / feat.norm(dim=-1, keepdim=True)
+# ---------------------------------------------------------------------------
+# Encoder: 单进程, 多线程读图, 小batch快速跑
+# ---------------------------------------------------------------------------
+class SigLIPEncoder:
+    def __init__(self, model_path: str, gpu_ids: List[int],
+                 batch_size_per_gpu: int = 64, num_threads: int = 16):
+        self.gpu_ids = gpu_ids
+        self.n_gpus = len(gpu_ids)
+        self.batch_size = batch_size_per_gpu * self.n_gpus
+        self.num_threads = num_threads
+        self.primary = torch.device(f"cuda:{gpu_ids[0]}")
+        print(f"  GPU: {gpu_ids} ({self.n_gpus} 张)")
+        print(f"  batch: {batch_size_per_gpu}/卡 × {self.n_gpus}卡 = {self.batch_size}")
+        self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+        base_model = AutoModel.from_pretrained(
+            model_path, dtype=torch.bfloat16, trust_remote_code=True
+        ).to(self.primary).eval()
+        self.img_module = nn.DataParallel(
+            SigLIPImageModule(base_model), device_ids=gpu_ids)
+        self.txt_module = nn.DataParallel(
+            SigLIPTextModule(base_model), device_ids=gpu_ids)
+    @staticmethod
+    def _load_and_preprocess(path):
+        """读图 + OpenCV resize + normalize → numpy (3, 384, 384) float32"""
+        try:
+            img = cv2.imread(path)
+            if img is None:
+                return (path, None)
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            img = cv2.resize(img, (384, 384))
+            img = img.astype(np.float32) / 255.0
+            img = (img - 0.5) / 0.5
+            img = np.transpose(img, (2, 0, 1))  # (3, 384, 384)
+            return (path, img)
+        except Exception:
+            return (path, None)
+    def encode_images(self, paths: List[str]) -> Tuple[List[str], np.ndarray]:
+        all_embs = []
+        valid_paths = []
+        n = len(paths)
+        pbar = tqdm(total=n, desc="  encode-img", unit="张", dynamic_ncols=True)
+        thread_pool = ThreadPoolExecutor(max_workers=self.num_threads)
+        batches = [paths[s:s + self.batch_size]
+                   for s in range(0, n, self.batch_size)]
+        # 预提交第一批
+        if batches:
+            next_future = list(thread_pool.map(self._load_and_preprocess, batches[0]))
+        else:
+            next_future = []
+        for i, batch_paths in enumerate(batches):
+            loaded = next_future
+            # 提前提交下一批 IO + 预处理
+            if i + 1 < len(batches):
+                next_futures_list = [thread_pool.submit(self._load_and_preprocess, p)
+                                     for p in batches[i + 1]]
+            else:
+                next_futures_list = None
+            batch_valid = []
+            batch_arrays = []
+            for p, arr in loaded:
+                if arr is not None:
+                    batch_valid.append(p)
+                    batch_arrays.append(arr)
+            if not batch_arrays:
+                pbar.update(len(batch_paths))
+                if next_futures_list:
+                    next_future = [f.result() for f in next_futures_list]
+                continue
+            # numpy stack → torch → GPU
+            pixel_values = torch.from_numpy(np.stack(batch_arrays)).to(
+                dtype=torch.bfloat16, device=self.primary)
+            with torch.inference_mode():
+                feat = self.img_module(pixel_values=pixel_values)
+            all_embs.append(feat.cpu().float().numpy())
+            valid_paths.extend(batch_valid)
+            pbar.update(len(batch_paths))
+            if next_futures_list:
+                next_future = [f.result() for f in next_futures_list]
+        thread_pool.shutdown(wait=False)
+        pbar.close()
+        if not all_embs:
+            return [], np.empty((0, 0), dtype=np.float16)
+        return valid_paths, np.concatenate(all_embs, axis=0).astype(np.float16)
+    def encode_texts(self, texts: List[str]) -> np.ndarray:
+        all_embs = []
+        n = len(texts)
+        pbar = tqdm(total=n, desc="  encode-txt", unit="条", dynamic_ncols=True)
+        for start in range(0, n, self.batch_size):
+            batch = texts[start:start + self.batch_size]
+            inp = self.processor(text=batch, return_tensors="pt",
+                                 padding="max_length", truncation=True,
+                                 max_length=64)
+            keys = {k: v.to(self.primary) for k, v in inp.items()
+                    if k in ("input_ids", "attention_mask", "position_ids")}
+            with torch.inference_mode():
+                feat = self.txt_module(**keys)
+            all_embs.append(feat.cpu().float().numpy())
+            pbar.update(len(batch))
+        pbar.close()
+        if not all_embs:
+            return np.empty((0, 0), dtype=np.float16)
+        return np.concatenate(all_embs, axis=0).astype(np.float16)
+# ---------------------------------------------------------------------------
+# Top-K（GPU）
+# ---------------------------------------------------------------------------
+def compute_top_k(caption_embs, image_embs, image_paths, k=5,
+                  chunk_size=5000, device="cuda:0"):
+    n = len(image_paths)
+    img_gpu = torch.from_numpy(image_embs.astype(np.float32)).to(device)
+    top_k_map = {}
+    for start in tqdm(range(0, n, chunk_size), desc="  compute-top5", unit="chunk"):
+        end = min(start + chunk_size, n)
+        cap = torch.from_numpy(caption_embs[start:end].astype(np.float32)).to(device)
+        sim = cap @ img_gpu.T
+        idx_range = torch.arange(end - start, device=sim.device)
+        sim[idx_range, torch.arange(start, end, device=sim.device)] = -1.0
+        _, top_idx = sim.topk(k, dim=1)
+        top_idx_cpu = top_idx.cpu().numpy()
+        for i in range(end - start):
+            top_k_map[image_paths[start + i]] = [
+                image_paths[j] for j in top_idx_cpu[i]]
+    return top_k_map
+# ---------------------------------------------------------------------------
+# 数据集工具
+# ---------------------------------------------------------------------------
+def discover_datasets(categories=None, specific=None):
+    if specific:
+        return [(s.split("/")[0], s.split("/")[1]) for s in specific if "/" in s]
+    result = []
+    for cat in sorted(os.listdir(IMAGES_ROOT)):
+        d = os.path.join(IMAGES_ROOT, cat)
+        if not os.path.isdir(d):
+            continue
+        if categories and cat not in categories:
+            continue
+        for ds in sorted(os.listdir(d)):
+            if os.path.isdir(os.path.join(d, ds)):
+                result.append((cat, ds))
+    return result
+def load_captions(cat, ds):
+    p = os.path.join(CAPTION_CACHE_DIR, f"{cat}_{ds}.json")
+    if not os.path.exists(p):
+        return {}
+    try:
+        with open(p) as f:
+            return json.load(f).get("items", {})
+    except Exception:
+        return {}
+def collect_images(cat, ds):
+    base = os.path.join(IMAGES_ROOT, cat, ds)
+    paths = []
+    for split in ("train", "val", "test", "other"):
+        d = os.path.join(base, split)
+        if not os.path.isdir(d):
+            continue
+        for fn in sorted(os.listdir(d)):
+            fp = os.path.join(d, fn)
+            if os.path.isfile(fp):
+                paths.append(fp)
+    return paths
+# ---------------------------------------------------------------------------
+# 处理单个数据集（含断点续传）
+# ---------------------------------------------------------------------------
+def process_dataset(cat, ds, encoder, top_k, force):
+    tag = f"{cat}_{ds}"
+    npz_path = os.path.join(EMBEDDINGS_DIR, f"{tag}.npz")
+    top5_path = os.path.join(EMBEDDINGS_DIR, f"{tag}_top{top_k}.json")
+    # 断点1：全部完成
+    if not force and os.path.exists(npz_path) and os.path.exists(top5_path):
+        try:
+            data = np.load(npz_path, allow_pickle=True)
+            n_emb = len(data["image_paths"])
+            with open(top5_path) as f:
+                n_top = len(json.load(f))
+            if n_emb == n_top and n_emb > 0:
+                print(f"  [SKIP] {tag} ({n_emb} 张)")
+                return True
+        except Exception:
+            pass
+    # 断点2：有 embeddings 缺 top5
+    if not force and os.path.exists(npz_path) and not os.path.exists(top5_path):
+        try:
+            data = np.load(npz_path, allow_pickle=True)
+            sp = list(data["image_paths"])
+            si, sc = data["image_embs"], data["caption_embs"]
+            if len(sp) > 0 and si.shape[0] == len(sp):
+                print(f"  [RESUME] {tag} 只算 top{top_k} ({len(sp)} 张)")
+                m = compute_top_k(sc, si, sp, k=top_k, device=str(encoder.primary))
+                with open(top5_path, 'w') as f:
+                    json.dump(m, f, ensure_ascii=False)
+                print(f"  top{top_k}: {os.path.getsize(top5_path)/1048576:.1f}MB")
+                return True
+        except Exception:
+            pass
+    # 从头
+    all_paths = collect_images(cat, ds)
+    if not all_paths:
+        print(f"  [SKIP] {tag} 无图片")
+        return False
+    captions = load_captions(cat, ds)
+    if not captions:
+        print(f"  [WARN] {tag} 无 caption，跳过")
+        return False
+    paths_with_cap = [p for p in all_paths if p in captions]
+    if not paths_with_cap:
+        print(f"  [WARN] {tag} 无交集，跳过")
+        return False
+    print(f"\n  {tag}: {len(paths_with_cap)} 张图")
+    valid_paths, image_embs = encoder.encode_images(paths_with_cap)
+    if not valid_paths:
+        print(f"  [ERROR] {tag} 编码失败")
+        return False
+    caption_embs = encoder.encode_texts([captions[p] for p in valid_paths])
+    os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
+    np.savez_compressed(npz_path, image_paths=np.array(valid_paths),
+                        image_embs=image_embs, caption_embs=caption_embs)
+    print(f"  embeddings: {os.path.getsize(npz_path)/1048576:.1f}MB")
+    m = compute_top_k(caption_embs, image_embs, valid_paths,
+                      k=top_k, device=str(encoder.primary))
+    with open(top5_path, 'w') as f:
+        json.dump(m, f, ensure_ascii=False)
+    print(f"  top{top_k}: {os.path.getsize(top5_path)/1048576:.1f}MB")
+    return True
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", default=DEFAULT_MODEL)
+    parser.add_argument("--gpus", default="")
+    parser.add_argument("--batch-size-per-gpu", type=int, default=256,
+                        help="每卡batch（预处理不再是瓶颈，可以开大）")
+    parser.add_argument("--num-threads", type=int, default=16,
+                        help="图片IO线程数")
+    parser.add_argument("--top-k", type=int, default=5)
+    parser.add_argument("--categories", default="")
+    parser.add_argument("--datasets", default="")
+    parser.add_argument("--force", action="store_true")
+    args = parser.parse_args()
+    gpu_ids = ([int(x) for x in args.gpus.split(",") if x.strip()]
+               or list(range(torch.cuda.device_count())))
+    total_batch = args.batch_size_per_gpu * len(gpu_ids)
+    print(f"GPU: {gpu_ids} ({len(gpu_ids)} 张), batch: {total_batch}")
+    cats = [c.strip() for c in args.categories.split(",") if c.strip()] or None
+    specific = [d.strip() for d in args.datasets.split(",") if d.strip()] or None
+    datasets = discover_datasets(categories=cats, specific=specific)
+    print(f"共 {len(datasets)} 个数据集\n")
+    encoder = SigLIPEncoder(args.model_path, gpu_ids,
+                            args.batch_size_per_gpu, args.num_threads)
+    ok, fail = 0, 0
+    pbar = tqdm(datasets, desc="总进度", unit="ds", dynamic_ncols=True)
+    for i, (cat, ds) in enumerate(pbar, 1):
+        pbar.set_postfix(current=f"{cat}/{ds}", ok=ok, fail=fail)
+        if process_dataset(cat, ds, encoder, args.top_k, args.force):
+            ok += 1
+        else:
+            fail += 1
+    pbar.close()
+    print(f"\n完成: {ok} 成功, {fail} 失败/跳过 → {EMBEDDINGS_DIR}")
+if __name__ == "__main__":
+    main()

ICL/build_index.py ADDED Viewed

	@@ -0,0 +1,506 @@

+#!/usr/bin/env python3
+"""
+生成索引 JSONL：将原始 base64 JSONL 的文本字段 + 提取后的图片路径 + VLM描述 对应起来。
+输入：
+  /workspace/xiaobin/dataset/data/{cat}/{ds}/{split}.jsonl   (原始，含base64)
+  /workspace/xiaobin/dataset/images/{cat}/{ds}/{split}/      (已提取的图片)
+  /workspace/xiaobin/dataset/detail/{cat}/{ds}/{split}/captions.json  (VLM描述)
+输出：
+  /workspace/xiaobin/dataset/index/{cat}/{ds}/{split}.jsonl   (轻量索引)
+每条记录格式：
+  {
+    "image": "/workspace/xiaobin/dataset/images/vqa/shapes/test/00000000.jpg",
+    "images": ["/path/..."],        # 多图时(video_str/images字段)
+    "question": "...",
+    "answer": "...",
+    "description": "A cat sitting...",  # 来自 detail/captions.json
+    "meta": {...},                       # 原始meta（如有）
+    "id": "...",                         # 原始id/img_id
+    "category": "vqa",
+    "dataset": "shapes",
+    "split": "test"
+  }
+用法：
+  python3 build_index.py                    # 全部（已完成的自动跳过）
+  python3 build_index.py vqa/shapes         # 某个数据集
+  python3 build_index.py --force            # 全部强制重建
+  python3 build_index.py --force vqa/shapes # 某个数据集强制重建
+"""
+import os
+import sys
+import json
+import glob
+import re
+from tqdm import tqdm
+DATA_ROOT = "/workspace/xiaobin/dataset/data"
+IMAGES_ROOT = "/workspace/xiaobin/dataset/images"
+DETAIL_ROOT = "/workspace/xiaobin/dataset/detail"
+INDEX_ROOT = "/workspace/xiaobin/dataset/index"
+# 图片base64字段（用于判断"这行有图"，和extract_images.py一致）
+ALL_IMAGE_FIELDS = [
+    "image", "image_str", "image_base64_str", "img_str",
+    "base64", "image_base64", "image_base_url",
+    "video_str", "images",
+]
+# 文本字段提取
+QUESTION_FIELDS = ["question", "text", "query", "prompt", "input", "inputs", "user_prompt"]
+ANSWER_FIELDS = ["answer", "output", "outputs", "label", "target", "caption", "paraphrased_answer", "original_answer"]
+def classify_split(filename):
+    fn = filename.lower()
+    if "train" in fn:
+        return "train"
+    elif "test" in fn:
+        return "test"
+    elif "val" in fn:
+        return "val"
+    else:
+        return "other"
+def has_image(record):
+    """判断这条记录是否有图（和 extract_images.py 逻辑一致）"""
+    for field in ALL_IMAGE_FIELDS:
+        if field not in record or not record[field]:
+            continue
+        val = record[field]
+        if isinstance(val, str) and len(val) > 100:
+            return True
+        elif isinstance(val, list):
+            if any(isinstance(item, str) and len(item) > 100 for item in val):
+                return True
+    return False
+def is_multi_image(record):
+    """判断是否是多图记录（video_str/images 列表字段）"""
+    for field in ("video_str", "images"):
+        if field in record and isinstance(record[field], list):
+            items = [x for x in record[field] if isinstance(x, str) and len(x) > 100]
+            if len(items) > 1:
+                return True
+    # image_str/image_base64 也可能是list
+    for field in ("image_str", "image_base64"):
+        val = record.get(field)
+        if isinstance(val, list):
+            items = [x for x in val if isinstance(x, str) and len(x) > 100]
+            if len(items) > 1:
+                return True
+    return False
+def count_images_in_record(record):
+    """统计这条记录里有几张图"""
+    for field in ALL_IMAGE_FIELDS:
+        if field not in record or not record[field]:
+            continue
+        val = record[field]
+        if isinstance(val, str) and len(val) > 100:
+            return 1
+        elif isinstance(val, list):
+            return len([x for x in val if isinstance(x, str) and len(x) > 100])
+    return 0
+def extract_text(record, fields):
+    """从记录中提取文本字段"""
+    for k in fields:
+        v = record.get(k)
+        if isinstance(v, str) and v.strip():
+            return v.strip()
+    # 尝试 answers 列表
+    if "answers" in record:
+        v = record["answers"]
+        if isinstance(v, list):
+            for a in v:
+                if isinstance(a, str) and a.strip():
+                    return a.strip()
+    return None
+def extract_id(record):
+    """提取记录ID"""
+    for k in ("id", "image_id", "img_id"):
+        v = record.get(k)
+        if v is not None:
+            return str(v)
+    meta = record.get("meta")
+    if isinstance(meta, dict):
+        for k in ("img_id", "id", "image_id"):
+            v = meta.get(k)
+            if v is not None:
+                return str(v)
+    return None
+def extract_meta(record):
+    """提取meta信息（去掉base64等大字段）"""
+    meta = record.get("meta")
+    if not isinstance(meta, dict):
+        return None
+    out = {}
+    for k, v in meta.items():
+        # 跳过所有图片/base64相关字段
+        if any(x in k.lower() for x in ("image", "img", "base64", "video")):
+            continue
+        # 跳过大字符串
+        if isinstance(v, str) and len(v) > 500:
+            continue
+        # 跳过含大字符串的列表
+        if isinstance(v, list) and v and isinstance(v[0], str) and len(v[0]) > 200:
+            continue
+        out[k] = v
+    return out if out else None
+def load_detail(category, dataset, split):
+    """加载 VLM description 缓存"""
+    path = os.path.join(DETAIL_ROOT, category, dataset, split, "captions.json")
+    if not os.path.exists(path):
+        return {}
+    try:
+        with open(path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        items = data.get("items", {})
+        if isinstance(items, dict):
+            return items
+    except Exception:
+        pass
+    return {}
+def count_lines(filepath):
+    count = 0
+    with open(filepath, 'rb') as f:
+        buf_size = 8 * 1024 * 1024
+        buf = f.raw.read(buf_size)
+        while buf:
+            count += buf.count(b'\n')
+            buf = f.raw.read(buf_size)
+    return count
+def process_one(jsonl_path, file_idx, total_files):
+    """处理单个原始 JSONL，生成索引 JSONL"""
+    rel_path = os.path.relpath(jsonl_path, DATA_ROOT)
+    parts = rel_path.split(os.sep)
+    if len(parts) < 3:
+        return 0
+    category, dataset, filename = parts[0], parts[1], parts[2]
+    split = classify_split(filename)
+    # 图片目录
+    img_dir = os.path.join(IMAGES_ROOT, category, dataset, split)
+    if not os.path.isdir(img_dir):
+        print(f"  [SKIP] 无图片目录: {img_dir}")
+        return 0
+    # 图片文件列表（按编号排序）
+    img_files = sorted([f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))])
+    if not img_files:
+        print(f"  [SKIP] 图片目录为空: {img_dir}")
+        return 0
+    # VLM描述
+    detail = load_detail(category, dataset, split)
+    # 输出索引文件
+    out_path = os.path.join(INDEX_ROOT, category, dataset, f"{split}.jsonl")
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    total_lines = count_lines(jsonl_path)
+    file_size_mb = os.path.getsize(jsonl_path) / (1024 * 1024)
+    desc = f"[{file_idx}/{total_files}] {category}/{dataset}/{split} ({file_size_mb:.0f}MB)"
+    img_idx = 0  # 图片文件游标
+    written = 0
+    skipped = 0
+    with open(jsonl_path, 'r', encoding='utf-8') as fin, \
+         open(out_path, 'w', encoding='utf-8') as fout:
+        pbar = tqdm(fin, total=total_lines, desc=desc, unit="行",
+                    dynamic_ncols=True, miniters=100)
+        for line in pbar:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                record = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            if not has_image(record):
+                skipped += 1
+                continue
+            n_imgs = count_images_in_record(record)
+            if img_idx + n_imgs > len(img_files):
+                # 图片不够了，可能extract时有错误
+                skipped += 1
+                continue
+            # 收集这条记录对应的图片路径
+            if n_imgs == 1:
+                img_path = os.path.join(img_dir, img_files[img_idx])
+                img_paths = [img_path]
+            else:
+                img_paths = [os.path.join(img_dir, img_files[img_idx + i])
+                             for i in range(n_imgs)]
+                img_path = img_paths[0]
+            # 获取 VLM 描述
+            desc_text = detail.get(img_path, "")
+            # 多图时尝试获取每张的描述
+            if n_imgs > 1:
+                descs = [detail.get(p, "") for p in img_paths]
+            else:
+                descs = None
+            # 构建索引记录
+            idx_record = {
+                "image": img_path,
+                "question": extract_text(record, QUESTION_FIELDS),
+                "answer": extract_text(record, ANSWER_FIELDS),
+                "description": desc_text,
+                "category": category,
+                "dataset": dataset,
+                "split": split,
+            }
+            # 多图
+            if n_imgs > 1:
+                idx_record["images"] = img_paths
+                idx_record["descriptions"] = descs
+            # ID
+            rid = extract_id(record)
+            if rid:
+                idx_record["id"] = rid
+            # meta
+            meta = extract_meta(record)
+            if meta:
+                idx_record["meta"] = meta
+            # instructions（如有）
+            insts = record.get("instructions")
+            if isinstance(insts, list) and insts:
+                idx_record["instructions"] = insts
+            fout.write(json.dumps(idx_record, ensure_ascii=False) + "\n")
+            written += 1
+            img_idx += n_imgs
+            pbar.set_postfix(written=written, imgs=img_idx, skip=skipped, refresh=False)
+        pbar.close()
+    print(f"  -> {written} 条, 用了 {img_idx} 张图, 跳过 {skipped} 行")
+    if img_idx != len(img_files):
+        print(f"  [WARN] 图片游标 {img_idx} != 图片总数 {len(img_files)}")
+    return written
+def find_all_jsonl_files():
+    all_files = []
+    for jsonl_path in sorted(glob.glob(os.path.join(DATA_ROOT, "*/*/*.jsonl"))):
+        filename = os.path.basename(jsonl_path)
+        if re.search(r'_\d{4}-\d{2}-\d{2}\.jsonl$', filename):
+            continue
+        if '_v2.jsonl' in filename or '_new.jsonl' in filename:
+            continue
+        if filename.startswith('para_'):
+            continue
+        all_files.append(jsonl_path)
+    return all_files
+def group_by_split(files):
+    """将多个JSONL文件按 (category/dataset/split) 分组，
+    同一split的多个文件按顺序合并处理（因为extract_images是按这个顺序提取的）"""
+    from collections import OrderedDict
+    groups = OrderedDict()
+    for f in files:
+        rel = os.path.relpath(f, DATA_ROOT)
+        parts = rel.split(os.sep)
+        if len(parts) < 3:
+            continue
+        cat, ds, fn = parts[0], parts[1], parts[2]
+        split = classify_split(fn)
+        key = (cat, ds, split)
+        groups.setdefault(key, []).append(f)
+    return groups
+def process_group(jsonl_files, category, dataset, split, group_idx, total_groups,
+                  force=False):
+    """处理同一个 split 的一组 JSONL 文件（可能有多个）"""
+    out_path = os.path.join(INDEX_ROOT, category, dataset, f"{split}.jsonl")
+    # 断点续传：对比索引条数和图片数，一致才跳过
+    if not force and os.path.exists(out_path) and os.path.getsize(out_path) > 0:
+        existing = sum(1 for _ in open(out_path, 'r', encoding='utf-8'))
+        img_dir = os.path.join(IMAGES_ROOT, category, dataset, split)
+        if os.path.isdir(img_dir):
+            img_count = len([f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))])
+            if existing == img_count:
+                print(f"  [SKIP] {category}/{dataset}/{split} 索引完整 ({existing}/{img_count})")
+                return existing
+            else:
+                print(f"  [REDO] {category}/{dataset}/{split} 索引不完整 ({existing}/{img_count}), 重建")
+    img_dir = os.path.join(IMAGES_ROOT, category, dataset, split)
+    if not os.path.isdir(img_dir):
+        print(f"  [SKIP] 无图片目录: {img_dir}")
+        return 0
+    img_files = sorted([f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))])
+    if not img_files:
+        print(f"  [SKIP] 图片目录为空: {img_dir}")
+        return 0
+    detail = load_detail(category, dataset, split)
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    img_idx = 0  # 图片游标，跨文件累加
+    written = 0
+    with open(out_path, 'w', encoding='utf-8') as fout:
+        for fi, jsonl_path in enumerate(jsonl_files):
+            total_lines = count_lines(jsonl_path)
+            file_size_mb = os.path.getsize(jsonl_path) / (1024 * 1024)
+            fn = os.path.basename(jsonl_path)
+            if len(jsonl_files) > 1:
+                desc = f"[{group_idx}/{total_groups}] {category}/{dataset}/{split} ({fn}, {file_size_mb:.0f}MB)"
+            else:
+                desc = f"[{group_idx}/{total_groups}] {category}/{dataset}/{split} ({file_size_mb:.0f}MB)"
+            skipped = 0
+            with open(jsonl_path, 'r', encoding='utf-8') as fin:
+                pbar = tqdm(fin, total=total_lines, desc=desc, unit="行",
+                            dynamic_ncols=True, miniters=100)
+                for line in pbar:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        record = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+                    if not has_image(record):
+                        skipped += 1
+                        continue
+                    n_imgs = count_images_in_record(record)
+                    if img_idx + n_imgs > len(img_files):
+                        skipped += 1
+                        continue
+                    if n_imgs == 1:
+                        img_path = os.path.join(img_dir, img_files[img_idx])
+                        img_paths = [img_path]
+                    else:
+                        img_paths = [os.path.join(img_dir, img_files[img_idx + i])
+                                     for i in range(n_imgs)]
+                        img_path = img_paths[0]
+                    desc_text = detail.get(img_path, "")
+                    idx_record = {
+                        "image": img_path,
+                        "question": extract_text(record, QUESTION_FIELDS),
+                        "answer": extract_text(record, ANSWER_FIELDS),
+                        "description": desc_text,
+                        "category": category,
+                        "dataset": dataset,
+                        "split": split,
+                    }
+                    if n_imgs > 1:
+                        idx_record["images"] = img_paths
+                        idx_record["descriptions"] = [detail.get(p, "") for p in img_paths]
+                    rid = extract_id(record)
+                    if rid:
+                        idx_record["id"] = rid
+                    meta = extract_meta(record)
+                    if meta:
+                        idx_record["meta"] = meta
+                    insts = record.get("instructions")
+                    if isinstance(insts, list) and insts:
+                        idx_record["instructions"] = insts
+                    fout.write(json.dumps(idx_record, ensure_ascii=False) + "\n")
+                    written += 1
+                    img_idx += n_imgs
+                    pbar.set_postfix(written=written, imgs=img_idx, skip=skipped, refresh=False)
+                pbar.close()
+    print(f"  -> {written} 条, 用了 {img_idx}/{len(img_files)} 张图")
+    if img_idx != len(img_files):
+        print(f"  [WARN] 图片游标 {img_idx} != 图片总数 {len(img_files)}")
+    return written
+def main():
+    print("=" * 60)
+    print("生成索引 JSONL (图片路径 + 文本 + VLM描述)")
+    print(f"原始数据: {DATA_ROOT}")
+    print(f"图片目录: {IMAGES_ROOT}")
+    print(f"描述缓存: {DETAIL_ROOT}")
+    print(f"输出索引: {INDEX_ROOT}")
+    print("=" * 60)
+    force = "--force" in sys.argv
+    args = [a for a in sys.argv[1:] if a != "--force"]
+    if args:
+        target = args[0]
+        if os.path.isfile(target):
+            files = [target]
+        else:
+            files = sorted(glob.glob(os.path.join(DATA_ROOT, target, "*.jsonl")))
+            files = [f for f in files
+                     if not re.search(r'_\d{4}-\d{2}-\d{2}\.jsonl$', os.path.basename(f))
+                     and '_v2.jsonl' not in os.path.basename(f)
+                     and '_new.jsonl' not in os.path.basename(f)
+                     and not os.path.basename(f).startswith('para_')]
+    else:
+        files = find_all_jsonl_files()
+    groups = group_by_split(files)
+    print(f"\n共 {len(groups)} 个 split 组 ({len(files)} 个文件):")
+    for (cat, ds, split), flist in groups.items():
+        for f in flist:
+            size_mb = os.path.getsize(f) / (1024 * 1024)
+            print(f"  {cat}/{ds}/{split}: {os.path.basename(f):40s} {size_mb:>10.1f} MB")
+    total = 0
+    for i, ((cat, ds, split), flist) in enumerate(groups.items(), 1):
+        n = process_group(flist, cat, ds, split, i, len(groups), force=force)
+        total += n
+    print(f"\n{'=' * 60}")
+    print(f"全部完成！共生成 {total} 条索引记录")
+    print(f"保存在: {INDEX_ROOT}")
+if __name__ == "__main__":
+    main()

ICL/build_sft.py ADDED Viewed

	@@ -0,0 +1,466 @@

+#!/usr/bin/env python3
+"""
+构建单步决策 SFT 数据集（轻量版，只存引用路径）。
+输入：
+  /workspace/xiaobin/dataset/index/{cat}/{ds}/{split}.jsonl   (索引)
+  /workspace/xiaobin/dataset/embeddings/{cat}_{ds}_top5.json  (预计算相似图)
+  /workspace/xiaobin/dataset/caption_cache/{cat}_{ds}.json    (VLM描述)
+  /workspace/xiaobin/dataset/index/{cat}/{ds}/instructions.json
+输出：
+  /workspace/xiaobin/dataset/sft/{cat}/sft.part{shard}.jsonl
+  /workspace/xiaobin/dataset/sft/all/sft.jsonl (合并后)
+每条记录格式（不含conversation，由train.py动态构建）：
+  {
+    "type": "ret" | "ans",
+    "query_image": "/path/to/query.jpg",
+    "question": "...",
+    "answer": "...",
+    "instruction": "...",
+    "shots": [{"image": "...", "caption": "..."}],
+    "next_description": "...",   # 仅 ret 类型
+    "category": "vqa",
+    "dataset": "vqav2"
+  }
+用法：
+  python3 build_sft.py                                    # 全部
+  python3 build_sft.py --categories vqa                   # 单类
+  python3 build_sft.py --shard-id 0 --num-shards 4       # 分片
+  python3 build_sft.py --merge --shuffle                  # 合并
+"""
+import argparse
+import json
+import os
+import random
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+try:
+    from tqdm import tqdm
+except ImportError:
+    def tqdm(x, **kw):
+        return x
+# ---------------------------------------------------------------------------
+# 默认路径
+# ---------------------------------------------------------------------------
+INDEX_ROOT = "/workspace/xiaobin/dataset/index"
+EMBEDDINGS_DIR = "/workspace/xiaobin/dataset/embeddings"
+CAPTION_CACHE_DIR = "/workspace/xiaobin/dataset/caption_cache"
+OUTPUT_DIR = "/workspace/xiaobin/dataset/sft"
+# ---------------------------------------------------------------------------
+# 数据加载
+# ---------------------------------------------------------------------------
+def discover_datasets(index_root: str, categories: List[str]) -> List[Tuple[str, str]]:
+    """发现所有 (category, dataset) 对。"""
+    result = []
+    for cat in sorted(os.listdir(index_root)):
+        if categories and cat not in categories:
+            continue
+        cat_dir = os.path.join(index_root, cat)
+        if not os.path.isdir(cat_dir):
+            continue
+        for ds in sorted(os.listdir(cat_dir)):
+            if os.path.isdir(os.path.join(cat_dir, ds)):
+                result.append((cat, ds))
+    return result
+def load_index(index_root: str, cat: str, ds: str, split: str) -> List[Dict]:
+    """加载索引 JSONL。"""
+    path = os.path.join(index_root, cat, ds, f"{split}.jsonl")
+    if not os.path.exists(path):
+        return []
+    records = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                r = json.loads(line)
+                # 必须有 image + (question 或 answer)
+                if r.get("image"):
+                    records.append(r)
+            except Exception:
+                continue
+    return records
+def load_top5(embeddings_dir: str, cat: str, ds: str, k: int = 5) -> Dict[str, List[str]]:
+    """加载预计算的 top-k 相似图映射。"""
+    path = os.path.join(embeddings_dir, f"{cat}_{ds}_top{k}.json")
+    if not os.path.exists(path):
+        return {}
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def load_captions(caption_cache_dir: str, cat: str, ds: str) -> Dict[str, str]:
+    """加载 caption 缓存: {image_path: description}。"""
+    path = os.path.join(caption_cache_dir, f"{cat}_{ds}.json")
+    if not os.path.exists(path):
+        return {}
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        items = data.get("items", {})
+        return items if isinstance(items, dict) else {}
+    except Exception:
+        return {}
+def load_instructions(index_root: str, cat: str, ds: str) -> List[str]:
+    """加载 instruction 模板。"""
+    path = os.path.join(index_root, cat, ds, "instructions.json")
+    if not os.path.exists(path):
+        return []
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        if isinstance(data, list):
+            return [str(x).strip() for x in data if str(x).strip()]
+        if isinstance(data, dict):
+            for key in ("instructions", "instruction", "prompts"):
+                v = data.get(key)
+                if isinstance(v, list):
+                    return [str(x).strip() for x in v if str(x).strip()]
+        return []
+    except Exception:
+        return []
+# ---------------------------------------------------------------------------
+# 样本生成
+# ---------------------------------------------------------------------------
+def generate_samples(
+    records: List[Dict],
+    top5_map: Dict[str, List[str]],
+    caption_map: Dict[str, str],
+    instructions: List[str],
+    cat: str, ds: str,
+    rng: random.Random,
+    max_shots: int = 3,
+    answer_at_weights: List[float] = None,
+    target_count: int = 0,
+) -> List[Dict]:
+    """为一个数据集生成 SFT 样本。
+    target_count=0 表示全量（遍历每条记录），>0 表示随机抽样到目标数。
+    """
+    if answer_at_weights is None:
+        answer_at_weights = [1, 3, 3, 2]
+    # 过滤：需要有 answer + top5；question 可为空（captioning 类）
+    valid = [r for r in records
+             if r.get("answer") and r.get("image") in top5_map]
+    if not valid:
+        return []
+    answer_at_values = list(range(len(answer_at_weights)))
+    default_inst = "Please answer the question based on the image."
+    samples = []
+    # 决定遍历源：全量遍历 or 随机抽样
+    if target_count > 0:
+        # 随机抽样模式
+        source = [rng.choice(valid) for _ in range(target_count * 5)]
+    else:
+        # 全量模式：遍历所有记录
+        source = valid
+    for q in source:
+        q_img = q["image"]
+        q_question = q.get("question") or ""
+        q_answer = q["answer"]
+        inst = rng.choice(instructions) if instructions else default_inst
+        answer_at = rng.choices(answer_at_values, weights=answer_at_weights, k=1)[0]
+        answer_at = min(answer_at, max_shots)
+        top5 = top5_map.get(q_img, [])
+        if answer_at > 0 and not top5:
+            continue
+        # 降级处理
+        if answer_at > len(top5):
+            answer_at = len(top5)
+        # 从 top5 里随机选 answer_at 个
+        chosen = rng.sample(top5, answer_at) if answer_at > 0 else []
+        shots = []
+        for img_path in chosen:
+            cap = caption_map.get(img_path, "")
+            shots.append({"image": img_path, "caption": cap})
+        remaining = [p for p in top5 if p not in chosen]
+        # ---- 轨迹式生成：每条记录只有一条一致的 RET→...→ANS 轨迹 ----
+        # answer_at=0: 直接 ANS（0-shot）
+        # answer_at=2: RET(0-shot) → RET(1-shot) → ANS(2-shot)
+        # 不在同一个 (image, question, n-shot) 下同时生成 RET 和 ANS，避免矛盾信号
+        for n in range(answer_at):
+            if n < len(chosen):
+                next_desc = caption_map.get(chosen[n], "")
+            elif remaining:
+                next_desc = caption_map.get(rng.choice(remaining), "")
+            else:
+                break
+            # RET 样本：在 n-shot 时决定继续检索
+            samples.append({
+                "type": "ret",
+                "query_image": q_img,
+                "question": q_question,
+                "answer": q_answer,
+                "instruction": inst,
+                "shots": shots[:n],
+                "next_description": next_desc,
+                "category": cat,
+                "dataset": ds,
+            })
+        # ANS 样本：在 answer_at shot 时回答
+        samples.append({
+            "type": "ans",
+            "query_image": q_img,
+            "question": q_question,
+            "answer": q_answer,
+            "instruction": inst,
+            "shots": shots[:answer_at],
+            "category": cat,
+            "dataset": ds,
+        })
+        if target_count > 0 and len(samples) >= target_count:
+            break
+    if target_count > 0:
+        samples = samples[:target_count]
+    return samples
+# ---------------------------------------------------------------------------
+# 文件工具
+# ---------------------------------------------------------------------------
+def write_jsonl(path: str, records: List[Dict]):
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        for r in records:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+def concat_and_shuffle(output_dir: str, categories: List[str], shuffle: bool, seed: int):
+    """合并各 category 的分片，生成最终数据集。"""
+    rng = random.Random(seed)
+    for cat in categories:
+        cat_dir = os.path.join(output_dir, cat)
+        if not os.path.isdir(cat_dir):
+            continue
+        parts = sorted(Path(cat_dir).glob("sft.part*.jsonl"))
+        if not parts:
+            continue
+        out_path = os.path.join(cat_dir, "sft.jsonl")
+        lines = []
+        for p in parts:
+            with open(p, "r", encoding="utf-8") as f:
+                lines.extend(line for line in f if line.strip())
+        if shuffle:
+            rng.shuffle(lines)
+        with open(out_path, "w", encoding="utf-8") as f:
+            f.writelines(lines)
+        print(f"  [OK] {cat}: {len(lines)} 条")
+    # 合并所有 category
+    all_lines = []
+    for cat in categories:
+        cat_file = os.path.join(output_dir, cat, "sft.jsonl")
+        if os.path.exists(cat_file):
+            with open(cat_file, "r", encoding="utf-8") as f:
+                all_lines.extend(line for line in f if line.strip())
+    if all_lines:
+        if shuffle:
+            rng.shuffle(all_lines)
+        all_dir = os.path.join(output_dir, "all")
+        os.makedirs(all_dir, exist_ok=True)
+        all_path = os.path.join(all_dir, "sft.jsonl")
+        with open(all_path, "w", encoding="utf-8") as f:
+            f.writelines(all_lines)
+        print(f"  [OK] all: {len(all_lines)} 条 → {all_path}")
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(description="构建单步决策 SFT 数据集")
+    # 路径
+    parser.add_argument("--index-root", default=INDEX_ROOT)
+    parser.add_argument("--embeddings-dir", default=EMBEDDINGS_DIR)
+    parser.add_argument("--caption-cache-dir", default=CAPTION_CACHE_DIR)
+    parser.add_argument("--output-dir", default=OUTPUT_DIR)
+    # 数据集选择
+    parser.add_argument("--categories", default="vqa,captioning,classification,reasoning")
+    parser.add_argument("--split", default="train", help="query 来自哪个 split")
+    parser.add_argument("--top-k", type=int, default=5)
+    # 样本参数
+    parser.add_argument("--samples-per-cat", type=int, default=0,
+                        help="每类目标数，0=全量遍历所有记录")
+    parser.add_argument("--samples-per-ds", type=int, default=0,
+                        help="每个数据集最多取多少条原始记录（0=不限）")
+    parser.add_argument("--max-shots", type=int, default=3)
+    parser.add_argument("--answer-at-weights", default="1,3,3,2",
+                        help="0/1/2/3-shot 的权重（默认 1,3,3,2，鼓励多轮 RET）")
+    parser.add_argument("--seed", type=int, default=42)
+    # 分片
+    parser.add_argument("--shard-id", type=int, default=0)
+    parser.add_argument("--num-shards", type=int, default=1)
+    # 模式
+    parser.add_argument("--merge", action="store_true", help="合并分片")
+    parser.add_argument("--shuffle", action="store_true", help="合并时 shuffle")
+    args = parser.parse_args()
+    categories = [c.strip() for c in args.categories.split(",") if c.strip()]
+    # ---- 合并模式 ----
+    if args.merge:
+        print("合并分片...")
+        concat_and_shuffle(args.output_dir, categories, args.shuffle, args.seed)
+        return
+    # ---- 构建模式 ----
+    aw = [float(x) for x in args.answer_at_weights.split(",") if x.strip()]
+    rng = random.Random(args.seed + args.shard_id * 1000003)
+    datasets = discover_datasets(args.index_root, categories)
+    print(f"共 {len(datasets)} 个数据集")
+    # 按 category 分组
+    cat_datasets: Dict[str, List[Tuple[str, str]]] = {}
+    for cat, ds in datasets:
+        cat_datasets.setdefault(cat, []).append((cat, ds))
+    for cat in categories:
+        ds_list = cat_datasets.get(cat, [])
+        if not ds_list:
+            print(f"[SKIP] {cat}: 无数据集")
+            continue
+        # 加载数据
+        ds_data = []
+        for c, d in ds_list:
+            records = load_index(args.index_root, c, d, args.split)
+            top5 = load_top5(args.embeddings_dir, c, d, args.top_k)
+            captions = load_captions(args.caption_cache_dir, c, d)
+            insts = load_instructions(args.index_root, c, d)
+            if not records or not top5:
+                print(f"  [SKIP] {c}/{d}: records={len(records)} top5={len(top5)}")
+                continue
+            # 预检：有多少条记录同时有 answer + top5 覆盖
+            n_valid = sum(1 for r in records
+                         if r.get("answer") and r.get("image") in top5)
+            if n_valid == 0:
+                print(f"  [SKIP] {c}/{d}: {len(records)} 条但无 answer+top5 覆盖")
+                continue
+            ds_data.append({
+                "cat": c, "ds": d,
+                "records": records, "top5": top5,
+                "captions": captions, "instructions": insts,
+            })
+            # 统计 caption 覆盖率
+            n_cap = sum(1 for r in records if r.get("image") in captions)
+            n_top5 = sum(1 for r in records if r.get("image") in top5)
+            print(f"  [OK] {c}/{d}: {len(records)} 条, "
+                  f"valid={n_valid}, top5覆盖={n_top5}, caption覆盖={n_cap}, "
+                  f"instructions={len(insts)}")
+        if not ds_data:
+            print(f"[WARN] {cat}: 无可用数据集")
+            continue
+        all_samples = []
+        # 计算每个数据集该抽多少条原始记录
+        n_ds = len(ds_data)
+        if args.samples_per_cat > 0:
+            # 目标: 每类 samples_per_cat 条 SFT 样本
+            # 保守估计每条记录生成 ~1.5 条样本（captioning等可能更少）
+            # 多抽一些，最后按 samples_per_cat 截断
+            records_per_ds = max(200, int(args.samples_per_cat / 1.0 / n_ds))
+        elif args.samples_per_ds > 0:
+            records_per_ds = args.samples_per_ds
+        else:
+            records_per_ds = 0  # 全量
+        print(f"  {cat}: {n_ds} 个数据集, 每个抽 {records_per_ds} 条记录" if records_per_ds > 0
+              else f"  {cat}: {n_ds} 个数据集, 全量")
+        for d in tqdm(ds_data, desc=f"{cat} shard{args.shard_id}"):
+            recs = d["records"]
+            # 抽样
+            if records_per_ds > 0 and len(recs) > records_per_ds:
+                recs = rng.sample(recs, records_per_ds)
+            samples = generate_samples(
+                records=recs,
+                top5_map=d["top5"],
+                caption_map=d["captions"],
+                instructions=d["instructions"],
+                cat=d["cat"], ds=d["ds"],
+                rng=rng,
+                max_shots=args.max_shots,
+                answer_at_weights=aw,
+                target_count=0,  # 遍历抽出的所有记录
+            )
+            all_samples.extend(samples)
+        # 截断到目标数（仅 samples-per-cat>0 时）
+        if args.samples_per_cat > 0 and len(all_samples) > args.samples_per_cat:
+            rng.shuffle(all_samples)
+            all_samples = all_samples[:args.samples_per_cat]
+        # shuffle 保证混合
+        rng.shuffle(all_samples)
+        # 写出
+        out_path = os.path.join(args.output_dir, cat, f"sft.part{args.shard_id:02d}.jsonl")
+        write_jsonl(out_path, all_samples)
+        # 统计
+        n_ret = sum(1 for r in all_samples if r["type"] == "ret")
+        n_ans = sum(1 for r in all_samples if r["type"] == "ans")
+        n_dist = {}
+        for r in all_samples:
+            nc = len(r.get("shots", []))
+            n_dist[nc] = n_dist.get(nc, 0) + 1
+        print(f"[OK] {cat} shard{args.shard_id}: {len(all_samples)} 条 "
+              f"(ret={n_ret} ans={n_ans}) shot分布={dict(sorted(n_dist.items()))}")
+        print(f"     → {out_path}")
+    # 单 shard 时自动合并 + shuffle
+    if args.num_shards == 1:
+        print("\n自动合并所有 category...")
+        concat_and_shuffle(args.output_dir, categories, shuffle=True, seed=args.seed)
+    print(f"\n完成！输出: {args.output_dir}")
+if __name__ == "__main__":
+    main()

ICL/dataset_inspect.tree.txt ADDED Viewed

	@@ -0,0 +1,456 @@

+M3IT/
+    .git/
+    data/
+    .gitattributes  (2.8KB)
+    .gitignore  (29.0B)
+    M3IT.py  (54.5KB)
+    README.md  (18.3KB)
+        branches/
+        hooks/
+        info/
+        lfs/
+        logs/
+        objects/
+        refs/
+        FETCH_HEAD  (110.0B)
+        HEAD  (21.0B)
+        config  (339.0B)
+        description  (73.0B)
+        packed-refs  (112.0B)
+            refs/
+            HEAD  (189.0B)
+                heads/
+                remotes/
+                    main  (189.0B)
+            heads/
+            remotes/
+            tags/
+                origin/
+                    HEAD  (30.0B)
+                main  (41.0B)
+            info/
+            pack/
+                pack-ee3e40a1a23ec17affa3b8afb61dc14bdffb229c.idx  (38.9KB)
+                pack-ee3e40a1a23ec17affa3b8afb61dc14bdffb229c.pack  (195.5KB)
+            applypatch-msg.sample  (478.0B)
+            commit-msg.sample  (896.0B)
+            fsmonitor-watchman.sample  (4.5KB)
+            post-checkout  (280.0B)
+            post-commit  (276.0B)
+            post-merge  (274.0B)
+            post-update.sample  (189.0B)
+            pre-applypatch.sample  (424.0B)
+            pre-commit.sample  (1.6KB)
+            pre-merge-commit.sample  (416.0B)
+            pre-push  (270.0B)
+            pre-push.sample  (1.3KB)
+            pre-rebase.sample  (4.8KB)
+            pre-receive.sample  (544.0B)
+            prepare-commit-msg.sample  (1.5KB)
+            push-to-checkout.sample  (2.7KB)
+            update.sample  (3.6KB)
+            incomplete/
+            logs/
+            objects/
+            tmp/
+                0152398d9443f2d300adc9e6099a773c66303d4e2e085812cd502cb36da7a0c73483193049  (0.0B)
+                0152398d9443f2d300adc9e6099a773c66303d4e2e085812cd502cb36da7a0c7763208216  (0.0B)
+                0152398d9443f2d300adc9e6099a773c66303d4e2e085812cd502cb36da7a0c789921672  (2.5MB)
+                0968a4438d46277583968011563e959e130feaee66f51bb2d66dbd7e8c979f8c.part  (0.0B)
+                1f77f56225e10edca84be06b6e0d796c579cbf1d4884aee46da564438ad1ba9b1484563810  (437.0KB)
+                1f77f56225e10edca84be06b6e0d796c579cbf1d4884aee46da564438ad1ba9b3850099655  (326.7KB)
+                1f77f56225e10edca84be06b6e0d796c579cbf1d4884aee46da564438ad1ba9b3898577811  (4.1MB)
+                220d32d087b6b29d1c5aaa49324d32b32ae1c19f42e9800f40f24d3a695c2a8d1743027097  (0.0B)
+                220d32d087b6b29d1c5aaa49324d32b32ae1c19f42e9800f40f24d3a695c2a8d3014727128  (0.0B)
+                220d32d087b6b29d1c5aaa49324d32b32ae1c19f42e9800f40f24d3a695c2a8d71894927  (62.6KB)
+                24f014bb5bc7b1fa7d9183dd65fd4b43c0c49aafd6af01bb91ae3a0e7e65502b2818819757  (49.3MB)
+                3da69649bfbc671710f38c2c2f7c6aaecb8f8544de3446866054bf927257c9332854861486  (158.6KB)
+                3da69649bfbc671710f38c2c2f7c6aaecb8f8544de3446866054bf927257c9334214717938  (0.0B)
+                3da69649bfbc671710f38c2c2f7c6aaecb8f8544de3446866054bf927257c933593947826  (0.0B)
+                45e8c51ed0df8edb1ae51d2012b3f7d6cd9cc84addf41e6f9f9adb0f625d41033126870057  (259.2MB)
+                4a80559730d917177e4d13246da0ce23ca318735b29d519d0448bea5579b1a771450117433  (154.4MB)
+                4fda2aa4918e5dec847935db6d46e9bebc570a173bd4201c5f48e60a3f73813a1530155941  (1.1MB)
+                4fda2aa4918e5dec847935db6d46e9bebc570a173bd4201c5f48e60a3f73813a2738070238  (0.0B)
+                4fda2aa4918e5dec847935db6d46e9bebc570a173bd4201c5f48e60a3f73813a2828099128  (0.0B)
+                52a445f8a26cd898e64129e7f1d4bfa6d7203311442068684f5344fc73407310.part  (0.0B)
+                6728a8fb7bad0bad3a2a27669232cb9ae66461c635172f1f7958c80a28e09fa32607733000  (150.2MB)
+                6bb6c9f17e77eab7d88e4a4501c38cb31a6cf792fe77e3b75d511b964a5667df2998182268  (91.8MB)
+                8cb15647ff6bbac322142fea1a38599c523f73acb3614ddb7d12e6a1975a79dc1986657385  (0.0B)
+                8cb15647ff6bbac322142fea1a38599c523f73acb3614ddb7d12e6a1975a79dc2743098052  (0.0B)
+                8cb15647ff6bbac322142fea1a38599c523f73acb3614ddb7d12e6a1975a79dc4193739161  (0.0B)
+                9919274ad6bc88e37235a4c7245d05e357e404ef3352a90a1ba0594e694893c01114223911  (0.0B)
+                9919274ad6bc88e37235a4c7245d05e357e404ef3352a90a1ba0594e694893c03545613611  (0.0B)
+                9919274ad6bc88e37235a4c7245d05e357e404ef3352a90a1ba0594e694893c0559090370  (2.8MB)
+                9cdf4d1a6972db893c8db1a4f2be0d1ec0362ba22a44542402b336760029c87253830692  (88.0MB)
+                b6aed90c79d180c5346994f8e7d0657b3d8a9aab002c057503736b4013a2096b.part  (0.0B)
+                ba47b9680dc949322877399218d1f210a057249803bc70addfb9528152e4b1662004000729  (218.5MB)
+                ca49e0b3f3400f38519a1103b2a567db32c9fa990a7395b1024b94454601479b.part  (0.0B)
+                d66a5b3267a7935b8ff272bcc166a8f43a8d66fb89c59503d536ac87661a02022501429466  (0.0B)
+                d66a5b3267a7935b8ff272bcc166a8f43a8d66fb89c59503d536ac87661a020230475132  (0.0B)
+                d66a5b3267a7935b8ff272bcc166a8f43a8d66fb89c59503d536ac87661a0202373225118  (62.5KB)
+                e5a3eb3e2d0c47d6f014e294ef7398bf26375920c8d2af80fd65e255396dcc78.part  (0.0B)
+                f19cacf3a9f9a57abdcafc4a6d242aa9c6fa48188ad0a394b1a2558cb8ab4dc5372340294  (199.2MB)
+                20251021T152133.441099492.log  (1.4KB)
+                01/
+                02/
+                03/
+                05/
+                06/
+                07/
+                09/
+                0b/
+                0f/
+                10/
+                12/
+                15/
+                16/
+                19/
+                1d/
+                1e/
+                1f/
+                21/
+                22/
+                23/
+                24/
+                2a/
+                2b/
+                2c/
+                2d/
+                2f/
+                30/
+                32/
+                34/
+                37/
+                3b/
+                3d/
+                44/
+                45/
+                4a/
+                4f/
+                50/
+                52/
+                54/
+                56/
+                58/
+                5a/
+                5b/
+                60/
+                61/
+                64/
+                65/
+                67/
+                68/
+                69/
+                6b/
+                6d/
+                6e/
+                70/
+                75/
+                76/
+                7b/
+                7c/
+                80/
+                87/
+                88/
+                89/
+                8b/
+                8c/
+                90/
+                91/
+                93/
+                99/
+                9a/
+                9b/
+                9c/
+                9e/
+                9f/
+                a0/
+                a5/
+                a9/
+                ac/
+                ae/
+                b1/
+                b3/
+                b4/
+                b6/
+                ba/
+                bb/
+                bc/
+                bd/
+                be/
+                c0/
+                c1/
+                c2/
+                c4/
+                c6/
+                c7/
+                c8/
+                ca/
+                cb/
+                d6/
+                d9/
+                dd/
+                e2/
+                e5/
+                e7/
+                e8/
+                e9/
+                ee/
+                ef/
+                f1/
+                f3/
+                f4/
+                f5/
+                f6/
+                f7/
+                f8/
+                f9/
+                fc/
+            exclude  (240.0B)
+        captioning/
+        classification/
+        generation/
+        reasoning/
+        vqa/
+            chinesefoodnet-10/
+            coco-goi/
+            coco-text/
+            imagenet/
+            iqa/
+            itm/
+            mocheg/
+            refcoco/
+            snli-ve/
+            ss/
+            vsr/
+            winoground/
+                .gitattributes  (141.0B)
+                README.md  (211.0B)
+                instructions.json  (1.4KB)
+                labels.json  (9.0KB)
+                test.jsonl  (223.5MB)
+                train.jsonl  (238.9MB)
+                val.jsonl  (227.6MB)
+                README.md  (31.0B)
+                esnlive_test.jsonl  (743.0MB)
+                esnlive_train.jsonl  (1000.8MB)
+                esnlive_val.jsonl  (717.9MB)
+                instructions.json  (1.9KB)
+                test_2023-10-09.jsonl  (2.9GB)
+                train_2023-10-09.jsonl  (3.9GB)
+                instructions.json  (825.0B)
+                mapping.txt  (30.9KB)
+                test_2023-10-08.jsonl  (10.6GB)
+                train.jsonl  (1.5GB)
+                train_2023-10-08.jsonl  (5.9GB)
+                val.jsonl  (2.6GB)
+                instructions.json  (907.0B)
+                test.jsonl  (330.4MB)
+                test_2023-10-09.jsonl  (1.3GB)
+                train.jsonl  (1.9GB)
+                train_2023-10-08.jsonl  (7.8GB)
+                val.jsonl  (330.8MB)
+                instructions.json  (773.0B)
+                test.jsonl  (730.0MB)
+                test_2023-10-09.jsonl  (2.9GB)
+                train.jsonl  (4.3GB)
+                train_2023-10-08.jsonl  (17.1GB)
+                val.jsonl  (730.2MB)
+                instructions.json  (1.4KB)
+                test_2023-10-09.jsonl  (553.7MB)
+                train_2023-10-09.jsonl  (1.9GB)
+                vsr_test.jsonl  (137.7MB)
+                vsr_train.jsonl  (483.3MB)
+                vsr_val.jsonl  (68.8MB)
+                instructions.json  (774.0B)
+                test_2023-10-10.jsonl  (7.6GB)
+                train.jsonl  (8.2GB)
+                train_2023-10-08.jsonl  (32.8GB)
+                val.jsonl  (1.9GB)
+                instructions.json  (733.0B)
+                test_2023-10-07.jsonl  (279.1MB)
+                train.jsonl  (2.0GB)
+                train_2023-10-06.jsonl  (4.1GB)
+                val.jsonl  (138.9MB)
+                instructions.json  (2.0KB)
+                winoground_test.jsonl  (245.5MB)
+                instructions.json  (1.3KB)
+                test.jsonl  (122.9MB)
+                instructions.json  (1.0KB)
+                mocheg_test.jsonl  (60.3MB)
+                mocheg_train.jsonl  (631.7MB)
+                mocheg_val.jsonl  (28.2MB)
+                test_2023-10-08.jsonl  (242.5MB)
+                train_2023-10-08.jsonl  (2.5GB)
+                instructions.json  (1.5KB)
+                test.jsonl  (701.9MB)
+                test_2023-10-08.jsonl  (2.7GB)
+                train.jsonl  (3.9GB)
+                train_2023-10-08.jsonl  (15.6GB)
+                val.jsonl  (667.7MB)
+            clevr/
+            nlvr/
+            science_qa/
+            vcr/
+            visual_mrc/
+                instructions.json  (2.5KB)
+                science_qa_test.jsonl  (174.0MB)
+                science_qa_train.jsonl  (531.3MB)
+                science_qa_validation.jsonl  (176.4MB)
+                instructions.json  (976.0B)
+                train.jsonl  (5.6GB)
+                train_2023-10-07.jsonl  (11.1GB)
+                val.jsonl  (379.6MB)
+                val_2023-10-07.jsonl  (760.4MB)
+                instructions.json  (911.0B)
+                test.jsonl  (1.2GB)
+                train.jsonl  (3.9GB)
+                val.jsonl  (266.9MB)
+                instructions.json  (1.3KB)
+                test.jsonl  (909.3MB)
+                train.jsonl  (4.3GB)
+                val.jsonl  (992.9MB)
+                instructions.json  (1.2KB)
+                test.jsonl  (489.0MB)
+                train.jsonl  (7.9GB)
+                val.jsonl  (533.3MB)
+            mmchat/
+            multi30k/
+            vist/
+            visual_dialog/
+                instructions.json  (818.0B)
+                test.jsonl  (65.2MB)
+                test_2023-10-10.jsonl  (262.2MB)
+                train.jsonl  (3.2GB)
+                train_2023-10-09.jsonl  (13.0GB)
+                val.jsonl  (66.0MB)
+                instructions.json  (1.2KB)
+                test.jsonl  (610.6MB)
+                train.jsonl  (4.4GB)
+                val.jsonl  (301.1MB)
+                instructions.json  (809.0B)
+                test.jsonl  (2.3GB)
+                train.jsonl  (6.2GB)
+                train_new.jsonl  (6.2GB)
+                validation.jsonl  (2.0GB)
+                instructions.json  (1.0KB)
+                test.jsonl  (14.0GB)
+                train.jsonl  (15.4GB)
+                val.jsonl  (13.0GB)
+            a-okvqa/
+            activitynet-qa/
+            docvqa/
+            fm-iqa/
+            gqa/
+            ivqa/
+            msrvtt-qa/
+            msvd-qa/
+            ocr-vqa/
+            okvqa/
+            shapes/
+            st-vqa/
+            text-vqa/
+            viquae/
+            vqav2/
+                instruction.json  (905.0B)
+                train.jsonl  (533.5MB)
+                train_new.jsonl  (533.5MB)
+                validation.jsonl  (228.3MB)
+                instructions.json  (1.9KB)
+                train.jsonl  (1.2GB)
+                train_v2.jsonl  (1.2GB)
+                val.jsonl  (77.7MB)
+                val_v2.jsonl  (78.2MB)
+                instruction.json  (905.0B)
+                test.jsonl  (713.3MB)
+                train.jsonl  (3.3GB)
+                validation_new.jsonl  (529.5MB)
+                instruction.json  (772.0B)
+                train.jsonl  (1.5GB)
+                validation.jsonl  (260.3MB)
+                instruction.json  (853.0B)
+                test.jsonl  (229.4MB)
+                train.jsonl  (1.4GB)
+                README.md  (288.0B)
+                instructions.json  (1.2KB)
+                test.jsonl  (132.4MB)
+                train.jsonl  (343.1MB)
+                val.jsonl  (60.9MB)
+                instructions.json  (853.0B)
+                train.jsonl  (1.9GB)
+                val.jsonl  (1.9GB)
+                instructions.json  (1.7KB)
+                train.jsonl  (7.2GB)
+                val.jsonl  (976.6MB)
+                instructions.json  (1.5KB)
+                test.jsonl  (1.4MB)
+                test_2023-10-08.jsonl  (7.0MB)
+                train.large.jsonl  (18.3MB)
+                train_2023-10-08.jsonl  (92.6MB)
+                val.jsonl  (1.4MB)
+                README.md  (334.0B)
+                instructions.json  (1.0KB)
+                test.jsonl  (500.8MB)
+                train.jsonl  (1.5GB)
+                val.jsonl  (485.4MB)
+                README.md  (434.0B)
+                instructions.json  (1.0KB)
+                test.jsonl  (348.1MB)
+                train.jsonl  (757.5MB)
+                val.jsonl  (58.0MB)
+                .gitattributes  (141.0B)
+                README.md  (332.0B)
+                instructions.json  (1.4KB)
+                test.jsonl  (474.7MB)
+                train.jsonl  (2.1GB)
+                val.jsonl  (1.1GB)
+                instructions.json  (1.2KB)
+                train.jsonl  (594.8MB)
+                train_v2.jsonl  (596.3MB)
+                val.jsonl  (334.3MB)
+                val_v2.jsonl  (335.2MB)
+                instructions.json  (802.0B)
+                para_train.jsonl  (10.5GB)
+                para_val.jsonl  (4.8GB)
+                train.jsonl  (10.5GB)
+                val.jsonl  (4.8GB)
+                instructions.json  (1.2KB)
+                test.jsonl  (122.5MB)
+                test_v2.jsonl  (120.9MB)
+                train.jsonl  (110.1MB)
+                train_v2.jsonl  (110.2MB)
+                validation.jsonl  (125.5MB)
+                validation_v2.jsonl  (125.6MB)
+            coco/
+            coco-cn/
+            flickr8k-cn/
+            image_paragraph_captioning/
+            msrvtt/
+            textcap/
+                .gitattributes  (141.0B)
+                README.md  (490.0B)
+                instructions.json  (1010.0B)
+                test.jsonl  (117.1MB)
+                train.jsonl  (231.1MB)
+                val.jsonl  (116.9MB)
+                instructions.json  (541.0B)
+                test.jsonl  (49.4MB)
+                train.jsonl  (300.0MB)
+                val.jsonl  (49.9MB)
+                instructions.json  (790.0B)
+                test.jsonl  (66.4MB)
+                train.jsonl  (1.2GB)
+                val.jsonl  (65.0MB)
+                image_paragraph_captioning_test.jsonl  (120.7MB)
+                image_paragraph_captioning_train.jsonl  (701.2MB)
+                image_paragraph_captioning_val.jsonl  (118.0MB)
+                instruction.json  (1.4KB)
+                README.md  (73.0B)
+                create_dataset.py  (5.5KB)
+                instructions.json  (882.0B)
+                test.jsonl  (333.1MB)
+                train.jsonl  (7.4GB)
+                val.jsonl  (333.4MB)
+                instructions.json  (1.1KB)
+                train.jsonl  (5.7GB)
+                val.jsonl  (851.3MB)

ICL/eval_icl.py ADDED Viewed

	@@ -0,0 +1,524 @@

+#!/usr/bin/env python3
+"""
+ICL 推理评测脚本：模拟多轮 RET/ANS 决策循环。支持多卡并行。
+流程：
+  1. 给模型 query_image + question（0-shot）
+  2. 模型输出 <RET> → 用预计算 top5 检索下一张图+caption，追加到 context，再问
+  3. 模型输出 <ANS> → 结束，提取答案
+  4. 最多 max_rounds 轮（防止一直 RET）
+多卡策略：
+  每张 GPU 加载一份模型，按 dataset 粒度分配任务，最后 rank 0 汇总。
+用法：
+  # 单卡
+  python3 eval_icl.py \
+      --model-path /workspace/xiaobin/ICL/sft_model/merged_hf \
+      --category vqa --dataset vqav2 --split val \
+      --num-samples 200 --max-rounds 4 --device cuda:0
+  # 多卡（8 GPU）
+  torchrun --nproc_per_node=8 eval_icl.py \
+      --model-path /workspace/xiaobin/ICL/sft_model/merged_hf \
+      --all-categories --num-samples 100 --max-rounds 4
+"""
+import argparse
+import json
+import os
+import random
+import sys
+import time
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.distributed as dist
+from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+from qwen_vl_utils import process_vision_info
+# ---------------------------------------------------------------------------
+# 默认路径
+# ---------------------------------------------------------------------------
+INDEX_ROOT = "/workspace/xiaobin/dataset/index"
+EMBEDDINGS_DIR = "/workspace/xiaobin/dataset/embeddings"
+CAPTION_CACHE_DIR = "/workspace/xiaobin/dataset/caption_cache"
+# ---------------------------------------------------------------------------
+# 分布式工具
+# ---------------------------------------------------------------------------
+def setup_distributed():
+    """初始化分布式环境，返回 (rank, world_size, device)。
+    单卡时 rank=0, world_size=1。"""
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+        local_rank = int(os.environ.get("LOCAL_RANK", rank))
+        dist.init_process_group("nccl")
+        torch.cuda.set_device(local_rank)
+        device = f"cuda:{local_rank}"
+    else:
+        rank, world_size = 0, 1
+        device = None  # 由 args.device 决定
+    return rank, world_size, device
+def gather_results(local_results: List[Dict], rank: int, world_size: int) -> List[Dict]:
+    """把各 rank 的结果汇总到 rank 0。"""
+    if world_size == 1:
+        return local_results
+    # 序列化 → bytes → tensor
+    data = json.dumps(local_results, ensure_ascii=False).encode("utf-8")
+    size = torch.tensor([len(data)], dtype=torch.long, device=f"cuda:{rank}")
+    # 收集各 rank 的大小
+    size_list = [torch.zeros(1, dtype=torch.long, device=f"cuda:{rank}") for _ in range(world_size)]
+    dist.all_gather(size_list, size)
+    max_size = max(s.item() for s in size_list)
+    # pad 到相同长度
+    padded = data + b"\x00" * (max_size - len(data))
+    tensor = torch.ByteTensor(list(padded)).cuda(rank)
+    tensor_list = [torch.zeros(max_size, dtype=torch.uint8, device=f"cuda:{rank}") for _ in range(world_size)]
+    dist.all_gather(tensor_list, tensor)
+    if rank == 0:
+        all_results = []
+        for i, (t, s) in enumerate(zip(tensor_list, size_list)):
+            raw = bytes(t[:s.item()].cpu().tolist())
+            all_results.extend(json.loads(raw.decode("utf-8")))
+        return all_results
+    return []
+# ---------------------------------------------------------------------------
+# 数据加载
+# ---------------------------------------------------------------------------
+def load_records(cat: str, ds: str, split: str, limit: int = 0) -> List[Dict]:
+    path = os.path.join(INDEX_ROOT, cat, ds, f"{split}.jsonl")
+    if not os.path.exists(path):
+        return []
+    records = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            r = json.loads(line)
+            if r.get("image") and r.get("answer"):
+                records.append(r)
+            if limit and len(records) >= limit:
+                break
+    return records
+def load_top5(cat: str, ds: str) -> Dict[str, List[str]]:
+    path = os.path.join(EMBEDDINGS_DIR, f"{cat}_{ds}_top5.json")
+    if not os.path.exists(path):
+        return {}
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def load_caption_cache(cat: str, ds: str) -> Dict[str, str]:
+    path = os.path.join(CAPTION_CACHE_DIR, f"{cat}_{ds}.json")
+    if not os.path.exists(path):
+        return {}
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def load_instructions(cat: str, ds: str) -> List[str]:
+    path = os.path.join(INDEX_ROOT, cat, ds, "instructions.json")
+    if not os.path.exists(path):
+        return ["Look at the image and answer the question."]
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def discover_datasets(categories: List[str]) -> List[Tuple[str, str]]:
+    results = []
+    for cat in sorted(os.listdir(INDEX_ROOT)):
+        if categories and cat not in categories:
+            continue
+        cat_dir = os.path.join(INDEX_ROOT, cat)
+        if not os.path.isdir(cat_dir):
+            continue
+        for ds in sorted(os.listdir(cat_dir)):
+            ds_dir = os.path.join(cat_dir, ds)
+            if os.path.isdir(ds_dir):
+                results.append((cat, ds))
+    return results
+# ---------------------------------------------------------------------------
+# 模型加载
+# ---------------------------------------------------------------------------
+def load_model(model_path: str, device: str):
+    print(f"[{device}] Loading model from {model_path} ...")
+    processor = AutoProcessor.from_pretrained(
+        model_path, trust_remote_code=True,
+        min_pixels=256 * 28 * 28,
+        max_pixels=1280 * 28 * 28,
+    )
+    model = Qwen3VLForConditionalGeneration.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+        device_map=device,
+    )
+    special_tokens = ["<RET>", "<ANS>", "</ANS>", "<RETQ>", "</RETQ>"]
+    num_added = processor.tokenizer.add_tokens(special_tokens, special_tokens=True)
+    if num_added > 0:
+        model.resize_token_embeddings(len(processor.tokenizer))
+    model.eval()
+    ret_id = processor.tokenizer.convert_tokens_to_ids("<RET>")
+    ans_id = processor.tokenizer.convert_tokens_to_ids("<ANS>")
+    print(f"[{device}] Ready. <RET>={ret_id}, <ANS>={ans_id}")
+    return model, processor
+# ---------------------------------------------------------------------------
+# 推理核心
+# ---------------------------------------------------------------------------
+def build_messages(
+    instruction: str,
+    query_image: str,
+    question: Optional[str],
+    shots: List[Dict],
+    min_pixels: int = 256 * 28 * 28,
+    max_pixels: int = 1280 * 28 * 28,
+) -> List[Dict]:
+    user_content = []
+    if instruction:
+        user_content.append({"type": "text", "text": instruction})
+    user_content.append({
+        "type": "image",
+        "image": f"file://{query_image}",
+        "min_pixels": min_pixels, "max_pixels": max_pixels,
+    })
+    if question:
+        user_content.append({"type": "text", "text": f"Question: {question}"})
+    for shot in shots:
+        user_content.append({
+            "type": "image",
+            "image": f"file://{shot['image']}",
+            "min_pixels": min_pixels, "max_pixels": max_pixels,
+        })
+        if shot.get("caption"):
+            user_content.append({"type": "text", "text": f"Caption: {shot['caption']}"})
+    user_content.append({"type": "text", "text": "Action:"})
+    return [{"role": "user", "content": user_content}]
+@torch.no_grad()
+def generate_action(model, processor, messages: List[Dict], max_new_tokens: int = 256) -> str:
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs = None
+    try:
+        image_inputs, _ = process_vision_info(messages)
+    except Exception:
+        pass
+    inputs = processor(
+        text=[text],
+        images=image_inputs if image_inputs else None,
+        return_tensors="pt",
+        padding=False,
+        truncation=False,
+    )
+    device = next(model.parameters()).device
+    inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=False,
+        temperature=None,
+        top_p=None,
+    )
+    input_len = inputs["input_ids"].shape[1]
+    generated = outputs[0][input_len:]
+    return processor.tokenizer.decode(generated, skip_special_tokens=False)
+def parse_action(text: str) -> Tuple[str, str]:
+    text = text.strip()
+    if text.startswith("<RET>"):
+        desc = text[len("<RET>"):].strip()
+        if desc.startswith("Description:"):
+            desc = desc[len("Description:"):].strip()
+        for tok in ["<|im_end|>", "</s>", "<|endoftext|>"]:
+            desc = desc.replace(tok, "").strip()
+        return "ret", desc
+    if text.startswith("<ANS>"):
+        ans = text[len("<ANS>"):]
+        end_idx = ans.find("</ANS>")
+        if end_idx != -1:
+            ans = ans[:end_idx]
+        else:
+            for tok in ["<|im_end|>", "</s>", "<|endoftext|>"]:
+                ans = ans.replace(tok, "").strip()
+        return "ans", ans.strip()
+    return "unknown", text
+def run_icl_loop(
+    model, processor,
+    record: Dict,
+    instruction: str,
+    top5: Dict[str, List[str]],
+    caption_cache: Dict[str, str],
+    max_rounds: int = 4,
+) -> Dict:
+    query_image = record["image"]
+    question = record.get("question", "")
+    gt_answer = record.get("answer", "")
+    shots = []
+    used_images = {query_image}
+    rounds = []
+    candidates = top5.get(query_image, [])
+    for round_idx in range(max_rounds):
+        messages = build_messages(instruction, query_image, question, shots)
+        raw_output = generate_action(model, processor, messages)
+        action, content = parse_action(raw_output)
+        rounds.append({
+            "round": round_idx,
+            "action": action,
+            "content": content,
+            "raw": raw_output[:200],
+        })
+        if action == "ans":
+            return {
+                "image": query_image, "question": question,
+                "gt_answer": gt_answer, "rounds": rounds,
+                "final_answer": content, "num_rounds": round_idx + 1,
+                "terminated_by": "ans",
+            }
+        if action == "ret":
+            next_image = None
+            for c in candidates:
+                if c not in used_images:
+                    next_image = c
+                    break
+            if next_image is None:
+                return {
+                    "image": query_image, "question": question,
+                    "gt_answer": gt_answer, "rounds": rounds,
+                    "final_answer": None, "num_rounds": round_idx + 1,
+                    "terminated_by": "no_more_shots",
+                }
+            cap = caption_cache.get(next_image, content)
+            shots.append({"image": next_image, "caption": cap})
+            used_images.add(next_image)
+        else:
+            return {
+                "image": query_image, "question": question,
+                "gt_answer": gt_answer, "rounds": rounds,
+                "final_answer": content, "num_rounds": round_idx + 1,
+                "terminated_by": "unknown_action",
+            }
+    return {
+        "image": query_image, "question": question,
+        "gt_answer": gt_answer, "rounds": rounds,
+        "final_answer": None, "num_rounds": max_rounds,
+        "terminated_by": "max_rounds",
+    }
+# ---------------------------------------------------------------------------
+# 统计
+# ---------------------------------------------------------------------------
+def print_stats(results: List[Dict], cat: str = "", ds: str = ""):
+    prefix = f"[{cat}/{ds}]" if ds else f"[{cat}]" if cat else "[ALL]"
+    n = len(results)
+    if n == 0:
+        print(f"{prefix} 无结果")
+        return
+    term_counts = defaultdict(int)
+    for r in results:
+        term_counts[r["terminated_by"]] += 1
+    round_actions = defaultdict(lambda: defaultdict(int))
+    for r in results:
+        for rd in r["rounds"]:
+            round_actions[rd["round"]][rd["action"]] += 1
+    avg_rounds = sum(r["num_rounds"] for r in results) / n
+    print(f"\n{'='*60}")
+    print(f"{prefix} 共 {n} 条样本")
+    print(f"  平均轮次: {avg_rounds:.2f}")
+    print(f"  终止原因:")
+    for k, v in sorted(term_counts.items()):
+        print(f"    {k}: {v} ({v/n*100:.1f}%)")
+    print(f"  每轮 RET/ANS 分布:")
+    for rd_idx in sorted(round_actions.keys()):
+        actions = round_actions[rd_idx]
+        total = sum(actions.values())
+        parts = [f"{a}={c}({c/total*100:.0f}%)" for a, c in sorted(actions.items())]
+        print(f"    Round {rd_idx}: {' | '.join(parts)}  (共{total}条)")
+    answered = [r for r in results if r["final_answer"] is not None]
+    print(f"  产出答案: {len(answered)}/{n} ({len(answered)/n*100:.1f}%)")
+    print(f"{'='*60}")
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(description="ICL 多轮推理评测（支持多卡）")
+    parser.add_argument("--model-path", required=True, help="合并后的 HF 模型路径")
+    parser.add_argument("--category", type=str, default="")
+    parser.add_argument("--dataset", type=str, default="")
+    parser.add_argument("--split", type=str, default="val")
+    parser.add_argument("--all-categories", action="store_true")
+    parser.add_argument("--num-samples", type=int, default=100, help="每个 dataset 采样数")
+    parser.add_argument("--max-rounds", type=int, default=4)
+    parser.add_argument("--device", type=str, default="cuda:0", help="单卡时用的设备")
+    parser.add_argument("--output", type=str, default="")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+    random.seed(args.seed)
+    # 分布式初始化
+    rank, world_size, dist_device = setup_distributed()
+    device = dist_device or args.device
+    is_main = (rank == 0)
+    if is_main:
+        print(f"World size: {world_size}")
+    # 加载模型（每张卡一份）
+    model, processor = load_model(args.model_path, device)
+    # 确定 dataset 列表
+    if args.all_categories:
+        categories = ["vqa", "captioning", "classification", "reasoning"]
+    elif args.category:
+        categories = [args.category]
+    else:
+        categories = ["vqa"]
+    if args.dataset:
+        ds_list = [(args.category or "vqa", args.dataset)]
+    else:
+        ds_list = discover_datasets(categories)
+    # ---- 按 rank 分配 dataset ----
+    my_ds_list = ds_list[rank::world_size]
+    if is_main:
+        print(f"共 {len(ds_list)} 个 dataset，每卡约 {len(my_ds_list)} 个")
+    local_results = []
+    for cat, ds in my_ds_list:
+        print(f"[rank {rank}] Evaluating {cat}/{ds} ({args.split})")
+        records = load_records(cat, ds, args.split, limit=args.num_samples * 5)
+        if not records:
+            print(f"  [rank {rank}] 跳过 {cat}/{ds}：无记录")
+            continue
+        top5 = load_top5(cat, ds)
+        if not top5:
+            print(f"  [rank {rank}] 跳过 {cat}/{ds}：无 top5")
+            continue
+        caption_cache = load_caption_cache(cat, ds)
+        instructions = load_instructions(cat, ds)
+        records = [r for r in records if r["image"] in top5]
+        if not records:
+            print(f"  [rank {rank}] 跳过 {cat}/{ds}：无 top5 覆盖")
+            continue
+        if len(records) > args.num_samples:
+            records = random.sample(records, args.num_samples)
+        print(f"  [rank {rank}] {cat}/{ds}: {len(records)} 条")
+        for i, rec in enumerate(records):
+            inst = random.choice(instructions)
+            result = run_icl_loop(
+                model, processor, rec, inst, top5, caption_cache,
+                max_rounds=args.max_rounds,
+            )
+            result["category"] = cat
+            result["dataset"] = ds
+            local_results.append(result)
+            if (i + 1) % 10 == 0 or (i + 1) == len(records):
+                action_seq = " → ".join(rd["action"].upper() for rd in result["rounds"])
+                print(f"  [rank {rank}] [{i+1}/{len(records)}] {action_seq} | "
+                      f"{result['terminated_by']}")
+    # ---- 汇总结果 ----
+    all_results = gather_results(local_results, rank, world_size)
+    if is_main:
+        # 按 category 统计
+        cat_results = defaultdict(list)
+        for r in all_results:
+            cat_results[r["category"]].append(r)
+        for cat in categories:
+            if cat_results[cat]:
+                # 按 dataset 子统计
+                ds_groups = defaultdict(list)
+                for r in cat_results[cat]:
+                    ds_groups[r["dataset"]].append(r)
+                for d in sorted(ds_groups):
+                    print_stats(ds_groups[d], cat, d)
+                print_stats(cat_results[cat], cat)
+        print_stats(all_results)
+        output_path = args.output or f"/workspace/xiaobin/ICL/eval_results_{args.split}.json"
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(all_results, f, ensure_ascii=False, indent=2)
+        print(f"\n详细结果已保存到: {output_path}")
+    if world_size > 1:
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

ICL/extract_images.py ADDED Viewed

	@@ -0,0 +1,231 @@

+#!/usr/bin/env python3
+"""
+从 /workspace/xiaobin/dataset/data 下所有 JSONL 文件中提取 base64 编码的图片，
+保存到 /workspace/xiaobin/dataset/images/{category}/{dataset}/{split}/ 目录。
+split 由文件名推断：含 train -> train, 含 test -> test, 含 val/validation -> val
+图片字段名自动检测，支持：
+  image_str, image_base64_str, img_str, base64, image_base64, image_base_url,
+  video_str (list), images (list)
+依赖：无需额外安装（tqdm 已有）
+用法：
+  python3 extract_images.py                           # 处理全部
+  python3 extract_images.py vqa/shapes                # 只处理某个数据集
+  python3 extract_images.py /path/to/some.jsonl       # 只处理某个文件
+"""
+import os
+import sys
+import json
+import base64
+import glob
+import re
+from tqdm import tqdm
+DATA_ROOT = "/workspace/xiaobin/dataset/data"
+OUTPUT_ROOT = "/workspace/xiaobin/dataset/images"
+# 所有可能的图片字段名（优先级顺序）
+# 注意：有些字段在不同数据集中可能是 str 也可能是 list，统一处理
+ALL_IMAGE_FIELDS = [
+    "image",             # captioning/coco
+    "image_str",         # 多个数据集（str 或 list）
+    "image_base64_str",  # snli-ve, multi30k, vcr, visual_mrc
+    "img_str",           # gqa, ocr-vqa, st-vqa, text-vqa, viquae, vqav2
+    "base64",            # fm-iqa
+    "image_base64",      # coco-cn, mmchat（str 或 list，如 chinesefoodnet-10）
+    "image_base_url",    # textcap
+    "video_str",         # msrvtt, ss, activitynet-qa, ivqa, msrvtt-qa, msvd-qa (list)
+    "images",            # vist (list)
+]
+def detect_extension(data_bytes):
+    """根据文件头判断图片格式"""
+    if data_bytes[:2] == b'\xff\xd8':
+        return ".jpg"
+    elif data_bytes[:8] == b'\x89PNG\r\n\x1a\n':
+        return ".png"
+    elif data_bytes[:4] == b'GIF8':
+        return ".gif"
+    elif data_bytes[:4] == b'RIFF' and data_bytes[8:12] == b'WEBP':
+        return ".webp"
+    else:
+        return ".jpg"
+def classify_split(filename):
+    """从文件名推断 split 类型"""
+    fn = filename.lower()
+    if "train" in fn:
+        return "train"
+    elif "test" in fn:
+        return "test"
+    elif "val" in fn:
+        return "val"
+    else:
+        return "other"
+def extract_images_from_record(record):
+    """从一条 JSONL 记录中提取图片 base64 字符串列表"""
+    for field in ALL_IMAGE_FIELDS:
+        if field not in record or not record[field]:
+            continue
+        val = record[field]
+        if isinstance(val, str) and len(val) > 100:
+            return [val]
+        elif isinstance(val, list):
+            return [item for item in val if isinstance(item, str) and len(item) > 100]
+    return []
+def count_lines(filepath):
+    """快速统计文件行数（用于 tqdm total）"""
+    count = 0
+    with open(filepath, 'rb') as f:
+        # 用 buffer 读取，比逐行快很多
+        buf_size = 1024 * 1024 * 8  # 8MB
+        buf = f.raw.read(buf_size)
+        while buf:
+            count += buf.count(b'\n')
+            buf = f.raw.read(buf_size)
+    return count
+def process_jsonl_file(jsonl_path, file_idx, total_files):
+    """处理单个 JSONL 文件，提取图片并保存"""
+    rel_path = os.path.relpath(jsonl_path, DATA_ROOT)
+    parts = rel_path.split(os.sep)
+    if len(parts) < 3:
+        print(f"  [SKIP] 路径层级不够: {rel_path}")
+        return 0
+    category = parts[0]
+    dataset = parts[1]
+    filename = parts[2]
+    split = classify_split(filename)
+    out_dir = os.path.join(OUTPUT_ROOT, category, dataset, split)
+    os.makedirs(out_dir, exist_ok=True)
+    # 断点续传：统计已有图片数
+    existing_count = len([f for f in os.listdir(out_dir) if os.path.isfile(os.path.join(out_dir, f))])
+    # 快速统计总行数
+    file_size_mb = os.path.getsize(jsonl_path) / (1024 * 1024)
+    total_lines = count_lines(jsonl_path)
+    count = 0
+    skipped = 0
+    errors = 0
+    desc = f"[{file_idx}/{total_files}] {category}/{dataset}/{split} ({file_size_mb:.0f}MB)"
+    try:
+        with open(jsonl_path, 'r', encoding='utf-8') as f:
+            pbar = tqdm(f, total=total_lines, desc=desc, unit="行",
+                        dynamic_ncols=True, miniters=50)
+            for line in pbar:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    record = json.loads(line)
+                except json.JSONDecodeError:
+                    errors += 1
+                    continue
+                b64_list = extract_images_from_record(record)
+                if not b64_list:
+                    skipped += 1
+                    continue
+                for img_idx, b64_str in enumerate(b64_list):
+                    global_idx = existing_count + count
+                    try:
+                        img_bytes = base64.b64decode(b64_str)
+                        ext = detect_extension(img_bytes)
+                        if len(b64_list) > 1:
+                            img_name = f"{global_idx:08d}_f{img_idx:03d}{ext}"
+                        else:
+                            img_name = f"{global_idx:08d}{ext}"
+                        with open(os.path.join(out_dir, img_name), 'wb') as img_f:
+                            img_f.write(img_bytes)
+                        count += 1
+                    except Exception as e:
+                        errors += 1
+                        if errors <= 3:
+                            tqdm.write(f"  [ERROR] {e}")
+                # 更新后缀信息
+                pbar.set_postfix(imgs=count, skip=skipped, err=errors, refresh=False)
+            pbar.close()
+    except Exception as e:
+        print(f"  [FATAL] {e}")
+    print(f"  -> 完成: {count} 张图片, 跳过 {skipped} 行(无图), 错误 {errors}")
+    return count
+def find_all_jsonl_files():
+    """查找所有需要处理的 JSONL 文件"""
+    all_files = []
+    for jsonl_path in sorted(glob.glob(os.path.join(DATA_ROOT, "*/*/*.jsonl"))):
+        filename = os.path.basename(jsonl_path)
+        if re.search(r'_\d{4}-\d{2}-\d{2}\.jsonl$', filename):
+            continue
+        if '_v2.jsonl' in filename or '_new.jsonl' in filename:
+            continue
+        if filename.startswith('para_'):
+            continue
+        all_files.append(jsonl_path)
+    return all_files
+def main():
+    print("=" * 60)
+    print("JSONL 图片提取工具")
+    print(f"数据源: {DATA_ROOT}")
+    print(f"输出到: {OUTPUT_ROOT}")
+    print("=" * 60)
+    if len(sys.argv) > 1:
+        target = sys.argv[1]
+        if os.path.isfile(target):
+            files = [target]
+        else:
+            files = sorted(glob.glob(os.path.join(DATA_ROOT, target, "*.jsonl")))
+            files = [f for f in files
+                     if not re.search(r'_\d{4}-\d{2}-\d{2}\.jsonl$', os.path.basename(f))
+                     and '_v2.jsonl' not in os.path.basename(f)
+                     and '_new.jsonl' not in os.path.basename(f)
+                     and not os.path.basename(f).startswith('para_')]
+    else:
+        files = find_all_jsonl_files()
+    print(f"\n共 {len(files)} 个 JSONL 文件:")
+    total_size = 0
+    for f in files:
+        size_mb = os.path.getsize(f) / (1024 * 1024)
+        total_size += size_mb
+        print(f"  {os.path.relpath(f, DATA_ROOT):50s} {size_mb:>10.1f} MB")
+    print(f"  {'合计':50s} {total_size/1024:>10.1f} GB")
+    total_images = 0
+    for i, jsonl_path in enumerate(files, 1):
+        n = process_jsonl_file(jsonl_path, i, len(files))
+        total_images += n
+    print(f"\n{'=' * 60}")
+    print(f"全部完成！共提取 {total_images} 张图片")
+    print(f"保存在: {OUTPUT_ROOT}")
+if __name__ == "__main__":
+    main()

ICL/merge_captions.py ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/usr/bin/env python3
+"""
+把 detail/{cat}/{ds}/{split}/captions.json 合并成 build_sft.py 需要的格式:
+  caption_cache/{cat}_{ds}.json = {"items": {img_path: caption, ...}}
+这样 build_sft.py --caption-cache-dir caption_cache 就能直接复用。
+用法:
+  python3 merge_captions.py
+  python3 merge_captions.py --force   # 强制重建
+"""
+import os
+import sys
+import json
+import glob
+DETAIL_ROOT = "/workspace/xiaobin/dataset/detail"
+CAPTION_CACHE_DIR = "/workspace/xiaobin/dataset/caption_cache"
+def main():
+    force = "--force" in sys.argv
+    os.makedirs(CAPTION_CACHE_DIR, exist_ok=True)
+    # 找所有 dataset 目录 (cat/ds)
+    datasets = set()
+    for captions_file in glob.glob(os.path.join(DETAIL_ROOT, "*/*/*/captions.json")):
+        rel = os.path.relpath(captions_file, DETAIL_ROOT)
+        parts = rel.split(os.sep)  # cat/ds/split/captions.json
+        datasets.add((parts[0], parts[1]))
+    print(f"共 {len(datasets)} 个数据集")
+    for cat, ds in sorted(datasets):
+        out_name = f"{cat}_{ds}.json"
+        out_path = os.path.join(CAPTION_CACHE_DIR, out_name)
+        if not force and os.path.exists(out_path) and os.path.getsize(out_path) > 0:
+            print(f"  [SKIP] {out_name}")
+            continue
+        merged = {}
+        for split in ("train", "val", "test"):
+            src = os.path.join(DETAIL_ROOT, cat, ds, split, "captions.json")
+            if not os.path.exists(src):
+                continue
+            try:
+                with open(src, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                items = data.get("items", {})
+                if isinstance(items, dict):
+                    merged.update(items)
+            except Exception as e:
+                print(f"  [WARN] {src}: {e}")
+        if not merged:
+            print(f"  [EMPTY] {cat}/{ds}")
+            continue
+        with open(out_path, 'w', encoding='utf-8') as f:
+            json.dump({"items": merged}, f, ensure_ascii=False)
+        print(f"  [OK] {out_name}: {len(merged)} 条")
+    print(f"\n完成! 输出: {CAPTION_CACHE_DIR}")
+if __name__ == "__main__":
+    main()

ICL/sft_model/epoch3_step1406_fp32/chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' }}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content_item in message.content %}\n                {%- if 'text' in content_item %}\n                    {{- content_item.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and message.content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"
+}

ICL/sft_model/epoch3_step1406_fp32/config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "architectures": [
+    "Qwen3VLForConditionalGeneration"
+  ],
+  "image_token_id": 151655,
+  "model_type": "qwen3_vl",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "max_position_embeddings": 262144,
+    "model_type": "qwen3_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_type": "default"
+    },
+    "rope_theta": 5000000,
+    "use_cache": true,
+    "vocab_size": 151936
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.0.dev0",
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      8,
+      16,
+      24
+    ],
+    "depth": 27,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4304,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 4096,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652
+}

ICL/sft_model/epoch3_step1406_fp32/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "bos_token_id": 151643,
+    "pad_token_id": 151643,
+    "do_sample": true,
+    "eos_token_id": [
+        151645,
+        151643
+    ],
+    "top_k": 20,
+    "top_p": 0.8,
+    "repetition_penalty": 1.0,
+    "temperature": 0.7,
+    "transformers_version": "4.56.0"
+}

ICL/sft_model/epoch3_step1406_fp32/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

ICL/sft_model/epoch3_step1406_fp32/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,757 @@

+{
+  "metadata": {
+    "total_size": 35059909568
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00008-of-00008.safetensors",
+    "model.language_model.embed_tokens.weight": "model-00001-of-00008.safetensors",
+    "model.language_model.layers.0.input_layernorm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.0.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.0.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.0.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.0.post_attention_layernorm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00008.safetensors",
+    "model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00008.safetensors",
+    "model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00008.safetensors",
+    "model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00008.safetensors",
+    "model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00008.safetensors",
+    "model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00008.safetensors",
+    "model.language_model.layers.1.input_layernorm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.1.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.1.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.1.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.1.post_attention_layernorm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.1.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.1.self_attn.k_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.1.self_attn.o_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.1.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.1.self_attn.q_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.1.self_attn.v_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.10.input_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.10.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.10.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.10.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.10.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.10.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.10.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.10.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.10.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.10.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.10.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.11.input_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.11.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.11.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.11.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.11.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.11.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.11.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.11.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.11.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.11.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.11.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.12.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.12.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.12.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.12.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.12.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.12.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.12.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.12.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.12.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.12.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.12.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.13.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.13.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.13.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.13.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.13.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.13.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.13.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.13.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.13.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.13.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.13.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.14.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.14.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.14.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.14.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.14.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.14.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.14.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.14.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.14.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.14.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.14.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.15.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.15.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.15.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.15.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.15.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.15.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.15.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.15.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.15.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.15.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.15.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.16.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.16.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.16.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.16.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.16.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.16.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.16.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.16.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.16.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.16.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.16.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.17.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.17.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.17.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.17.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.17.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.17.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.17.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.17.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.17.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.17.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.17.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.18.input_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.18.mlp.down_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.18.mlp.gate_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.18.mlp.up_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.18.post_attention_layernorm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.18.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.18.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.18.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.18.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.18.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.18.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.19.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.19.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.19.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.19.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.19.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.19.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.19.self_attn.k_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.19.self_attn.o_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.19.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.19.self_attn.q_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.19.self_attn.v_proj.weight": "model-00004-of-00008.safetensors",
+    "model.language_model.layers.2.input_layernorm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.2.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.2.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.2.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.2.post_attention_layernorm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.2.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.2.self_attn.k_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.2.self_attn.o_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.2.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.2.self_attn.q_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.2.self_attn.v_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.20.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.20.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.20.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.20.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.20.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.20.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.20.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.20.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.20.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.20.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.20.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.21.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.21.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.21.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.21.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.21.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.21.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.21.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.21.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.21.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.21.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.21.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.22.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.22.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.22.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.22.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.22.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.22.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.22.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.22.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.22.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.22.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.22.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.23.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.23.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.23.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.23.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.23.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.23.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.23.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.23.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.23.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.23.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.23.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.24.input_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.24.mlp.down_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.24.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.24.mlp.up_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.24.post_attention_layernorm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.24.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.24.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.24.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.24.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.24.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.24.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.25.input_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.25.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.25.mlp.gate_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.25.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.25.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.25.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.25.self_attn.k_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.25.self_attn.o_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.25.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.25.self_attn.q_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.25.self_attn.v_proj.weight": "model-00005-of-00008.safetensors",
+    "model.language_model.layers.26.input_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.26.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.26.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.26.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.26.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.26.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.26.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.26.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.26.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.26.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.26.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.27.input_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.27.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.27.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.27.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.27.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.27.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.27.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.27.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.27.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.27.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.27.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.28.input_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.28.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.28.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.28.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.28.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.28.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.28.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.28.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.28.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.28.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.28.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.29.input_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.29.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.29.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.29.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.29.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.29.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.29.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.29.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.29.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.29.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.29.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.3.input_layernorm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.3.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.3.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.3.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.3.post_attention_layernorm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.3.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.3.self_attn.k_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.3.self_attn.o_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.3.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.3.self_attn.q_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.3.self_attn.v_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.30.input_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.30.mlp.down_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.30.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.30.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.30.post_attention_layernorm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.30.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.30.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.30.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.30.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.30.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.30.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.31.input_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.31.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.31.mlp.gate_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.31.mlp.up_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.31.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.31.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.31.self_attn.k_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.31.self_attn.o_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.31.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.31.self_attn.q_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.31.self_attn.v_proj.weight": "model-00006-of-00008.safetensors",
+    "model.language_model.layers.32.input_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.32.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.32.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.32.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.32.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.32.self_attn.k_norm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.32.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.32.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.32.self_attn.q_norm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.32.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.32.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.33.input_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.33.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.33.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.33.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.33.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.33.self_attn.k_norm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.33.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.33.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.33.self_attn.q_norm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.33.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.33.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.34.input_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.34.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.34.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.34.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.34.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.34.self_attn.k_norm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.34.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.34.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.34.self_attn.q_norm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.34.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.34.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.35.input_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.35.mlp.down_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.35.mlp.gate_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.35.mlp.up_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.35.post_attention_layernorm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.35.self_attn.k_norm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.35.self_attn.k_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.35.self_attn.o_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.35.self_attn.q_norm.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.35.self_attn.q_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.35.self_attn.v_proj.weight": "model-00007-of-00008.safetensors",
+    "model.language_model.layers.4.input_layernorm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.4.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.4.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.4.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.4.post_attention_layernorm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.4.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.4.self_attn.k_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.4.self_attn.o_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.4.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.4.self_attn.q_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.4.self_attn.v_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.5.input_layernorm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.5.mlp.down_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.5.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.5.mlp.up_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.5.post_attention_layernorm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.5.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.5.self_attn.k_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.5.self_attn.o_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.5.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.5.self_attn.q_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.5.self_attn.v_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.6.input_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.6.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.6.mlp.gate_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.6.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.6.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.6.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.6.self_attn.k_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.6.self_attn.o_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.6.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.6.self_attn.q_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.6.self_attn.v_proj.weight": "model-00002-of-00008.safetensors",
+    "model.language_model.layers.7.input_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.7.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.7.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.7.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.7.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.7.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.7.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.7.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.7.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.7.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.7.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.8.input_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.8.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.8.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.8.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.8.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.8.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.8.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.8.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.8.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.8.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.8.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.9.input_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.9.mlp.down_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.9.mlp.gate_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.9.mlp.up_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.9.post_attention_layernorm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.9.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.9.self_attn.k_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.9.self_attn.o_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.9.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.9.self_attn.q_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.layers.9.self_attn.v_proj.weight": "model-00003-of-00008.safetensors",
+    "model.language_model.norm.weight": "model-00007-of-00008.safetensors",
+    "model.visual.blocks.0.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.0.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.0.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.0.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.0.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.0.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.0.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.0.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.1.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.1.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.1.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.1.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.1.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.1.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.1.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.1.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.10.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.10.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.10.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.10.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.10.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.10.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.10.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.10.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.11.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.11.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.11.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.11.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.11.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.11.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.11.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.11.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.12.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.12.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.12.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.12.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.12.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.12.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.12.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.12.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.13.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.13.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.13.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.13.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.13.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.13.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.13.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.13.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.14.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.14.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.14.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.14.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.14.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.14.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.14.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.14.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.15.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.15.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.15.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.15.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.15.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.15.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.15.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.15.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.16.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.16.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.16.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.16.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.16.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.16.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.16.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.16.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.17.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.17.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.17.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.17.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.17.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.17.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.17.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.17.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.18.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.18.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.18.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.18.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.18.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.18.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.18.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.18.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.19.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.19.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.19.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.19.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.19.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.19.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.19.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.19.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.2.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.2.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.2.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.2.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.2.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.2.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.2.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.2.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.20.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.20.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.20.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.20.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.20.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.20.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.20.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.20.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.21.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.21.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.21.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.21.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.21.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.21.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.21.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.21.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.22.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.22.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.22.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.22.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.22.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.22.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.22.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.22.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.23.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.23.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.23.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.23.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.23.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.23.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.23.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.23.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.24.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.24.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.24.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.24.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.24.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.24.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.24.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.24.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.25.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.25.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.25.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.25.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.25.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.25.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.25.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.25.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.26.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.26.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.26.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.26.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.26.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.26.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.26.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.26.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.3.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.3.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.3.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.3.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.3.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.3.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.3.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.3.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.4.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.4.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.4.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.4.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.4.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.4.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.4.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.4.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.5.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.5.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.5.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.5.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.5.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.5.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.5.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.5.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.6.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.6.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.6.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.6.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.6.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.6.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.6.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.6.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.7.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.7.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.7.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.7.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.7.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.7.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.7.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.7.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.8.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.8.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.8.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.8.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.8.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.8.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.8.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.8.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.9.attn.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.9.attn.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.9.attn.qkv.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.9.attn.qkv.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.9.norm1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.9.norm1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.9.norm2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.blocks.9.norm2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.0.norm.bias": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.0.norm.weight": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.1.norm.bias": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.1.norm.weight": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.2.norm.bias": "model-00001-of-00008.safetensors",
+    "model.visual.deepstack_merger_list.2.norm.weight": "model-00001-of-00008.safetensors",
+    "model.visual.merger.linear_fc1.bias": "model-00001-of-00008.safetensors",
+    "model.visual.merger.linear_fc1.weight": "model-00001-of-00008.safetensors",
+    "model.visual.merger.linear_fc2.bias": "model-00001-of-00008.safetensors",
+    "model.visual.merger.linear_fc2.weight": "model-00001-of-00008.safetensors",
+    "model.visual.merger.norm.bias": "model-00001-of-00008.safetensors",
+    "model.visual.merger.norm.weight": "model-00001-of-00008.safetensors",
+    "model.visual.patch_embed.proj.bias": "model-00001-of-00008.safetensors",
+    "model.visual.patch_embed.proj.weight": "model-00001-of-00008.safetensors",
+    "model.visual.pos_embed.weight": "model-00001-of-00008.safetensors"
+  }
+}

ICL/sft_model/epoch3_step1406_fp32/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "size": {
+        "longest_edge": 16777216,
+        "shortest_edge": 65536
+    },
+    "patch_size": 16,
+    "temporal_patch_size": 2,
+    "merge_size": 2,
+    "image_mean": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "image_std": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "processor_class": "Qwen3VLProcessor",
+    "image_processor_type": "Qwen2VLImageProcessorFast"
+}

ICL/sft_model/epoch3_step1406_fp32/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ICL/sft_model/epoch3_step1406_fp32/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' }}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content_item in message.content %}\n                {%- if 'text' in content_item %}\n                    {{- content_item.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and message.content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

ICL/sft_model/epoch3_step1406_fp32/video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "size": {
+        "longest_edge": 25165824,
+        "shortest_edge": 4096
+    },
+    "patch_size": 16,
+    "temporal_patch_size": 2,
+    "merge_size": 2,
+    "image_mean": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "image_std": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "processor_class": "Qwen3VLProcessor",
+    "video_processor_type": "Qwen3VLVideoProcessor"
+}

ICL/sft_model/epoch3_step1406_fp32/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ICL/sft_model/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+    Returns:
+        - pytorch ``state_dict``
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

RL_dataset/.gitattributes ADDED Viewed

	@@ -0,0 +1,89 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.mat filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.hdf5 filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*.tfevents* filter=lfs diff=lfs merge=lfs -text
+*.db* filter=lfs diff=lfs merge=lfs -text
+*.ark* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.wma filter=lfs diff=lfs merge=lfs -text
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.m4a filter=lfs diff=lfs merge=lfs -text
+*.m3u8 filter=lfs diff=lfs merge=lfs -text
+*.amr filter=lfs diff=lfs merge=lfs -text
+*.audio filter=lfs diff=lfs merge=lfs -text
+*.avi filter=lfs diff=lfs merge=lfs -text
+*.flv filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.mpg filter=lfs diff=lfs merge=lfs -text
+*.asf filter=lfs diff=lfs merge=lfs -text
+*.mov filter=lfs diff=lfs merge=lfs -text
+*.mpeg filter=lfs diff=lfs merge=lfs -text
+*.3gp filter=lfs diff=lfs merge=lfs -text
+*.wmv filter=lfs diff=lfs merge=lfs -text
+*.rmvb filter=lfs diff=lfs merge=lfs -text
+*.rm filter=lfs diff=lfs merge=lfs -text
+*.ts filter=lfs diff=lfs merge=lfs -text
+*.mkv filter=lfs diff=lfs merge=lfs -text
+*.flash filter=lfs diff=lfs merge=lfs -text
+*.vob filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.ost filter=lfs diff=lfs merge=lfs -text
+*.pst filter=lfs diff=lfs merge=lfs -text
+*.doc filter=lfs diff=lfs merge=lfs -text
+*.docx filter=lfs diff=lfs merge=lfs -text
+*.txt filter=lfs diff=lfs merge=lfs -text
+*.ppt filter=lfs diff=lfs merge=lfs -text
+*.pptx filter=lfs diff=lfs merge=lfs -text
+*.xls filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text
+*.vsd filter=lfs diff=lfs merge=lfs -text
+*.vsdx filter=lfs diff=lfs merge=lfs -text
+*.jsonl filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+dataset_infos.json ignore
+*.csv filter=lfs diff=lfs merge=lfs -text
+*.tsv filter=lfs diff=lfs merge=lfs -text

RL_dataset/.msc ADDED Viewed

Binary file (546 Bytes). View file

RL_dataset/.mv ADDED Viewed

	@@ -0,0 +1 @@


1	+ master

RL_dataset/INFOSEEK_DOWNLOAD.md ADDED Viewed

	@@ -0,0 +1,337 @@

+# InfoSeek Data Download
+This document collects ready-to-run scripts for downloading the InfoSeek dataset into:
+`/workspace/xiaobin/RL_dataset/data`
+It covers:
+- InfoSeek annotations
+- InfoSeek KB mapping files
+- InfoSeek human set
+- Wiki6M text files
+- OVEN image snapshot on Hugging Face
+- OVEN original-source image download workflow
+InfoSeek images are derived from OVEN, so image download is handled through the OVEN release pipeline.
+## 1. Recommended Directory Layout
+```bash
+mkdir -p /workspace/xiaobin/RL_dataset/data/infoseek
+mkdir -p /workspace/xiaobin/RL_dataset/data/oven_hf
+mkdir -p /workspace/xiaobin/RL_dataset/data/oven_source
+```
+Suggested usage:
+- `/workspace/xiaobin/RL_dataset/data/infoseek`: InfoSeek jsonl files
+- `/workspace/xiaobin/RL_dataset/data/oven_hf`: Hugging Face image snapshot files
+- `/workspace/xiaobin/RL_dataset/data/oven_source`: upstream OVEN repo for original-source image download
+## 2. Proxy Workaround
+If your shell is configured with an invalid local proxy such as `127.0.0.1:7890`, use one of these patterns.
+Temporarily disable proxy for a single command:
+```bash
+env -u http_proxy -u https_proxy -u HTTP_PROXY -u HTTPS_PROXY wget -c URL
+```
+Or disable proxy for the current shell session:
+```bash
+unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
+```
+## 3. Download All InfoSeek Text Data With `wget`
+This is the simplest full download for the released InfoSeek jsonl files.
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+TARGET_DIR="/workspace/xiaobin/RL_dataset/data/infoseek"
+mkdir -p "${TARGET_DIR}"
+cd "${TARGET_DIR}"
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_train.jsonl
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_val.jsonl
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_test.jsonl
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_train_withkb.jsonl
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_val_withkb.jsonl
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_human.jsonl
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/Wiki6M_ver_1_0.jsonl.gz
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/Wiki6M_ver_1_0_title_only.jsonl
+ls -lh "${TARGET_DIR}"
+```
+## 4. Download All InfoSeek Text Data With `curl`
+Use this if `wget` is not available.
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+TARGET_DIR="/workspace/xiaobin/RL_dataset/data/infoseek"
+mkdir -p "${TARGET_DIR}"
+cd "${TARGET_DIR}"
+curl -L -O http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_train.jsonl
+curl -L -O http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_val.jsonl
+curl -L -O http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_test.jsonl
+curl -L -O http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_train_withkb.jsonl
+curl -L -O http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_val_withkb.jsonl
+curl -L -O http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_human.jsonl
+curl -L -O http://storage.googleapis.com/gresearch/open-vision-language/Wiki6M_ver_1_0.jsonl.gz
+curl -L -O http://storage.googleapis.com/gresearch/open-vision-language/Wiki6M_ver_1_0_title_only.jsonl
+ls -lh "${TARGET_DIR}"
+```
+## 5. Download Only Core InfoSeek Splits
+If you only need the standard train/val/test annotations:
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+TARGET_DIR="/workspace/xiaobin/RL_dataset/data/infoseek"
+mkdir -p "${TARGET_DIR}"
+cd "${TARGET_DIR}"
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_train.jsonl
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_val.jsonl
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_test.jsonl
+```
+## 6. Download Only KB Mapping Files
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+TARGET_DIR="/workspace/xiaobin/RL_dataset/data/infoseek"
+mkdir -p "${TARGET_DIR}"
+cd "${TARGET_DIR}"
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_train_withkb.jsonl
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_val_withkb.jsonl
+```
+## 7. Download Only Human Eval Set
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+TARGET_DIR="/workspace/xiaobin/RL_dataset/data/infoseek"
+mkdir -p "${TARGET_DIR}"
+cd "${TARGET_DIR}"
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/infoseek/infoseek_human.jsonl
+```
+## 8. Download Only Wiki6M Files
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+TARGET_DIR="/workspace/xiaobin/RL_dataset/data/infoseek"
+mkdir -p "${TARGET_DIR}"
+cd "${TARGET_DIR}"
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/Wiki6M_ver_1_0.jsonl.gz
+wget -c http://storage.googleapis.com/gresearch/open-vision-language/Wiki6M_ver_1_0_title_only.jsonl
+```
+Optional decompression:
+```bash
+gunzip -k /workspace/xiaobin/RL_dataset/data/infoseek/Wiki6M_ver_1_0.jsonl.gz
+```
+## 9. Download OVEN Image Snapshot From Hugging Face
+Upstream OVEN now points image snapshot downloads to the gated dataset `ychenNLP/oven` on Hugging Face. Before downloading:
+1. Open `https://huggingface.co/datasets/ychenNLP/oven`
+2. Accept the dataset access conditions
+3. Log in with the Hugging Face CLI
+Install the CLI if needed:
+```bash
+python -m pip install -U "huggingface_hub[cli]"
+```
+Login:
+```bash
+hf auth login
+```
+Download the image snapshot and mapping file into `/workspace/xiaobin/RL_dataset/data/oven_hf`:
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+TARGET_DIR="/workspace/xiaobin/RL_dataset/data/oven_hf"
+mkdir -p "${TARGET_DIR}"
+hf download ychenNLP/oven \
+  --repo-type dataset \
+  --local-dir "${TARGET_DIR}" \
+  --include "shard*.tar" \
+  --include "all_wikipedia_images.tar" \
+  --include "ovenid2impath.csv"
+```
+Extract the snapshot tar files:
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+HF_DIR="/workspace/xiaobin/RL_dataset/data/oven_hf"
+IMG_DIR="/workspace/xiaobin/RL_dataset/data/infoseek/images"
+mkdir -p "${IMG_DIR}"
+for f in "${HF_DIR}"/shard*.tar; do
+  tar -xf "${f}" -C "${IMG_DIR}"
+done
+tar -xf "${HF_DIR}/all_wikipedia_images.tar" -C "${IMG_DIR}"
+```
+Notes:
+- Hugging Face file listing shows `shard01.tar` to `shard08.tar` plus `all_wikipedia_images.tar`
+- The compressed download is very large, roughly 293 GB based on the published file sizes
+- You need additional free space for extraction
+## 10. Download OVEN Images From Original Sources
+This follows the upstream `oven_eval/image_downloads` workflow.
+### 10.1 Clone the Upstream Repo
+```bash
+git clone https://github.com/edchengg/oven_eval /workspace/xiaobin/RL_dataset/data/oven_source/oven_eval
+```
+### 10.2 Run All Source Download Scripts
+The upstream image download directory contains these scripts:
+- `download_aircraft.sh`
+- `download_car196.sh`
+- `download_coco.sh`
+- `download_food101.sh`
+- `download_gldv2.sh`
+- `download_imagenet.sh`
+- `download_inat.sh`
+- `download_oxfordflower.sh`
+- `download_sports100.sh`
+- `download_sun397.sh`
+- `download_textvqa.sh`
+- `download_v7w.sh`
+- `download_vg.sh`
+Run them one by one:
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+cd /workspace/xiaobin/RL_dataset/data/oven_source/oven_eval/image_downloads
+bash download_aircraft.sh
+bash download_car196.sh
+bash download_coco.sh
+bash download_food101.sh
+bash download_gldv2.sh
+bash download_imagenet.sh
+bash download_inat.sh
+bash download_oxfordflower.sh
+bash download_sports100.sh
+bash download_sun397.sh
+bash download_textvqa.sh
+bash download_v7w.sh
+bash download_vg.sh
+```
+Or run them in a loop:
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+cd /workspace/xiaobin/RL_dataset/data/oven_source/oven_eval/image_downloads
+for script in download_*.sh; do
+  bash "${script}"
+done
+```
+### 10.3 Download `ovenid2impath.csv`
+You need `ovenid2impath.csv` for the merge step. The current recommended source is the Hugging Face dataset:
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+TARGET_DIR="/workspace/xiaobin/RL_dataset/data/oven_hf"
+mkdir -p "${TARGET_DIR}"
+hf download ychenNLP/oven \
+  --repo-type dataset \
+  --local-dir "${TARGET_DIR}" \
+  --include "ovenid2impath.csv"
+```
+### 10.4 Merge Into the Final OVEN Image Layout
+Run the upstream merge script after all downloads finish:
+```bash
+cd /workspace/xiaobin/RL_dataset/data/oven_source/oven_eval/image_downloads
+python merge_oven_images.py
+```
+The upstream documentation states that `merge_oven_images.py` should be run after all image download scripts complete and after `ovenid2impath.csv` is available.
+## 11. Verify the Downloaded Files
+Check text files:
+```bash
+ls -lh /workspace/xiaobin/RL_dataset/data/infoseek
+```
+Check Hugging Face snapshot files:
+```bash
+ls -lh /workspace/xiaobin/RL_dataset/data/oven_hf
+```
+Check extracted images:
+```bash
+find /workspace/xiaobin/RL_dataset/data/infoseek/images -type f | wc -l
+```
+## 12. Upstream References
+- InfoSeek release page: `https://github.com/open-vision-language/infoseek`
+- OVEN image download page: `https://github.com/edchengg/oven_eval/tree/main/image_downloads`
+- Hugging Face OVEN dataset: `https://huggingface.co/datasets/ychenNLP/oven`
+- Hugging Face CLI download docs: `https://huggingface.co/docs/huggingface_hub/guides/cli`

RL_dataset/README.md ADDED Viewed

	@@ -0,0 +1,171 @@

+---
+license: apache-2.0
+task_categories:
+- question-answering
+tags:
+- deep-research
+- hierarchical-reasoning
+- multi-hop-qa
+- synthetic-data
+- data-synthesis
+language:
+- en
+---
+# InfoSeek: Open Data Synthesis For Deep Research
+[Paper](https://huggingface.co/papers/2509.00375) | [Code](https://github.com/VectorSpaceLab/InfoSeek)
+## Dataset Information
+* **`data/InfoSeek.jsonl`**
+  Contains the full research tree structures of *InfoSeek*. Each sample starts from a root node with a research question, its corresponding entity, and process information for sub-questions (stored in `root`). Also expands into intermediate tree structure during each step of construction (stored in `all_tree_list`). Totally 52K samples.
+* **`data/InfoSeekQA.jsonl`**
+  A collection of QA pairs derived from *InfoSeek*. Each entry corresponds to the final question (`sample['root']['question']`) and its answer entity (`sample['root']['entity']`) in `InfoSeek.jsonl`.
+* **`data/InfoSeek-Hard-18K.jsonl`**
+  A challenging subset of *InfoSeek* (18K samples), which is better to conduct end-to-end RL, identified using an LLM with a dedicated prompt for complex deep research.
+* **`data/Trajectory-RFT-17K.jsonl`**
+  Contains 17K reasoning trajectories generated through the workflow described in our paper. These can be used as training data for supervised fine-tuning (SFT).
+## Abstract
+Large language models (LLMs) are increasingly expected to go beyond simple factual queries toward Deep Research-tasks that require decomposing questions into sub-problems, coordinating multi-step reasoning, and synthesizing evidence from diverse sources. We formalize Deep Research tasks with verifiable answers as Hierarchical Constraint Satisfaction Problems (HCSPs), which are fundamentally different from single-constraint, multi-hop, or flat CSP formulations. However, existing benchmarks (e.g., Natural Questions, HotpotQA) fail to capture this complexity, while recent synthetic datasets often introduce shortcut reasoning, knowledge leakage, or lack sufficient structural depth. To address this gap, we introduce InfoSeek, a scalable framework for synthesizing complex Deep Research tasks. InfoSeek uses a dual-agent system to recursively build a Research Tree from large-scale webpages, blurring intermediate nodes into valid sub-problems, and converting these trees into natural language questions that require traversing the full hierarchy. It also enables rapid scaling, yielding over 50K training examples, a curated test set, and reasoning trajectories generated via reject sampling. Experiments show that models trained on InfoSeek consistently outperform strong baselines. On a challenging benchmark BrowseComp-Plus, 3B LLMs optimized with InfoSeek surpass much larger 32B models and lightweight commercial APIs (e.g., Gemini2.5-Flash), while achieving performance comparable to stronger APIs (e.g., Gemini2.5-Pro). By preserving meta-information such as intermediate steps and retrieval labels, InfoSeek further supports advanced optimization strategies, including compound reward design and trajectory-level exploration.
+## 🔆 Overview
+We propose **InfoSeek**, a scalable data synthesis framework for constructing structurally complex Deep Research tasks. InfoSeek designs a dual-agent system to recursively build a *Research Tree* by mining entities and relations from large-scale text, and blurring itermediate vertices to ensure they form valid sub-problems. The agent then transform these trees into natural language questions whose solutions require traversing the entire hierarchy. Using InfoSeek pipeline, we construct a high-quality, complexity-controllable, and intrinsically verifiable dataset.
+### Example 1:
+**Question:** What is a species of bird that was named by a person employed under his father between 1818 and 1824, whose wife was a British artist, and which has three subspecies and body length is generally no more than 6 inches?
+**Answer:** Russet sparrow
+<details>
+  <summary>Tree Structure</summary>
+```
+{
+  "root": {
+    "id": "A",
+    "entity": "Russet sparrow",
+    "question": "What is a species of bird that was named by a person employed under his father between 1818 and 1824, whose wife was a British artist, and which has three subspecies and body length is generally no more than 6 inches?",
+    "claims": [
+      { "target_id": "B", "claim": "A was named by B" },
+      { "target_id": "C", "claim": "A has three subspecies" },
+      { "target_id": "D", "claim": "A's body length is generally no more than 6 inches" }
+    ],\
+    "children": [
+      {
+        "id": "B",
+        "entity": "John Gould",
+        "claims": [
+          { "target_id": "E", "claim": "B was employed by his father between 1818 and 1824" },
+          { "target_id": "F", "claim": "B's wife was F" }
+        ],\
+        "children": [
+          { "id": "E", "entity": "None", "claims": [], "children": [] },
+          { "id": "F", "entity": "Elizabeth Gould", "claims": [], "children": [] }
+        ]
+      },\
+      { "id": "C", "entity": "None", "claims": [], "children": [] },
+      { "id": "D", "entity": "None", "claims": [], "children": [] }
+    ]
+  }
+}
+```
+```
+(A: Russet sparrow)
+ │
+ │
+ │── [claim] "was named by" ──> (B: John Gould)
+ │    │
+ │    │
+ │    │── [claim] "was employed by his father (1818-1824)"
+ │    │
+ │    │
+ │    │── [claim] "wife was" ──> (F: Elizabeth Gould)
+ │
+ │
+ │── [claim] "has three subspecies"
+ │
+ │
+ │── [claim] "body length is generally no more than 6 inches"
+```
+</details>
+### Example 2:
+**Question:** What is a women's football team whose first goals in the 2. Bundesliga were scored by a player born in Korogocho, who was discovered and developed by the Mathare Youth Sports Association?
+**Answer:** SV Werder Bremen (women)
+<details>
+    <summary>Tree Structure</summary>
+```
+{
+  "root": {
+    "id": "A",
+    "entity": "SV Werder Bremen (women)",
+    "question": "What is a women's football team whose first goals in the 2. Bundesliga were scored by a player born in Korogocho, who was discovered and developed by the Mathare Youth Sports Association?",
+    "claims": [
+      { "target_id": "B", "claim": "A's first goals in the 2. Bundesliga were scored by B" }
+    ],\
+    "children": [
+      {
+        "id": "B",
+        "entity": "Doreen Nabwire",
+        "claims": [
+          { "target_id": "C", "claim": "B was discovered and developed by C" },
+          { "target_id": "D", "claim": "B was born in D" }
+        ],\
+        "children": [
+          { "id": "C", "entity": "Mathare Youth Sports Association", "claims": [], "children": [] },
+          { "id": "D", "entity": "Korogocho", "claims": [], "children": [] }
+        ]
+      }
+    ]
+  }
+}
+```
+```
+(A: SV Werder Bremen (women))
+ │
+ │
+ │── [claim] "first goals scored by" ──> (B: Doreen Nabwire)
+      │
+      │
+      │── [claim] "discovered and developed by" ──> (C:Mathare Youth Sports Association)
+      │
+      │
+      │── [claim] "was born in" ──> (D: Korogocho)
+```
+</details>
+## 📊 Performance
+Model trained on InfoSeek and our framework shows strong performances on traditional multi-hop benchmarks:
+<img src="https://github.com/VectorSpaceLab/InfoSeek/raw/main/assets/results.png" width="800">
+Our 3B model shows competitive results on [BrowseComp-Plus](https://github.com/texttron/BrowseComp-Plus):
+<img src="https://github.com/VectorSpaceLab/InfoSeek/raw/main/assets/browsecomp_plus.png" width="800">
+## ❤️ Citing Us
+If you find this repository or our work useful, please consider giving a star ⭐ and or citing our work, which would be greatly appreciated:
+```bibtex
+@misc{xia2025opendatasynthesisdeep,
+      title={Open Data Synthesis For Deep Research},
+      author={Ziyi Xia and Kun Luo and Hongjin Qian and Zheng Liu},
+      year={2025},\
+      eprint={2509.00375},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},\
+      url={https://arxiv.org/abs/2509.00375},
+}
+```

RL_dataset/dataset_infos.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"default": {"features": {"root": {"_type": "Value"}, "all_tree_list": {"_type": "Value"}, "vertices": {"_type": "Value"}}, "splits": {"train": {"name": "train", "dataset_name": "InfoSeek"}}}}

RL_dataset/download_oven_hf_mirror.sh ADDED Viewed

	@@ -0,0 +1,189 @@

+#!/usr/bin/env bash
+set -euo pipefail
+MODE="${1:-all}"
+REPO_ID="ychenNLP/oven"
+TARGET_DIR="/workspace/xiaobin/RL_dataset/data"
+CACHE_DIR="${TARGET_DIR}/.hf_cache"
+ASSETS_DIR="${TARGET_DIR}/.hf_assets"
+DEFAULT_ENDPOINT="https://hf-mirror.com"
+MIRROR_URL="${HF_ENDPOINT:-${HF_ENDPOINT_OVERRIDE:-${DEFAULT_ENDPOINT}}}"
+HARDCODED_TOKEN="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"
+META_FILES=(
+  "download_infoseek_jsonl.sh"
+  "download_oven_jsonl.sh"
+  "ovenid2impath.csv"
+)
+IMAGE_FILES=(
+  "shard01.tar"
+  "shard02.tar"
+  "shard03.tar"
+  "shard04.tar"
+  "shard05.tar"
+  "shard06.tar"
+  "shard07.tar"
+  "shard08.tar"
+  "all_wikipedia_images.tar"
+)
+unset http_proxy
+unset https_proxy
+unset HTTP_PROXY
+unset HTTPS_PROXY
+unset all_proxy
+unset ALL_PROXY
+export HF_ENDPOINT="${MIRROR_URL}"
+export HF_HUB_CACHE="${CACHE_DIR}"
+export HF_ASSETS_CACHE="${ASSETS_DIR}"
+mkdir -p "${TARGET_DIR}" "${CACHE_DIR}" "${ASSETS_DIR}"
+if command -v hf >/dev/null 2>&1; then
+  HF_BIN=(hf download)
+elif command -v huggingface-cli >/dev/null 2>&1; then
+  HF_BIN=(huggingface-cli download)
+else
+  echo "Missing Hugging Face CLI. Install it with:" >&2
+  echo "  python -m pip install -U \"huggingface_hub[cli]\"" >&2
+  exit 1
+fi
+TOKEN_ARGS=()
+if [[ -n "${HF_TOKEN:-}" ]]; then
+  TOKEN_ARGS=(--token "${HF_TOKEN}")
+elif [[ -n "${HARDCODED_TOKEN}" ]]; then
+  TOKEN_ARGS=(--token "${HARDCODED_TOKEN}")
+fi
+print_help() {
+  cat <<'EOF'
+Usage:
+  bash download_oven_hf_mirror.sh [meta|images|all]
+Modes:
+  meta    Download metadata files only:
+          - download_infoseek_jsonl.sh
+          - download_oven_jsonl.sh
+          - ovenid2impath.csv
+  images  Download image tar files only:
+          - shard01.tar ... shard08.tar
+          - all_wikipedia_images.tar
+  all     Download both metadata and image tar files
+Behavior:
+  - unsets proxy variables before downloading
+  - uses the mirror endpoint: https://hf-mirror.com
+  - endpoint can be overridden:
+    HF_ENDPOINT=https://huggingface.co bash download_oven_hf_mirror.sh meta
+  - stores downloaded files in: /workspace/xiaobin/RL_dataset/data
+  - stores Hugging Face cache in: /workspace/xiaobin/RL_dataset/data/.hf_cache
+Notes:
+  - The dataset is gated. First accept access at:
+    https://huggingface.co/datasets/ychenNLP/oven
+  - The script contains a hardcoded token by default.
+  - If needed, export your token before running to override it:
+    export HF_TOKEN=hf_xxx
+EOF
+}
+if [[ "${MODE}" == "-h" || "${MODE}" == "--help" || "${MODE}" == "help" ]]; then
+  print_help
+  exit 0
+fi
+require_auth() {
+  if [[ -n "${HF_TOKEN:-}" ]]; then
+    return 0
+  fi
+  if hf auth whoami >/dev/null 2>&1; then
+    return 0
+  fi
+  echo "No Hugging Face authentication detected." >&2
+  echo "Do this first:" >&2
+  echo "  1. Open https://huggingface.co/datasets/ychenNLP/oven and accept access." >&2
+  echo "  2. Run: hf auth login" >&2
+  echo "     or: export HF_TOKEN=hf_xxx" >&2
+  exit 2
+}
+run_download() {
+  if ! "$@"; then
+    echo >&2
+    echo "Download failed." >&2
+    echo "Check these items:" >&2
+    echo "  - access was approved for https://huggingface.co/datasets/ychenNLP/oven" >&2
+    echo "  - HF_TOKEN is valid, or 'hf auth login' succeeded" >&2
+    echo "  - the mirror endpoint is reachable: ${MIRROR_URL}" >&2
+    exit 1
+  fi
+}
+verify_files() {
+  local missing=0
+  local file
+  for file in "$@"; do
+    if [[ ! -f "${TARGET_DIR}/${file}" ]]; then
+      echo "Missing expected file: ${TARGET_DIR}/${file}" >&2
+      missing=1
+    fi
+  done
+  if [[ "${missing}" -ne 0 ]]; then
+    echo >&2
+    echo "Download did not complete successfully." >&2
+    echo "This usually means one of these:" >&2
+    echo "  - the mirror endpoint could not be reached" >&2
+    echo "  - access to the gated dataset was not approved" >&2
+    echo "  - authentication was missing or invalid" >&2
+    exit 1
+  fi
+}
+download_meta() {
+  run_download "${HF_BIN[@]}" "${REPO_ID}" \
+    --repo-type dataset \
+    --local-dir "${TARGET_DIR}" \
+    --include "download_infoseek_jsonl.sh" \
+    --include "download_oven_jsonl.sh" \
+    --include "ovenid2impath.csv" \
+    "${TOKEN_ARGS[@]}"
+  verify_files "${META_FILES[@]}"
+}
+download_images() {
+  run_download "${HF_BIN[@]}" "${REPO_ID}" \
+    --repo-type dataset \
+    --local-dir "${TARGET_DIR}" \
+    --include "all_wikipedia_images.tar" \
+    --include "shard*.tar" \
+    "${TOKEN_ARGS[@]}"
+  verify_files "${IMAGE_FILES[@]}"
+}
+require_auth
+case "${MODE}" in
+  meta)
+    download_meta
+    ;;
+  images)
+    download_images
+    ;;
+  all)
+    download_meta
+    download_images
+    ;;
+  *)
+    echo "Unknown mode: ${MODE}" >&2
+    print_help >&2
+    exit 1
+    ;;
+esac
+echo "Download completed. Files are under: ${TARGET_DIR}"

RL_dataset/download_scienceqa_hf.sh ADDED Viewed

	@@ -0,0 +1,135 @@

+#!/usr/bin/env bash
+set -euo pipefail
+MODE="${1:-all}"
+REPO_ID="derek-thomas/ScienceQA"
+ROOT_DIR="/workspace/xiaobin/RL_dataset/data/ScienceQA"
+HF_DIR="${ROOT_DIR}/hf"
+IMG_DIR="${ROOT_DIR}/images"
+CACHE_DIR="${ROOT_DIR}/.hf_cache"
+DEFAULT_ENDPOINT="https://hf-mirror.com"
+HF_ENDPOINT_VALUE="${HF_ENDPOINT:-${HF_ENDPOINT_OVERRIDE:-${DEFAULT_ENDPOINT}}}"
+unset http_proxy
+unset https_proxy
+unset HTTP_PROXY
+unset HTTPS_PROXY
+unset all_proxy
+unset ALL_PROXY
+export HF_ENDPOINT="${HF_ENDPOINT_VALUE}"
+mkdir -p "${HF_DIR}" "${IMG_DIR}" "${CACHE_DIR}"
+if command -v hf >/dev/null 2>&1; then
+  HF_BIN=(hf download)
+elif command -v huggingface-cli >/dev/null 2>&1; then
+  HF_BIN=(huggingface-cli download)
+else
+  echo "Missing Hugging Face CLI. Install it with:" >&2
+  echo "  python -m pip install -U \"huggingface_hub[cli]\"" >&2
+  exit 1
+fi
+print_help() {
+  cat <<'EOF'
+Usage:
+  bash download_scienceqa_hf.sh [parquet|images|all]
+Modes:
+  parquet  Download the public Hugging Face parquet files only
+  images   Download the original ScienceQA image zip files only
+  all      Download both parquet files and images
+Output layout:
+  /workspace/xiaobin/RL_dataset/data/ScienceQA/hf
+  /workspace/xiaobin/RL_dataset/data/ScienceQA/images
+Notes:
+  - This dataset is public and should not require an HF token.
+  - Image URLs are adapted from:
+    /workspace/xiaobin/RL_dataset/ScienceQA/tools/download.sh
+  - Proxies are unset before download.
+  - Default HF endpoint: https://hf-mirror.com
+  - To override and use the official endpoint:
+    HF_ENDPOINT=https://huggingface.co bash download_scienceqa_hf.sh parquet
+EOF
+}
+if [[ "${MODE}" == "-h" || "${MODE}" == "--help" || "${MODE}" == "help" ]]; then
+  print_help
+  exit 0
+fi
+verify_glob() {
+  local pattern="$1"
+  if ! compgen -G "${pattern}" >/dev/null; then
+    echo "Missing expected file matching: ${pattern}" >&2
+    exit 1
+  fi
+}
+download_parquet() {
+  "${HF_BIN[@]}" "${REPO_ID}" \
+    --repo-type dataset \
+    --cache-dir "${CACHE_DIR}" \
+    --local-dir "${HF_DIR}" \
+    --include "data/*.parquet" \
+    --include "README.md" \
+    --include "ScienceQA.py"
+  verify_glob "${HF_DIR}/data/train-*.parquet"
+  verify_glob "${HF_DIR}/data/validation-*.parquet"
+  verify_glob "${HF_DIR}/data/test-*.parquet"
+}
+download_one_split() {
+  local split="$1"
+  local zip_path="${IMG_DIR}/${split}.zip"
+  local split_dir="${IMG_DIR}/${split}"
+  local url="https://scienceqa.s3.us-west-1.amazonaws.com/images/${split}.zip"
+  if [[ -d "${split_dir}" ]]; then
+    echo "Image split already exists: ${split_dir}"
+    return 0
+  fi
+  wget -c -O "${zip_path}" "${url}"
+  unzip -q -o "${zip_path}" -d "${IMG_DIR}"
+  rm -f "${zip_path}"
+  if [[ ! -d "${split_dir}" ]]; then
+    echo "Failed to extract image split: ${split}" >&2
+    exit 1
+  fi
+}
+download_images() {
+  download_one_split train
+  download_one_split val
+  download_one_split test
+}
+case "${MODE}" in
+  parquet)
+    download_parquet
+    ;;
+  images)
+    download_images
+    ;;
+  all)
+    download_parquet
+    download_images
+    ;;
+  *)
+    echo "Unknown mode: ${MODE}" >&2
+    print_help >&2
+    exit 1
+    ;;
+esac
+echo "Download completed."
+echo "Parquet dir: ${HF_DIR}"
+echo "Image dir:   ${IMG_DIR}"

download_hf.py ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/usr/bin/env python3
+"""
+Hugging Face 断点续传下载脚本
+镜像站: hf-mirror.com
+目标: MMInstruction/M3IT
+"""
+import os
+import sys
+# 设置国内镜像站
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+from huggingface_hub import snapshot_download
+from huggingface_hub import hf_hub_download
+import huggingface_hub
+REPO_ID = "MMInstruction/M3IT"
+LOCAL_DIR = "/workspace/xiaobin/dataset"
+REPO_TYPE = "dataset"  # M3IT 是数据集
+def download():
+    print(f"镜像站: {os.environ['HF_ENDPOINT']}")
+    print(f"下载仓库: {REPO_ID}")
+    print(f"保存目录: {LOCAL_DIR}")
+    print("-" * 50)
+    os.makedirs(LOCAL_DIR, exist_ok=True)
+    try:
+        snapshot_download(
+            repo_id=REPO_ID,
+            repo_type=REPO_TYPE,
+            local_dir=LOCAL_DIR,
+            local_dir_use_symlinks=False,  # 直接复制文件，不用软链接
+            resume_download=True,          # 断点续传
+            ignore_patterns=["*.gitattributes"],
+        )
+        print("\n下载完成!")
+    except Exception as e:
+        print(f"\n出错: {e}")
+        print("提示: 如果是模型仓库，请将 REPO_TYPE 改为 'model' 后重试")
+        sys.exit(1)
+if __name__ == "__main__":
+    download()