Zhaoming213 commited on Mar 21

Commit

9985989

verified ·

1 Parent(s): 4684152

Upload 58 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +23 -0
minimind-master/.DS_Store +0 -0
minimind-master/.gitignore +4 -0
minimind-master/CODE_OF_CONDUCT.md +128 -0
minimind-master/LICENSE +201 -0
minimind-master/README.md +0 -0
minimind-master/README_en.md +0 -0
minimind-master/dataset/__init__.py +0 -0
minimind-master/dataset/dataset.md +5 -0
minimind-master/dataset/lm_dataset.py +218 -0
minimind-master/dataset/sft_mini_512.jsonl +3 -0
minimind-master/eval_llm.py +92 -0
minimind-master/images/1-wiki.png +3 -0
minimind-master/images/2-wiki.png +0 -0
minimind-master/images/3-wiki.png +3 -0
minimind-master/images/4-wiki.png +3 -0
minimind-master/images/5-wiki.png +3 -0
minimind-master/images/LLM-structure-moe.png +3 -0
minimind-master/images/LLM-structure.png +3 -0
minimind-master/images/and_huggingface.png +3 -0
minimind-master/images/and_modelscope.png +3 -0
minimind-master/images/compare_radar.png +3 -0
minimind-master/images/dataset.jpg +3 -0
minimind-master/images/gpt3_config.png +0 -0
minimind-master/images/logo.png +3 -0
minimind-master/images/logo2.png +3 -0
minimind-master/images/minimind2.gif +3 -0
minimind-master/images/pre_512_loss.png +3 -0
minimind-master/images/pre_768_loss.png +3 -0
minimind-master/images/rope_ppl.png +0 -0
minimind-master/images/sft_512_loss.png +3 -0
minimind-master/images/sft_768_loss.png +3 -0
minimind-master/images/train_grpo_512.png +3 -0
minimind-master/images/train_grpo_768.png +3 -0
minimind-master/images/train_ppo_512.png +3 -0
minimind-master/images/train_ppo_768.png +3 -0
minimind-master/images/train_spo_768.png +3 -0
minimind-master/model/__init__.py +0 -0
minimind-master/model/model_lora.py +53 -0
minimind-master/model/model_minimind.py +463 -0
minimind-master/model/tokenizer.json +0 -0
minimind-master/model/tokenizer_config.json +43 -0
minimind-master/out/pretrain_512.pth +3 -0
minimind-master/requirements.txt +31 -0
minimind-master/scripts/chat_openai_api.py +33 -0
minimind-master/scripts/convert_model.py +77 -0
minimind-master/scripts/serve_openai_api.py +177 -0
minimind-master/scripts/web_demo.py +328 -0
minimind-master/trainer/train_distillation.py +235 -0
minimind-master/trainer/train_dpo.py +219 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,26 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+minimind-master/dataset/sft_mini_512.jsonl filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/1-wiki.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/3-wiki.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/4-wiki.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/5-wiki.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/and_huggingface.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/and_modelscope.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/compare_radar.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/dataset.jpg filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/LLM-structure-moe.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/LLM-structure.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/logo.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/logo2.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/minimind2.gif filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/pre_512_loss.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/pre_768_loss.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/sft_512_loss.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/sft_768_loss.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/train_grpo_512.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/train_grpo_768.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/train_ppo_512.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/train_ppo_768.png filter=lfs diff=lfs merge=lfs -text
+minimind-master/images/train_spo_768.png filter=lfs diff=lfs merge=lfs -text

minimind-master/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

minimind-master/.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+model/__pycache__
+out
+website/
+docs-minimind/

minimind-master/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Contributor Covenant Code of Conduct
+## Our Pledge
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+## Our Standards
+Examples of behavior that contributes to a positive environment for our
+community include:
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+Examples of unacceptable behavior include:
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Enforcement Responsibilities
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+## Scope
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+.
+All complaints will be reviewed and investigated promptly and fairly.
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+## Enforcement Guidelines
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+### 1. Correction
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+### 2. Warning
+**Community Impact**: A violation through a single incident or series
+of actions.
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+### 3. Temporary Ban
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+### 4. Permanent Ban
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.

minimind-master/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

minimind-master/README.md ADDED Viewed

The diff for this file is too large to render. See raw diff

minimind-master/README_en.md ADDED Viewed

The diff for this file is too large to render. See raw diff

minimind-master/dataset/__init__.py ADDED Viewed

File without changes

minimind-master/dataset/dataset.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# MiniMind Datasets
+将所有下载的数据集文件放置到当前目录.
+Place the downloaded dataset file in the current directory.

minimind-master/dataset/lm_dataset.py ADDED Viewed

	@@ -0,0 +1,218 @@

+from torch.utils.data import Dataset
+import torch
+import os
+import random
+from datasets import load_dataset
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+def pre_processing_chat(conversations, add_system_ratio=0.2):
+    SYSTEM_PROMPTS = [
+        "你是一个知识丰富的AI，尽力为用户提供准确的信息。",
+        "你是minimind，一个小巧但有用的语言模型。",
+        "你是一个专业的AI助手，请提供有价值的回答。",
+        "你是minimind，请尽力帮助用户解决问题。",
+        "你是一个可靠的AI，请给出准确的回答。",
+        "You are a helpful AI assistant.",
+        "You are minimind, a lightweight intelligent assistant.",
+        "You are a friendly chatbot. Please answer the user's questions carefully.",
+        "You are a knowledgeable AI. Try your best to provide accurate information.",
+        "You are minimind, a small but useful language model."
+    ]
+    if conversations and conversations[0].get('role') != 'system':
+        if random.random() < add_system_ratio:
+            return [{'role': 'system', 'content': random.choice(SYSTEM_PROMPTS)}] + conversations
+    return conversations
+def post_processing_chat(prompt_content, empty_think_ratio=0.05):
+    if '<think>\n\n</think>\n\n' in prompt_content and random.random() > empty_think_ratio:
+        prompt_content = prompt_content.replace('<think>\n\n</think>\n\n', '')
+    return prompt_content
+class PretrainDataset(Dataset):
+    def __init__(self, data_path, tokenizer, max_length=512):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.samples = load_dataset('json', data_files=data_path, split='train')
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, index):
+        sample = self.samples[index]
+        tokens = self.tokenizer(str(sample['text']), add_special_tokens=False, max_length=self.max_length - 2, truncation=True).input_ids
+        tokens = [self.tokenizer.bos_token_id] + tokens + [self.tokenizer.eos_token_id]
+        input_ids = tokens + [self.tokenizer.pad_token_id] * (self.max_length - len(tokens))
+        input_ids = torch.tensor(input_ids, dtype=torch.long)
+        labels = input_ids.clone()
+        labels[input_ids == self.tokenizer.pad_token_id] = -100
+        return input_ids, labels
+class SFTDataset(Dataset):
+    def __init__(self, jsonl_path, tokenizer, max_length=1024):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.samples = load_dataset('json', data_files=jsonl_path, split='train')
+        self.bos_id = tokenizer(f'{tokenizer.bos_token}assistant\n', add_special_tokens=False).input_ids
+        self.eos_id = tokenizer(f'{tokenizer.eos_token}\n', add_special_tokens=False).input_ids
+    def __len__(self):
+        return len(self.samples)
+    def create_chat_prompt(self, conversations):
+        messages = conversations.copy()
+        tools = conversations[0]["functions"] if (conversations and conversations[0]["role"] == "system" and conversations[0].get("functions")) else None
+        return self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=False,
+            tools=tools
+        )
+    def generate_labels(self, input_ids):
+        labels = [-100] * len(input_ids)
+        i = 0
+        while i < len(input_ids):
+            if input_ids[i:i + len(self.bos_id)] == self.bos_id:
+                start = i + len(self.bos_id)
+                end = start
+                while end < len(input_ids):
+                    if input_ids[end:end + len(self.eos_id)] == self.eos_id:
+                        break
+                    end += 1
+                for j in range(start, min(end + len(self.eos_id), self.max_length)):
+                    labels[j] = input_ids[j]
+                i = end + len(self.eos_id) if end < len(input_ids) else len(input_ids)
+            else:
+                i += 1
+        return labels
+    def __getitem__(self, index):
+        sample = self.samples[index]
+        conversations = pre_processing_chat(sample['conversations'])
+        prompt = self.create_chat_prompt(conversations)
+        prompt = post_processing_chat(prompt)
+        input_ids = self.tokenizer(prompt).input_ids[:self.max_length]
+        input_ids += [self.tokenizer.pad_token_id] * (self.max_length - len(input_ids))
+        labels = self.generate_labels(input_ids)
+        # # === 调试打印 ===
+        # print(f"\n--- Sample {index} ---")
+        # for i, (x, y) in enumerate(zip(input_ids[:-1], labels[1:])):
+        #     print(f"{i:3d}: X={self.tokenizer.decode([x])!r:16s} ---> Y={self.tokenizer.decode([input_ids[i+1]])!r:16s} label={y}")
+        # # ================
+        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)
+class DPODataset(Dataset):
+    def __init__(self, file_path, tokenizer, max_length=4096):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.padding = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
+        self.bos_id = tokenizer(f'{tokenizer.bos_token}assistant\n', add_special_tokens=False).input_ids
+        self.eos_id = tokenizer(f'{tokenizer.eos_token}\n', add_special_tokens=False).input_ids
+        self.samples = load_dataset('json', data_files=file_path, split='train')
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, index):
+        sample = self.samples[index]
+        chosen = sample['chosen']  # 是一个 list，里面包含若干 {role, content}
+        rejected = sample['rejected']  # 同上
+        chosen_prompt = self.tokenizer.apply_chat_template(
+            chosen, tokenize=False, add_generation_prompt=False
+        )
+        chosen_prompt = post_processing_chat(chosen_prompt)
+        rejected_prompt = self.tokenizer.apply_chat_template(
+            rejected, tokenize=False, add_generation_prompt=False
+        )
+        rejected_prompt = post_processing_chat(rejected_prompt)
+        chosen_encoding = self.tokenizer(
+            chosen_prompt, truncation=True, max_length=self.max_length, padding='max_length'
+        )
+        rejected_encoding = self.tokenizer(
+            rejected_prompt, truncation=True, max_length=self.max_length, padding='max_length'
+        )
+        chosen_input_ids = chosen_encoding['input_ids']
+        chosen_loss_mask = self.generate_loss_mask(chosen_input_ids)
+        rejected_input_ids = rejected_encoding['input_ids']
+        rejected_loss_mask = self.generate_loss_mask(rejected_input_ids)
+        x_chosen = torch.tensor(chosen_input_ids[:-1], dtype=torch.long)
+        y_chosen = torch.tensor(chosen_input_ids[1:], dtype=torch.long)
+        mask_chosen = torch.tensor(chosen_loss_mask[1:], dtype=torch.long)
+        x_rejected = torch.tensor(rejected_input_ids[:-1], dtype=torch.long)
+        y_rejected = torch.tensor(rejected_input_ids[1:], dtype=torch.long)
+        mask_rejected = torch.tensor(rejected_loss_mask[1:], dtype=torch.long)
+        return {
+            'x_chosen': x_chosen,
+            'y_chosen': y_chosen,
+            'mask_chosen': mask_chosen,
+            'x_rejected': x_rejected,
+            'y_rejected': y_rejected,
+            'mask_rejected': mask_rejected
+        }
+    def generate_loss_mask(self, input_ids):
+        loss_mask = [0] * len(input_ids)
+        i = 0
+        while i < len(input_ids):
+            if input_ids[i:i + len(self.bos_id)] == self.bos_id:
+                start = i + len(self.bos_id)
+                end = start
+                while end < len(input_ids):
+                    if input_ids[end:end + len(self.eos_id)] == self.eos_id:
+                        break
+                    end += 1
+                for j in range(start, min(end + len(self.eos_id), self.max_length)):
+                    loss_mask[j] = 1
+                i = end + len(self.eos_id) if end < len(input_ids) else len(input_ids)
+            else:
+                i += 1
+        return loss_mask
+class RLAIFDataset(Dataset):
+    def __init__(self, jsonl_path, tokenizer, max_length=1024):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.samples = load_dataset('json', data_files=jsonl_path, split='train')
+        self.bos_id = tokenizer(f'{tokenizer.bos_token}assistant', add_special_tokens=False).input_ids
+        self.eos_id = tokenizer(f'{tokenizer.eos_token}', add_special_tokens=False).input_ids
+    def __len__(self):
+        return len(self.samples)
+    def create_chat_prompt(self, conversations):
+        messages = []
+        answer = ''
+        for i, turn in enumerate(conversations):
+            role = 'user' if i % 2 == 0 else 'assistant'
+            messages.append({"role": role, "content": turn['content']})
+            answer = turn['content']
+        prompt = self.tokenizer.apply_chat_template(
+            messages[:-1],
+            tokenize=False,
+            add_generation_prompt=True  # 这里需要True
+        )
+        prompt = post_processing_chat(prompt)
+        return prompt, answer
+    def __getitem__(self, index):
+        sample = self.samples[index]
+        prompt, answer = self.create_chat_prompt(sample['conversations'])
+        return {
+            'prompt': prompt,
+            'answer': answer
+        }
+if __name__ == "__main__":
+    pass

minimind-master/dataset/sft_mini_512.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf3ddf2329b3fdec5e79a7444fb44923aa7e007f161538f0c3f3ab6515a4d93e
+size 226717278

minimind-master/eval_llm.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import time
+import argparse
+import random
+import warnings
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
+from model.model_minimind import MiniMindConfig, MiniMindForCausalLM
+from model.model_lora import *
+from trainer.trainer_utils import setup_seed, get_model_params
+warnings.filterwarnings('ignore')
+def init_model(args):
+    tokenizer = AutoTokenizer.from_pretrained(args.load_from)
+    if 'model' in args.load_from:
+        model = MiniMindForCausalLM(MiniMindConfig(
+            hidden_size=args.hidden_size,
+            num_hidden_layers=args.num_hidden_layers,
+            use_moe=bool(args.use_moe),
+            inference_rope_scaling=args.inference_rope_scaling
+        ))
+        moe_suffix = '_moe' if args.use_moe else ''
+        ckp = f'./{args.save_dir}/{args.weight}_{args.hidden_size}{moe_suffix}.pth'
+        model.load_state_dict(torch.load(ckp, map_location=args.device), strict=True)
+        if args.lora_weight != 'None':
+            apply_lora(model)
+            load_lora(model, f'./{args.save_dir}/lora/{args.lora_weight}_{args.hidden_size}.pth')
+    else:
+        model = AutoModelForCausalLM.from_pretrained(args.load_from, trust_remote_code=True)
+    get_model_params(model, model.config)
+    return model.eval().to(args.device), tokenizer
+def main():
+    parser = argparse.ArgumentParser(description="MiniMind模型推理与对话")
+    parser.add_argument('--load_from', default='model', type=str, help="模型加载路径（model=原生torch权重，其他路径=transformers格式）")
+    parser.add_argument('--save_dir', default='out', type=str, help="模型权重目录")
+    parser.add_argument('--weight', default='full_sft', type=str, help="权重名称前缀（pretrain, full_sft, rlhf, reason, ppo_actor, grpo, spo）")
+    parser.add_argument('--lora_weight', default='None', type=str, help="LoRA权重名称（None表示不使用，可选：lora_identity, lora_medical）")
+    parser.add_argument('--hidden_size', default=512, type=int, help="隐藏层维度（512=Small-26M, 640=MoE-145M, 768=Base-104M）")
+    parser.add_argument('--num_hidden_layers', default=8, type=int, help="隐藏层数量（Small/MoE=8, Base=16）")
+    parser.add_argument('--use_moe', default=0, type=int, choices=[0, 1], help="是否使用MoE架构（0=否，1=是）")
+    parser.add_argument('--inference_rope_scaling', default=False, action='store_true', help="启用RoPE位置编码外推（4倍，仅解决位置编码问题）")
+    parser.add_argument('--max_new_tokens', default=8192, type=int, help="最大生成长度（注意：并非模型实际长文本能力）")
+    parser.add_argument('--temperature', default=0.85, type=float, help="生成温度，控制随机性（0-1，越大越随机）")
+    parser.add_argument('--top_p', default=0.85, type=float, help="nucleus采样阈值（0-1）")
+    parser.add_argument('--historys', default=0, type=int, help="携带历史对话轮数（需为偶数，0表示不携带历史）")
+    parser.add_argument('--show_speed', default=1, type=int, help="显示decode速度（tokens/s）")
+    parser.add_argument('--device', default='cuda' if torch.cuda.is_available() else 'cpu', type=str, help="运行设备")
+    args = parser.parse_args()
+    prompts = [
+        '你有什么特长？',
+        '为什么天空是蓝色的',
+        '请用Python写一个计算斐波那契数列的函数',
+        '解释一下"光合作用"的基本过程',
+        '如果明天下雨，我应该如何出门',
+        '比较一下猫和狗作为宠物的优缺点',
+        '解释什么是机器学习',
+        '推荐一些中国的美食'
+    ]
+    conversation = []
+    model, tokenizer = init_model(args)
+    input_mode = int(input('[0] 自动测试\n[1] 手动输入\n'))
+    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    prompt_iter = prompts if input_mode == 0 else iter(lambda: input('💬: '), '')
+    for prompt in prompt_iter:
+        setup_seed(2026) # or setup_seed(random.randint(0, 2048))
+        if input_mode == 0: print(f'💬: {prompt}')
+        conversation = conversation[-args.historys:] if args.historys else []
+        conversation.append({"role": "user", "content": prompt})
+        templates = {"conversation": conversation, "tokenize": False, "add_generation_prompt": True}
+        if args.weight == 'reason': templates["enable_thinking"] = True # 仅Reason模型使用
+        inputs = tokenizer.apply_chat_template(**templates) if args.weight != 'pretrain' else (tokenizer.bos_token + prompt)
+        inputs = tokenizer(inputs, return_tensors="pt", truncation=True).to(args.device)
+        print('🤖: ', end='')
+        st = time.time()
+        generated_ids = model.generate(
+            inputs=inputs["input_ids"], attention_mask=inputs["attention_mask"],
+            max_new_tokens=args.max_new_tokens, do_sample=True, streamer=streamer,
+            pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
+            top_p=args.top_p, temperature=args.temperature, repetition_penalty=1.0
+        )
+        response = tokenizer.decode(generated_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
+        conversation.append({"role": "assistant", "content": response})
+        gen_tokens = len(generated_ids[0]) - len(inputs["input_ids"][0])
+        print(f'\n[Speed]: {gen_tokens / (time.time() - st):.2f} tokens/s\n\n') if args.show_speed else print('\n\n')
+if __name__ == "__main__":
+    main()

minimind-master/images/1-wiki.png ADDED Viewed

Git LFS Details

SHA256: 4cc25bf63913b5d7e1dfd67da3a3818eb2c2d614fa060a09cdb6918a07f29883
Pointer size: 131 Bytes
Size of remote file: 139 kB

minimind-master/images/2-wiki.png ADDED Viewed

minimind-master/images/3-wiki.png ADDED Viewed

Git LFS Details

SHA256: 049da16ef3d962b598d2a16aaae0e41cc9be992d404cd2d79c69a215fe7f903c
Pointer size: 131 Bytes
Size of remote file: 235 kB

minimind-master/images/4-wiki.png ADDED Viewed

Git LFS Details

SHA256: 02be48be19f24d1028e9a776906585524075c925bdc78cf02f8a0c6d6cef3cee
Pointer size: 131 Bytes
Size of remote file: 107 kB

minimind-master/images/5-wiki.png ADDED Viewed

Git LFS Details

SHA256: 280dc978404ed8dbdc3d6e5a3dd3033460d117f3e464dd696005194edffe53a9
Pointer size: 131 Bytes
Size of remote file: 245 kB

minimind-master/images/LLM-structure-moe.png ADDED Viewed

Git LFS Details

SHA256: 469f0fd91e0e6864d2f73b3fb8e2ad8ef2840030f780d0b0e3a474422ad81d55
Pointer size: 131 Bytes
Size of remote file: 124 kB

minimind-master/images/LLM-structure.png ADDED Viewed

Git LFS Details

SHA256: a909fe278f195db69f24a1e06f6ca6bf80588b1d4c4f90266fa9f5314e6a3c2e
Pointer size: 131 Bytes
Size of remote file: 380 kB

minimind-master/images/and_huggingface.png ADDED Viewed

Git LFS Details

SHA256: 29b2b47a7d8f1ecac4ea1949bc6047408a64e88cf8eb3b8d988e41e5ff111a5b
Pointer size: 131 Bytes
Size of remote file: 182 kB

minimind-master/images/and_modelscope.png ADDED Viewed

Git LFS Details

SHA256: ef021a8aff9f2db44a23be35d06a16a1a2c99a672f8e912de35baf5b49989cf7
Pointer size: 131 Bytes
Size of remote file: 154 kB

minimind-master/images/compare_radar.png ADDED Viewed

Git LFS Details

SHA256: 0600236c6ea91a3ce41183940cd077177f09fd78eca6380bcbfa07611cbc0510
Pointer size: 131 Bytes
Size of remote file: 563 kB

minimind-master/images/dataset.jpg ADDED Viewed

Git LFS Details

SHA256: 2a11afbad089f7ea5f62dec5c429e6a254eb443fb20b3b07789528715c533dff
Pointer size: 131 Bytes
Size of remote file: 149 kB

minimind-master/images/gpt3_config.png ADDED Viewed

minimind-master/images/logo.png ADDED Viewed

Git LFS Details

SHA256: f7f2a414ac9d3e79a239c832fbd731fe0ab2e1e285dc9ce3516f2b77315a9316
Pointer size: 131 Bytes
Size of remote file: 507 kB

minimind-master/images/logo2.png ADDED Viewed

Git LFS Details

SHA256: 768882e94fd7c9f75edc288f08f4fafceadcb9640dc8df44bd532bc6877a6a60
Pointer size: 131 Bytes
Size of remote file: 630 kB

minimind-master/images/minimind2.gif ADDED Viewed

Git LFS Details

SHA256: cf7feeafd822eee6ed3c91f646fb436c4003cb69d8939dc14f34caf1412dae5b
Pointer size: 132 Bytes
Size of remote file: 3.98 MB

minimind-master/images/pre_512_loss.png ADDED Viewed

Git LFS Details

SHA256: 7ddf3a9de9c3c20a40e91bc964617bbf03d90a62fc293fe5ae961bc15ad53b53
Pointer size: 131 Bytes
Size of remote file: 573 kB

minimind-master/images/pre_768_loss.png ADDED Viewed

Git LFS Details

SHA256: 746988cfdc36a2a8af65d43cf4e753b9397e1a4705d9d9583f7a2a65e5940633
Pointer size: 131 Bytes
Size of remote file: 544 kB

minimind-master/images/rope_ppl.png ADDED Viewed

minimind-master/images/sft_512_loss.png ADDED Viewed

Git LFS Details

SHA256: 774b132997e5560fe58897ad6467a79900bcaa4166848157e797469f37d8e35d
Pointer size: 132 Bytes
Size of remote file: 1.03 MB

minimind-master/images/sft_768_loss.png ADDED Viewed

Git LFS Details

SHA256: cf8ff28c49773e5f3529583a7408ac27abc71b699a2b302d59be528e90b4dd51
Pointer size: 131 Bytes
Size of remote file: 966 kB

minimind-master/images/train_grpo_512.png ADDED Viewed

Git LFS Details

SHA256: da13111bad0cbbf06a10b78a517ed2c0c3f37c6a91c3eefa93a747e390b93f9f
Pointer size: 131 Bytes
Size of remote file: 220 kB

minimind-master/images/train_grpo_768.png ADDED Viewed

Git LFS Details

SHA256: 08a156b2d353ad8ccd8020d466e1591831b9cc312cd598614488bd87d8c3bf58
Pointer size: 131 Bytes
Size of remote file: 252 kB

minimind-master/images/train_ppo_512.png ADDED Viewed

Git LFS Details

SHA256: e578a16f1c0dd0d41e3f71be9a38fb07657cc9c1c910afedde20c4b99c9cda2a
Pointer size: 131 Bytes
Size of remote file: 252 kB

minimind-master/images/train_ppo_768.png ADDED Viewed

Git LFS Details

SHA256: d4cb7747eeecda74990279901367b85c8341fd07f20ce96a529615d2af52538a
Pointer size: 131 Bytes
Size of remote file: 247 kB

minimind-master/images/train_spo_768.png ADDED Viewed

Git LFS Details

SHA256: 7ea16f9f0633e491fddffd4cfa4739c73235d67ca02746c7dfdf4f5d240f3901
Pointer size: 131 Bytes
Size of remote file: 239 kB

minimind-master/model/__init__.py ADDED Viewed

File without changes

minimind-master/model/model_lora.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from torch import optim, nn
+# 定义Lora网络结构
+class LoRA(nn.Module):
+    def __init__(self, in_features, out_features, rank):
+        super().__init__()
+        self.rank = rank  # LoRA的秩（rank），控制低秩矩阵的大小
+        self.A = nn.Linear(in_features, rank, bias=False)  # 低秩矩阵A
+        self.B = nn.Linear(rank, out_features, bias=False)  # 低秩矩阵B
+        # 矩阵A高斯初始化
+        self.A.weight.data.normal_(mean=0.0, std=0.02)
+        # 矩阵B全0初始化
+        self.B.weight.data.zero_()
+    def forward(self, x):
+        return self.B(self.A(x))
+def apply_lora(model, rank=8):
+    for name, module in model.named_modules():
+        if isinstance(module, nn.Linear) and module.weight.shape[0] == module.weight.shape[1]:
+            lora = LoRA(module.weight.shape[0], module.weight.shape[1], rank=rank).to(model.device)
+            setattr(module, "lora", lora)
+            original_forward = module.forward
+            # 显式绑定
+            def forward_with_lora(x, layer1=original_forward, layer2=lora):
+                return layer1(x) + layer2(x)
+            module.forward = forward_with_lora
+def load_lora(model, path):
+    state_dict = torch.load(path, map_location=model.device)
+    state_dict = {(k[7:] if k.startswith('module.') else k): v for k, v in state_dict.items()}
+    for name, module in model.named_modules():
+        if hasattr(module, 'lora'):
+            lora_state = {k.replace(f'{name}.lora.', ''): v for k, v in state_dict.items() if f'{name}.lora.' in k}
+            module.lora.load_state_dict(lora_state)
+def save_lora(model, path):
+    raw_model = getattr(model, '_orig_mod', model)
+    state_dict = {}
+    for name, module in raw_model.named_modules():
+        if hasattr(module, 'lora'):
+            clean_name = name[7:] if name.startswith("module.") else name
+            lora_state = {f'{clean_name}.lora.{k}': v for k, v in module.lora.state_dict().items()}
+            state_dict.update(lora_state)
+    torch.save(state_dict, path)

minimind-master/model/model_minimind.py ADDED Viewed

	@@ -0,0 +1,463 @@

+# 📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘
+#                                             MiniMind Config
+# 📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘
+from transformers import PretrainedConfig
+class MiniMindConfig(PretrainedConfig):
+    model_type = "minimind"
+    def __init__(
+            self,
+            dropout: float = 0.0,
+            bos_token_id: int = 1,
+            eos_token_id: int = 2,
+            hidden_act: str = 'silu',
+            hidden_size: int = 512,
+            intermediate_size: int = None,
+            max_position_embeddings: int = 32768,
+            num_attention_heads: int = 8,
+            num_hidden_layers: int = 8,
+            num_key_value_heads: int = 2,
+            vocab_size: int = 6400,
+            rms_norm_eps: float = 1e-05,
+            rope_theta: int = 1000000.0,
+            inference_rope_scaling: bool = False,
+            flash_attn: bool = True,
+            ####################################################
+            # Here are the specific configurations of MOE
+            # When use_moe is false, the following is invalid
+            ####################################################
+            use_moe: bool = False,
+            num_experts_per_tok: int = 2,
+            n_routed_experts: int = 4,
+            n_shared_experts: int = 1,
+            scoring_func: str = 'softmax',
+            aux_loss_alpha: float = 0.01,
+            seq_aux: bool = True,
+            norm_topk_prob: bool = True,
+            **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.dropout = dropout
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.num_key_value_heads = num_key_value_heads
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+        self.inference_rope_scaling = inference_rope_scaling
+        # 外推长度 = factor * original_max_position_embeddings = 32768
+        self.rope_scaling = {
+            "beta_fast": 32,
+            "beta_slow": 1,
+            "factor": 16,
+            "original_max_position_embeddings": 2048,
+            "attention_factor": 1.0,
+            "type": "yarn"
+        } if self.inference_rope_scaling else None
+        self.flash_attn = flash_attn
+        ####################################################
+        # Here are the specific configurations of MOE
+        # When use_moe is false, the following is invalid
+        ####################################################
+        self.use_moe = use_moe
+        self.num_experts_per_tok = num_experts_per_tok  # 每个token选择的专家数量
+        self.n_routed_experts = n_routed_experts  # 总的专家数量
+        self.n_shared_experts = n_shared_experts  # 共享专家
+        self.scoring_func = scoring_func  # 评分函数，默认为'softmax'
+        self.aux_loss_alpha = aux_loss_alpha  # 辅助损失的alpha参数
+        self.seq_aux = seq_aux  # 是否在序列级别上计算辅助损失
+        self.norm_topk_prob = norm_topk_prob  # 是否标准化top-k概率
+# 📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘
+#                                             MiniMind Model
+# 📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘📘
+import math
+import torch
+import torch.nn.init as init
+import torch.nn.functional as F
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, Tuple, List, Union
+from transformers import PreTrainedModel, GenerationMixin, PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        return self.weight * self._norm(x.float()).type_as(x)
+def precompute_freqs_cis(dim: int, end: int = int(32 * 1024), rope_base: float = 1e6,
+                         rope_scaling: Optional[dict] = None):
+    freqs, attn_factor = 1.0 / (rope_base ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)), 1.0
+    if rope_scaling is not None:
+        orig_max, factor, beta_fast, beta_slow, attn_factor = (
+            rope_scaling.get("original_max_position_embeddings", 2048), rope_scaling.get("factor", 16),
+            rope_scaling.get("beta_fast", 32.0), rope_scaling.get("beta_slow", 1.0), rope_scaling.get("attention_factor", 1.0)
+        )
+        if end / orig_max > 1.0:
+            # YaRN: f'(i) = f(i)((1-γ) + γ/s), where γ∈[0,1] is linear ramp
+            inv_dim = lambda b: (dim * math.log(orig_max / (b * 2 * math.pi))) / (2 * math.log(rope_base))
+            low, high = max(math.floor(inv_dim(beta_fast)), 0), min(math.ceil(inv_dim(beta_slow)), dim // 2 - 1)
+            ramp = torch.clamp((torch.arange(dim // 2, device=freqs.device).float() - low) / max(high - low, 0.001), 0, 1)
+            freqs = freqs * (1 - ramp + ramp / factor)
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs).float()
+    freqs_cos = torch.cat([torch.cos(freqs), torch.cos(freqs)], dim=-1) * attn_factor
+    freqs_sin = torch.cat([torch.sin(freqs), torch.sin(freqs)], dim=-1) * attn_factor
+    return freqs_cos, freqs_sin
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    def rotate_half(x):
+        return torch.cat((-x[..., x.shape[-1] // 2:], x[..., : x.shape[-1] // 2]), dim=-1)
+    q_embed = (q * cos.unsqueeze(unsqueeze_dim)) + (rotate_half(q) * sin.unsqueeze(unsqueeze_dim))
+    k_embed = (k * cos.unsqueeze(unsqueeze_dim)) + (rotate_half(k) * sin.unsqueeze(unsqueeze_dim))
+    return q_embed, k_embed
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, slen, num_key_value_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, :, None, :].expand(bs, slen, num_key_value_heads, n_rep, head_dim).reshape(bs, slen, num_key_value_heads * n_rep, head_dim)
+    )
+class Attention(nn.Module):
+    def __init__(self, args: MiniMindConfig):
+        super().__init__()
+        self.num_key_value_heads = args.num_attention_heads if args.num_key_value_heads is None else args.num_key_value_heads
+        assert args.num_attention_heads % self.num_key_value_heads == 0
+        self.n_local_heads = args.num_attention_heads
+        self.n_local_kv_heads = self.num_key_value_heads
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.hidden_size // args.num_attention_heads
+        self.q_proj = nn.Linear(args.hidden_size, args.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(args.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(args.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(args.num_attention_heads * self.head_dim, args.hidden_size, bias=False)
+        self.attn_dropout = nn.Dropout(args.dropout)
+        self.resid_dropout = nn.Dropout(args.dropout)
+        self.dropout = args.dropout
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') and args.flash_attn
+        # print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
+    def forward(self,
+                x: torch.Tensor,
+                position_embeddings: Tuple[torch.Tensor, torch.Tensor],  # 修改为接收cos和sin
+                past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+                use_cache=False,
+                attention_mask: Optional[torch.Tensor] = None):
+        bsz, seq_len, _ = x.shape
+        xq, xk, xv = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+        xq = xq.view(bsz, seq_len, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seq_len, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seq_len, self.n_local_kv_heads, self.head_dim)
+        cos, sin = position_embeddings
+        xq, xk = apply_rotary_pos_emb(xq, xk, cos, sin)
+        # kv_cache实现
+        if past_key_value is not None:
+            xk = torch.cat([past_key_value[0], xk], dim=1)
+            xv = torch.cat([past_key_value[1], xv], dim=1)
+        past_kv = (xk, xv) if use_cache else None
+        xq, xk, xv = (
+            xq.transpose(1, 2),
+            repeat_kv(xk, self.n_rep).transpose(1, 2),
+            repeat_kv(xv, self.n_rep).transpose(1, 2)
+        )
+        if self.flash and (seq_len > 1) and (past_key_value is None) and (attention_mask is None or torch.all(attention_mask == 1)):
+            output = F.scaled_dot_product_attention(xq, xk, xv, dropout_p=self.dropout if self.training else 0.0, is_causal=True)
+        else:
+            scores = (xq @ xk.transpose(-2, -1)) / math.sqrt(self.head_dim)
+            scores[:, :, :, -seq_len:] += torch.triu(torch.full((seq_len, seq_len), float("-inf"), device=scores.device), diagonal=1)
+            if attention_mask is not None:
+                extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+                extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
+                scores = scores + extended_attention_mask
+            scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+            scores = self.attn_dropout(scores)
+            output = scores @ xv
+        output = output.transpose(1, 2).reshape(bsz, seq_len, -1)
+        output = self.resid_dropout(self.o_proj(output))
+        return output, past_kv
+class FeedForward(nn.Module):
+    def __init__(self, config: MiniMindConfig):
+        super().__init__()
+        if config.intermediate_size is None:
+            intermediate_size = int(config.hidden_size * 8 / 3)
+            config.intermediate_size = 64 * ((intermediate_size + 64 - 1) // 64)
+        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        return self.dropout(self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)))
+class MoEGate(nn.Module):
+    def __init__(self, config: MiniMindConfig):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.scoring_func = config.scoring_func
+        self.alpha = config.aux_loss_alpha
+        self.seq_aux = config.seq_aux
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(hidden_states, self.weight, None)
+        if self.scoring_func == 'softmax':
+            scores = logits.softmax(dim=-1)
+        else:
+            raise NotImplementedError(f'insupportable scoring function for MoE gating: {self.scoring_func}')
+        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+        if self.training and self.alpha > 0.0:
+            scores_for_aux = scores
+            aux_topk = self.top_k
+            topk_idx_for_aux_loss = topk_idx.view(bsz, -1)
+            if self.seq_aux:
+                scores_for_seq_aux = scores_for_aux.view(bsz, seq_len, -1)
+                ce = torch.zeros(bsz, self.n_routed_experts, device=hidden_states.device)
+                ce.scatter_add_(1, topk_idx_for_aux_loss,
+                                torch.ones(bsz, seq_len * aux_topk, device=hidden_states.device)).div_(
+                    seq_len * aux_topk / self.n_routed_experts)
+                aux_loss = (ce * scores_for_seq_aux.mean(dim=1)).sum(dim=1).mean() * self.alpha
+            else:
+                mask_ce = F.one_hot(topk_idx_for_aux_loss.view(-1), num_classes=self.n_routed_experts)
+                ce = mask_ce.float().mean(0)
+                Pi = scores_for_aux.mean(0)
+                fi = ce * self.n_routed_experts
+                aux_loss = (Pi * fi).sum() * self.alpha
+        else:
+            aux_loss = scores.new_zeros(1).squeeze()
+        return topk_idx, topk_weight, aux_loss
+class MOEFeedForward(nn.Module):
+    def __init__(self, config: MiniMindConfig):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList([
+            FeedForward(config)
+            for _ in range(config.n_routed_experts)
+        ])
+        self.gate = MoEGate(config)
+        if config.n_shared_experts > 0:
+            self.shared_experts = nn.ModuleList([
+                FeedForward(config)
+                for _ in range(config.n_shared_experts)
+            ])
+    def forward(self, x):
+        identity = x
+        orig_shape = x.shape
+        bsz, seq_len, _ = x.shape
+        # 使用门控机制选择专家
+        topk_idx, topk_weight, aux_loss = self.gate(x)
+        x = x.view(-1, x.shape[-1])
+        flat_topk_idx = topk_idx.view(-1)
+        if self.training:
+            x = x.repeat_interleave(self.config.num_experts_per_tok, dim=0)
+            y = torch.empty_like(x, dtype=x.dtype)
+            for i, expert in enumerate(self.experts):
+                expert_out = expert(x[flat_topk_idx == i])
+                if expert_out.shape[0] > 0: y[flat_topk_idx == i] = expert_out.to(y.dtype)
+                else: y[flat_topk_idx == i] = expert_out.to(y.dtype) + 0 * sum(p.sum() for p in expert.parameters())
+            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+            y = y.view(*orig_shape)
+        else:
+            y = self.moe_infer(x, flat_topk_idx, topk_weight.view(-1, 1)).view(*orig_shape)
+        if self.config.n_shared_experts > 0:
+            for expert in self.shared_experts:
+                y = y + expert(identity)
+        self.aux_loss = aux_loss
+        return y
+    @torch.no_grad()
+    def moe_infer(self, x, flat_expert_indices, flat_expert_weights):
+        expert_cache = torch.zeros_like(x)
+        idxs = flat_expert_indices.argsort()
+        tokens_per_expert = flat_expert_indices.bincount().cpu().numpy().cumsum(0)
+        token_idxs = idxs // self.config.num_experts_per_tok
+        # 当tokens_per_expert = [6, 15, 20, 26]，tokens_per_expert.shape[0]即为专家数量（此时为4）
+        # 且token_idxs = [3, 7, 19, 21, 24, 25,  4,  5,  6, 10, 11, 12...] 时
+        # 意味token_idxs[:6] -> [3, 7, 19, 21, 24, 25]这6个位置属于专家0处理的token（每个token有可能被多个专家处理，这取决于num_experts_per_tok）
+        # 接下来9个位置token_idxs[6:15] -> [4,  5,  6, 10, 11, 12...]属于专家1处理的token...依此类推
+        for i, end_idx in enumerate(tokens_per_expert):
+            start_idx = 0 if i == 0 else tokens_per_expert[i - 1]
+            if start_idx == end_idx:
+                continue
+            expert = self.experts[i]
+            exp_token_idx = token_idxs[start_idx:end_idx]
+            expert_tokens = x[exp_token_idx]
+            expert_out = expert(expert_tokens).to(expert_cache.dtype)
+            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
+            expert_cache.scatter_add_(0, exp_token_idx.view(-1, 1).repeat(1, x.shape[-1]), expert_out)
+        return expert_cache
+class MiniMindBlock(nn.Module):
+    def __init__(self, layer_id: int, config: MiniMindConfig):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.self_attn = Attention(config)
+        self.layer_id = layer_id
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = FeedForward(config) if not config.use_moe else MOEFeedForward(config)
+    def forward(self, hidden_states, position_embeddings, past_key_value=None, use_cache=False, attention_mask=None):
+        residual = hidden_states
+        hidden_states, present_key_value = self.self_attn(
+            self.input_layernorm(hidden_states), position_embeddings,
+            past_key_value, use_cache, attention_mask
+        )
+        hidden_states += residual
+        hidden_states = hidden_states + self.mlp(self.post_attention_layernorm(hidden_states))
+        return hidden_states, present_key_value
+class MiniMindModel(nn.Module):
+    def __init__(self, config: MiniMindConfig):
+        super().__init__()
+        self.config = config
+        self.vocab_size, self.num_hidden_layers = config.vocab_size, config.num_hidden_layers
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.dropout)
+        self.layers = nn.ModuleList([MiniMindBlock(l, config) for l in range(self.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        freqs_cos, freqs_sin = precompute_freqs_cis(dim=config.hidden_size // config.num_attention_heads,
+                                                    end=config.max_position_embeddings, rope_base=config.rope_theta,
+                                                    rope_scaling=config.rope_scaling)
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+    def forward(self,
+                input_ids: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+                use_cache: bool = False,
+                **kwargs):
+        batch_size, seq_length = input_ids.shape
+        if hasattr(past_key_values, 'layers'): past_key_values = None
+        past_key_values = past_key_values or [None] * len(self.layers)
+        start_pos = past_key_values[0][0].shape[1] if past_key_values[0] is not None else 0
+        hidden_states = self.dropout(self.embed_tokens(input_ids))
+        position_embeddings = (
+            self.freqs_cos[start_pos:start_pos + seq_length],
+            self.freqs_sin[start_pos:start_pos + seq_length]
+        )
+        presents = []
+        for layer_idx, (layer, past_key_value) in enumerate(zip(self.layers, past_key_values)):
+            hidden_states, present = layer(
+                hidden_states,
+                position_embeddings,
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+                attention_mask=attention_mask
+            )
+            presents.append(present)
+        hidden_states = self.norm(hidden_states)
+        aux_loss = sum([l.mlp.aux_loss for l in self.layers if isinstance(l.mlp, MOEFeedForward)], hidden_states.new_zeros(1).squeeze())
+        return hidden_states, presents, aux_loss
+class MiniMindForCausalLM(PreTrainedModel, GenerationMixin):
+    config_class = MiniMindConfig
+    def __init__(self, config: MiniMindConfig = None):
+        self.config = config or MiniMindConfig()
+        super().__init__(self.config)
+        self.model = MiniMindModel(self.config)
+        self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)
+        self.model.embed_tokens.weight = self.lm_head.weight
+    def forward(self,
+                input_ids: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                labels: Optional[torch.Tensor] = None,
+                past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+                use_cache: bool = False,
+                logits_to_keep: Union[int, torch.Tensor] = 0,
+                **args):
+        hidden_states, past_key_values, aux_loss = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            **args
+        )
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), ignore_index=-100)
+        output = CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=past_key_values, hidden_states=hidden_states)
+        output.aux_loss = aux_loss
+        return output

minimind-master/model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

minimind-master/model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<|im_start|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "legacy": true,
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<|endoftext|>",
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' -%}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else -%}\n        {{- '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if message.content is string %}\n        {%- set content = message.content %}\n    {%- else %}\n        {%- set content = '' %}\n    {%- endif %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n   {{- '<|im_start|>' + message.role + '\\n' + content }}\n  {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}"
+}

minimind-master/out/pretrain_512.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a5433ae05c6e0f74b582e8c5ad4b38bbff6a4ba1ae128494b13dd8a076f1d3f
+size 58237975

minimind-master/requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+datasets==3.6.0
+datasketch==1.6.4
+Flask==3.0.3
+Flask_Cors==4.0.0
+jieba==0.42.1
+jsonlines==4.0.0
+marshmallow==3.22.0
+matplotlib==3.10.0
+ngrok==1.4.0
+nltk==3.8
+numpy==1.26.4
+openai==1.59.6
+peft==0.7.1
+psutil==5.9.8
+pydantic==2.11.5
+rich==13.7.1
+scikit_learn==1.5.1
+sentence_transformers==2.3.1
+simhash==2.1.2
+tiktoken==0.10.0
+transformers==4.57.1
+jinja2==3.1.2
+jsonlines==4.0.0
+trl==0.13.0
+ujson==5.1.0
+wandb==0.18.3
+streamlit==1.50.0
+einops==0.8.1
+swanlab==0.6.8
+torch==2.6.0
+torchvision==0.21.0

minimind-master/scripts/chat_openai_api.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from openai import OpenAI
+client = OpenAI(
+    api_key="ollama",
+    base_url="http://127.0.0.1:8998/v1"
+)
+stream = True
+conversation_history_origin = []
+conversation_history = conversation_history_origin.copy()
+history_messages_num = 0  # 必须设置为偶数（Q+A），为0则不携带历史对话
+while True:
+    query = input('[Q]: ')
+    conversation_history.append({"role": "user", "content": query})
+    response = client.chat.completions.create(
+        model="minimind",
+        messages=conversation_history[-(history_messages_num or 1):],
+        stream=stream,
+        temperature=0.7,
+        max_tokens=2048,
+        top_p=0.9
+    )
+    if not stream:
+        assistant_res = response.choices[0].message.content
+        print('[A]: ', assistant_res)
+    else:
+        print('[A]: ', end='')
+        assistant_res = ''
+        for chunk in response:
+            print(chunk.choices[0].delta.content or "", end="")
+            assistant_res += chunk.choices[0].delta.content or ""
+    conversation_history.append({"role": "assistant", "content": assistant_res})
+    print('\n\n')

minimind-master/scripts/convert_model.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import sys
+import json
+__package__ = "scripts"
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import torch
+import warnings
+from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig, LlamaForCausalLM
+from model.model_minimind import MiniMindConfig, MiniMindForCausalLM
+warnings.filterwarnings('ignore', category=UserWarning)
+# MoE模型需使用此函数转换
+def convert_torch2transformers_minimind(torch_path, transformers_path, dtype=torch.float16):
+    MiniMindConfig.register_for_auto_class()
+    MiniMindForCausalLM.register_for_auto_class("AutoModelForCausalLM")
+    lm_model = MiniMindForCausalLM(lm_config)
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    state_dict = torch.load(torch_path, map_location=device)
+    lm_model.load_state_dict(state_dict, strict=False)
+    lm_model = lm_model.to(dtype)  # 转换模型权重精度
+    model_params = sum(p.numel() for p in lm_model.parameters() if p.requires_grad)
+    print(f'模型参数: {model_params / 1e6} 百万 = {model_params / 1e9} B (Billion)')
+    lm_model.save_pretrained(transformers_path, safe_serialization=False)
+    tokenizer = AutoTokenizer.from_pretrained('../model/')
+    tokenizer.save_pretrained(transformers_path)
+    # 兼容transformers-5.0的写法
+    config_path = os.path.join(transformers_path, "tokenizer_config.json")
+    json.dump({**json.load(open(config_path, 'r', encoding='utf-8')), "tokenizer_class": "PreTrainedTokenizerFast", "extra_special_tokens": {}}, open(config_path, 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    print(f"模型已保存为 Transformers-MiniMind 格式: {transformers_path}")
+# LlamaForCausalLM结构兼容第三方生态
+def convert_torch2transformers_llama(torch_path, transformers_path, dtype=torch.float16):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    state_dict = torch.load(torch_path, map_location=device)
+    llama_config = LlamaConfig(
+        vocab_size=lm_config.vocab_size,
+        hidden_size=lm_config.hidden_size,
+        intermediate_size=64 * ((int(lm_config.hidden_size * 8 / 3) + 64 - 1) // 64),
+        num_hidden_layers=lm_config.num_hidden_layers,
+        num_attention_heads=lm_config.num_attention_heads,
+        num_key_value_heads=lm_config.num_key_value_heads,
+        max_position_embeddings=lm_config.max_position_embeddings,
+        rms_norm_eps=lm_config.rms_norm_eps,
+        rope_theta=lm_config.rope_theta,
+        tie_word_embeddings=True
+    )
+    llama_model = LlamaForCausalLM(llama_config)
+    llama_model.load_state_dict(state_dict, strict=False)
+    llama_model = llama_model.to(dtype)  # 转换模型权重精度
+    llama_model.save_pretrained(transformers_path)
+    model_params = sum(p.numel() for p in llama_model.parameters() if p.requires_grad)
+    print(f'模型参数: {model_params / 1e6} 百万 = {model_params / 1e9} B (Billion)')
+    tokenizer = AutoTokenizer.from_pretrained('../model/')
+    tokenizer.save_pretrained(transformers_path)
+    # 兼容transformers-5.0的写法
+    config_path = os.path.join(transformers_path, "tokenizer_config.json")
+    json.dump({**json.load(open(config_path, 'r', encoding='utf-8')), "tokenizer_class": "PreTrainedTokenizerFast", "extra_special_tokens": {}}, open(config_path, 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    print(f"模型已保存为 Transformers-Llama 格式: {transformers_path}")
+def convert_transformers2torch(transformers_path, torch_path):
+    model = AutoModelForCausalLM.from_pretrained(transformers_path, trust_remote_code=True)
+    torch.save({k: v.cpu().half() for k, v in model.state_dict().items()}, torch_path)
+    print(f"模型已保存为 PyTorch 格式 (half精度): {torch_path}")
+if __name__ == '__main__':
+    lm_config = MiniMindConfig(hidden_size=512, num_hidden_layers=8, max_seq_len=8192, use_moe=False)
+    torch_path = f"../out/full_sft_{lm_config.hidden_size}{'_moe' if lm_config.use_moe else ''}.pth"
+    transformers_path = '../MiniMind2-Small'
+    convert_torch2transformers_llama(torch_path, transformers_path)
+    # # convert transformers to torch model
+    # convert_transformers2torch(transformers_path, torch_path)

minimind-master/scripts/serve_openai_api.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import argparse
+import json
+import os
+import sys
+__package__ = "scripts"
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+import torch
+import warnings
+import uvicorn
+from threading import Thread
+from queue import Queue
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
+from model.model_minimind import MiniMindConfig, MiniMindForCausalLM
+from model.model_lora import apply_lora, load_lora
+warnings.filterwarnings('ignore')
+app = FastAPI()
+def init_model(args):
+    tokenizer = AutoTokenizer.from_pretrained(args.load_from)
+    if 'model' in args.load_from:
+        moe_suffix = '_moe' if args.use_moe else ''
+        ckp = f'../{args.save_dir}/{args.weight}_{args.hidden_size}{moe_suffix}.pth'
+        model = MiniMindForCausalLM(MiniMindConfig(
+            hidden_size=args.hidden_size,
+            num_hidden_layers=args.num_hidden_layers,
+            max_seq_len=args.max_seq_len,
+            use_moe=bool(args.use_moe),
+            inference_rope_scaling=args.inference_rope_scaling
+        ))
+        model.load_state_dict(torch.load(ckp, map_location=device), strict=True)
+        if args.lora_weight != 'None':
+            apply_lora(model)
+            load_lora(model, f'../{args.save_dir}/lora/{args.lora_weight}_{args.hidden_size}.pth')
+    else:
+        model = AutoModelForCausalLM.from_pretrained(args.load_from, trust_remote_code=True)
+    print(f'MiniMind模型参数量: {sum(p.numel() for p in model.parameters()) / 1e6:.2f} M(illion)')
+    return model.eval().to(device), tokenizer
+class ChatRequest(BaseModel):
+    model: str
+    messages: list
+    temperature: float = 0.7
+    top_p: float = 0.92
+    max_tokens: int = 8192
+    stream: bool = False
+    tools: list = []
+class CustomStreamer(TextStreamer):
+    def __init__(self, tokenizer, queue):
+        super().__init__(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        self.queue = queue
+        self.tokenizer = tokenizer
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        self.queue.put(text)
+        if stream_end:
+            self.queue.put(None)
+def generate_stream_response(messages, temperature, top_p, max_tokens):
+    try:
+        new_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)[-max_tokens:]
+        inputs = tokenizer(new_prompt, return_tensors="pt", truncation=True).to(device)
+        queue = Queue()
+        streamer = CustomStreamer(tokenizer, queue)
+        def _generate():
+            model.generate(
+                inputs.input_ids,
+                max_new_tokens=max_tokens,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                attention_mask=inputs.attention_mask,
+                pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                streamer=streamer
+            )
+        Thread(target=_generate).start()
+        while True:
+            text = queue.get()
+            if text is None:
+                yield json.dumps({
+                    "choices": [{
+                        "delta": {},
+                        "finish_reason": "stop"
+                    }]
+                }, ensure_ascii=False)
+                break
+            yield json.dumps({
+                "choices": [{"delta": {"content": text}}]
+            }, ensure_ascii=False)
+    except Exception as e:
+        yield json.dumps({"error": str(e)})
+@app.post("/v1/chat/completions")
+async def chat_completions(request: ChatRequest):
+    try:
+        if request.stream:
+            return StreamingResponse(
+                (f"data: {chunk}\n\n" for chunk in generate_stream_response(
+                    messages=request.messages,
+                    temperature=request.temperature,
+                    top_p=request.top_p,
+                    max_tokens=request.max_tokens
+                )),
+                media_type="text/event-stream"
+            )
+        else:
+            new_prompt = tokenizer.apply_chat_template(
+                request.messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )[-request.max_tokens:]
+            inputs = tokenizer(new_prompt, return_tensors="pt", truncation=True).to(device)
+            with torch.no_grad():
+                generated_ids = model.generate(
+                    inputs["input_ids"],
+                    max_length=inputs["input_ids"].shape[1] + request.max_tokens,
+                    do_sample=True,
+                    attention_mask=inputs["attention_mask"],
+                    pad_token_id=tokenizer.pad_token_id,
+                    eos_token_id=tokenizer.eos_token_id,
+                    top_p=request.top_p,
+                    temperature=request.temperature
+                )
+                answer = tokenizer.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+            return {
+                "id": f"chatcmpl-{int(time.time())}",
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": "minimind",
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {"role": "assistant", "content": answer},
+                        "finish_reason": "stop"
+                    }
+                ]
+            }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Server for MiniMind")
+    parser.add_argument('--load_from', default='../model', type=str, help="模型加载路径（model=原生torch权重，其他路径=transformers格式）")
+    parser.add_argument('--save_dir', default='out', type=str, help="模型权重目录")
+    parser.add_argument('--weight', default='full_sft', type=str, help="权重名称前缀（pretrain, full_sft, dpo, reason, ppo_actor, grpo, spo）")
+    parser.add_argument('--lora_weight', default='None', type=str, help="LoRA权重名称（None表示不使用，可选：lora_identity, lora_medical）")
+    parser.add_argument('--hidden_size', default=512, type=int, help="隐藏层维度（512=Small-26M, 640=MoE-145M, 768=Base-104M）")
+    parser.add_argument('--num_hidden_layers', default=8, type=int, help="隐藏层数量（Small/MoE=8, Base=16）")
+    parser.add_argument('--max_seq_len', default=8192, type=int, help="最大序列长度")
+    parser.add_argument('--use_moe', default=0, type=int, choices=[0, 1], help="是否使用MoE架构（0=否，1=是）")
+    parser.add_argument('--inference_rope_scaling', default=False, action='store_true', help="启用RoPE位置编码外推（4倍，仅解决位置编码问题）")
+    parser.add_argument('--device', default='cuda' if torch.cuda.is_available() else 'cpu', type=str, help="运行设备")
+    args = parser.parse_args()
+    device = args.device
+    model, tokenizer = init_model(args)
+    uvicorn.run(app, host="0.0.0.0", port=8998)

minimind-master/scripts/web_demo.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import random
+import re
+from threading import Thread
+import torch
+import numpy as np
+import streamlit as st
+st.set_page_config(page_title="MiniMind", initial_sidebar_state="collapsed")
+st.markdown("""
+    <style>
+        /* 添加操作按钮样式 */
+        .stButton button {
+            border-radius: 50% !important;  /* 改为圆形 */
+            width: 32px !important;         /* 固定宽度 */
+            height: 32px !important;        /* 固定高度 */
+            padding: 0 !important;          /* 移除内边距 */
+            background-color: transparent !important;
+            border: 1px solid #ddd !important;
+            display: flex !important;
+            align-items: center !important;
+            justify-content: center !important;
+            font-size: 14px !important;
+            color: #666 !important;         /* 更柔和的颜色 */
+            margin: 5px 10px 5px 0 !important;  /* 调整按钮间距 */
+        }
+        .stButton button:hover {
+            border-color: #999 !important;
+            color: #333 !important;
+            background-color: #f5f5f5 !important;
+        }
+        .stMainBlockContainer > div:first-child {
+            margin-top: -50px !important;
+        }
+        .stApp > div:last-child {
+            margin-bottom: -35px !important;
+        }
+        /* 重置按钮基础样式 */
+        .stButton > button {
+            all: unset !important;  /* 重置所有默认样式 */
+            box-sizing: border-box !important;
+            border-radius: 50% !important;
+            width: 18px !important;
+            height: 18px !important;
+            min-width: 18px !important;
+            min-height: 18px !important;
+            max-width: 18px !important;
+            max-height: 18px !important;
+            padding: 0 !important;
+            background-color: transparent !important;
+            border: 1px solid #ddd !important;
+            display: flex !important;
+            align-items: center !important;
+            justify-content: center !important;
+            font-size: 14px !important;
+            color: #888 !important;
+            cursor: pointer !important;
+            transition: all 0.2s ease !important;
+            margin: 0 2px !important;  /* 调整这里的 margin 值 */
+        }
+    </style>
+""", unsafe_allow_html=True)
+system_prompt = []
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def process_assistant_content(content):
+    if model_source == "API" and 'R1' not in api_model_name:
+        return content
+    if model_source != "API" and 'R1' not in MODEL_PATHS[selected_model][1]:
+        return content
+    if '<think>' in content and '</think>' in content:
+        content = re.sub(r'(<think>)(.*?)(</think>)',
+                         r'<details style="font-style: italic; background: rgba(222, 222, 222, 0.5); padding: 10px; border-radius: 10px;"><summary style="font-weight:bold;">推理内容（展开）</summary>\2</details>',
+                         content,
+                         flags=re.DOTALL)
+    if '<think>' in content and '</think>' not in content:
+        content = re.sub(r'<think>(.*?)$',
+                         r'<details open style="font-style: italic; background: rgba(222, 222, 222, 0.5); padding: 10px; border-radius: 10px;"><summary style="font-weight:bold;">推理中...</summary>\1</details>',
+                         content,
+                         flags=re.DOTALL)
+    if '<think>' not in content and '</think>' in content:
+        content = re.sub(r'(.*?)</think>',
+                         r'<details style="font-style: italic; background: rgba(222, 222, 222, 0.5); padding: 10px; border-radius: 10px;"><summary style="font-weight:bold;">推理内容（展开）</summary>\1</details>',
+                         content,
+                         flags=re.DOTALL)
+    return content
+@st.cache_resource
+def load_model_tokenizer(model_path):
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path,
+        trust_remote_code=True
+    )
+    model = model.eval().to(device)
+    return model, tokenizer
+def clear_chat_messages():
+    del st.session_state.messages
+    del st.session_state.chat_messages
+def init_chat_messages():
+    if "messages" in st.session_state:
+        for i, message in enumerate(st.session_state.messages):
+            if message["role"] == "assistant":
+                with st.chat_message("assistant", avatar=image_url):
+                    st.markdown(process_assistant_content(message["content"]), unsafe_allow_html=True)
+                    if st.button("🗑", key=f"delete_{i}"):
+                        st.session_state.messages.pop(i)
+                        st.session_state.messages.pop(i - 1)
+                        st.session_state.chat_messages.pop(i)
+                        st.session_state.chat_messages.pop(i - 1)
+                        st.rerun()
+            else:
+                st.markdown(
+                    f'<div style="display: flex; justify-content: flex-end;"><div style="display: inline-block; margin: 10px 0; padding: 8px 12px 8px 12px;  background-color: #ddd; border-radius: 10px; color: black;">{message["content"]}</div></div>',
+                    unsafe_allow_html=True)
+    else:
+        st.session_state.messages = []
+        st.session_state.chat_messages = []
+    return st.session_state.messages
+def regenerate_answer(index):
+    st.session_state.messages.pop()
+    st.session_state.chat_messages.pop()
+    st.rerun()
+def delete_conversation(index):
+    st.session_state.messages.pop(index)
+    st.session_state.messages.pop(index - 1)
+    st.session_state.chat_messages.pop(index)
+    st.session_state.chat_messages.pop(index - 1)
+    st.rerun()
+st.sidebar.title("模型设定调整")
+# st.sidebar.text("训练数据偏差，增加上下文记忆时\n多轮对话（较单轮）容易出现能力衰减")
+st.session_state.history_chat_num = st.sidebar.slider("Number of Historical Dialogues", 0, 6, 0, step=2)
+# st.session_state.history_chat_num = 0
+st.session_state.max_new_tokens = st.sidebar.slider("Max Sequence Length", 256, 8192, 8192, step=1)
+st.session_state.temperature = st.sidebar.slider("Temperature", 0.6, 1.2, 0.85, step=0.01)
+model_source = st.sidebar.radio("选择模型来源", ["本地模型", "API"], index=0)
+if model_source == "API":
+    api_url = st.sidebar.text_input("API URL", value="http://127.0.0.1:8000/v1")
+    api_model_id = st.sidebar.text_input("Model ID", value="minimind")
+    api_model_name = st.sidebar.text_input("Model Name", value="MiniMind2")
+    api_key = st.sidebar.text_input("API Key", value="none", type="password")
+    slogan = f"Hi, I'm {api_model_name}"
+else:
+    MODEL_PATHS = {
+        "MiniMind2-R1 (0.1B)": ["../MiniMind2-R1", "MiniMind2-R1"],
+        "MiniMind2-Small-R1 (0.02B)": ["../MiniMind2-Small-R1", "MiniMind2-Small-R1"],
+        "MiniMind2 (0.1B)": ["../MiniMind2", "MiniMind2"],
+        "MiniMind2-MoE (0.15B)": ["../MiniMind2-MoE", "MiniMind2-MoE"],
+        "MiniMind2-Small (0.02B)": ["../MiniMind2-Small", "MiniMind2-Small"]
+    }
+    selected_model = st.sidebar.selectbox('Models', list(MODEL_PATHS.keys()), index=2)  # 默认选择 MiniMind2
+    model_path = MODEL_PATHS[selected_model][0]
+    slogan = f"Hi, I'm {MODEL_PATHS[selected_model][1]}"
+image_url = "https://www.modelscope.cn/api/v1/studio/gongjy/MiniMind/repo?Revision=master&FilePath=images%2Flogo2.png&View=true"
+st.markdown(
+    f'<div style="display: flex; flex-direction: column; align-items: center; text-align: center; margin: 0; padding: 0;">'
+    '<div style="font-style: italic; font-weight: 900; margin: 0; padding-top: 4px; display: flex; align-items: center; justify-content: center; flex-wrap: wrap; width: 100%;">'
+    f'<img src="{image_url}" style="width: 45px; height: 45px; "> '
+    f'<span style="font-size: 26px; margin-left: 10px;">{slogan}</span>'
+    '</div>'
+    '<span style="color: #bbb; font-style: italic; margin-top: 6px; margin-bottom: 10px;">内容完全由AI生成，请务必仔细甄别<br>Content AI-generated, please discern with care</span>'
+    '</div>',
+    unsafe_allow_html=True
+)
+def setup_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def main():
+    if model_source == "本地模型":
+        model, tokenizer = load_model_tokenizer(model_path)
+    else:
+        model, tokenizer = None, None
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+        st.session_state.chat_messages = []
+    messages = st.session_state.messages
+    for i, message in enumerate(messages):
+        if message["role"] == "assistant":
+            with st.chat_message("assistant", avatar=image_url):
+                st.markdown(process_assistant_content(message["content"]), unsafe_allow_html=True)
+                if st.button("×", key=f"delete_{i}"):
+                    st.session_state.messages = st.session_state.messages[:i - 1]
+                    st.session_state.chat_messages = st.session_state.chat_messages[:i - 1]
+                    st.rerun()
+        else:
+            st.markdown(
+                f'<div style="display: flex; justify-content: flex-end;"><div style="display: inline-block; margin: 10px 0; padding: 8px 12px 8px 12px;  background-color: gray; border-radius: 10px; color:white; ">{message["content"]}</div></div>',
+                unsafe_allow_html=True)
+    prompt = st.chat_input(key="input", placeholder="给 MiniMind 发送消息")
+    if hasattr(st.session_state, 'regenerate') and st.session_state.regenerate:
+        prompt = st.session_state.last_user_message
+        regenerate_index = st.session_state.regenerate_index
+        delattr(st.session_state, 'regenerate')
+        delattr(st.session_state, 'last_user_message')
+        delattr(st.session_state, 'regenerate_index')
+    if prompt:
+        st.markdown(
+            f'<div style="display: flex; justify-content: flex-end;"><div style="display: inline-block; margin: 10px 0; padding: 8px 12px 8px 12px;  background-color: gray; border-radius: 10px; color:white; ">{prompt}</div></div>',
+            unsafe_allow_html=True)
+        messages.append({"role": "user", "content": prompt[-st.session_state.max_new_tokens:]})
+        st.session_state.chat_messages.append({"role": "user", "content": prompt[-st.session_state.max_new_tokens:]})
+        with st.chat_message("assistant", avatar=image_url):
+            placeholder = st.empty()
+            if model_source == "API":
+                try:
+                    from openai import OpenAI
+                    client = OpenAI(
+                        api_key=api_key,
+                        base_url=api_url
+                    )
+                    history_num = st.session_state.history_chat_num + 1  # +1 是为了包含当前的用户消息
+                    conversation_history = system_prompt + st.session_state.chat_messages[-history_num:]
+                    answer = ""
+                    response = client.chat.completions.create(
+                        model=api_model_id,
+                        messages=conversation_history,
+                        stream=True,
+                        temperature=st.session_state.temperature
+                    )
+                    for chunk in response:
+                        content = chunk.choices[0].delta.content or ""
+                        answer += content
+                        placeholder.markdown(process_assistant_content(answer), unsafe_allow_html=True)
+                except Exception as e:
+                    answer = f"API调用出错: {str(e)}"
+                    placeholder.markdown(answer, unsafe_allow_html=True)
+            else:
+                random_seed = random.randint(0, 2 ** 32 - 1)
+                setup_seed(random_seed)
+                st.session_state.chat_messages = system_prompt + st.session_state.chat_messages[
+                                                                 -(st.session_state.history_chat_num + 1):]
+                new_prompt = tokenizer.apply_chat_template(
+                    st.session_state.chat_messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+                inputs = tokenizer(
+                    new_prompt,
+                    return_tensors="pt",
+                    truncation=True
+                ).to(device)
+                streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+                generation_kwargs = {
+                    "input_ids": inputs.input_ids,
+                    "max_length": inputs.input_ids.shape[1] + st.session_state.max_new_tokens,
+                    "num_return_sequences": 1,
+                    "do_sample": True,
+                    "attention_mask": inputs.attention_mask,
+                    "pad_token_id": tokenizer.pad_token_id,
+                    "eos_token_id": tokenizer.eos_token_id,
+                    "temperature": st.session_state.temperature,
+                    "top_p": 0.85,
+                    "streamer": streamer,
+                }
+                Thread(target=model.generate, kwargs=generation_kwargs).start()
+                answer = ""
+                for new_text in streamer:
+                    answer += new_text
+                    placeholder.markdown(process_assistant_content(answer), unsafe_allow_html=True)
+            messages.append({"role": "assistant", "content": answer})
+            st.session_state.chat_messages.append({"role": "assistant", "content": answer})
+            with st.empty():
+                if st.button("×", key=f"delete_{len(messages) - 1}"):
+                    st.session_state.messages = st.session_state.messages[:-2]
+                    st.session_state.chat_messages = st.session_state.chat_messages[:-2]
+                    st.rerun()
+if __name__ == "__main__":
+    from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+    main()

minimind-master/trainer/train_distillation.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import os
+import sys
+__package__ = "trainer"
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import argparse
+import time
+import warnings
+import torch
+import torch.nn.functional as F
+import torch.distributed as dist
+from contextlib import nullcontext
+from torch import optim
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader, DistributedSampler
+from model.model_minimind import MiniMindConfig
+from dataset.lm_dataset import SFTDataset
+from trainer.trainer_utils import get_lr, Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, init_model, SkipBatchSampler
+warnings.filterwarnings('ignore')
+def distillation_loss(student_logits, teacher_logits, temperature=1.0, reduction='batchmean'):
+    with torch.no_grad():
+        teacher_probs = F.softmax(teacher_logits / temperature, dim=-1).detach()
+    student_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
+    kl = F.kl_div(
+        student_log_probs,
+        teacher_probs,
+        reduction=reduction
+    )
+    return (temperature ** 2) * kl
+def train_epoch(epoch, loader, iters, teacher_model, lm_config_student, start_step=0, wandb=None, alpha=0.0, temperature=1.0):
+    start_time = time.time()
+    if teacher_model is not None:
+        teacher_model.eval()
+        teacher_model.requires_grad_(False)
+    for step, (input_ids, labels) in enumerate(loader, start=start_step + 1):
+        input_ids = input_ids.to(args.device)
+        labels = labels.to(args.device)
+        loss_mask = (labels[..., 1:] != -100).float()
+        lr = get_lr(epoch * iters + step, args.epochs * iters, args.learning_rate)
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = lr
+        # 前向传播（学生模型）
+        with autocast_ctx:
+            res = model(input_ids)
+            student_logits = res.logits[..., :-1, :].contiguous()
+        # 教师模型前向传播（只在eval & no_grad）
+        if teacher_model is not None:
+            with torch.no_grad():
+                teacher_logits = teacher_model(input_ids).logits[..., :-1, :].contiguous()
+                vocab_size_student = student_logits.size(-1)
+                teacher_logits = teacher_logits[..., :vocab_size_student]
+        # ========== 计算损失 ==========
+        # 1) Ground-Truth CE Loss
+        shift_labels = labels[..., 1:].contiguous()
+        loss_mask_flat = loss_mask.view(-1)
+        ce_loss = F.cross_entropy(
+            student_logits.view(-1, student_logits.size(-1)),
+            shift_labels.view(-1),
+            ignore_index=-100,
+            reduction='none'
+        )
+        ce_loss_raw = torch.sum(ce_loss * loss_mask_flat) / (loss_mask_flat.sum() + 1e-8)
+        if lm_config_student.use_moe: ce_loss = ce_loss_raw + res.aux_loss
+        else: ce_loss = ce_loss_raw
+        # 2) Distillation Loss
+        if teacher_model is not None:
+            distill_loss = distillation_loss(
+                student_logits.view(-1, student_logits.size(-1))[loss_mask_flat == 1],
+                teacher_logits.view(-1, teacher_logits.size(-1))[loss_mask_flat == 1],
+                temperature=temperature
+            )
+        else:
+            distill_loss = torch.tensor(0.0, device=args.device)
+        # 3) 总损失 = alpha * CE + (1-alpha) * Distill
+        loss = (alpha * ce_loss + (1 - alpha) * distill_loss) / args.accumulation_steps
+        scaler.scale(loss).backward()
+        if (step + 1) % args.accumulation_steps == 0:
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad(set_to_none=True)
+        if step % args.log_interval == 0 or step == iters - 1:
+            spend_time = time.time() - start_time
+            current_loss = loss.item() * args.accumulation_steps
+            current_ce_loss = ce_loss_raw.item()
+            current_aux_loss = res.aux_loss.item() if lm_config_student.use_moe else 0.0
+            current_lr = optimizer.param_groups[-1]['lr']
+            eta_min = spend_time / (step + 1) * iters // 60 - spend_time // 60
+            Logger(f'Epoch:[{epoch + 1}/{args.epochs}]({step}/{iters}), loss: {current_loss:.4f}, ce: {current_ce_loss:.4f}, aux_loss: {current_aux_loss:.4f}, distill: {distill_loss.item():.4f}, learning_rate: {current_lr:.8f}, epoch_time: {eta_min:.3f}min')
+            if wandb:
+                wandb.log({
+                    "loss": current_loss,
+                    "ce_loss": current_ce_loss,
+                    "aux_loss": current_aux_loss,
+                    "distill_loss": distill_loss.item() if teacher_model is not None else 0.0,
+                    "learning_rate": current_lr,
+                    "epoch_time": eta_min
+                })
+        if (step % args.save_interval == 0 or step == iters - 1) and is_main_process():
+            model.eval()
+            moe_suffix = '_moe' if lm_config_student.use_moe else ''
+            ckp = f'{args.save_dir}/{args.save_weight}_{lm_config_student.hidden_size}{moe_suffix}.pth'
+            raw_model = model.module if isinstance(model, DistributedDataParallel) else model
+            raw_model = getattr(raw_model, '_orig_mod', raw_model)
+            state_dict = raw_model.state_dict()
+            torch.save({k: v.half().cpu() for k, v in state_dict.items()}, ckp)
+            lm_checkpoint(lm_config_student, weight=args.save_weight, model=model, optimizer=optimizer, scaler=scaler, epoch=epoch, step=step, wandb=wandb, save_dir='../checkpoints')
+            model.train()
+            del state_dict
+        del input_ids, labels, loss_mask, res, student_logits, ce_loss, distill_loss, loss
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="MiniMind Knowledge Distillation")
+    parser.add_argument("--save_dir", type=str, default="../out", help="模型保存目录")
+    parser.add_argument('--save_weight', default='full_dist', type=str, help="保存权重的前缀名")
+    parser.add_argument("--epochs", type=int, default=6, help="训练轮数")
+    parser.add_argument("--batch_size", type=int, default=32, help="batch size")
+    parser.add_argument("--learning_rate", type=float, default=5e-6, help="初始学习率")
+    parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", help="训练设备")
+    parser.add_argument("--dtype", type=str, default="bfloat16", help="混合精度类型")
+    parser.add_argument("--num_workers", type=int, default=8, help="数据加载线程数")
+    parser.add_argument("--accumulation_steps", type=int, default=1, help="梯度累积步数")
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="梯度裁剪阈值")
+    parser.add_argument("--log_interval", type=int, default=100, help="日志打印间隔")
+    parser.add_argument("--save_interval", type=int, default=100, help="模型保存间隔")
+    parser.add_argument("--max_seq_len", type=int, default=340, help="训练的最大截断长度（中文1token≈1.5~1.7字符）")
+    parser.add_argument("--data_path", type=str, default="../dataset/sft_mini_512.jsonl", help="训练数据路径")
+    parser.add_argument('--student_hidden_size', default=512, type=int, help="学生模型隐藏层维度")
+    parser.add_argument('--student_num_layers', default=8, type=int, help="学生模型隐藏层数量")
+    parser.add_argument('--teacher_hidden_size', default=768, type=int, help="教师模型隐藏层维度")
+    parser.add_argument('--teacher_num_layers', default=16, type=int, help="教师模型隐藏层数量")
+    parser.add_argument('--use_moe', default=0, type=int, choices=[0, 1], help="是否使用MoE架构（0=否，1=是）")
+    parser.add_argument('--from_student_weight', default='full_sft', type=str, help="学生模型基于哪个权重")
+    parser.add_argument('--from_teacher_weight', default='full_sft', type=str, help="教师模型基于哪个权重")
+    parser.add_argument('--from_resume', default=0, type=int, choices=[0, 1], help="是否自动检测&续训（0=否，1=是）")
+    parser.add_argument('--alpha', default=0.5, type=float, help="CE损失权重，总损失=alpha*CE+(1-alpha)*KL")
+    parser.add_argument('--temperature', default=1.5, type=float, help="蒸馏温度（推荐范围1.0-2.0）")
+    parser.add_argument("--use_wandb", action="store_true", help="是否使用wandb")
+    parser.add_argument("--wandb_project", type=str, default="MiniMind-Distillation", help="wandb项目名")
+    parser.add_argument("--use_compile", default=0, type=int, choices=[0, 1], help="是否使用torch.compile加速（0=否，1=是）")
+    args = parser.parse_args()
+    # ========== 1. 初始化环境和随机种子 ==========
+    local_rank = init_distributed_mode()
+    if dist.is_initialized(): args.device = f"cuda:{local_rank}"
+    setup_seed(42 + (dist.get_rank() if dist.is_initialized() else 0))
+    # ========== 2. 配置目录、模型参数、检查ckp ==========
+    os.makedirs(args.save_dir, exist_ok=True)
+    lm_config_student = MiniMindConfig(hidden_size=args.student_hidden_size, num_hidden_layers=args.student_num_layers, use_moe=bool(args.use_moe))
+    lm_config_teacher = MiniMindConfig(hidden_size=args.teacher_hidden_size, num_hidden_layers=args.teacher_num_layers, use_moe=bool(args.use_moe))
+    ckp_data = lm_checkpoint(lm_config_student, weight=args.save_weight, save_dir='../checkpoints') if args.from_resume==1 else None
+    # ========== 3. 设置混合精度 ==========
+    device_type = "cuda" if "cuda" in args.device else "cpu"
+    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    autocast_ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast(dtype=dtype)
+    # ========== 4. 配wandb ==========
+    wandb = None
+    if args.use_wandb and is_main_process():
+        import swanlab as wandb
+        wandb_id = ckp_data.get('wandb_id') if ckp_data else None
+        resume = 'must' if wandb_id else None
+        wandb_run_name = f"MiniMind-Distill-S{args.student_hidden_size}T{args.teacher_hidden_size}-Epoch-{args.epochs}-BS-{args.batch_size}-LR-{args.learning_rate}"
+        wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume)
+    # ========== 5. 定义学生和教师模型 ==========
+    model, tokenizer = init_model(lm_config_student, args.from_student_weight, device=args.device)
+    if args.use_compile == 1:
+        model = torch.compile(model)
+        Logger('torch.compile enabled')
+    Logger(f'学生模型总参数量：{sum(p.numel() for p in model.parameters()) / 1e6:.3f} M')
+    teacher_model, _ = init_model(lm_config_teacher, args.from_teacher_weight, device=args.device)
+    teacher_model.eval()
+    teacher_model.requires_grad_(False)
+    Logger(f'教师模型总参数量：{sum(p.numel() for p in teacher_model.parameters()) / 1e6:.3f} M')
+    train_ds = SFTDataset(args.data_path, tokenizer, max_length=args.max_seq_len)
+    train_sampler = DistributedSampler(train_ds) if dist.is_initialized() else None
+    scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype == 'float16'))
+    optimizer = optim.AdamW(model.parameters(), lr=args.learning_rate)
+    # ========== 6. 从ckp恢复状态 ==========
+    start_epoch, start_step = 0, 0
+    if ckp_data:
+        model.load_state_dict(ckp_data['model'])
+        optimizer.load_state_dict(ckp_data['optimizer'])
+        scaler.load_state_dict(ckp_data['scaler'])
+        start_epoch = ckp_data['epoch']
+        start_step = ckp_data.get('step', 0)
+    # ========== 7. DDP包模型 ==========
+    if dist.is_initialized():
+        model._ddp_params_and_buffers_to_ignore = {"freqs_cos", "freqs_sin"}
+        model = DistributedDataParallel(model, device_ids=[local_rank])
+    # ========== 8. 开始训练 ==========
+    for epoch in range(start_epoch, args.epochs):
+        train_sampler and train_sampler.set_epoch(epoch)
+        setup_seed(42 + epoch); indices = torch.randperm(len(train_ds)).tolist()
+        skip = start_step if (epoch == start_epoch and start_step > 0) else 0
+        batch_sampler = SkipBatchSampler(train_sampler or indices, args.batch_size, skip)
+        loader = DataLoader(train_ds, batch_sampler=batch_sampler, num_workers=args.num_workers, pin_memory=True)
+        if skip > 0:
+            Logger(f'Epoch [{epoch + 1}/{args.epochs}]: 跳过前{start_step}个step，从step {start_step + 1}开始')
+            train_epoch(epoch, loader, len(loader) + skip, teacher_model, lm_config_student, start_step, wandb, args.alpha, args.temperature)
+        else:
+            train_epoch(epoch, loader, len(loader), teacher_model, lm_config_student, 0, wandb, args.alpha, args.temperature)
+    # ========== 9. 清理分布进程 ==========
+    if dist.is_initialized(): dist.destroy_process_group()

minimind-master/trainer/train_dpo.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import os
+import sys
+__package__ = "trainer"
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import argparse
+import time
+import warnings
+import torch
+import torch.nn.functional as F
+import torch.distributed as dist
+from contextlib import nullcontext
+from torch import optim
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader, DistributedSampler
+from model.model_minimind import MiniMindConfig
+from dataset.lm_dataset import DPODataset
+from trainer.trainer_utils import get_lr, Logger, is_main_process, lm_checkpoint, init_distributed_mode, setup_seed, init_model, SkipBatchSampler
+warnings.filterwarnings('ignore')
+def logits_to_log_probs(logits, labels):
+    # logits shape: (batch_size, seq_len, vocab_size)
+    # labels shape: (batch_size, seq_len)
+    # log_probs shape: (batch_size, seq_len)
+    log_probs = F.log_softmax(logits, dim=2)
+    log_probs_per_token = torch.gather(log_probs, dim=2, index=labels.unsqueeze(2)).squeeze(-1)
+    return log_probs_per_token
+def dpo_loss(ref_log_probs, policy_log_probs, mask, beta):
+    # ref_log_probs 和 policy_log_probs 都是 shape: (batch_size, seq_len)
+    # https://github.com/jingyaogong/minimind/issues/298
+    seq_lengths = mask.sum(dim=1, keepdim=True).clamp_min(1e-8)  # 防止零长度mask导致除零NaN
+    ref_log_probs = (ref_log_probs * mask).sum(dim=1) / seq_lengths.squeeze()
+    policy_log_probs = (policy_log_probs * mask).sum(dim=1) / seq_lengths.squeeze()
+    # 将 chosen 和 rejected 数据分开
+    batch_size = ref_log_probs.shape[0]
+    chosen_ref_log_probs = ref_log_probs[:batch_size // 2]
+    reject_ref_log_probs = ref_log_probs[batch_size // 2:]
+    chosen_policy_log_probs = policy_log_probs[:batch_size // 2]
+    reject_policy_log_probs = policy_log_probs[batch_size // 2:]
+    pi_logratios = chosen_policy_log_probs - reject_policy_log_probs
+    ref_logratios = chosen_ref_log_probs - reject_ref_log_probs
+    logits = pi_logratios - ref_logratios
+    loss = -F.logsigmoid(beta * logits)
+    return loss.mean()
+def train_epoch(epoch, loader, iters, ref_model, lm_config, start_step=0, wandb=None, beta=0.1):
+    start_time = time.time()
+    for step, batch in enumerate(loader, start=start_step + 1):
+        x_chosen = batch['x_chosen'].to(args.device)
+        x_rejected = batch['x_rejected'].to(args.device)
+        y_chosen = batch['y_chosen'].to(args.device)
+        y_rejected = batch['y_rejected'].to(args.device)
+        mask_chosen = batch['mask_chosen'].to(args.device)
+        mask_rejected = batch['mask_rejected'].to(args.device)
+        x = torch.cat([x_chosen, x_rejected], dim=0)
+        y = torch.cat([y_chosen, y_rejected], dim=0)
+        mask = torch.cat([mask_chosen, mask_rejected], dim=0)
+        lr = get_lr(epoch * iters + step, args.epochs * iters, args.learning_rate)
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = lr
+        with autocast_ctx:
+            with torch.no_grad():
+                ref_outputs = ref_model(x)
+                ref_logits = ref_outputs.logits
+            ref_log_probs = logits_to_log_probs(ref_logits, y)
+            outputs = model(x)
+            logits = outputs.logits
+            policy_log_probs = logits_to_log_probs(logits, y)
+            dpo_loss_val = dpo_loss(ref_log_probs, policy_log_probs, mask, beta=beta)
+            loss = dpo_loss_val + outputs.aux_loss
+            loss = loss / args.accumulation_steps
+        scaler.scale(loss).backward()
+        if (step + 1) % args.accumulation_steps == 0:
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad(set_to_none=True)
+        if step % args.log_interval == 0 or step == iters - 1:
+            spend_time = time.time() - start_time
+            current_loss = loss.item() * args.accumulation_steps
+            current_dpo_loss = dpo_loss_val.item()
+            current_aux_loss = outputs.aux_loss.item()
+            current_lr = optimizer.param_groups[-1]['lr']
+            eta_min = spend_time / (step + 1) * iters // 60 - spend_time // 60
+            Logger(f'Epoch:[{epoch + 1}/{args.epochs}]({step}/{iters}), loss: {current_loss:.4f}, dpo_loss: {current_dpo_loss:.4f}, aux_loss: {current_aux_loss:.4f}, learning_rate: {current_lr:.8f}, epoch_time: {eta_min:.3f}min')
+            if wandb: wandb.log({"loss": current_loss, "dpo_loss": current_dpo_loss, "aux_loss": current_aux_loss, "learning_rate": current_lr, "epoch_time": eta_min})
+        if (step % args.save_interval == 0 or step == iters - 1) and is_main_process():
+            model.eval()
+            moe_suffix = '_moe' if lm_config.use_moe else ''
+            ckp = f'{args.save_dir}/{args.save_weight}_{lm_config.hidden_size}{moe_suffix}.pth'
+            raw_model = model.module if isinstance(model, DistributedDataParallel) else model
+            raw_model = getattr(raw_model, '_orig_mod', raw_model)
+            state_dict = raw_model.state_dict()
+            torch.save({k: v.half().cpu() for k, v in state_dict.items()}, ckp)
+            lm_checkpoint(lm_config, weight=args.save_weight, model=model, optimizer=optimizer, scaler=scaler, epoch=epoch, step=step, wandb=wandb, save_dir='../checkpoints')
+            model.train()
+            del state_dict
+        del x_chosen, x_rejected, y_chosen, y_rejected, mask_chosen, mask_rejected, x, y, mask
+        del ref_outputs, ref_logits, ref_log_probs, outputs, logits, policy_log_probs, loss
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="MiniMind DPO (Direct Preference Optimization)")
+    parser.add_argument("--save_dir", type=str, default="../out", help="模型保存目录")
+    parser.add_argument('--save_weight', default='dpo', type=str, help="保存权重的前缀名")
+    parser.add_argument("--epochs", type=int, default=1, help="训练轮数")
+    parser.add_argument("--batch_size", type=int, default=4, help="batch size")
+    parser.add_argument("--learning_rate", type=float, default=4e-8, help="初始学习率（建议<=5e-8避免遗忘）")
+    parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", help="训练设备")
+    parser.add_argument("--dtype", type=str, default="bfloat16", help="混合精度类型")
+    parser.add_argument("--num_workers", type=int, default=8, help="数据加载线程数")
+    parser.add_argument("--accumulation_steps", type=int, default=1, help="梯度累积步数")
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="梯度裁剪阈值")
+    parser.add_argument("--log_interval", type=int, default=100, help="日志打印间隔")
+    parser.add_argument("--save_interval", type=int, default=100, help="模型保存间隔")
+    parser.add_argument('--hidden_size', default=512, type=int, help="隐藏层维度")
+    parser.add_argument('--num_hidden_layers', default=8, type=int, help="隐藏层数量")
+    parser.add_argument('--max_seq_len', default=1024, type=int, help="训练的最大截断长度（中文1token≈1.5~1.7字符）")
+    parser.add_argument('--use_moe', default=0, type=int, choices=[0, 1], help="是否使用MoE架构（0=否，1=是）")
+    parser.add_argument("--data_path", type=str, default="../dataset/dpo.jsonl", help="DPO训练数据路径")
+    parser.add_argument('--from_weight', default='full_sft', type=str, help="基于哪个权重训练")
+    parser.add_argument('--from_resume', default=0, type=int, choices=[0, 1], help="是否自动检测&续训（0=否，1=是）")
+    parser.add_argument('--beta', default=0.1, type=float, help="DPO中的beta参数")
+    parser.add_argument("--use_wandb", action="store_true", help="是否使用wandb")
+    parser.add_argument("--wandb_project", type=str, default="MiniMind-DPO", help="wandb项目名")
+    parser.add_argument("--use_compile", default=0, type=int, choices=[0, 1], help="是否使用torch.compile加速（0=否，1=是）")
+    args = parser.parse_args()
+    # ========== 1. 初始化环境和随机种子 ==========
+    local_rank = init_distributed_mode()
+    if dist.is_initialized(): args.device = f"cuda:{local_rank}"
+    setup_seed(42 + (dist.get_rank() if dist.is_initialized() else 0))
+    # ========== 2. 配置目录、模型参数、检查ckp ==========
+    os.makedirs(args.save_dir, exist_ok=True)
+    lm_config = MiniMindConfig(hidden_size=args.hidden_size, num_hidden_layers=args.num_hidden_layers, use_moe=bool(args.use_moe))
+    ckp_data = lm_checkpoint(lm_config, weight=args.save_weight, save_dir='../checkpoints') if args.from_resume==1 else None
+    # ========== 3. 设置混合精度 ==========
+    device_type = "cuda" if "cuda" in args.device else "cpu"
+    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    autocast_ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast(dtype=dtype)
+    # ========== 4. 配wandb ==========
+    wandb = None
+    if args.use_wandb and is_main_process():
+        import swanlab as wandb
+        wandb_id = ckp_data.get('wandb_id') if ckp_data else None
+        resume = 'must' if wandb_id else None
+        wandb_run_name = f"MiniMind-DPO-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LR-{args.learning_rate}"
+        wandb.init(project=args.wandb_project, name=wandb_run_name, id=wandb_id, resume=resume)
+    # ========== 5. 定义模型和参考模型 ==========
+    model, tokenizer = init_model(lm_config, args.from_weight, device=args.device)
+    if args.use_compile == 1:
+        model = torch.compile(model)
+        Logger('torch.compile enabled')
+    Logger(f'策略模型总参数量：{sum(p.numel() for p in model.parameters()) / 1e6:.3f} M')
+    # 初始化参考模型（ref_model冻结）
+    ref_model, _ = init_model(lm_config, args.from_weight, device=args.device)
+    ref_model.eval()
+    ref_model.requires_grad_(False)
+    Logger(f'参考模型总参数量：{sum(p.numel() for p in ref_model.parameters()) / 1e6:.3f} M')
+    train_ds = DPODataset(args.data_path, tokenizer, max_length=args.max_seq_len)
+    train_sampler = DistributedSampler(train_ds) if dist.is_initialized() else None
+    scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype == 'float16'))
+    optimizer = optim.AdamW(model.parameters(), lr=args.learning_rate)
+    # ========== 6. 从ckp恢复状态 ==========
+    start_epoch, start_step = 0, 0
+    if ckp_data:
+        model.load_state_dict(ckp_data['model'])
+        optimizer.load_state_dict(ckp_data['optimizer'])
+        scaler.load_state_dict(ckp_data['scaler'])
+        start_epoch = ckp_data['epoch']
+        start_step = ckp_data.get('step', 0)
+    # ========== 7. DDP包模型 ==========
+    if dist.is_initialized():
+        model._ddp_params_and_buffers_to_ignore = {"freqs_cos", "freqs_sin"}
+        model = DistributedDataParallel(model, device_ids=[local_rank])
+    # ========== 8. 开始训练 ==========
+    for epoch in range(start_epoch, args.epochs):
+        train_sampler and train_sampler.set_epoch(epoch)
+        setup_seed(42 + epoch); indices = torch.randperm(len(train_ds)).tolist()
+        skip = start_step if (epoch == start_epoch and start_step > 0) else 0
+        batch_sampler = SkipBatchSampler(train_sampler or indices, args.batch_size, skip)
+        loader = DataLoader(train_ds, batch_sampler=batch_sampler, num_workers=args.num_workers, pin_memory=True)
+        if skip > 0:
+            Logger(f'Epoch [{epoch + 1}/{args.epochs}]: 跳过前{start_step}个step，从step {start_step + 1}开始')
+            train_epoch(epoch, loader, len(loader) + skip, ref_model, lm_config, start_step, wandb, args.beta)
+        else:
+            train_epoch(epoch, loader, len(loader), ref_model, lm_config, 0, wandb, args.beta)
+    # ========== 9. 清理分布进程 ==========
+    if dist.is_initialized(): dist.destroy_process_group()