Lekr0 commited on 7 days ago

Commit

7c50656

verified ·

1 Parent(s): 40d87dd

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

SpecForge-ext/.pre-commit-config.yaml +53 -0
SpecForge-ext/convert_mtbench.py +22 -0
SpecForge-ext/download_datasets.py +64 -0
SpecForge-ext/download_mtbench.sh +23 -0
SpecForge-ext/download_mtbench_data.py +51 -0
SpecForge-ext/mtbench_sample.json +26 -0
SpecForge-ext/pyproject.toml +44 -0
SpecForge-ext/requirements.txt +0 -0
SpecForge-ext/setup.py +33 -0
SpecForge-ext/test_accept_length.md +300 -0
SpecForge/.editorconfig +25 -0
SpecForge/.isort.cfg +3 -0
SpecForge/.pre-commit-config.yaml +53 -0
SpecForge/LICENSE +21 -0
SpecForge/MANIFEST.in +2 -0
SpecForge/README.md +70 -0
SpecForge/pyproject.toml +47 -0
SpecForge/requirements-rocm.txt +20 -0
SpecForge/version.txt +1 -0
idea1/.editorconfig +25 -0
idea1/.isort.cfg +3 -0
idea1/.pre-commit-config.yaml +53 -0
idea1/LICENSE +21 -0
idea1/requirements-rocm.txt +20 -0
idea1/version.txt +1 -0
qwen3-8b_dflash_regen/.gitattributes +36 -0
syxin/backup.log +0 -0
syxin/dflash_lora_changelog.md +232 -0
syxin/eval_accepted_length.md +217 -0
syxin/eval_dflash_b16_baseline.py +354 -0
syxin/eval_dflash_lora_inject.py +627 -0
syxin/idea.md +23 -0
syxin/launch_train.sh +37 -0
syxin/launch_train_wrapper.py +21 -0
syxin/list.md +12 -0
syxin/merge_lora.py +66 -0
syxin/oom_fix_progress.md +42 -0
syxin/requirements.txt +0 -0
syxin/run_bench.sh +68 -0
syxin/run_bench_dflash.sh +71 -0
syxin/run_bench_dflash_b16_baseline.sh +60 -0
syxin/run_qwen3_8b_sft_32gpu.sh +31 -0
syxin/run_train_dflash_direct_inject.sh +56 -0
syxin/run_train_dflash_lora_inject.sh +71 -0
syxin/run_train_multinode.sh +67 -0
syxin/run_train_qwen3_8b_sft_32gpu.sh +66 -0
syxin/server.log +186 -0
syxin/start_server.sh +42 -0
syxin/start_server_dflash.sh +54 -0
syxin/step1.md +139 -0

SpecForge-ext/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+default_stages: [pre-commit, pre-push, manual]
+repos:
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.3.1
+    hooks:
+    -   id: autoflake
+        args: [--remove-all-unused-imports, --in-place]
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-symlinks
+      - id: destroyed-symlinks
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+        args: [--allow-multiple-documents]
+      - id: check-toml
+      - id: check-ast
+      - id: check-added-large-files
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: detect-private-key
+      - id: debug-statements
+      - id: no-commit-to-branch
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.10
+    hooks:
+      - id: ruff
+        args: [--select=F401, --fixable=F401]
+        files: ^(benchmark/|docs/|examples/)
+        exclude: \.ipynb$
+  - repo: https://github.com/psf/black
+    rev: 24.10.0
+    hooks:
+      - id: black-jupyter
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v18.1.8
+    hooks:
+    - id: clang-format
+      types_or: [c++, cuda]
+      args: [--style=file, --verbose]
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.8.1
+    hooks:
+      - id: nbstripout
+        args:
+          - '--keep-output'
+          - '--extra-keys=metadata.kernelspec metadata.language_info.version'

SpecForge-ext/convert_mtbench.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env python3
+import json
+import os
+# 读取 JSON 文件并转换为 JSONL
+input_file = "/workspace/hanrui/SpecForge-ext/mtbench_sample.json"
+with open(input_file, 'r') as f:
+    data = json.load(f)
+# 保存为 jsonl
+cache_dir = os.path.expanduser("~/.cache/sglang")
+os.makedirs(cache_dir, exist_ok=True)
+output_file = os.path.join(cache_dir, "mtbench.jsonl")
+with open(output_file, 'w') as f:
+    for item in data:
+        f.write(json.dumps(item) + '\n')
+print(f"Converted {len(data)} questions")
+print(f"Saved to {output_file}")
+print(f"\nFirst question:")
+print(json.dumps(data[0], indent=2))

SpecForge-ext/download_datasets.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python3
+"""
+下载 GSM8K 和 HumanEval 数据集到本地
+"""
+import os
+import json
+import requests
+from datasets import load_dataset
+DATA_DIR = "/workspace/hanrui/datasets"
+os.makedirs(DATA_DIR, exist_ok=True)
+print("=" * 60)
+print("下载 GSM8K 数据集")
+print("=" * 60)
+try:
+    # 下载 GSM8K
+    gsm8k_dir = os.path.join(DATA_DIR, "gsm8k")
+    os.makedirs(gsm8k_dir, exist_ok=True)
+    print("Loading GSM8K from HuggingFace...")
+    dataset = load_dataset("gsm8k", "main", split="test")
+    # 保存为 jsonl
+    output_file = os.path.join(gsm8k_dir, "test.jsonl")
+    with open(output_file, 'w') as f:
+        for item in dataset:
+            f.write(json.dumps(item) + '\n')
+    print(f"✓ GSM8K saved to {output_file}")
+    print(f"  Total samples: {len(dataset)}")
+except Exception as e:
+    print(f"✗ GSM8K download failed: {e}")
+print("\n" + "=" * 60)
+print("下载 HumanEval 数据集")
+print("=" * 60)
+try:
+    # 下载 HumanEval
+    humaneval_dir = os.path.join(DATA_DIR, "humaneval")
+    os.makedirs(humaneval_dir, exist_ok=True)
+    print("Loading HumanEval from HuggingFace...")
+    dataset = load_dataset("openai_humaneval", split="test")
+    # 保存为 jsonl
+    output_file = os.path.join(humaneval_dir, "test.jsonl")
+    with open(output_file, 'w') as f:
+        for item in dataset:
+            f.write(json.dumps(item) + '\n')
+    print(f"✓ HumanEval saved to {output_file}")
+    print(f"  Total samples: {len(dataset)}")
+except Exception as e:
+    print(f"✗ HumanEval download failed: {e}")
+print("\n" + "=" * 60)
+print("下载完成")
+print("=" * 60)
+print(f"数据保存在: {DATA_DIR}")

SpecForge-ext/download_mtbench.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/bin/bash
+# 下载 mtbench 数据文件
+# 如果无法访问 GitHub，需要手动下载或使用镜像
+CACHE_DIR="$HOME/.cache/sglang"
+mkdir -p "$CACHE_DIR"
+echo "Downloading mtbench data..."
+# 方法1：尝试使用代理下载
+https_proxy=http://10.1.2.1:7890 http_proxy=http://10.1.2.1:7890 \
+curl -L "https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl" \
+  -o "$CACHE_DIR/mtbench.jsonl"
+if [ $? -eq 0 ]; then
+    echo "Downloaded to $CACHE_DIR/mtbench.jsonl"
+    ls -lh "$CACHE_DIR/mtbench.jsonl"
+else
+    echo "Download failed. Please manually download the file from:"
+    echo "https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl"
+    echo "And save it to: $CACHE_DIR/mtbench.jsonl"
+fi

SpecForge-ext/download_mtbench_data.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env python3
+"""
+下载并转换 MT-Bench 数据到本地目录
+"""
+import json
+import os
+import requests
+# 目标目录
+DATA_DIR = "/workspace/hanrui/datasets/mtbench"
+os.makedirs(DATA_DIR, exist_ok=True)
+# 下载 MT-Bench 问题数据
+url = "https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl"
+output_file = os.path.join(DATA_DIR, "question.jsonl")
+print(f"Downloading MT-Bench questions from {url}")
+print(f"Saving to {output_file}")
+try:
+    # 使用代理下载
+    proxies = {
+        'http': 'http://10.1.2.1:7890',
+        'https': 'http://10.1.2.1:7890',
+    }
+    response = requests.get(url, proxies=proxies, timeout=30)
+    response.raise_for_status()
+    with open(output_file, 'wb') as f:
+        f.write(response.content)
+    print(f"✓ Downloaded successfully")
+    # 验证数据
+    with open(output_file, 'r') as f:
+        lines = f.readlines()
+    print(f"✓ Total questions: {len(lines)}")
+    # 显示第一个问题
+    first_question = json.loads(lines[0])
+    print(f"\nFirst question:")
+    print(json.dumps(first_question, indent=2))
+except Exception as e:
+    print(f"✗ Download failed: {e}")
+    print(f"\nPlease manually download from:")
+    print(f"  {url}")
+    print(f"And save to:")
+    print(f"  {output_file}")

SpecForge-ext/mtbench_sample.json ADDED Viewed

	@@ -0,0 +1,26 @@

+[
+  {
+    "question_id": 1,
+    "category": "writing",
+    "turns": [
+      "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.",
+      "Rewrite your previous response. Start every sentence with the letter A."
+    ]
+  },
+  {
+    "question_id": 2,
+    "category": "roleplay",
+    "turns": [
+      "Imagine you are writing a blog post comparing two popular smartphone models. Develop an outline for the blog post, including key points and subheadings to effectively compare and contrast the features, performance, and user experience of the two models. Please answer in fewer than 200 words.",
+      "Take your previous response and rephrase it as a limerick."
+    ]
+  },
+  {
+    "question_id": 3,
+    "category": "reasoning",
+    "turns": [
+      "Describe a vivid and unique character, using strong imagery and creative language. Please answer in fewer than two paragraphs.",
+      "Revise your previous response and incorporate an allusion to a famous work of literature or historical event in each sentence."
+    ]
+  }
+]

SpecForge-ext/pyproject.toml ADDED Viewed

	@@ -0,0 +1,44 @@

+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "specforge"
+dynamic = ["version", "description"]
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "pre-commit",
+    "torch==2.9.1",
+    "torchaudio==2.9.1",
+    "torchvision==0.24.1",
+    "transformers==4.57.1",
+    "qwen-vl-utils==0.0.11",
+    "datasets",
+    "setuptools",
+    "tqdm",
+    "wandb",
+    "psutil",
+    "numpy",
+    "accelerate",
+    "pydantic",
+    "sglang==0.5.6",
+    "openai-harmony",
+    "ninja",
+    "packaging",
+    "yunchang",
+]
+[tool.setuptools]
+packages = ["specforge"]
+[project.optional-dependencies]
+dev = [
+    "pre-commit",
+    "unittest"
+]
+fa = ["flash-attn"]
+[tool.setuptools.dynamic]
+version = {file = "version.txt"}
+description = {file = "README.md"}

SpecForge-ext/requirements.txt ADDED Viewed

File without changes

SpecForge-ext/setup.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import tomllib
+from pathlib import Path
+from setuptools import find_packages, setup
+def read_readme():
+    with open("README.md", "r") as f:
+        return f.read()
+def read_version():
+    with open("version.txt", "r") as f:
+        return f.read().strip()
+def read_dependencies():
+    pyproject_path = Path(__file__).parent / "pyproject.toml"
+    with open(pyproject_path, "rb") as f:
+        pyproject = tomllib.load(f)
+        return pyproject.get("project", {}).get("dependencies", [])
+setup(
+    name="specforge",
+    packages=find_packages(exclude=["configs", "scripts", "tests"]),
+    version=read_version(),
+    install_requires=read_dependencies(),
+    long_description=read_readme(),
+    long_description_content_type="text/markdown",
+    author="SGLang Team",
+    url="https://github.com/sgl-project/SpecForge",
+)

SpecForge-ext/test_accept_length.md ADDED Viewed

	@@ -0,0 +1,300 @@

+# Accept Length 测试指南
+## 0. 准备工作
+### 创建目录
+```bash
+cd /workspace/hanrui/SpecForge-ext
+mkdir -p logs results
+```
+### 下载数据集（首次运行）
+```bash
+cd /workspace/hanrui/SpecForge-ext
+python download_datasets.py
+```
+数据保存位置：
+- MT-Bench: `/workspace/hanrui/datasets/mtbench/question.jsonl`
+- GSM8K: `/workspace/hanrui/datasets/gsm8k/test.jsonl`
+- HumanEval: `/workspace/hanrui/datasets/humaneval/test.jsonl`
+---
+## 1. 测试 Baseline 模型
+### 启动服务器（终端1）
+```bash
+cd /workspace/hanrui/SpecForge-ext
+# 设置环境变量
+export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
+export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
+# 启动 baseline 服务器
+python3 -m sglang.launch_server \
+    --model /workspace/Qwen3-8B \
+    --speculative-algorithm EAGLE3 \
+    --speculative-draft-model-path /workspace/qwen3_8b_eagle3 \
+    --speculative-num-steps 3 \
+    --speculative-eagle-topk 1 \
+    --speculative-num-draft-tokens 4 \
+    --mem-fraction-static 0.75 \
+    --cuda-graph-max-bs 1 \
+    --tp 1 \
+    --trust-remote-code \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --dtype bfloat16 \
+    --skip-server-warmup
+```
+等待看到 `Application startup complete` 后，继续下一步。
+### 运行三个 Benchmark（终端2）
+```bash
+cd /workspace/hanrui/SpecForge-ext
+conda activate /workspace/Hanrui/
+# 设置环境变量
+export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
+export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
+# 1. MT-Bench
+echo "=== Running MT-Bench (Baseline) ==="
+python benchmarks/bench_eagle3.py \
+    --model-path /workspace/Qwen3-8B \
+    --host 10.1.1.31 \
+    --port 30000 \
+    --config-list 1,3,1,4 \
+    --benchmark-list mtbench:80 \
+    --dtype bfloat16 \
+    --skip-launch-server \
+    --name baseline_mtbench \
+    --output-dir ./results \
+    2>&1 | tee logs/baseline_mtbench_$(date +%Y%m%d_%H%M%S).log
+# 2. GSM8K
+echo "=== Running GSM8K (Baseline) ==="
+python benchmarks/bench_eagle3.py \
+    --model-path /workspace/Qwen3-8B \
+    --host 10.1.1.31 \
+    --port 30000 \
+    --config-list 1,3,1,4 \
+    --benchmark-list gsm8k:100 \
+    --dtype bfloat16 \
+    --skip-launch-server \
+    --name baseline_gsm8k \
+    --output-dir ./results \
+    2>&1 | tee logs/baseline_gsm8k_$(date +%Y%m%d_%H%M%S).log
+# 3. HumanEval
+echo "=== Running HumanEval (Baseline) ==="
+python benchmarks/bench_eagle3.py \
+    --model-path /workspace/Qwen3-8B \
+    --host 10.1.1.31 \
+    --port 30000 \
+    --config-list 1,3,1,4 \
+    --benchmark-list humaneval:164 \
+    --dtype bfloat16 \
+    --skip-launch-server \
+    --name baseline_humaneval \
+    --output-dir ./results \
+    2>&1 | tee logs/baseline_humaneval_$(date +%Y%m%d_%H%M%S).log
+echo "=== Baseline 测试完成 ==="
+```
+---
+## 2. 测试训练后的模型
+### 停止 Baseline 服务器并启动训练后的服务器（终端1）
+```bash
+cd /workspace/hanrui/SpecForge-ext
+# 停止旧服务器
+pkill -f "sglang.launch_server"
+sleep 5
+# 设置环境变量
+export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
+export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
+# 启动训练后的服务器
+python3 -m sglang.launch_server \
+    --model /workspace/Qwen3-8B \
+    --speculative-algorithm EAGLE3 \
+    --speculative-draft-model-path /workspace/hanrui/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_9_step_12310 \
+    --speculative-num-steps 3 \
+    --speculative-eagle-topk 1 \
+    --speculative-num-draft-tokens 4 \
+    --mem-fraction-static 0.75 \
+    --cuda-graph-max-bs 1 \
+    --tp 1 \
+    --trust-remote-code \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --dtype bfloat16 \
+    --skip-server-warmup
+```
+等待看到 `Application startup complete` 后，继续下一步。
+### 运行三个 Benchmark（终端2）
+```bash
+cd /workspace/hanrui/SpecForge-ext
+conda activate /workspace/Hanrui/
+# 设置环境变量
+export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
+export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
+# 1. MT-Bench
+echo "=== Running MT-Bench (Trained) ==="
+python benchmarks/bench_eagle3.py \
+    --model-path /workspace/Qwen3-8B \
+    --host 10.1.1.31 \
+    --port 30000 \
+    --config-list 1,3,1,4 \
+    --benchmark-list mtbench:80 \
+    --dtype bfloat16 \
+    --skip-launch-server \
+    --name trained_mtbench \
+    --output-dir ./results \
+    2>&1 | tee logs/trained_mtbench_$(date +%Y%m%d_%H%M%S).log
+# 2. GSM8K
+echo "=== Running GSM8K (Trained) ==="
+python benchmarks/bench_eagle3.py \
+    --model-path /workspace/Qwen3-8B \
+    --host 10.1.1.31 \
+    --port 30000 \
+    --config-list 1,3,1,4 \
+    --benchmark-list gsm8k:100 \
+    --dtype bfloat16 \
+    --skip-launch-server \
+    --name trained_gsm8k \
+    --output-dir ./results \
+    2>&1 | tee logs/trained_gsm8k_$(date +%Y%m%d_%H%M%S).log
+# 3. HumanEval
+echo "=== Running HumanEval (Trained) ==="
+python benchmarks/bench_eagle3.py \
+    --model-path /workspace/Qwen3-8B \
+    --host 10.1.1.31 \
+    --port 30000 \
+    --config-list 1,3,1,4 \
+    --benchmark-list humaneval:164 \
+    --dtype bfloat16 \
+    --skip-launch-server \
+    --name trained_humaneval \
+    --output-dir ./results \
+    2>&1 | tee logs/trained_humaneval_$(date +%Y%m%d_%H%M%S).log
+echo "=== Trained 测试完成 ==="
+```
+---
+## 3. 查看结果
+### 日志文件位置
+所有日志保存在：`/workspace/hanrui/SpecForge-ext/logs/`
+- `baseline_mtbench_*.log`
+- `baseline_gsm8k_*.log`
+- `baseline_humaneval_*.log`
+- `trained_mtbench_*.log`
+- `trained_gsm8k_*.log`
+- `trained_humaneval_*.log`
+所有结果保存在：`/workspace/hanrui/SpecForge-ext/results/`
+- `baseline_mtbench_*.jsonl`
+- `baseline_gsm8k_*.jsonl`
+- `baseline_humaneval_*.jsonl`
+- `trained_mtbench_*.jsonl`
+- `trained_gsm8k_*.jsonl`
+- `trained_humaneval_*.jsonl`
+### 生成对比报告
+```bash
+cd /workspace/hanrui/SpecForge-ext
+python3 << 'EOF'
+import json
+import glob
+print("=" * 80)
+print("Accept Length 对比报告")
+print("=" * 80)
+datasets = ['mtbench', 'gsm8k', 'humaneval']
+for dataset in datasets:
+    print(f"\n{'=' * 80}")
+    print(f"{dataset.upper()} 结果对比")
+    print('=' * 80)
+    baseline_files = sorted(glob.glob(f'results/baseline_{dataset}_*.jsonl'))
+    trained_files = sorted(glob.glob(f'results/trained_{dataset}_*.jsonl'))
+    if not baseline_files or not trained_files:
+        print(f"  未找到 {dataset} 的结果文件")
+        continue
+    with open(baseline_files[-1], 'r') as f:
+        baseline = json.load(f)
+    with open(trained_files[-1], 'r') as f:
+        trained = json.load(f)
+    baseline_metrics = baseline[dataset][0]['metrics'][0]
+    trained_metrics = trained[dataset][0]['metrics'][0]
+    print(f"\nBaseline:")
+    print(f"  Accept Length:      {baseline_metrics['accept_length']:.4f}")
+    print(f"  Output Throughput:  {baseline_metrics['output_throughput']:.2f} tokens/s")
+    if 'accuracy' in baseline_metrics and baseline_metrics['accuracy'] is not None:
+        print(f"  Accuracy:           {baseline_metrics['accuracy']:.2%}")
+    print(f"\nTrained:")
+    print(f"  Accept Length:      {trained_metrics['accept_length']:.4f}")
+    print(f"  Output Throughput:  {trained_metrics['output_throughput']:.2f} tokens/s")
+    if 'accuracy' in trained_metrics and trained_metrics['accuracy'] is not None:
+        print(f"  Accuracy:           {trained_metrics['accuracy']:.2%}")
+    accept_diff = trained_metrics['accept_length'] - baseline_metrics['accept_length']
+    accept_pct = (accept_diff / baseline_metrics['accept_length']) * 100
+    throughput_diff = trained_metrics['output_throughput'] - baseline_metrics['output_throughput']
+    throughput_pct = (throughput_diff / baseline_metrics['output_throughput']) * 100
+    print(f"\n差异:")
+    print(f"  Accept Length:      {accept_diff:+.4f} ({accept_pct:+.2f}%)")
+    print(f"  Throughput:         {throughput_diff:+.2f} tokens/s ({throughput_pct:+.2f}%)")
+    if 'accuracy' in baseline_metrics and baseline_metrics['accuracy'] is not None:
+        acc_diff = trained_metrics['accuracy'] - baseline_metrics['accuracy']
+        acc_pct = acc_diff * 100
+        print(f"  Accuracy:           {acc_pct:+.2f} percentage points")
+print("\n" + "=" * 80)
+EOF
+```
+---
+## 4. 快速查看单个结果
+```bash
+cd /workspace/hanrui/SpecForge-ext
+# 查看 baseline 的 accept_length
+cat results/baseline_mtbench_*.jsonl | jq '.mtbench[0].metrics[0].accept_length'
+cat results/baseline_gsm8k_*.jsonl | jq '.gsm8k[0].metrics[0].accept_length'
+cat results/baseline_humaneval_*.jsonl | jq '.humaneval[0].metrics[0].accept_length'
+# 查看 trained 的 accept_length
+cat results/trained_mtbench_*.jsonl | jq '.mtbench[0].metrics[0].accept_length'
+cat results/trained_gsm8k_*.jsonl | jq '.gsm8k[0].metrics[0].accept_length'
+cat results/trained_humaneval_*.jsonl | jq '.humaneval[0].metrics[0].accept_length'
+```

SpecForge/.editorconfig ADDED Viewed

	@@ -0,0 +1,25 @@

+# https://editorconfig.org/
+root = true
+[*]
+charset = utf-8
+end_of_line = lf
+indent_style = space
+indent_size = 4
+trim_trailing_whitespace = true
+insert_final_newline = true
+[*.{json,yaml,yml}]
+indent_size = 2
+[*.md]
+indent_size = 2
+x-soft-wrap-text = true
+[*.rst]
+indent_size = 4
+x-soft-wrap-text = true
+[Makefile]
+indent_style = tab

SpecForge/.isort.cfg ADDED Viewed

	@@ -0,0 +1,3 @@

+[settings]
+profile=black
+known_first_party=sgl-eagle

SpecForge/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+default_stages: [pre-commit, pre-push, manual]
+repos:
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.3.1
+    hooks:
+    -   id: autoflake
+        args: [--remove-all-unused-imports, --in-place]
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-symlinks
+      - id: destroyed-symlinks
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+        args: [--allow-multiple-documents]
+      - id: check-toml
+      - id: check-ast
+      - id: check-added-large-files
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: detect-private-key
+      - id: debug-statements
+      - id: no-commit-to-branch
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.10
+    hooks:
+      - id: ruff
+        args: [--select=F401, --fixable=F401]
+        files: ^(benchmark/|docs/|examples/)
+        exclude: \.ipynb$
+  - repo: https://github.com/psf/black
+    rev: 24.10.0
+    hooks:
+      - id: black-jupyter
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v18.1.8
+    hooks:
+    - id: clang-format
+      types_or: [c++, cuda]
+      args: [--style=file, --verbose]
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.8.1
+    hooks:
+      - id: nbstripout
+        args:
+          - '--keep-output'
+          - '--extra-keys=metadata.kernelspec metadata.language_info.version'

SpecForge/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 sgl-project
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

SpecForge/MANIFEST.in ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ include requirements.txt
2	+ include version.txt

SpecForge/README.md ADDED Viewed

	@@ -0,0 +1,70 @@

+<div align="center" id="sglangtop">
+<img src="./assets/logo.png" alt="logo" width="400" margin="10px"></img>
+[![documentation](https://img.shields.io/badge/📖-Documentation-red.svg?style=flat)](https://docs.sglang.ai/SpecForge/)
+[![SpecBundle](https://img.shields.io/badge/🤗%20SpecBundle-yellow.svg?style=flat)](https://huggingface.co/collections/lmsys/specbundle)
+[![DeepWiki](https://img.shields.io/badge/DeepWiki-SpecForge-blue.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACwAAAAyCAYAAAAnWDnqAAAAAXNSR0IArs4c6QAAA05JREFUaEPtmUtyEzEQhtWTQyQLHNak2AB7ZnyXZMEjXMGeK/AIi+QuHrMnbChYY7MIh8g01fJoopFb0uhhEqqcbWTp06/uv1saEDv4O3n3dV60RfP947Mm9/SQc0ICFQgzfc4CYZoTPAswgSJCCUJUnAAoRHOAUOcATwbmVLWdGoH//PB8mnKqScAhsD0kYP3j/Yt5LPQe2KvcXmGvRHcDnpxfL2zOYJ1mFwrryWTz0advv1Ut4CJgf5uhDuDj5eUcAUoahrdY/56ebRWeraTjMt/00Sh3UDtjgHtQNHwcRGOC98BJEAEymycmYcWwOprTgcB6VZ5JK5TAJ+fXGLBm3FDAmn6oPPjR4rKCAoJCal2eAiQp2x0vxTPB3ALO2CRkwmDy5WohzBDwSEFKRwPbknEggCPB/imwrycgxX2NzoMCHhPkDwqYMr9tRcP5qNrMZHkVnOjRMWwLCcr8ohBVb1OMjxLwGCvjTikrsBOiA6fNyCrm8V1rP93iVPpwaE+gO0SsWmPiXB+jikdf6SizrT5qKasx5j8ABbHpFTx+vFXp9EnYQmLx02h1QTTrl6eDqxLnGjporxl3NL3agEvXdT0WmEost648sQOYAeJS9Q7bfUVoMGnjo4AZdUMQku50McDcMWcBPvr0SzbTAFDfvJqwLzgxwATnCgnp4wDl6Aa+Ax283gghmj+vj7feE2KBBRMW3FzOpLOADl0Isb5587h/U4gGvkt5v60Z1VLG8BhYjbzRwyQZemwAd6cCR5/XFWLYZRIMpX39AR0tjaGGiGzLVyhse5C9RKC6ai42ppWPKiBagOvaYk8lO7DajerabOZP46Lby5wKjw1HCRx7p9sVMOWGzb/vA1hwiWc6jm3MvQDTogQkiqIhJV0nBQBTU+3okKCFDy9WwferkHjtxib7t3xIUQtHxnIwtx4mpg26/HfwVNVDb4oI9RHmx5WGelRVlrtiw43zboCLaxv46AZeB3IlTkwouebTr1y2NjSpHz68WNFjHvupy3q8TFn3Hos2IAk4Ju5dCo8B3wP7VPr/FGaKiG+T+v+TQqIrOqMTL1VdWV1DdmcbO8KXBz6esmYWYKPwDL5b5FA1a0hwapHiom0r/cKaoqr+27/XcrS5UwSMbQAAAABJRU5ErkJggg==)](https://deepwiki.com/sgl-project/SpecForge)
+[![github badge](https://img.shields.io/badge/📃%20LMSYS-Blog-black.svg?style=flat)](https://lmsys.org/blog/2025-07-25-spec-forge/)
+[![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&amp)](https://sgl-fru7574.slack.com/archives/C09784E3EN6)
+[![license](https://img.shields.io/badge/License-MIT%202.0-blue)](./LICENSE)
+</div>
+## 📍 Overview
+SpecForge is an ecosystem project developed by the SGLang team. It is a framework for training speculative decoding models so that you can smoothly port them over to the SGLang serving framework to speed up your inference.
+We have seen many open-source projects for speculative decoding, but most of them are not well-maintained or not directly compatible with SGLang. We prepared this project because we wish that the open-source community can enjoy a speculative decoding framework that is
+- regularly maintained by the SpecForge team: the code is runnable out-of-the-box
+- directly compatible with SGLang: there is no additional efforts for porting to SGLang
+- provide performant training capabilities: we provided online/offline/tensor-parallel/FSDP to suit your needs
+Check out [**our documentation**](https://docs.sglang.ai/SpecForge/) to get started.
+## 🚀 Accelerate with SpecBundle
+SpecBundle is a collection of production-grade speculative decoding models that are released by the SpecForge team and our industry partners. They provide higher acceptance rate compared to the existing open-source checkpoints over a wide range of domains. Together with SGLang, you can experience up to 4x speedup for inference. Check out our resources below:
+| Item | Link |
+| --- | --- |
+| 📝 Documentation | [Link](https://docs.sglang.io/SpecForge/community_resources/specbundle.html) |
+| 📊 Performance Dashboard | [Link](https://docs.sglang.io/SpecForge/SpecBundle/index.html) |
+| 🤗 Hugging Face Collection | [Link](https://huggingface.co/collections/lmsys/specbundle) |
+## 🎉 News
+- [2025-12] 🎉 Released SpecBundle (phase 1) and SpecForge v0.2. Check out our blog at [LMSYS.org](https://lmsys.org/blog/2025-12-23-spec-bundle-phase-1/)
+- [2025-12] 🔔 Released the roadmap for 2026 Q1.
+- [2025-08] 🔔 SpecForge is listed as a [flagship project](https://lmsys.org/about/) in LMSYS. Congratulations to the SpecForge team!
+- [2025-08] 🔥 SpecForge powered the Eagle3 draft model for GPT-OSS. Check out the blog at [LMSYS.org](https://lmsys.org/blog/2025-08-27-gpt-oss/)
+- [2025-07] 🔥 SpecForge is released together with Llama4-Eagle3 checkpoints. Check out our blog at [LMSYS.org](https://lmsys.org/blog/2025-07-25-spec-forge/)
+## ✨ Acknowledgements
+<img src="./assets/acknowledgements.png" alt="acknowledgements"></img>
+We would like to express our sincere gratitude to the official EAGLE team, especially Hongyang Zhang and Yuhui Li, for their invaluable contributions and support. Our thanks also go to the NVIDIA team—particularly Avery H and Izzy Putterman—and to the Google team, especially Ying Wang, for their insightful discussions and generous assistance throughout the project.
+We are especially grateful to Meituan for their strong backing and meaningful contributions, which played a vital role in driving this project forward.
+This project has also been inspired by many outstanding open-source projects from the LLM community, including [EAGLE](https://github.com/SafeAILab/EAGLE), [BaldEagle](https://github.com/NickL77/BaldEagle), and [TensorRT-Model-Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) and others. Their contributions and shared knowledge have greatly benefited our work.
+## 💡 Special Thanks to Voltage Park
+We would like to extend our sincere thanks to [Voltage Park](https://www.voltagepark.com/), our official infrastructure partner. As part of a formal collaboration with the SGLang team, Voltage Park provided critical GPU resources that empowered us to train and evaluate large-scale speculative decoding models efficiently and reliably. This partnership was instrumental in making SpecForge possible. We deeply appreciate Voltage Park’s mission to make cutting-edge AI infrastructure more accessible, and we look forward to continued collaboration as we push the boundaries of open-source LLM serving and optimization.
+## 📃 Citation
+```bibtex
+@misc{specforge2025,
+  title={SpecForge: Train speculative decoding models effortlessly},
+  author={Shenggui Li, Yikai Zhu, Chao Wang, Fan Yin, Shuai Shi, Yubo Wang, Yi Zhang, Yingyi Huang, Haoshuai Zheng, Yineng Zhang},
+  year={2025},
+  publisher={GitHub},
+  howpublished={\url{https://github.com/sgl-project/specforge}},
+}

SpecForge/pyproject.toml ADDED Viewed

	@@ -0,0 +1,47 @@

+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "specforge"
+dynamic = ["version"]
+readme = "README.md"
+requires-python = ">=3.11"
+description = "SpecForge: Speculative Decoding Training Framework"
+authors = [{name = "SGLang Team"}]
+urls = {Homepage = "https://github.com/sgl-project/SpecForge"}
+dependencies = [
+    "pre-commit",
+    "torch==2.9.1",
+    "torchaudio==2.9.1",
+    "torchvision==0.24.1",
+    "transformers==4.57.1",
+    "qwen-vl-utils==0.0.11",
+    "datasets",
+    "setuptools",
+    "tqdm",
+    "wandb",
+    "psutil",
+    "numpy",
+    "accelerate",
+    "pydantic",
+    "sglang==0.5.9",
+    "openai-harmony",
+    "ninja",
+    "packaging",
+    "yunchang",
+    "tensorboard",
+]
+[tool.setuptools.packages.find]
+exclude = ["configs*", "scripts*", "tests*"]
+[project.optional-dependencies]
+dev = [
+    "pre-commit",
+    "unittest"
+]
+fa = ["flash-attn"]
+[tool.setuptools.dynamic]
+version = {file = "version.txt"}

SpecForge/requirements-rocm.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+# Use the PyTorch ROCm wheel index (choose the stream that matches your system)
+--extra-index-url https://download.pytorch.org/whl/rocm6.3
+pre-commit
+torch==2.8.0+rocm6.3
+torchaudio==2.8.0+rocm6.3
+torchvision==0.23.0+rocm6.3
+transformers==4.57.1
+qwen-vl-utils==0.0.11
+datasets
+setuptools
+tqdm
+wandb
+psutil
+numpy
+accelerate
+pydantic
+sglang[all]==0.5.4
+openai-harmony
+tensorboard

SpecForge/version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0.2.0

idea1/.editorconfig ADDED Viewed

	@@ -0,0 +1,25 @@

+# https://editorconfig.org/
+root = true
+[*]
+charset = utf-8
+end_of_line = lf
+indent_style = space
+indent_size = 4
+trim_trailing_whitespace = true
+insert_final_newline = true
+[*.{json,yaml,yml}]
+indent_size = 2
+[*.md]
+indent_size = 2
+x-soft-wrap-text = true
+[*.rst]
+indent_size = 4
+x-soft-wrap-text = true
+[Makefile]
+indent_style = tab

idea1/.isort.cfg ADDED Viewed

	@@ -0,0 +1,3 @@

+[settings]
+profile=black
+known_first_party=sgl-eagle

idea1/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+default_stages: [pre-commit, pre-push, manual]
+repos:
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.3.1
+    hooks:
+    -   id: autoflake
+        args: [--remove-all-unused-imports, --in-place]
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-symlinks
+      - id: destroyed-symlinks
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+        args: [--allow-multiple-documents]
+      - id: check-toml
+      - id: check-ast
+      - id: check-added-large-files
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: detect-private-key
+      - id: debug-statements
+      - id: no-commit-to-branch
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.10
+    hooks:
+      - id: ruff
+        args: [--select=F401, --fixable=F401]
+        files: ^(benchmark/|docs/|examples/)
+        exclude: \.ipynb$
+  - repo: https://github.com/psf/black
+    rev: 24.10.0
+    hooks:
+      - id: black-jupyter
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v18.1.8
+    hooks:
+    - id: clang-format
+      types_or: [c++, cuda]
+      args: [--style=file, --verbose]
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.8.1
+    hooks:
+      - id: nbstripout
+        args:
+          - '--keep-output'
+          - '--extra-keys=metadata.kernelspec metadata.language_info.version'

idea1/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 sgl-project
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

idea1/requirements-rocm.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+# Use the PyTorch ROCm wheel index (choose the stream that matches your system)
+--extra-index-url https://download.pytorch.org/whl/rocm6.3
+pre-commit
+torch==2.8.0+rocm6.3
+torchaudio==2.8.0+rocm6.3
+torchvision==0.23.0+rocm6.3
+transformers==4.57.1
+qwen-vl-utils==0.0.11
+datasets
+setuptools
+tqdm
+wandb
+psutil
+numpy
+accelerate
+pydantic
+sglang[all]==0.5.4
+openai-harmony
+tensorboard

idea1/version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0.2.0

qwen3-8b_dflash_regen/.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+sharegpt_train_regenerated.jsonl filter=lfs diff=lfs merge=lfs -text

syxin/backup.log ADDED Viewed

The diff for this file is too large to render. See raw diff

syxin/dflash_lora_changelog.md ADDED Viewed

	@@ -0,0 +1,232 @@

+# DFlash LoRA 全部改动记录
+## 概述
+为了让 Qwen3-8B DFlash LoRA 训练在 2×H100 上跑通（解决 OOM），共新增/修改了 **5 个文件，1084 行代码**。改动分为两大阶段：基础搭建 + OOM 修复。
+---
+## 新增文件清单
+| 文件 | 行数 | 用途 |
+|------|------|------|
+| `specforge/core/dflash_lora.py` | 453 | 训练 wrapper（OnlineDFlashLoRAModel） |
+| `specforge/modeling/draft/dflash_lora.py` | 141 | LoRA draft 模型（DFlashLoRADraftModel） |
+| `scripts/train_dflash_lora.py` | 449 | 训练入口脚本 |
+| `scripts/run_train_dflash_lora.sh` | 31 | 启动 shell 脚本 |
+| `configs/qwen3-8b-dflash-lora.json` | 10 | LoRA 配置文件 |
+---
+## Step 1 完成过程
+### 1.1 分析现有代码
+首先分析了非 LoRA 版 `train_dflash.py` 的完整流程：
+```
+input_ids → target_model.generate_dflash_data() → hidden_states
+         → OnlineDFlashModel.forward():
+             1. 截断到 block 边界
+             2. prepare_noise_input(): anchor 保留，其余 → MASK
+             3. embed_tokens(noise_input_ids) → noise_embedding
+             4. 构建 DFlash attention mask
+             5. draft_model(noise_embedding, target_hidden, mask)
+             6. lm_head(hidden) → logits → CE loss
+```
+非 LoRA 版使用独立的小型 draft model + 冻结 target model 提取 hidden states。
+### 1.2 确定 LoRA 版设计差异
+| 方面 | 非 LoRA 版 (`train_dflash.py`) | LoRA 版 (`train_dflash_lora.py`) |
+|------|------|------|
+| Draft model | 自定义小模型 (1-10 层) | Qwen3-8B + PEFT LoRA |
+| Target model | 冻结大模型提取 hidden states | 无需 — 模型用自身表征 |
+| Attention | 自定义 Qwen3DFlashAttention，KV = [ctx, noise] concat | 标准 HF attention + DFlash mask |
+| KV 结构 | Q_LEN = noise_len, KV_LEN = 2×noise_len | Q_LEN = KV_LEN = seq_len |
+| 可训练参数 | 全部 draft model 参数 | 仅 LoRA (q/k/v/o_proj) |
+### 1.3 新建 LoRA 版三个核心文件
+#### `specforge/modeling/draft/dflash_lora.py` — DFlashLoRADraftModel
+- `from_pretrained()`: 加载 Qwen3-8B，注入 PEFT LoRA，支持 `attn_implementation` 参数
+- `forward()`: 标准 HF forward，支持 `output_hidden_states` 参数（chunked loss 需要）
+- `get_lm_head()`: 穿透 PEFT 层级获取 lm_head 引用
+- `gradient_checkpointing_enable()`: 代理到底层模型
+- `save_pretrained()`: 仅保存 LoRA adapter 权重
+#### `specforge/core/dflash_lora.py` — OnlineDFlashLoRAModel
+- `prepare_noise_input()`: context 部分保持不变，block 部分只保留 anchor，其余替换为 MASK
+- `build_dflash_full_attn_mask_fast()`: 向量化构建 4D additive mask `[bsz, 1, seq, seq]`
+- `_compute_loss_weights()`: context + anchor 权重为 0，非 anchor 权重为 1（或 decay）
+- `_full_lm_loss()`: 标准 CE loss 路径
+- `_compute_accuracy()`: block-wise acceptance rate（累积正确预测长度 / block 非 anchor 长度）
+- `forward()`: 完整训练 forward pass
+LoRA 版 mask 规则：
+- context token i → 因果注意力 (j ≤ i)
+- block token i (属于 block b) → 所有 context + 同 block 内双向注意力
+#### `scripts/train_dflash_lora.py` — 训练脚本
+- 参数解析：model/lora/dataset/training/output/distributed/tracker 7 组参数
+- `build_model()`: 加载模型 + 注入 LoRA + 包装 OnlineDFlashLoRAModel
+- `build_dataloader()`: 复用 `build_eagle3_dataset` 和 `prepare_dp_dataloaders`
+- FSDP 包装 + BF16Optimizer
+- 训练循环：forward → backward → accumulation → optimizer step
+- checkpoint 保存/恢复
+---
+## OOM 修复改动（4 项）
+### 改动 1: FSDP FULL_SHARD (ZeRO-3)
+**问题**: `SHARD_GRAD_OP` (ZeRO-2) 每卡持有完整 Qwen3-8B 参数 (~16GB bf16)
+**修复**: `train_dflash_lora.py:362`
+```python
+# 之前
+sharding_strategy=ShardingStrategy.SHARD_GRAD_OP
+# 之后
+sharding_strategy=ShardingStrategy.FULL_SHARD
+```
+**效果**: 参数跨卡分片，每卡省 ~8-12GB
+### 改动 2: batch_size=1 + accumulation_steps=8
+**问题**: `batch_size=2` 时峰值显存过高
+**修复**: `run_train_dflash_lora.sh`
+```bash
+--batch-size 1 \
+--accumulation-steps 8 \
+```
+**效果**: 等效 global batch size 不变，峰值显存减半
+### 改动 3: flex_attention + BlockMask 替换 4D additive mask
+**问题**: SDPA 不支持 4D additive mask → fallback 到 math backend → 每层 materialize 完整 `[bsz, 32heads, 2048, 2048]` attention scores
+**修复**: 从非 LoRA 版 `dflash.py` 移植 `_get_or_create_block_mask()` 方法，适配 LoRA 场景
+涉及文件：
+1. **`specforge/core/dflash_lora.py`**
+   - `__init__()`: 添加 `attention_backend` 参数（默认 `"flex_attention"`），BlockMask 缓存字段
+   - 新增 `_get_or_create_block_mask()`: 用 `create_block_mask()` 构建零显存的 BlockMask
+   - `forward()`: 根据 `attention_backend` 选择 BlockMask 或 additive mask
+2. **`specforge/modeling/draft/dflash_lora.py`**
+   - `from_pretrained()`: 当 backend 为 flex_attention 时，传 `attn_implementation="flex_attention"` 给 HuggingFace
+3. **`scripts/train_dflash_lora.py`**
+   - `parse_args()`: `--attention-backend` 参数 (`flex_attention` | `additive`)
+   - `build_model()`: 根据 backend 选择 `attn_implementation`
+BlockMask mask function（LoRA 版）：
+```python
+def dflash_lora_mask_fn(b, h, q_idx, kv_idx):
+    # Context query: 标准因果
+    is_q_ctx = q_idx < context_len
+    ctx_visible = is_q_ctx & (kv_idx <= q_idx)
+    # Block query: 全部 context + 同 block 双向
+    is_q_block = q_idx >= context_len
+    is_k_ctx = kv_idx < context_len
+    q_block_id = (q_idx - context_len) // block_size
+    k_block_id = (kv_idx - context_len) // block_size
+    block_attend_ctx = is_q_block & is_k_ctx
+    block_attend_same = is_q_block & (~is_k_ctx) & (q_block_id == k_block_id)
+    return ctx_visible | (block_attend_ctx | block_attend_same)
+```
+**验证**: 手动逐元素对比 BlockMask 和 additive mask 输出，三组测试 (context_len=4/0, seq=12/16/64) pattern 完全一致。
+**效果**: 不再 fallback 到 SDPA math backend，省去 `[bsz, heads, seq, seq]` attention scores 显存
+### 改动 4: chunked cross-entropy loss
+**问题**: `[bsz, 2048, 151936]` bf16 logits ≈ 1.18GB，加梯度 ~2.4GB+
+**修复**: 从非 LoRA 版 `dflash.py:419-478` 移植 chunked loss
+涉及文件：
+1. **`specforge/core/dflash_lora.py`**
+   - `__init__()`: 添加 `lm_head_chunk_size` 参数（默认 0 = 不启用）
+   - 新增 `_chunked_lm_loss()`: 分 chunk 过 lm_head + CE loss + gradient checkpointing
+   - 提取 `_full_lm_loss()`: 原始非 chunked 路径
+   - `forward()`: `lm_head_chunk_size > 0` 时走 chunked 路径
+2. **`specforge/modeling/draft/dflash_lora.py`**
+   - `forward()`: 新增 `output_hidden_states` 参数，True 时返回 last hidden state 而非 logits
+   - `get_lm_head()`: 穿透 PEFT 层级返回 `base_model.lm_head` 引用
+3. **`scripts/train_dflash_lora.py`**
+   - `parse_args()`: `--lm-head-chunk-size` 参数（默认 0，推荐 256）
+   - `build_model()`: 传递到 OnlineDFlashLoRAModel
+Chunked loss 核心逻辑：
+```python
+# 分 chunk 计算，每 chunk 用 gradient checkpointing（backward 时重算 logits，不存储）
+for start in range(0, effective_len, chunk_size):
+    end = min(start + chunk_size, effective_len)
+    chunk_loss, chunk_weight = grad_checkpoint(
+        _chunk_ce,                          # lm_head + CE
+        hidden[:, start:end, :],            # 只取当前 chunk
+        input_ids[:, start:end],
+        combined_mask[:, start:end],
+        use_reentrant=False,
+    )
+    total_loss += chunk_loss
+    total_weight += chunk_weight
+loss = total_loss / total_weight
+```
+**效果**: logits 峰值显存从 `O(seq_len × vocab_size)` 降至 `O(chunk_size × vocab_size)`，256 chunk → ~150MB vs 1.18GB
+---
+## 当前训练命令
+```bash
+bash run_train_dflash_lora.sh 2   # 2 = GPU 数量
+```
+对应完整参数：
+```bash
+torchrun --nproc_per_node 2 scripts/train_dflash_lora.py \
+    --model-path /workspace/Qwen3-8B \
+    --train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \
+    --output-dir outputs/qwen3-8b-dflash-lora \
+    --lora-config configs/qwen3-8b-dflash-lora.json \
+    --block-size 16 \
+    --max-length 2048 \
+    --batch-size 1 \
+    --num-epochs 3 \
+    --learning-rate 2e-4 \
+    --accumulation-steps 8 \
+    --loss-decay-gamma 7 \
+    --attention-backend flex_attention \
+    --lm-head-chunk-size 256 \
+    --gradient-checkpointing \
+    --chat-template qwen \
+    --log-interval 50 \
+    --save-interval 500
+```
+---
+## 待验证
+- [ ] 跑 `bash run_train_dflash_lora.sh 2` 确认不再 OOM
+- [ ] 确认无 SDPA math fallback warning
+- [ ] 观察 GPU 显存峰值
+- [ ] 确认 loss 下降和 accuracy 上升趋势正常

syxin/eval_accepted_length.md ADDED Viewed

	@@ -0,0 +1,217 @@

+# DFlash-LoRA-Inject 评测：Accepted Length & Accuracy
+## 为什么不能用 sglang 在线评测？
+DFlash-LoRA-Inject 的推理需要**逐层注入 target 模型的 hidden states** 到 draft 模型中，
+这是 LoRA-Inject 训练时的核心机制。但 sglang 不支持这种推理模式：
+| sglang 算法 | 问题 |
+|---|---|
+| `STANDALONE` | 把 draft 当独立自回归模型跑，**完全忽略 layer injection**。merged 模型 ≈ 原始 Qwen3-8B，accept_length 恒 ≈ 4.7，跟 LoRA 训没训没关系 |
+| `DFLASH` | 期望 DFlash-b16 架构（5 层 + fc + hidden_norm），跟 LoRA-Inject（36 层全模型）结构不匹配 |
+因此必须**离线评测**：加载 target + draft 两个模型，手动实现带 layer injection 的 speculative decoding 循环。
+---
+## 基本信息
+| 项目 | 路径 / 值 |
+|---|---|
+| conda 环境 | `spec` |
+| 基座模型（target） | `/workspace/models/Qwen3-8B` |
+| 训练输出（最终 ckpt） | `.../outputs/qwen3-8b-dflash-lora-inject/epoch_3_step_1400` |
+| 合并后 draft 模型 | `.../outputs/qwen3-8b-dflash-lora-inject-merged` |
+| 评测脚本 | `/workspace/hanrui/syxin_old/eval_dflash_lora_inject.py` |
+| 本地数据集 | `/workspace/hanrui/datasets/{humaneval,mtbench,gsm8k}` |
+| 结果输出目录 | `/workspace/hanrui/syxin_old/Specforge/benchmarks/results/` |
+| GPU | 8 × H100 80GB（单卡即可，需 ~32GB 加载两个 8B 模型） |
+---
+## Step 1：合并 LoRA 权重
+LoRA-Inject 训练只保存 adapter 权重，评测时需要完整模型。
+```bash
+conda activate spec
+python3 -c "
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch, os
+BASE    = '/workspace/models/Qwen3-8B'
+ADAPTER = '/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-inject/epoch_3_step_1400'
+MERGED  = '/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-inject-merged'
+if os.path.exists(MERGED):
+    print(f'[skip] Merged model already exists: {MERGED}')
+else:
+    print('[1/4] Loading base model to CPU ...')
+    model = AutoModelForCausalLM.from_pretrained(BASE, torch_dtype=torch.bfloat16, device_map='cpu')
+    print('[2/4] Loading LoRA adapter ...')
+    model = PeftModel.from_pretrained(model, ADAPTER)
+    print('[3/4] Merging weights ...')
+    model = model.merge_and_unload()
+    print('[4/4] Saving merged model ...')
+    os.makedirs(MERGED, exist_ok=True)
+    model.save_pretrained(MERGED, safe_serialization=True)
+    AutoTokenizer.from_pretrained(BASE).save_pretrained(MERGED)
+    print(f'Done. Merged model saved to: {MERGED}')
+"
+```
+> 耗时约 3–5 分钟，CPU 内存占用 ≈ 16 GB。已存在则自动跳过。
+---
+## Step 2：离线评测 accepted length
+**不需要启动 sglang server**，直接跑：
+### 全部 Bench（推荐）
+```bash
+bash /workspace/hanrui/syxin_old/run_bench_dflash.sh
+```
+### 单独跑 / 快速测试
+```bash
+# 只跑 HumanEval
+bash /workspace/hanrui/syxin_old/run_bench_dflash.sh humaneval
+# 快速测试（每个 bench 20 条）
+bash /workspace/hanrui/syxin_old/run_bench_dflash.sh --quick
+# 指定 checkpoint
+bash /workspace/hanrui/syxin_old/run_bench_dflash.sh --ckpt epoch_0_step_1000
+# 组合
+bash /workspace/hanrui/syxin_old/run_bench_dflash.sh humaneval gsm8k --quick
+```
+### 或者直接调 Python
+```bash
+conda activate spec
+python3 /workspace/hanrui/syxin_old/eval_dflash_lora_inject.py \
+    --benchmarks humaneval mtbench gsm8k \
+    --block-size 16 \
+    --max-new-tokens 512 \
+    --temperature 0.0
+```
+---
+## 结果文件说明
+结果保存在 `results/` 下，文件名示例：
+```
+dflash_lora_inject_offline_epoch_3_step_1400_20260314_150000.json
+```
+```json
+{
+  "model": "dflash-lora-inject/epoch_3_step_1400",
+  "block_size": 16,
+  "humaneval": {
+    "avg_accept_length": 3.42,
+    "total_tokens": 28500,
+    "latency": 120.5,
+    "throughput": 236.5,
+    "num_samples": 164,
+    "num_verify_rounds": 8320
+  },
+  "mtbench": { ... },
+  "gsm8k": { ... }
+}
+```
+| 字段 | 含义 |
+|---|---|
+| `avg_accept_length` | **核心指标**：平均每次 verify 接受的 token 数（含 injection）。越高越好，`1.0` = draft 完全无效 |
+| `total_tokens` | 总生成 token 数 |
+| `throughput` | tokens/s（离线评测，不含 batching 优化） |
+| `num_verify_rounds` | 总验证轮数 |
+---
+## 对比 baseline
+对比未经 LoRA 训练的原始 Qwen3-8B 当 draft 的 accept_length：
+```bash
+python3 /workspace/hanrui/syxin_old/eval_dflash_lora_inject.py \
+    --merged-path /workspace/models/Qwen3-8B \
+    --benchmarks humaneval mtbench gsm8k \
+    --num-samples 50
+```
+> 这会用原始 Qwen3-8B 同时当 target 和 draft（带 injection），
+> 对比 LoRA 训练前后 accept_length 是否有提升。
+---
+## 如何测其他 checkpoint
+```bash
+# 方法 1：直接加载 adapter（自动 merge，不保存）
+python3 /workspace/hanrui/syxin_old/eval_dflash_lora_inject.py \
+    --ckpt epoch_0_step_1000 \
+    --benchmarks humaneval --num-samples 50
+# 方法 2：预先 merge 到不同目录
+python3 -c "
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch, os
+BASE = '/workspace/models/Qwen3-8B'
+ADAPTER = '/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-inject/epoch_0_step_1000'
+MERGED = '/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-inject-merged-epoch_0_step_1000'
+model = AutoModelForCausalLM.from_pretrained(BASE, torch_dtype=torch.bfloat16, device_map='cpu')
+model = PeftModel.from_pretrained(model, ADAPTER).merge_and_unload()
+os.makedirs(MERGED, exist_ok=True)
+model.save_pretrained(MERGED, safe_serialization=True)
+AutoTokenizer.from_pretrained(BASE).save_pretrained(MERGED)
+"
+python3 /workspace/hanrui/syxin_old/eval_dflash_lora_inject.py \
+    --merged-path .../qwen3-8b-dflash-lora-inject-merged-epoch_0_step_1000 \
+    --benchmarks humaneval --num-samples 50
+```
+可用 checkpoint：`epoch_0_step_500` / `epoch_0_step_1000` / `epoch_0_step_1400` / `epoch_2_step_34500` / `epoch_2_step_35000` / `epoch_3_step_1400`
+---
+## 常见问题
+### Q1：accept_length 和 STANDALONE 模式下差不多（都 ≈ 4.7）
+这说明 layer injection 没有真正起作用。检查：
+- 评测脚本确实用的是 `eval_dflash_lora_inject.py`（离线），不是 sglang bench
+- merged 模型确实是 LoRA-Inject 版本（不是原始 Qwen3-8B）
+### Q2：OOM（单卡放不下两个 8B 模型）
+两个 bf16 的 Qwen3-8B ≈ 32GB，单卡 H100 80GB 够用。如果 OOM：
+- 检查是否有其他进程占用显存
+- 减小 `--max-new-tokens`（试 256）
+- 减小 `--num-samples`
+### Q3：数据集下载失败（无外网）
+评测脚本优先读本地文件：
+| bench | 本地文件 |
+|---|---|
+| GSM8K | `/workspace/hanrui/datasets/gsm8k/test.jsonl` |
+| MT-Bench | `/workspace/hanrui/datasets/mtbench/question.jsonl` |
+| HumanEval | `/workspace/hanrui/datasets/humaneval/test.jsonl` |
+---
+*基座：`/workspace/models/Qwen3-8B` | 最终 ckpt：`epoch_3_step_1400` | block_size：16*

syxin/eval_dflash_b16_baseline.py ADDED Viewed

	@@ -0,0 +1,354 @@

+#!/usr/bin/env python3
+"""
+Offline evaluation for DFlash-b16 baseline: measure accepted length.
+8 GPUs parallel, each GPU loads target + draft independently.
+Usage:
+    # 8 GPUs
+    torchrun --nproc_per_node 8 eval_dflash_b16_baseline.py
+    # quick test
+    torchrun --nproc_per_node 8 eval_dflash_b16_baseline.py --num-samples 20
+    # single GPU
+    python3 eval_dflash_b16_baseline.py --benchmarks humaneval
+"""
+import argparse
+import json
+import os
+import sys
+import time
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from tqdm import tqdm
+from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, DynamicCache
+# Add DFlash model path so we can import utils
+sys.path.insert(0, "/workspace/models/Qwen3-8B-DFlash-b16")
+from utils import extract_context_feature, sample
+# ──────────────────────────────────────────────────────────────────
+BASE_MODEL = "/workspace/models/Qwen3-8B"
+DRAFT_MODEL = "/workspace/models/Qwen3-8B-DFlash-b16"
+RESULT_DIR = "/workspace/hanrui/syxin_old/Specforge/benchmarks/results"
+# ──────────────────────────────────────────────────────────────────
+# Distributed helpers
+# ──────────────────────────────────────────────────────────────────
+def is_distributed():
+    return dist.is_available() and dist.is_initialized()
+def get_rank():
+    return dist.get_rank() if is_distributed() else 0
+def get_world_size():
+    return dist.get_world_size() if is_distributed() else 1
+def is_main():
+    return get_rank() == 0
+def print_rank0(*args, **kwargs):
+    if is_main():
+        print(*args, **kwargs)
+def split_list(lst, rank, world_size):
+    return [x for i, x in enumerate(lst) if i % world_size == rank]
+# ──────────────────────────────────────────────────────────────────
+# Prompts
+# ──────────────────────────────────────────────────────────────────
+def load_prompts(bench_name: str, num_samples: Optional[int] = None) -> List[str]:
+    local_paths = {
+        "humaneval": "/workspace/hanrui/datasets/humaneval/test.jsonl",
+        "mtbench":   "/workspace/hanrui/datasets/mtbench/question.jsonl",
+        "gsm8k":     "/workspace/hanrui/datasets/gsm8k/test.jsonl",
+    }
+    prompts = []
+    path = local_paths.get(bench_name)
+    if path and os.path.exists(path):
+        with open(path) as f:
+            for line in f:
+                item = json.loads(line)
+                if bench_name == "humaneval":
+                    p = f"Write a solution to the following problem and make sure that it passes the tests:\n```python\n{item['prompt']}\n```"
+                elif bench_name == "mtbench":
+                    p = item.get("turns", [item.get("prompt", "")])[0]
+                elif bench_name == "gsm8k":
+                    p = item["question"] + "\nPlease reason step by step, and put your final answer within \\boxed{}."
+                else:
+                    p = str(item)
+                prompts.append(p)
+    else:
+        from datasets import load_dataset
+        if bench_name == "humaneval":
+            ds = load_dataset("openai/openai_humaneval", split="test")
+            prompts = [f"Write a solution to the following problem and make sure that it passes the tests:\n```python\n{x['prompt']}\n```" for x in ds]
+        elif bench_name == "mtbench":
+            ds = load_dataset("HuggingFaceH4/mt_bench_prompts", split="train")
+            prompts = [x["prompt"][0] for x in ds]
+        elif bench_name == "gsm8k":
+            ds = load_dataset("openai/gsm8k", "main", split="test")
+            prompts = [x["question"] + "\nPlease reason step by step, and put your final answer within \\boxed{}." for x in ds]
+    if num_samples is not None:
+        prompts = prompts[:num_samples]
+    return prompts
+# ──────────────────────────────────────────────────────────────────
+# spec_generate with acceptance_lengths returned
+# (Same logic as DFlashDraftModel.spec_generate but returns accept lens)
+# ──────────────────────────────────────────────────────────────────
+@torch.inference_mode()
+def spec_generate_b16(
+    draft_model,
+    target_model: nn.Module,
+    input_ids: torch.LongTensor,
+    max_new_tokens: int = 512,
+    temperature: float = 0.0,
+    stop_token_ids: Optional[List[int]] = None,
+) -> Tuple[torch.Tensor, List[int]]:
+    """Same as DFlashDraftModel.spec_generate but also returns acceptance_lengths."""
+    draft_model.eval()
+    device = target_model.device if hasattr(target_model, 'device') else input_ids.device
+    num_input_tokens = input_ids.shape[1]
+    max_length = num_input_tokens + max_new_tokens
+    block_size = draft_model.block_size
+    mask_token_id = draft_model.mask_token_id
+    output_ids = torch.full(
+        (1, max_length + block_size), mask_token_id,
+        dtype=torch.long, device=device,
+    )
+    position_ids = torch.arange(output_ids.shape[1], device=device).unsqueeze(0)
+    past_key_values_target = DynamicCache()
+    past_key_values_draft = DynamicCache()
+    # Prefill
+    output = target_model(
+        input_ids,
+        position_ids=position_ids[:, :num_input_tokens],
+        past_key_values=past_key_values_target,
+        use_cache=True,
+        logits_to_keep=1,
+        output_hidden_states=True,
+    )
+    output_ids[:, :num_input_tokens] = input_ids
+    output_ids[:, num_input_tokens:num_input_tokens + 1] = sample(output.logits, temperature)
+    target_hidden = extract_context_feature(output.hidden_states, draft_model.target_layer_ids)
+    # Decode
+    acceptance_lengths = []
+    start = num_input_tokens
+    while start < max_length:
+        block_output_ids = output_ids[:, start:start + block_size].clone()
+        block_position_ids = position_ids[:, start:start + block_size]
+        noise_embedding = target_model.model.embed_tokens(block_output_ids)
+        draft_logits = target_model.lm_head(
+            draft_model(
+                target_hidden=target_hidden,
+                noise_embedding=noise_embedding,
+                position_ids=position_ids[:, past_key_values_draft.get_seq_length():start + block_size],
+                past_key_values=past_key_values_draft,
+                use_cache=True,
+                is_causal=False,
+            )[:, -block_size + 1:, :]
+        )
+        past_key_values_draft.crop(start)
+        block_output_ids[:, 1:] = sample(draft_logits)
+        output = target_model(
+            block_output_ids,
+            position_ids=block_position_ids,
+            past_key_values=past_key_values_target,
+            use_cache=True,
+            output_hidden_states=True,
+        )
+        posterior = sample(output.logits, temperature)
+        acceptance_length = (
+            (block_output_ids[:, 1:] == posterior[:, :-1])
+            .cumprod(dim=1).sum(dim=1)[0].item()
+        )
+        output_ids[:, start:start + int(acceptance_length) + 1] = block_output_ids[:, :int(acceptance_length) + 1]
+        output_ids[:, start + int(acceptance_length) + 1] = posterior[:, int(acceptance_length)]
+        start += int(acceptance_length) + 1
+        past_key_values_target.crop(start)
+        target_hidden = extract_context_feature(
+            output.hidden_states, draft_model.target_layer_ids
+        )[:, :int(acceptance_length) + 1, :]
+        acceptance_lengths.append(int(acceptance_length) + 1)
+        if stop_token_ids is not None and any(
+            sid in output_ids[:, num_input_tokens:start] for sid in stop_token_ids
+        ):
+            break
+    output_ids = output_ids[:, :max_length]
+    output_ids = output_ids[:, output_ids[0] != mask_token_id]
+    if stop_token_ids is not None:
+        stop_t = torch.tensor(stop_token_ids, device=output_ids.device)
+        stop_idx = torch.isin(output_ids[0][num_input_tokens:], stop_t).nonzero(as_tuple=True)[0]
+        if stop_idx.numel() > 0:
+            output_ids = output_ids[:, :num_input_tokens + stop_idx[0] + 1]
+    return output_ids, acceptance_lengths
+# ──────────────────────────────────────────────────────────────────
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--base-model", default=BASE_MODEL)
+    p.add_argument("--draft-model", default=DRAFT_MODEL)
+    p.add_argument("--max-new-tokens", type=int, default=512)
+    p.add_argument("--temperature", type=float, default=0.0)
+    p.add_argument("--benchmarks", nargs="+", default=["humaneval", "mtbench", "gsm8k"])
+    p.add_argument("--num-samples", type=int, default=None)
+    p.add_argument("--output-dir", default=RESULT_DIR)
+    return p.parse_args()
+def main():
+    args = parse_args()
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    if world_size > 1:
+        dist.init_process_group(backend="nccl")
+        torch.cuda.set_device(local_rank)
+    device = f"cuda:{local_rank}"
+    rank = get_rank()
+    print_rank0(f"Running DFlash-b16 baseline on {world_size} GPU(s)")
+    # ── Load models ──
+    print_rank0(f"Loading target: {args.base_model}")
+    target_model = AutoModelForCausalLM.from_pretrained(
+        args.base_model,
+        torch_dtype=torch.bfloat16,
+        device_map=device,
+        trust_remote_code=True,
+    )
+    target_model.eval()
+    print_rank0(f"Loading DFlash-b16 draft: {args.draft_model}")
+    draft_model = AutoModel.from_pretrained(
+        args.draft_model,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+    ).to(device)
+    draft_model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(args.base_model, trust_remote_code=True)
+    stop_token_ids = [tokenizer.eos_token_id]
+    print_rank0(f"DFlash-b16: block_size={draft_model.block_size}, "
+                f"target_layer_ids={draft_model.target_layer_ids}, "
+                f"num_layers={len(draft_model.layers)}")
+    # ── Run benchmarks ──
+    results = {"model": "Qwen3-8B-DFlash-b16", "type": "baseline",
+               "block_size": draft_model.block_size}
+    for bench_name in args.benchmarks:
+        print_rank0(f"\n{'='*60}")
+        print_rank0(f"Benchmark: {bench_name} ({world_size} GPUs)")
+        print_rank0(f"{'='*60}")
+        all_prompts = load_prompts(bench_name, args.num_samples)
+        my_prompts = split_list(all_prompts, rank, world_size)
+        print_rank0(f"Total {len(all_prompts)} prompts, ~{len(my_prompts)} per GPU")
+        local_accept_lengths = []
+        local_tokens = 0
+        t0 = time.time()
+        iterator = tqdm(my_prompts, desc=f"[GPU{rank}] {bench_name}", unit="sample",
+                        disable=(rank != 0))
+        for prompt in iterator:
+            messages = [{"role": "user", "content": prompt}]
+            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
+            output_ids, accept_lens = spec_generate_b16(
+                draft_model=draft_model,
+                target_model=target_model,
+                input_ids=input_ids,
+                max_new_tokens=args.max_new_tokens,
+                temperature=args.temperature,
+                stop_token_ids=stop_token_ids,
+            )
+            local_accept_lengths.extend(accept_lens)
+            num_gen = output_ids.shape[1] - input_ids.shape[1]
+            local_tokens += num_gen
+            if rank == 0 and len(local_accept_lengths) > 0:
+                avg = sum(local_accept_lengths) / len(local_accept_lengths)
+                iterator.set_postfix(accept_len=f"{avg:.2f}", tokens=local_tokens, gen=num_gen)
+        elapsed = time.time() - t0
+        # ── Gather ──
+        if world_size > 1:
+            local_sum = torch.tensor(sum(local_accept_lengths), dtype=torch.float64, device=device)
+            local_count = torch.tensor(len(local_accept_lengths), dtype=torch.long, device=device)
+            local_tok = torch.tensor(local_tokens, dtype=torch.long, device=device)
+            dist.all_reduce(local_sum, op=dist.ReduceOp.SUM)
+            dist.all_reduce(local_count, op=dist.ReduceOp.SUM)
+            dist.all_reduce(local_tok, op=dist.ReduceOp.SUM)
+            total_accept_sum = local_sum.item()
+            total_count = local_count.item()
+            total_tokens = local_tok.item()
+        else:
+            total_accept_sum = sum(local_accept_lengths)
+            total_count = len(local_accept_lengths)
+            total_tokens = local_tokens
+        avg_accept_length = total_accept_sum / max(total_count, 1)
+        throughput = total_tokens / elapsed if elapsed > 0 else 0
+        print_rank0(f"\n{bench_name} Results:")
+        print_rank0(f"  Avg Accept Length: {avg_accept_length:.3f}")
+        print_rank0(f"  Total tokens: {total_tokens}")
+        print_rank0(f"  Latency: {elapsed:.1f}s")
+        print_rank0(f"  Throughput: {throughput:.1f} tok/s (aggregate {world_size} GPUs)")
+        print_rank0(f"  Num verify rounds: {total_count}")
+        print_rank0(f"  Num samples: {len(all_prompts)}")
+        results[bench_name] = {
+            "avg_accept_length": avg_accept_length,
+            "total_tokens": total_tokens,
+            "latency": elapsed,
+            "throughput": throughput,
+            "num_samples": len(all_prompts),
+            "num_verify_rounds": total_count,
+            "num_gpus": world_size,
+        }
+    # ── Save ──
+    if is_main():
+        os.makedirs(args.output_dir, exist_ok=True)
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        result_file = os.path.join(
+            args.output_dir,
+            f"dflash_b16_baseline_offline_{timestamp}.json",
+        )
+        with open(result_file, "w") as f:
+            json.dump(results, f, indent=2)
+        print(f"\nResults saved to: {result_file}")
+    if world_size > 1:
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

syxin/eval_dflash_lora_inject.py ADDED Viewed

	@@ -0,0 +1,627 @@

+#!/usr/bin/env python3
+"""
+Offline evaluation for DFlash-LoRA-Inject: measure accepted length & speedup.
+Aligned with official DFlash benchmark.py methodology.
+Unlike DFlash-b16 which uses a small 5-layer draft model with fc/hidden_norm,
+LoRA-Inject uses a full Qwen3-8B with LoRA adapters that receives target hidden
+states via layer-by-layer injection.
+Usage:
+    conda activate spec
+    # 8 GPU parallel (default, all 10 benchmarks)
+    torchrun --nproc_per_node 8 eval_dflash_lora_inject.py
+    # single GPU
+    python3 eval_dflash_lora_inject.py
+    # specific checkpoint / benchmark
+    torchrun --nproc_per_node 8 eval_dflash_lora_inject.py --ckpt epoch_0_step_1000 --datasets humaneval
+    # quick test
+    torchrun --nproc_per_node 8 eval_dflash_lora_inject.py --max-samples 20
+"""
+import argparse
+import json
+import os
+import random
+import sys
+import time
+import warnings
+from itertools import chain
+from types import SimpleNamespace
+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from peft import PeftModel
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache
+# Import official dataset loader
+sys.path.insert(0, "/workspace/hanrui/dflash")
+from model.utils import load_and_process_dataset
+# ──────────────────────────────────────────────────────────────────
+# Config defaults
+# ──────────────────────────────────────────────────────────────────
+BASE_MODEL = "/workspace/models/Qwen3-8B"
+ADAPTER_ROOT = "/workspace/hanrui/syxin/Specforge/outputs/qwen3-8b-dflash-lora-inject"
+DEFAULT_CKPT = "epoch_3_step_1400"
+MASK_TOKEN_ID = 151669          # Qwen3 <|mask|>
+BLOCK_SIZE = 16
+RESULT_DIR = "/workspace/hanrui/syxin/Specforge/benchmarks/results"
+# Official benchmark tasks (from run_benchmark.sh)
+OFFICIAL_TASKS = {
+    "gsm8k": 128,
+    "math500": 128,
+    "aime24": 30,
+    "aime25": 30,
+    "humaneval": 164,
+    "mbpp": 128,
+    "livecodebench": 128,
+    "swe-bench": 128,
+    "mt-bench": 80,
+    "alpaca": 128,
+}
+# ──────────────────────────────────────────────────────────────────
+# CUDA-synchronised timer (matches official benchmark.py)
+# ──────────────────────────────────────────────────────────────────
+def cuda_time() -> float:
+    torch.cuda.synchronize()
+    return time.perf_counter()
+def has_flash_attn() -> bool:
+    try:
+        import flash_attn  # noqa: F401
+        return True
+    except ImportError:
+        print("[WARN] flash_attn not installed, falling back to sdpa.")
+        return False
+# ──────────────────────────────────────────────────────────────────
+# Distributed helpers (mirrors official distributed.py)
+# ──────────────────────────────────────────────────────────────────
+def dist_init():
+    if "RANK" not in os.environ:
+        warnings.warn("RANK not set. Skipping distributed init.")
+        return
+    dist.init_process_group(backend="nccl", init_method="env://")
+def dist_rank():
+    return int(os.environ.get("RANK", 0))
+def dist_size():
+    return int(os.environ.get("WORLD_SIZE", 1))
+def dist_local_rank():
+    return int(os.environ.get("LOCAL_RANK", 0))
+def dist_is_main():
+    return dist_rank() == 0
+def dist_gather(obj, dst=0):
+    if not dist.is_initialized():
+        return [obj]
+    if dist_is_main():
+        objs = [None for _ in range(dist_size())]
+        dist.gather_object(obj, objs, dst=dst)
+        return objs
+    else:
+        dist.gather_object(obj, dst=dst)
+        return None
+def print_rank0(*args, **kwargs):
+    if dist_is_main():
+        print(*args, **kwargs)
+# ──────────────────────────────────────────────────────────────────
+# Sampling (matches official model/utils.py::sample)
+# ──────────────────────────────────────────────────────────────────
+def sample(logits: torch.Tensor, temperature: float = 0.0) -> torch.Tensor:
+    if temperature < 1e-5:
+        return torch.argmax(logits, dim=-1)
+    bsz, seq_len, vocab_size = logits.shape
+    logits = logits.view(-1, vocab_size)
+    logits = logits / temperature
+    probs = torch.softmax(logits, dim=-1)
+    return torch.multinomial(probs, num_samples=1).view(bsz, seq_len)
+# ──────────────────────────────────────────────────────────────────
+# Build DFlash attention mask (vectorized, no Python loops)
+# ──────────────────────────────────────────────────────────────────
+def build_dflash_mask(ctx_len: int, block_size: int, device, dtype=torch.bfloat16):
+    """
+    Build DFlash attention mask for [context | block] sequence.
+    - Context part: standard causal
+    - Block part: each token sees all context + all tokens in same block (bidirectional)
+    """
+    full_len = ctx_len + block_size
+    neg_inf = torch.finfo(dtype).min
+    mask = torch.full((1, 1, full_len, full_len), neg_inf, device=device, dtype=dtype)
+    if ctx_len > 0:
+        ctx_rows = torch.arange(ctx_len, device=device)
+        ctx_cols = torch.arange(ctx_len, device=device)
+        causal = ctx_cols.unsqueeze(0) <= ctx_rows.unsqueeze(1)
+        mask[0, 0, :ctx_len, :ctx_len].masked_fill_(causal, 0)
+    if ctx_len > 0:
+        mask[0, 0, ctx_len:, :ctx_len] = 0
+    mask[0, 0, ctx_len:, ctx_len:] = 0
+    return mask
+# ──────────────────────────────────────────────────────────────────
+# Pure autoregressive generation (target model only, no draft)
+# Used for AR baseline timing -- avoids inflating AR time with draft overhead.
+# ──────────────────────────────────────────────────────────────────
+@torch.inference_mode()
+def ar_generate(
+    target_model: nn.Module,
+    input_ids: torch.LongTensor,
+    max_new_tokens: int = 2048,
+    mask_token_id: int = MASK_TOKEN_ID,
+    temperature: float = 0.0,
+    stop_token_ids: Optional[List[int]] = None,
+) -> SimpleNamespace:
+    """
+    Pure autoregressive generation using only the target model.
+    Mirrors official benchmark.py with block_size=1 (no draft model involved).
+    Returns SimpleNamespace matching official dflash_generate output format.
+    """
+    device = input_ids.device
+    num_input_tokens = input_ids.shape[1]
+    max_length = num_input_tokens + max_new_tokens
+    output_ids = torch.full(
+        (1, max_length + 1), mask_token_id,
+        dtype=torch.long, device=device,
+    )
+    output_ids[:, :num_input_tokens] = input_ids
+    position_ids = torch.arange(output_ids.shape[1], device=device).unsqueeze(0)
+    past_key_values = DynamicCache()
+    # Prefill
+    prefill_start = cuda_time()
+    output = target_model(
+        input_ids,
+        position_ids=position_ids[:, :num_input_tokens],
+        past_key_values=past_key_values,
+        use_cache=True,
+        logits_to_keep=1,
+        output_hidden_states=False,
+    )
+    first_token = sample(output.logits, temperature)
+    output_ids[:, num_input_tokens:num_input_tokens + 1] = first_token
+    time_to_first_token = cuda_time() - prefill_start
+    # Decode (autoregressive, one token at a time)
+    decode_start = cuda_time()
+    start = num_input_tokens
+    while start < max_length:
+        cur_token = output_ids[:, start:start + 1]
+        cur_pos = position_ids[:, start:start + 1]
+        output = target_model(
+            cur_token,
+            position_ids=cur_pos,
+            past_key_values=past_key_values,
+            use_cache=True,
+            output_hidden_states=False,
+        )
+        next_token = sample(output.logits, temperature)
+        start += 1
+        output_ids[:, start:start + 1] = next_token
+        past_key_values.crop(start)
+        # Check stop tokens (matches official: check all generated)
+        if stop_token_ids is not None and any(
+            sid in output_ids[:, num_input_tokens:] for sid in stop_token_ids
+        ):
+            break
+    output_ids = output_ids[:, :max_length]
+    output_ids = output_ids[:, output_ids[0] != mask_token_id]
+    if stop_token_ids is not None:
+        stop_t = torch.tensor(stop_token_ids, device=output_ids.device)
+        stop_idx = torch.isin(output_ids[0][num_input_tokens:], stop_t).nonzero(as_tuple=True)[0]
+        if stop_idx.numel() > 0:
+            output_ids = output_ids[:, :num_input_tokens + stop_idx[0] + 1]
+    num_output_tokens = output_ids.shape[1] - num_input_tokens
+    total_decode_time = cuda_time() - decode_start
+    time_per_output_token = total_decode_time / max(num_output_tokens, 1)
+    return SimpleNamespace(
+        output_ids=output_ids,
+        num_input_tokens=num_input_tokens,
+        num_output_tokens=num_output_tokens,
+        time_to_first_token=time_to_first_token,
+        time_per_output_token=time_per_output_token,
+        acceptance_lengths=[1] * max(num_output_tokens, 0),  # AR: always 1
+    )
+# ──────────────────────────────────────────────────────────────────
+# Core: spec_generate with layer-by-layer injection (KV-cached)
+# ──────────────────────────────────────────────────────────────────
+@torch.inference_mode()
+def spec_generate_inject(
+    target_model: nn.Module,
+    draft_model: nn.Module,
+    input_ids: torch.LongTensor,
+    max_new_tokens: int = 2048,
+    block_size: int = 16,
+    mask_token_id: int = MASK_TOKEN_ID,
+    temperature: float = 0.0,
+    stop_token_ids: Optional[List[int]] = None,
+) -> SimpleNamespace:
+    """
+    Speculative generation using DFlash-LoRA-Inject inference pattern.
+    Returns SimpleNamespace matching official dflash_generate output format.
+    """
+    device = input_ids.device
+    num_input_tokens = input_ids.shape[1]
+    max_length = num_input_tokens + max_new_tokens
+    draft_layers = draft_model.model.layers
+    draft_norm = draft_model.model.norm
+    draft_lm_head = draft_model.lm_head
+    rotary_emb = draft_model.model.rotary_emb
+    num_layers = len(draft_layers)
+    output_ids = torch.full(
+        (1, max_length + block_size), mask_token_id,
+        dtype=torch.long, device=device,
+    )
+    output_ids[:, :num_input_tokens] = input_ids
+    # ── Prefill: target with KV cache + hidden states ──
+    prefill_start = cuda_time()
+    target_kv = DynamicCache()
+    target_output = target_model(
+        input_ids,
+        past_key_values=target_kv,
+        use_cache=True,
+        output_hidden_states=True,
+    )
+    first_token = sample(target_output.logits[:, -1:, :], temperature)
+    output_ids[:, num_input_tokens] = first_token.squeeze()
+    ctx_hidden_per_layer = [
+        target_output.hidden_states[i + 1]
+        for i in range(num_layers)
+    ]
+    time_to_first_token = cuda_time() - prefill_start
+    # Decode
+    decode_start = cuda_time()
+    acceptance_lengths = []
+    start = num_input_tokens
+    draft_prefill = True
+    while start < max_length:
+        end = min(start + block_size, max_length)
+        actual_block_size = end - start
+        block_ids = output_ids[:, start:end].clone()
+        # ── Draft: forward with layer-by-layer injection ──
+        draft_hidden = draft_model.model.embed_tokens(block_ids)
+        ctx_len = ctx_hidden_per_layer[0].shape[1]
+        dflash_mask = build_dflash_mask(ctx_len, actual_block_size, device)
+        combined_pos = torch.arange(ctx_len + actual_block_size, device=device).unsqueeze(0)
+        dummy_combined = torch.empty(1, ctx_len + actual_block_size, draft_hidden.shape[-1],
+                                     device=device, dtype=torch.bfloat16)
+        position_embeddings = rotary_emb(dummy_combined, combined_pos)
+        for layer_idx in range(num_layers):
+            target_ctx = ctx_hidden_per_layer[layer_idx]
+            combined = torch.cat([target_ctx, draft_hidden], dim=1)
+            layer_output = draft_layers[layer_idx](
+                combined,
+                attention_mask=dflash_mask,
+                position_ids=combined_pos,
+                position_embeddings=position_embeddings,
+            )
+            if isinstance(layer_output, tuple):
+                layer_output = layer_output[0]
+            draft_hidden = layer_output[:, ctx_len:, :]
+        draft_hidden = draft_norm(draft_hidden)
+        draft_logits = draft_lm_head(draft_hidden)
+        draft_predictions = sample(draft_logits[:, :-1, :], temperature)
+        block_ids[:, 1:actual_block_size] = draft_predictions[:, :actual_block_size - 1]
+        # Exclude draft's first prefill from decode timing (matches official pattern)
+        if draft_prefill:
+            draft_prefill = False
+            decode_start = cuda_time()
+        # ── Verify: target forward on block tokens (with KV cache) ──
+        position_ids_block = torch.arange(
+            start, start + actual_block_size, device=device
+        ).unsqueeze(0)
+        target_verify = target_model(
+            block_ids,
+            position_ids=position_ids_block,
+            past_key_values=target_kv,
+            use_cache=True,
+            output_hidden_states=True,
+        )
+        target_tokens = sample(target_verify.logits, temperature)
+        # Acceptance
+        matches = (block_ids[:, 1:actual_block_size] == target_tokens[:, :actual_block_size - 1])
+        acceptance_length = int(matches.cumprod(dim=1).sum(dim=1)[0].item())
+        output_ids[:, start:start + acceptance_length + 1] = block_ids[:, :acceptance_length + 1]
+        output_ids[:, start + acceptance_length + 1] = target_tokens[:, acceptance_length]
+        accepted_end = start + acceptance_length + 1
+        target_kv.crop(accepted_end)
+        for i in range(num_layers):
+            new_hidden = target_verify.hidden_states[i + 1][:, :acceptance_length + 1, :]
+            ctx_hidden_per_layer[i] = torch.cat([ctx_hidden_per_layer[i], new_hidden], dim=1)
+        start += acceptance_length + 1
+        acceptance_lengths.append(acceptance_length + 1)
+        # Official: check ALL generated tokens
+        if stop_token_ids is not None and any(
+            sid in output_ids[:, num_input_tokens:] for sid in stop_token_ids
+        ):
+            break
+    output_ids = output_ids[:, :min(start, max_length)]
+    output_ids = output_ids[:, output_ids[0] != mask_token_id]
+    if stop_token_ids is not None:
+        stop_t = torch.tensor(stop_token_ids, device=output_ids.device)
+        stop_idx = torch.isin(output_ids[0][num_input_tokens:], stop_t).nonzero(as_tuple=True)[0]
+        if stop_idx.numel() > 0:
+            output_ids = output_ids[:, :num_input_tokens + stop_idx[0] + 1]
+    num_output_tokens = output_ids.shape[1] - num_input_tokens
+    total_decode_time = cuda_time() - decode_start
+    time_per_output_token = total_decode_time / max(num_output_tokens, 1)
+    return SimpleNamespace(
+        output_ids=output_ids,
+        num_input_tokens=num_input_tokens,
+        num_output_tokens=num_output_tokens,
+        time_to_first_token=time_to_first_token,
+        time_per_output_token=time_per_output_token,
+        acceptance_lengths=acceptance_lengths,
+    )
+# ──────────────────────────────────────────────────────────────────
+# Main
+# ──────────────────────────────────────────────────────────────────
+def parse_args():
+    p = argparse.ArgumentParser(description="Offline eval for DFlash-LoRA-Inject (aligned with official)")
+    p.add_argument("--base-model", default=BASE_MODEL)
+    p.add_argument("--adapter-root", default=ADAPTER_ROOT)
+    p.add_argument("--ckpt", default=DEFAULT_CKPT, help="Checkpoint folder name")
+    p.add_argument("--merged-path",
+                   default="/workspace/hanrui/syxin/Specforge/outputs/qwen3-8b-dflash-lora-inject-merged",
+                   help="Path to pre-merged model. If None, will merge on the fly.")
+    p.add_argument("--block-size", type=int, default=BLOCK_SIZE)
+    p.add_argument("--max-new-tokens", type=int, default=2048,
+                   help="Max new tokens per turn (official shell uses 2048)")
+    p.add_argument("--temperature", type=float, default=0.0)
+    p.add_argument("--datasets", nargs="+", default=list(OFFICIAL_TASKS.keys()),
+                   help="Benchmarks to run (default: all 10 official tasks)")
+    p.add_argument("--max-samples", type=int, default=None,
+                   help="Override max samples per dataset (None = use official per-task counts)")
+    p.add_argument("--output-dir", default=RESULT_DIR)
+    return p.parse_args()
+def main():
+    args = parse_args()
+    # Fix random seeds (matches official)
+    random.seed(0)
+    np.random.seed(0)
+    torch.manual_seed(0)
+    torch.cuda.manual_seed_all(0)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    # ── Init distributed ──
+    dist_init()
+    torch.cuda.set_device(dist_local_rank())
+    device = torch.device(f"cuda:{dist_local_rank()}")
+    print_rank0(f"Running on {dist_size()} GPU(s)")
+    # Detect flash_attn (only for target model; draft needs sdpa for custom DFlash mask)
+    installed_flash_attn = has_flash_attn()
+    target_attn_impl = "flash_attention_2" if installed_flash_attn else "sdpa"
+    draft_attn_impl = "sdpa"  # DFlash injection uses custom attention mask
+    print_rank0(f"Using attn_implementation: target={target_attn_impl}, draft={draft_attn_impl}")
+    # ── Load models ──
+    print_rank0(f"Loading target model: {args.base_model}")
+    target_model = AutoModelForCausalLM.from_pretrained(
+        args.base_model,
+        torch_dtype=torch.bfloat16,
+        attn_implementation=target_attn_impl,
+        device_map=device,
+        trust_remote_code=True,
+    )
+    target_model.eval()
+    if args.merged_path and os.path.isdir(args.merged_path):
+        print_rank0(f"Loading pre-merged draft model: {args.merged_path}")
+        draft_model = AutoModelForCausalLM.from_pretrained(
+            args.merged_path,
+            torch_dtype=torch.bfloat16,
+            attn_implementation=draft_attn_impl,
+            device_map=device,
+            trust_remote_code=True,
+        )
+    else:
+        adapter_path = os.path.join(args.adapter_root, args.ckpt)
+        print_rank0(f"Loading base + LoRA adapter: {adapter_path}")
+        draft_model = AutoModelForCausalLM.from_pretrained(
+            args.base_model,
+            torch_dtype=torch.bfloat16,
+            attn_implementation=draft_attn_impl,
+            device_map=device,
+            trust_remote_code=True,
+        )
+        draft_model = PeftModel.from_pretrained(draft_model, adapter_path)
+        draft_model = draft_model.merge_and_unload()
+    draft_model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(args.base_model, trust_remote_code=True)
+    stop_token_ids = [tokenizer.eos_token_id]
+    block_size = args.block_size
+    # ── Run benchmarks ──
+    all_results = {"model": f"dflash-lora-inject/{args.ckpt}", "block_size": block_size}
+    for dataset_name in args.datasets:
+        print_rank0(f"\n{'=' * 60}")
+        print_rank0(f"Benchmark: {dataset_name} ({dist_size()} GPUs)")
+        print_rank0(f"{'=' * 60}")
+        # Load dataset using official loader
+        dataset = load_and_process_dataset(dataset_name)
+        # Sample selection: official uses shuffle(seed=0).select()
+        max_samples = args.max_samples if args.max_samples is not None else OFFICIAL_TASKS.get(dataset_name)
+        if max_samples is not None and len(dataset) > max_samples:
+            dataset = dataset.shuffle(seed=0).select(range(max_samples))
+        print_rank0(f"Total {len(dataset)} samples, distributed across {dist_size()} GPUs")
+        responses = []
+        indices = range(dist_rank(), len(dataset), dist_size())
+        iterator = tqdm(indices, desc=f"[GPU{dist_rank()}] {dataset_name}",
+                        unit="sample", disable=not dist_is_main())
+        for idx in iterator:
+            instance = dataset[idx]
+            # Multi-turn support (matches official benchmark.py)
+            messages = []
+            for turn_index, user_content in enumerate(instance["turns"]):
+                messages.append({"role": "user", "content": user_content})
+                input_text = tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True,
+                    enable_thinking=False,
+                )
+                input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
+                response = {}
+                # AR baseline: pure target-only autoregressive (no draft overhead)
+                response[1] = ar_generate(
+                    target_model=target_model,
+                    input_ids=input_ids,
+                    max_new_tokens=args.max_new_tokens,
+                    mask_token_id=MASK_TOKEN_ID,
+                    temperature=args.temperature,
+                    stop_token_ids=stop_token_ids,
+                )
+                # Speculative: DFlash-LoRA-Inject
+                response[block_size] = spec_generate_inject(
+                    target_model=target_model,
+                    draft_model=draft_model,
+                    input_ids=input_ids,
+                    max_new_tokens=args.max_new_tokens,
+                    block_size=block_size,
+                    mask_token_id=MASK_TOKEN_ID,
+                    temperature=args.temperature,
+                    stop_token_ids=stop_token_ids,
+                )
+                # Append assistant response for multi-turn context
+                spec_response = response[block_size]
+                generated_ids = spec_response.output_ids[0, spec_response.num_input_tokens:]
+                output_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
+                messages.append({"role": "assistant", "content": output_text})
+                responses.append(response)
+            if dist_is_main() and responses:
+                recent_tau = np.mean([np.mean(r[block_size].acceptance_lengths) for r in responses[-5:]])
+                iterator.set_postfix(accept_len=f"{recent_tau:.2f}")
+        # ── Gather to rank 0 (matches official) ──
+        if dist_size() > 1:
+            gathered = dist_gather(responses, dst=0)
+            if not dist_is_main():
+                continue
+            responses = list(chain(*gathered))
+        elif not dist_is_main():
+            continue
+        # ── Compute metrics (exact official formulas) ──
+        t1 = np.mean([r[1].time_per_output_token for r in responses])
+        tb = np.mean([r[block_size].time_per_output_token for r in responses])
+        speedup = t1 / tb if tb > 0 else 0
+        # Acceptance length: per-sample mean, then mean of means (official)
+        tau = np.mean([np.mean(r[block_size].acceptance_lengths) for r in responses])
+        # Histogram
+        acceptance_lengths = list(chain(*[r[block_size].acceptance_lengths for r in responses]))
+        histogram = [acceptance_lengths.count(b) / len(acceptance_lengths) for b in range(block_size + 1)]
+        print_rank0(f"\n{dataset_name} Results:")
+        print_rank0(f"  Decoding speedup: {speedup:.2f}x")
+        print_rank0(f"  Average Acceptance length: {tau:.2f}")
+        print_rank0(f"  Acceptance length histogram: {[f'{x * 100:.1f}%' for x in histogram]}")
+        print_rank0(f"  Num responses: {len(responses)}")
+        all_results[dataset_name] = {
+            "decoding_speedup": speedup,
+            "avg_accept_length": tau,
+            "acceptance_histogram": histogram,
+            "num_responses": len(responses),
+            "num_gpus": dist_size(),
+        }
+    # ── Save results ──
+    if dist_is_main():
+        os.makedirs(args.output_dir, exist_ok=True)
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        result_file = os.path.join(
+            args.output_dir,
+            f"dflash_lora_inject_offline_{args.ckpt}_{timestamp}.json",
+        )
+        with open(result_file, "w") as f:
+            json.dump(all_results, f, indent=2)
+        print(f"\nResults saved to: {result_file}")
+if __name__ == "__main__":
+    main()

syxin/idea.md ADDED Viewed

	@@ -0,0 +1,23 @@

+现在关于target model的hidden state注入
+dflash的做法是，抽5层的feature过一下fc然后concat到mask token对应的hidden state前面
+但是如果我们的draft是用lora的原始模型
+我们不用这样注入
+我们可以直接把target model的hidden state直接层对层拉过来
+我是把加了lora后的模型作为draft model用的
+它本质上还是一个speculative decode
+我的想法的核心是，因为这个draft model足够大，也和target model足够像，把他转为和dflash一样每次用mask直接生成16个token，可能能得到很长的accept len，以此获得加速
+而dflash能work的核心是，它在生成阶段是使用的部分target model的hidden state，注入到mask token的hidden state前面
+我们也用相同的做法
+带lora的模型，lora只负责让它能并行解码16个mask token，但是前面的上下文信息，依然用原始model跑出来的，通过注入放进draft的时候
+而且由于模型结构的一致，我们可以直接层对层注入进去

syxin/launch_train.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/bin/bash
+set -euo pipefail
+cd /workspace/hanrui/syxin/Specforge
+export TORCHINDUCTOR_CACHE_DIR=/workspace/hanrui/cache/compiled_kernels
+export SPECFORGE_DATA_NUM_PROC=16
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export PYTORCH_ALLOC_CONF=expandable_segments:True
+export HF_DATASETS_CACHE=/workspace/hanrui/cache/hf_datasets
+export HF_HOME=/workspace/hanrui/cache/hf_home
+torchrun --nproc_per_node=8 \
+  scripts/train_dflash_lora_inject.py \
+  --target-model-path /workspace/models/Qwen3-8B \
+  --target-model-backend hf \
+  --train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \
+  --output-dir outputs/qwen3-8b-sft-32gpu-v2 \
+  --block-size 16 \
+  --attention-backend additive \
+  --attn-implementation sdpa \
+  --max-length 2048 \
+  --batch-size 4 \
+  --accumulation-steps 8 \
+  --num-epochs 3 \
+  --learning-rate 5e-5 \
+  --loss-decay-gamma 7 \
+  --gradient-checkpointing \
+  --chat-template qwen \
+  --log-interval 50 \
+  --save-interval 500 \
+  --cache-dir /workspace/hanrui/cache \
+  --lora-rank 32 \
+  --lora-alpha 64 \
+  --lora-dropout 0.1 \
+  --trust-remote-code \
+  --dataloader-num-workers 0

syxin/launch_train_wrapper.py ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/env python3
+"""
+Python wrapper to launch bash training script via torchrun
+"""
+import subprocess
+import sys
+import os
+if __name__ == "__main__":
+    # Get the bash script path and arguments
+    bash_script = "/workspace/hanrui/syxin/run_train_multinode.sh"
+    args = sys.argv[1:]  # Pass through all arguments
+    # Build the command
+    cmd = ["bash", bash_script] + args
+    # Execute the bash script
+    result = subprocess.run(cmd, env=os.environ.copy())
+    # Exit with the same code as the bash script
+    sys.exit(result.returncode)

syxin/list.md ADDED Viewed

	@@ -0,0 +1,12 @@

+### 1. `train_dflash_lora.py`
+* 加了lora，原来是调用小模型，现在是hidden states+lora预测。
+* `dflash_lora_mask_fn`函数是在处理预测的那一块草稿Block时，可以同时看到这一块里的所有词。
+### 2. OOM优化
+* 分片策略ZeRO-3，FSDP切分从`SHARD_GRAD_OP`升级到`FULL_SHARD`。
+* `batch-size=1`，`accumulation-steps=8`。
+* 参考之前的代码用了FlexAttention（`dflash_lora_mask_fn`）。
+* `_chunked_lm_loss()`，把算loss切片成256块来算+梯度检查。
+### 运行
+* bash /workspace/hanrui/junquan/SpecForge/scripts/run_train_dflash_lora.sh 2

syxin/merge_lora.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""
+Step 1: Merge DFlash-LoRA adapter into base model.
+Usage:
+    conda activate sglang
+    python3 merge_lora.py
+    python3 merge_lora.py --ckpt epoch_2_step_15000   # 测其他 checkpoint
+"""
+import argparse
+import os
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+BASE_MODEL  = "/workspace/models/Qwen3-8B"
+OUTPUT_ROOT = "/workspace/hanrui/syxin/Specforge/outputs/qwen3-8b-dflash-lora"
+MERGE_ROOT  = "/workspace/hanrui/syxin/Specforge/outputs/qwen3-8b-dflash-lora-merged"
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--ckpt", default="epoch_3_step_18576",
+                   help="Checkpoint folder name under OUTPUT_ROOT")
+    p.add_argument("--merged-path", default=MERGE_ROOT,
+                   help="Where to save the merged model")
+    return p.parse_args()
+def main():
+    args = parse_args()
+    adapter_path = os.path.join(OUTPUT_ROOT, args.ckpt)
+    merged_path  = args.merged_path
+    if os.path.exists(merged_path):
+        print(f"[skip] Merged model already exists: {merged_path}")
+        return
+    assert os.path.isdir(adapter_path), f"Adapter not found: {adapter_path}"
+    print(f"Base model  : {BASE_MODEL}")
+    print(f"Adapter     : {adapter_path}")
+    print(f"Output      : {merged_path}")
+    print()
+    print("[1/4] Loading base model to CPU ...")
+    model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL,
+        torch_dtype=torch.bfloat16,
+        device_map="cpu",
+    )
+    print("[2/4] Loading LoRA adapter ...")
+    model = PeftModel.from_pretrained(model, adapter_path)
+    print("[3/4] Merging weights ...")
+    model = model.merge_and_unload()
+    print("[4/4] Saving merged model ...")
+    os.makedirs(merged_path, exist_ok=True)
+    model.save_pretrained(merged_path, safe_serialization=True)
+    AutoTokenizer.from_pretrained(BASE_MODEL).save_pretrained(merged_path)
+    print(f"\nDone. Merged model saved to: {merged_path}")
+if __name__ == "__main__":
+    main()

syxin/oom_fix_progress.md ADDED Viewed

	@@ -0,0 +1,42 @@

+# DFlash LoRA OOM 修复记录
+## OOM 根因分析
+1. **SHARD_GRAD_OP (ZeRO-2)** — 每卡持有完整 Qwen3-8B 参数 (~16GB bf16)，参数未分片
+2. **SDPA + 4D additive mask** — FlashAttention 不支持 4D additive mask，fallback 到 math backend，每层 materialize 完整 attention scores (`bsz × 32heads × 2048 × 2048`)
+3. **大 vocab logits** — `[bsz, 2048, 151936]` bf16 ≈ 1.18GB，加上梯度和 boolean indexing 拷贝，峰值 ~3-4GB
+4. **机器只有 2 张 H100**，脚本默认 `NUM_GPUS=4`
+## 已完成的改动
+### 1. FSDP sharding 改为 FULL_SHARD (ZeRO-3)
+- 文件: `SpecForge/scripts/train_dflash_lora.py:347`
+- `ShardingStrategy.SHARD_GRAD_OP` → `ShardingStrategy.FULL_SHARD`
+- 效果: 参数跨卡分片，每卡省 ~8-12GB
+### 2. 降 batch-size，提高 accumulation-steps
+- 文件: `SpecForge/scripts/run_train_dflash_lora.sh`
+- `--batch-size 2` → `1`，`--accumulation-steps 4` → `8`
+- 效果: 等效 global batch size 不变，峰值显存减半
+## 待验证 / 后续优化
+- [ ] 运行时传 `bash run_train_dflash_lora.sh 2` 确保用 2 卡
+- [x] 如仍 OOM，考虑 chunked cross-entropy loss 避免大 vocab logits 全量 materialize
+- [x] 长期可探索自定义 attention kernel 支持 block-sparse mask，绕过 SDPA math fallback
+### 3. flex_attention + BlockMask 替换 4D additive mask
+- 文件: `SpecForge/specforge/core/dflash_lora.py`, `specforge/modeling/draft/dflash_lora.py`, `scripts/train_dflash_lora.py`
+- 从非 LoRA 版 `dflash.py` 移植 `_get_or_create_block_mask()` 方法，适配 LoRA 场景 (Q_LEN == KV_LEN == seq_len)
+- LoRA 版 mask: context causal + block bidirectional (非 LoRA 版是 [context, noise] concat KV)
+- 用 `--attention-backend flex_attention` 启用 (默认)，退回 `--attention-backend additive` 走原有 4D mask
+- HuggingFace model 用 `attn_implementation="flex_attention"` 加载
+- 效果: 不再 fallback 到 SDPA math backend，省去 `[bsz, heads, seq, seq]` attention scores 的显存
+### 4. chunked cross-entropy loss
+- 文件: `SpecForge/specforge/core/dflash_lora.py`, `specforge/modeling/draft/dflash_lora.py`, `scripts/train_dflash_lora.py`
+- 从非 LoRA 版 `dflash.py` 移植 `_chunked_lm_loss()` 方法
+- 分 chunk 过 lm_head + CE loss + gradient checkpointing，避免 materialize 完整 `[bsz, seq, vocab]` logits
+- 用 `--lm-head-chunk-size 256` 启用 (默认 0 = 不启用)
+- `DFlashLoRADraftModel.forward()` 新增 `output_hidden_states` 参数，chunked 时返回 hidden states
+- 效果: logits 峰值显存从 O(seq_len × vocab_size) 降至 O(chunk_size × vocab_size)

syxin/requirements.txt ADDED Viewed

File without changes

syxin/run_bench.sh ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/bin/bash
+# Step 3: Run HumanEval / MT-Bench / GSM8K benchmarks.
+# Run AFTER start_server.sh is up.
+# Usage:
+#   bash run_bench.sh                    # all three benches, full dataset
+#   bash run_bench.sh humaneval          # only humaneval
+#   bash run_bench.sh mtbench gsm8k      # pick any subset
+set -e
+INTRANET_IP=10.1.1.131
+PORT=30000
+BASE_MODEL=/workspace/models/Qwen3-8B
+MERGED=/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-merged
+BENCH_DIR=/workspace/hanrui/syxin_old/Specforge/benchmarks
+RESULT_DIR=$BENCH_DIR/results
+# ---- sanity check ----
+echo "Checking server at http://$INTRANET_IP:$PORT ..."
+curl -sf http://$INTRANET_IP:$PORT/v1/models > /dev/null || {
+    echo "[ERROR] Server not reachable. Start it first: bash start_server.sh"
+    exit 1
+}
+echo "Server OK."
+mkdir -p $RESULT_DIR
+cd $BENCH_DIR
+export PYTHONPATH=/workspace/hanrui/syxin_old/Specforge:$PYTHONPATH
+# ---- decide which benches to run ----
+TARGETS=("$@")
+if [ ${#TARGETS[@]} -eq 0 ]; then
+    TARGETS=(humaneval mtbench gsm8k)
+fi
+BENCH_ARGS=""
+for t in "${TARGETS[@]}"; do
+    case $t in
+        humaneval) BENCH_ARGS="$BENCH_ARGS humaneval:164" ;;
+        mtbench)   BENCH_ARGS="$BENCH_ARGS mtbench:80"   ;;
+        gsm8k)     BENCH_ARGS="$BENCH_ARGS gsm8k:1319"   ;;
+        *)
+            echo "[ERROR] Unknown bench: $t (choices: humaneval mtbench gsm8k)"
+            exit 1
+            ;;
+    esac
+done
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+echo "Running: $BENCH_ARGS"
+echo "Results -> $RESULT_DIR"
+echo ""
+python3 bench_eagle3.py \
+    --model-path                   $BASE_MODEL \
+    --speculative-draft-model-path $MERGED \
+    --host                         $INTRANET_IP \
+    --port                         $PORT \
+    --config-list                  "16,4,1,4" \
+    --benchmark-list               $BENCH_ARGS \
+    --output-dir                   $RESULT_DIR \
+    --name                         dflash_lora_${TIMESTAMP} \
+    --skip-launch-server \
+    2>&1 | tee $RESULT_DIR/bench_${TIMESTAMP}.log
+echo ""
+echo "Done. Latest result files:"
+ls -lht $RESULT_DIR/*.jsonl 2>/dev/null | head -5

syxin/run_bench_dflash.sh ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/bin/bash
+# Evaluate DFlash-LoRA-Inject accepted length (offline, 8 GPUs parallel).
+# No sglang server needed. Each GPU loads its own target+draft and processes a shard.
+#
+# Usage:
+#   bash run_bench_dflash.sh                        # 8 GPUs, all 3 benches
+#   bash run_bench_dflash.sh humaneval              # only humaneval
+#   bash run_bench_dflash.sh mtbench gsm8k          # pick any subset
+#   bash run_bench_dflash.sh --quick                # quick test (20 samples)
+#   bash run_bench_dflash.sh --ckpt epoch_0_step_500  # specific checkpoint
+#   NUM_GPUS=4 bash run_bench_dflash.sh             # use 4 GPUs
+set -e
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+PYTHON=/workspace/miniconda3/envs/spec/bin/python3
+RESULT_DIR=/workspace/hanrui/syxin_old/Specforge/benchmarks/results
+NUM_GPUS=${NUM_GPUS:-8}
+# ---- parse args ----
+BENCHMARKS=()
+EXTRA_ARGS=()
+QUICK=false
+for arg in "$@"; do
+    case $arg in
+        humaneval|mtbench|gsm8k)
+            BENCHMARKS+=("$arg")
+            ;;
+        --quick)
+            QUICK=true
+            ;;
+        *)
+            EXTRA_ARGS+=("$arg")
+            ;;
+    esac
+done
+if [ ${#BENCHMARKS[@]} -eq 0 ]; then
+    BENCHMARKS=(humaneval mtbench gsm8k)
+fi
+if [ "$QUICK" = true ]; then
+    EXTRA_ARGS+=(--num-samples 20)
+fi
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+echo "============================================"
+echo " DFlash-LoRA-Inject Offline Eval"
+echo " GPUs       : $NUM_GPUS"
+echo " benchmarks : ${BENCHMARKS[*]}"
+echo " extra args : ${EXTRA_ARGS[*]}"
+echo " results    : $RESULT_DIR"
+echo "============================================"
+echo ""
+mkdir -p $RESULT_DIR
+$PYTHON -m torch.distributed.run \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $SCRIPT_DIR/eval_dflash_lora_inject.py \
+    --benchmarks ${BENCHMARKS[@]} \
+    --output-dir $RESULT_DIR \
+    "${EXTRA_ARGS[@]}" \
+    2>&1 | tee $RESULT_DIR/bench_dflash_lora_inject_offline_${TIMESTAMP}.log
+echo ""
+echo "Done. Latest result files:"
+ls -lht $RESULT_DIR/*.json 2>/dev/null | head -5

syxin/run_bench_dflash_b16_baseline.sh ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/bin/bash
+# DFlash-b16 baseline: measure accepted length offline, 8 GPUs parallel.
+# Usage:
+#   bash run_bench_dflash_b16_baseline.sh                  # 8 GPUs, all 3 benches
+#   bash run_bench_dflash_b16_baseline.sh humaneval         # only humaneval
+#   bash run_bench_dflash_b16_baseline.sh --quick           # 20 samples per bench
+#   NUM_GPUS=4 bash run_bench_dflash_b16_baseline.sh       # 4 GPUs
+set -e
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+PYTHON=/workspace/miniconda3/envs/spec/bin/python3
+RESULT_DIR=/workspace/hanrui/syxin_old/Specforge/benchmarks/results
+NUM_GPUS=${NUM_GPUS:-8}
+BENCHMARKS=()
+EXTRA_ARGS=()
+QUICK=false
+for arg in "$@"; do
+    case $arg in
+        humaneval|mtbench|gsm8k) BENCHMARKS+=("$arg") ;;
+        --quick) QUICK=true ;;
+        *) EXTRA_ARGS+=("$arg") ;;
+    esac
+done
+if [ ${#BENCHMARKS[@]} -eq 0 ]; then
+    BENCHMARKS=(humaneval mtbench gsm8k)
+fi
+if [ "$QUICK" = true ]; then
+    EXTRA_ARGS+=(--num-samples 20)
+fi
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+echo "============================================"
+echo " DFlash-b16 Baseline Offline Eval"
+echo " GPUs       : $NUM_GPUS"
+echo " draft      : /workspace/models/Qwen3-8B-DFlash-b16"
+echo " benchmarks : ${BENCHMARKS[*]}"
+echo " extra args : ${EXTRA_ARGS[*]}"
+echo "============================================"
+echo ""
+mkdir -p $RESULT_DIR
+$PYTHON -m torch.distributed.run \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $SCRIPT_DIR/eval_dflash_b16_baseline.py \
+    --benchmarks ${BENCHMARKS[@]} \
+    --output-dir $RESULT_DIR \
+    "${EXTRA_ARGS[@]}" \
+    2>&1 | tee $RESULT_DIR/bench_dflash_b16_baseline_${TIMESTAMP}.log
+echo ""
+echo "Done. Latest result files:"
+ls -lht $RESULT_DIR/*.json 2>/dev/null | head -5

syxin/run_qwen3_8b_sft_32gpu.sh ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/bin/bash
+export JOB_NAME='qwen3-8b-sft'
+export GPU_NUMS=64
+export TRAIN_SCRIPT='/workspace/hanrui/syxin/launch_train_wrapper.py'
+export WORK_DIR='/workspace/hanrui/syxin/Specforge'
+if [ $GPU_NUMS -lt 8 ]; then
+    export NNODES=1
+    export GPU_NUMS_PER_NODE=$GPU_NUMS
+else
+    export NNODES=$((GPU_NUMS/8))
+    export GPU_NUMS_PER_NODE=8
+fi
+# 使用 spec 环境的 northjob
+/workspace/miniconda3/envs/spec/bin/northjob \
+create \
+--job-type train \
+--nproc-per-node $GPU_NUMS_PER_NODE \
+--gpu-per-node $GPU_NUMS_PER_NODE \
+--nnodes $NNODES \
+--k8s-priority 3 \
+--k8s-queue bg-agentic-coding \
+--k8s-namespace bg-agentic-coding \
+--k8s-pvc-name i-xinsiyang-y4zy0sik0a \
+--k8s-pvc-mount-path /workspace \
+--k8s-no-reclaim \
+--k8s-images harbor.local.clusters/bp/megatron-bplm:25.03_fp8.ibgda.qwen3.next.fix_triton.fix_te.hf457.qwen3_vl \
+--job-name $JOB_NAME \
+--workspace $WORK_DIR \
+$TRAIN_SCRIPT $GPU_NUMS_PER_NODE

syxin/run_train_dflash_direct_inject.sh ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/bin/bash
+set -euo pipefail
+ROOT_DIR=/workspace/hanrui/syxin_old/Specforge
+NUM_GPUS=8
+OUTPUT_DIR=$ROOT_DIR/outputs/qwen3-8b-dflash-direct-inject
+if [[ $# -ge 1 ]]; then
+  NUM_GPUS=$1
+  shift
+fi
+if [[ $# -ge 1 && "${1:0:1}" != "-" ]]; then
+  OUTPUT_DIR=$1
+  shift
+fi
+EXTRA_ARGS=("$@")
+export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels
+export SPECFORGE_DATA_NUM_PROC=16
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export PYTORCH_ALLOC_CONF=expandable_segments:True
+export PYTHONPATH="$ROOT_DIR:${PYTHONPATH:-}"
+DEFAULT_SPECFORGE_PY=/workspace/hanrui/specforge/bin/python3
+if [[ -z "${PYTHON_BIN:-}" ]]; then
+  if [[ -x "$DEFAULT_SPECFORGE_PY" ]]; then
+    PYTHON_BIN="$DEFAULT_SPECFORGE_PY"
+  else
+    PYTHON_BIN=python3
+  fi
+fi
+cd $ROOT_DIR
+$PYTHON_BIN -m torch.distributed.run \
+  --standalone \
+  --nproc_per_node $NUM_GPUS \
+  scripts/train_dflash.py \
+  --target-model-path /workspace/models/Qwen3-8B \
+  --target-model-backend sglang \
+  --train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \
+  --output-dir $OUTPUT_DIR \
+  --block-size 16 \
+  --num-draft-layers 36 \
+  --attention-backend flex_attention \
+  --max-length 2048 \
+  --batch-size 1 \
+  --accumulation-steps 8 \
+  --num-epochs 3 \
+  --learning-rate 6e-4 \
+  --loss-decay-gamma 7 \
+  --lm-head-chunk-size 256 \
+  --gradient-checkpointing \
+  --chat-template qwen \
+  --log-interval 50 \
+  --save-interval 500 \
+  --cache-dir $ROOT_DIR/cache \
+  "${EXTRA_ARGS[@]}"

syxin/run_train_dflash_lora_inject.sh ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/bin/bash
+set -euo pipefail
+ROOT_DIR=/workspace/hanrui/syxin/Specforge
+NUM_GPUS=8
+OUTPUT_DIR=$ROOT_DIR/outputs/qwen3-8b-dflash-lora-inject
+CACHE_DIR=/tmp/specforge_cache
+# Parse arguments
+if [[ $# -ge 1 ]]; then
+  NUM_GPUS=$1
+  shift
+fi
+if [[ $# -ge 1 && "${1:0:1}" != "-" ]]; then
+  OUTPUT_DIR=$1
+  shift
+fi
+EXTRA_ARGS=("$@")
+# Environment variables
+export TORCHINDUCTOR_CACHE_DIR=/tmp/specforge_cache/compiled_kernels
+export SPECFORGE_DATA_NUM_PROC=16
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export PYTORCH_ALLOC_CONF=expandable_segments:True
+export PYTHONPATH="$ROOT_DIR:${PYTHONPATH:-}"
+export HF_DATASETS_CACHE=/tmp/specforge_cache/hf_datasets
+export HF_HOME=/tmp/specforge_cache/hf_home
+# Python binary
+DEFAULT_SPECFORGE_PY=/workspace/hanrui/specforge/bin/python3
+if [[ -z "${PYTHON_BIN:-}" ]]; then
+  if [[ -x "$DEFAULT_SPECFORGE_PY" ]]; then
+    PYTHON_BIN="$DEFAULT_SPECFORGE_PY"
+  else
+    PYTHON_BIN=python3
+  fi
+fi
+cd $ROOT_DIR
+$PYTHON_BIN -m torch.distributed.run \
+  --standalone \
+  --nproc_per_node $NUM_GPUS \
+  scripts/train_dflash_lora_inject.py \
+  --target-model-path /workspace/models/Qwen3-8B \
+  --target-model-backend hf \
+  --train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \
+  --output-dir $OUTPUT_DIR \
+  --block-size 16 \
+  --attention-backend additive \
+  --attn-implementation sdpa \
+  --max-length 2048 \
+  --batch-size 8 \
+  --accumulation-steps 8 \
+  --num-epochs 3 \
+  --learning-rate 5e-5 \
+  --loss-decay-gamma 7 \
+  --gradient-checkpointing \
+  --chat-template qwen \
+  --log-interval 50 \
+  --save-interval 500 \
+  --cache-dir $CACHE_DIR \
+  --lora-rank 32 \
+  --lora-alpha 64 \
+  --lora-dropout 0.1 \
+  --trust-remote-code \
+  --dataloader-num-workers 0 \
+  --early-stop \
+  --early-stop-patience 5 \
+  --early-stop-min-delta 0.005 \
+  "${EXTRA_ARGS[@]}"

syxin/run_train_multinode.sh ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/bin/bash
+set -euo pipefail
+ROOT_DIR=/workspace/hanrui/syxin/Specforge
+NUM_GPUS=8
+OUTPUT_DIR=$ROOT_DIR/outputs/qwen3-8b-sft-32gpu-v3
+CACHE_DIR=/tmp/specforge_cache
+# Parse arguments
+if [[ $# -ge 1 ]]; then
+  NUM_GPUS=$1
+  shift
+fi
+if [[ $# -ge 1 && "${1:0:1}" != "-" ]]; then
+  OUTPUT_DIR=$1
+  shift
+fi
+EXTRA_ARGS=("$@")
+# Environment variables
+export TORCHINDUCTOR_CACHE_DIR=/tmp/specforge_cache/compiled_kernels
+export SPECFORGE_DATA_NUM_PROC=16
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export PYTORCH_ALLOC_CONF=expandable_segments:True
+export PYTHONPATH="$ROOT_DIR:${PYTHONPATH:-}"
+export HF_DATASETS_CACHE=/tmp/specforge_cache/hf_datasets
+export HF_HOME=/tmp/specforge_cache/hf_home
+# Python binary
+DEFAULT_SPECFORGE_PY=/workspace/miniconda3/envs/spec/bin/python3
+if [[ -z "${PYTHON_BIN:-}" ]]; then
+  if [[ -x "$DEFAULT_SPECFORGE_PY" ]]; then
+    PYTHON_BIN="$DEFAULT_SPECFORGE_PY"
+  else
+    PYTHON_BIN=python3
+  fi
+fi
+cd $ROOT_DIR
+# northjob 已经通过 torchrun 设置了分布式环境变量
+# 直接运行训练脚本，不要再启动 torch.distributed.run
+$PYTHON_BIN scripts/train_dflash_lora_inject.py \
+  --target-model-path /workspace/models/Qwen3-8B \
+  --target-model-backend hf \
+  --train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \
+  --output-dir $OUTPUT_DIR \
+  --block-size 16 \
+  --attention-backend additive \
+  --attn-implementation sdpa \
+  --max-length 2048 \
+  --batch-size 4 \
+  --accumulation-steps 16 \
+  --num-epochs 3 \
+  --learning-rate 5e-5 \
+  --loss-decay-gamma 7 \
+  --gradient-checkpointing \
+  --chat-template qwen \
+  --log-interval 50 \
+  --save-interval 500 \
+  --cache-dir $CACHE_DIR \
+  --lora-rank 32 \
+  --lora-alpha 64 \
+  --lora-dropout 0.1 \
+  --trust-remote-code \
+  --dataloader-num-workers 0 \
+  "${EXTRA_ARGS[@]}"

syxin/run_train_qwen3_8b_sft_32gpu.sh ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/bin/bash
+set -euo pipefail
+ROOT_DIR=/workspace/hanrui/syxin_old/Specforge
+NUM_GPUS=8
+OUTPUT_DIR=$ROOT_DIR/outputs/qwen3-8b-sft-32gpu-v2
+CACHE_DIR=/tmp/specforge_cache_sft
+# Parse arguments
+if [[ $# -ge 1 ]]; then
+  NUM_GPUS=$1
+  shift
+fi
+if [[ $# -ge 1 && "${1:0:1}" != "-" ]]; then
+  OUTPUT_DIR=$1
+  shift
+fi
+EXTRA_ARGS=("$@")
+# Environment variables
+export TORCHINDUCTOR_CACHE_DIR=/tmp/specforge_cache_sft/compiled_kernels
+export SPECFORGE_DATA_NUM_PROC=16
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export PYTORCH_ALLOC_CONF=expandable_segments:True
+export PYTHONPATH="$ROOT_DIR:${PYTHONPATH:-}"
+export HF_DATASETS_CACHE=/tmp/specforge_cache_sft/hf_datasets
+export HF_HOME=/tmp/specforge_cache_sft/hf_home
+# Python binary
+DEFAULT_SPECFORGE_PY=/workspace/hanrui/specforge/bin/python3
+if [[ -z "${PYTHON_BIN:-}" ]]; then
+  if [[ -x "$DEFAULT_SPECFORGE_PY" ]]; then
+    PYTHON_BIN="$DEFAULT_SPECFORGE_PY"
+  else
+    PYTHON_BIN=python3
+  fi
+fi
+cd $ROOT_DIR
+# northjob 已经通过 torchrun 启动了分布式，这里直接运行训练脚本
+$PYTHON_BIN $ROOT_DIR/scripts/train_dflash_lora_inject.py \
+  --target-model-path /workspace/models/Qwen3-8B \
+  --target-model-backend hf \
+  --train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \
+  --output-dir $OUTPUT_DIR \
+  --block-size 16 \
+  --attention-backend additive \
+  --attn-implementation sdpa \
+  --max-length 2048 \
+  --batch-size 8 \
+  --accumulation-steps 8 \
+  --num-epochs 3 \
+  --learning-rate 5e-5 \
+  --loss-decay-gamma 7 \
+  --gradient-checkpointing \
+  --chat-template qwen \
+  --log-interval 50 \
+  --save-interval 500 \
+  --cache-dir $CACHE_DIR \
+  --lora-rank 32 \
+  --lora-alpha 64 \
+  --lora-dropout 0.1 \
+  --trust-remote-code \
+  --dataloader-num-workers 0 \
+  "${EXTRA_ARGS[@]}"

syxin/server.log ADDED Viewed

	@@ -0,0 +1,186 @@

+/workspace/hanrui/sglang/python/sglang/launch_server.py:51: UserWarning: 'python -m sglang.launch_server' is still supported, but 'sglang serve' is the recommended entrypoint.
+  Example: sglang serve --model-path <model> [options]
+  warnings.warn(
+[2026-03-07 15:24:13] INFO server_args.py:2048: Attention backend not specified. Use fa3 backend by default.
+[2026-03-07 15:24:13] WARNING server_args.py:2629: Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests.
+[2026-03-07 15:24:13] WARNING server_args.py:2650: Overlap scheduler is disabled when spec v2 is off or using unsupported speculative algorithm. You can set env SGLANG_ENABLE_SPEC_V2=True to enable the experimental overlap scheduler.
+[2026-03-07 15:24:13] WARNING server_args.py:2712: speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1
+[2026-03-07 15:24:14] server_args=ServerArgs(model_path='/workspace/models/Qwen3-8B', tokenizer_path='/workspace/models/Qwen3-8B', tokenizer_mode='auto', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=True, context_length=None, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='10.233.100.123', port=30000, fastapi_root_path='', grpc_mode=False, skip_server_warmup=False, warmups=None, nccl_port=None, checkpoint_engine_wait_weights_before_ready=False, ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_keyfile_password=None, enable_ssl_refresh=False, dtype='bfloat16', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', enable_fp32_lm_head=False, modelopt_quant=None, modelopt_checkpoint_restore_path=None, modelopt_checkpoint_save_path=None, modelopt_export_path=None, quantize_and_serve=False, rl_quant_profile=None, mem_fraction_static=0.8, max_running_requests=48, max_queued_requests=None, max_total_tokens=None, chunked_prefill_size=8192, enable_dynamic_chunking=False, max_prefill_tokens=16384, prefill_max_requests=None, schedule_policy='fcfs', enable_priority_scheduling=False, disable_priority_preemption=False, default_priority_value=None, abort_on_priority_when_disabled=False, schedule_low_priority_values_first=False, priority_scheduling_preemption_threshold=10, schedule_conservativeness=1.0, page_size=1, swa_full_tokens_ratio=0.8, disable_hybrid_swa_memory=False, radix_eviction_policy='lru', enable_prefill_delayer=False, prefill_delayer_max_delay_passes=30, prefill_delayer_token_usage_low_watermark=None, prefill_delayer_forward_passes_buckets=None, prefill_delayer_wait_seconds_buckets=None, device='cuda', tp_size=4, pp_size=1, pp_max_micro_batch_size=None, pp_async_batch_depth=0, stream_interval=1, stream_output=False, enable_streaming_session=False, random_seed=551181117, constrained_json_whitespace_pattern=None, constrained_json_disable_any_whitespace=False, watchdog_timeout=300, soft_watchdog_timeout=None, dist_timeout=None, download_dir=None, model_checksum=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, use_ray=False, custom_sigquit_handler=None, log_level='info', log_level_http=None, log_requests=False, log_requests_level=2, log_requests_format='text', log_requests_target=None, uvicorn_access_log_exclude_prefixes=[], crash_dump_folder=None, show_time_cost=False, enable_metrics=False, enable_metrics_for_all_schedulers=False, tokenizer_metrics_custom_labels_header='x-custom-labels', tokenizer_metrics_allowed_custom_labels=None, extra_metric_labels=None, bucket_time_to_first_token=None, bucket_inter_token_latency=None, bucket_e2e_request_latency=None, collect_tokens_histogram=False, prompt_tokens_buckets=None, generation_tokens_buckets=None, gc_warning_threshold_secs=0.0, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, enable_trace=False, otlp_traces_endpoint='localhost:4317', export_metrics_to_file=False, export_metrics_to_file_dir=None, api_key=None, admin_api_key=None, served_model_name='/workspace/models/Qwen3-8B', weight_version='default', chat_template=None, hf_chat_template_name=None, completion_template=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser=None, tool_server=None, sampling_defaults='model', dp_size=1, load_balance_method='round_robin', attn_cp_size=1, moe_dp_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, enable_lora=None, enable_lora_overlap_loading=None, max_lora_rank=None, lora_target_modules=None, lora_paths=None, max_loaded_loras=None, max_loras_per_batch=8, lora_eviction_policy='lru', lora_backend='csgmv', max_lora_chunk_size=16, attention_backend='fa3', decode_attention_backend=None, prefill_attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, fp8_gemm_runner_backend='auto', fp4_gemm_runner_backend='flashinfer_cutlass', nsa_prefill_backend=None, nsa_decode_backend=None, disable_flashinfer_autotune=False, mamba_backend='triton', speculative_algorithm='STANDALONE', speculative_draft_model_path='/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-merged', speculative_draft_model_revision='main', speculative_draft_load_format=None, speculative_num_steps=4, speculative_eagle_topk=1, speculative_num_draft_tokens=5, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, speculative_attention_mode='prefill', speculative_draft_attention_backend=None, speculative_moe_runner_backend='auto', speculative_moe_a2a_backend=None, speculative_draft_model_quantization=None, speculative_ngram_min_match_window_size=1, speculative_ngram_max_match_window_size=12, speculative_ngram_min_bfs_breadth=1, speculative_ngram_max_bfs_breadth=10, speculative_ngram_match_type='BFS', speculative_ngram_branch_length=18, speculative_ngram_capacity=10000000, enable_multi_layer_eagle=False, ep_size=1, moe_a2a_backend='none', moe_runner_backend='auto', flashinfer_mxfp4_moe_precision='default', enable_flashinfer_allreduce_fusion=False, enable_aiter_allreduce_fusion=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm=None, init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, eplb_min_rebalancing_utilization_threshold=1.0, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, elastic_ep_backend=None, enable_elastic_expert_backup=False, mooncake_ib_device=None, max_mamba_cache_size=None, mamba_ssm_dtype=None, mamba_full_memory_ratio=0.9, mamba_scheduler_strategy='no_buffer', mamba_track_interval=256, linear_attn_backend='triton', linear_attn_decode_backend=None, linear_attn_prefill_backend=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through', hicache_io_backend='kernel', hicache_mem_layout='layer_first', disable_hicache_numa_detect=False, hicache_storage_backend=None, hicache_storage_prefetch_policy='best_effort', hicache_storage_backend_extra_config=None, hierarchical_sparse_attention_extra_config=None, enable_lmcache=False, kt_weight_path=None, kt_method='AMXINT4', kt_cpuinfer=None, kt_threadpool_count=2, kt_num_gpu_experts=None, kt_max_deferred_experts_per_token=None, dllm_algorithm=None, dllm_algorithm_config=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, cpu_offload_gb=0, offload_group_size=-1, offload_num_in_group=1, offload_prefetch_step=1, offload_mode='cpu', multi_item_scoring_delimiter=None, disable_radix_cache=False, cuda_graph_max_bs=512, cuda_graph_bs=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 40, 44, 48, 52, 56, 60, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], disable_cuda_graph=False, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_cudagraph_gc=False, enable_layerwise_nvtx_marker=False, enable_nccl_nvls=False, enable_symm_mem=False, disable_flashinfer_cutlass_moe_fp4_allgather=False, enable_tokenizer_batch_encode=False, disable_tokenizer_batch_decode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, enable_torch_symm_mem=False, disable_overlap_schedule=True, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_single_batch_overlap=False, tbo_token_distribution_threshold=0.48, enable_torch_compile=False, disable_piecewise_cuda_graph=True, enforce_piecewise_cuda_graph=False, enable_torch_compile_debug_mode=False, torch_compile_max_bs=32, piecewise_cuda_graph_max_tokens=8192, piecewise_cuda_graph_tokens=[4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584, 3840, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192], piecewise_cuda_graph_compiler='eager', torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, triton_attention_split_tile_size=None, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, enable_weights_cpu_backup=False, enable_draft_weights_cpu_backup=False, allow_auto_truncate=False, enable_custom_logit_processor=False, flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, keep_mm_feature_on_device=False, enable_return_hidden_states=False, enable_return_routed_experts=False, scheduler_recv_interval=1, numa_node=None, enable_deterministic_inference=False, rl_on_policy_target=None, enable_attn_tp_input_scattered=False, enable_nsa_prefill_context_parallel=False, nsa_prefill_cp_mode='round-robin-split', enable_fused_qk_norm_rope=False, enable_precise_embedding_interpolation=False, enable_fused_moe_sum_all_reduce=False, enable_dynamic_batch_tokenizer=False, dynamic_batch_tokenizer_batch_size=32, dynamic_batch_tokenizer_batch_timeout=0.002, debug_tensor_dump_output_folder=None, debug_tensor_dump_layers=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_ib_device=None, disaggregation_decode_enable_offload_kvcache=False, num_reserved_decode_tokens=512, disaggregation_decode_polling_interval=1, encoder_only=False, language_only=False, encoder_transfer_backend='zmq_to_scheduler', encoder_urls=[], enable_adaptive_dispatch_to_encoder=False, custom_weight_loader=[], weight_loader_disable_mmap=False, remote_instance_weight_loader_seed_instance_ip=None, remote_instance_weight_loader_seed_instance_service_port=None, remote_instance_weight_loader_send_weights_group_ports=None, remote_instance_weight_loader_backend='nccl', remote_instance_weight_loader_start_seed_via_transfer_engine=False, enable_pdmux=False, pdmux_config_path=None, sm_group_num=8, mm_max_concurrent_calls=32, mm_per_request_timeout=10.0, enable_broadcast_mm_inputs_process=False, enable_prefix_mm_cache=False, mm_enable_dp_encoder=False, mm_process_config={}, limit_mm_data_per_request=None, enable_mm_global_cache=False, decrypted_config_file=None, decrypted_draft_config_file=None, forward_hooks=None)
+[2026-03-07 15:24:15] Using default HuggingFace chat template with detected content format: string
+[2026-03-07 15:24:25 TP2] Mamba selective_state_update backend initialized: triton
+[2026-03-07 15:24:25 TP2] Init torch distributed begin.
+[2026-03-07 15:24:26 TP0] Mamba selective_state_update backend initialized: triton
+[2026-03-07 15:24:26 TP0] Init torch distributed begin.
+[2026-03-07 15:24:26 TP3] Mamba selective_state_update backend initialized: triton
+[2026-03-07 15:24:26 TP1] Mamba selective_state_update backend initialized: triton
+[2026-03-07 15:24:26 TP3] Init torch distributed begin.
+[2026-03-07 15:24:26 TP1] Init torch distributed begin.
+[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
+[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
+[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
+[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
+[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
+[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
+[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
+[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
+[2026-03-07 15:24:27 TP0] sglang is using nccl==2.27.5
+[2026-03-07 15:24:29 TP0] Scheduler hit an exception: Traceback (most recent call last):
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 3239, in run_scheduler_process
+    scheduler = Scheduler(
+                ^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 365, in __init__
+    self.init_model_worker()
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 561, in init_model_worker
+    self.init_tp_model_worker()
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 519, in init_tp_model_worker
+    self.tp_worker = TpModelWorker(
+                     ^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 258, in __init__
+    self._init_model_runner()
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 341, in _init_model_runner
+    self._model_runner = ModelRunner(
+                         ^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 395, in __init__
+    pre_model_load_memory = self.init_torch_distributed()
+                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 813, in init_torch_distributed
+    initialize_model_parallel(
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1764, in initialize_model_parallel
+    _TP = init_model_parallel_group(
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1450, in init_model_parallel_group
+    return GroupCoordinator(
+           ^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 357, in __init__
+    self.pynccl_comm = PyNcclCommunicator(
+                       ^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl.py", line 113, in __init__
+    self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 401, in ncclCommInitRank
+    self.NCCL_CHECK(
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 376, in NCCL_CHECK
+    raise RuntimeError(f"NCCL error: {error_str}")
+RuntimeError: NCCL error: unhandled system error (run with NCCL_DEBUG=INFO for details)
+[2026-03-07 15:24:29] Received sigquit from a child process. It usually means the child failed.
+[2026-03-07 15:24:29 TP2] Scheduler hit an exception: Traceback (most recent call last):
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 3239, in run_scheduler_process
+    scheduler = Scheduler(
+                ^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 365, in __init__
+    self.init_model_worker()
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 561, in init_model_worker
+    self.init_tp_model_worker()
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 519, in init_tp_model_worker
+    self.tp_worker = TpModelWorker(
+                     ^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 258, in __init__
+    self._init_model_runner()
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 341, in _init_model_runner
+    self._model_runner = ModelRunner(
+                         ^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 395, in __init__
+    pre_model_load_memory = self.init_torch_distributed()
+                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 813, in init_torch_distributed
+    initialize_model_parallel(
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1764, in initialize_model_parallel
+    _TP = init_model_parallel_group(
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1450, in init_model_parallel_group
+    return GroupCoordinator(
+           ^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 357, in __init__
+    self.pynccl_comm = PyNcclCommunicator(
+                       ^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl.py", line 113, in __init__
+    self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 401, in ncclCommInitRank
+    self.NCCL_CHECK(
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 376, in NCCL_CHECK
+    raise RuntimeError(f"NCCL error: {error_str}")
+RuntimeError: NCCL error: unhandled system error (run with NCCL_DEBUG=INFO for details)
+[2026-03-07 15:24:29 TP1] Scheduler hit an exception: Traceback (most recent call last):
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 3239, in run_scheduler_process
+    scheduler = Scheduler(
+                ^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 365, in __init__
+    self.init_model_worker()
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 561, in init_model_worker
+    self.init_tp_model_worker()
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 519, in init_tp_model_worker
+    self.tp_worker = TpModelWorker(
+                     ^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 258, in __init__
+    self._init_model_runner()
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 341, in _init_model_runner
+    self._model_runner = ModelRunner(
+                         ^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 395, in __init__
+    pre_model_load_memory = self.init_torch_distributed()
+                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 813, in init_torch_distributed
+    initialize_model_parallel(
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1764, in initialize_model_parallel
+    _TP = init_model_parallel_group(
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1450, in init_model_parallel_group
+    return GroupCoordinator(
+           ^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 357, in __init__
+    self.pynccl_comm = PyNcclCommunicator(
+                       ^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl.py", line 113, in __init__
+    self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 401, in ncclCommInitRank
+    self.NCCL_CHECK(
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 376, in NCCL_CHECK
+    raise RuntimeError(f"NCCL error: {error_str}")
+RuntimeError: NCCL error: unhandled system error (run with NCCL_DEBUG=INFO for details)
+[2026-03-07 15:24:29] Received sigquit from a child process. It usually means the child failed.
+[2026-03-07 15:24:29] Received sigquit from a child process. It usually means the child failed.
+[2026-03-07 15:24:29 TP3] Scheduler hit an exception: Traceback (most recent call last):
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 3239, in run_scheduler_process
+    scheduler = Scheduler(
+                ^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 365, in __init__
+    self.init_model_worker()
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 561, in init_model_worker
+    self.init_tp_model_worker()
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 519, in init_tp_model_worker
+    self.tp_worker = TpModelWorker(
+                     ^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 258, in __init__
+    self._init_model_runner()
+  File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 341, in _init_model_runner
+    self._model_runner = ModelRunner(
+                         ^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 395, in __init__
+    pre_model_load_memory = self.init_torch_distributed()
+                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 813, in init_torch_distributed
+    initialize_model_parallel(
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1764, in initialize_model_parallel
+    _TP = init_model_parallel_group(
+          ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1450, in init_model_parallel_group
+    return GroupCoordinator(
+           ^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 357, in __init__
+    self.pynccl_comm = PyNcclCommunicator(
+                       ^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl.py", line 113, in __init__
+    self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 401, in ncclCommInitRank
+    self.NCCL_CHECK(
+  File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 376, in NCCL_CHECK
+    raise RuntimeError(f"NCCL error: {error_str}")
+RuntimeError: NCCL error: unhandled system error (run with NCCL_DEBUG=INFO for details)
+[2026-03-07 15:24:29] Received sigquit from a child process. It usually means the child failed.

syxin/start_server.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/bin/bash
+# Step 2: Launch SGLang server with STANDALONE speculative decoding.
+# Usage:
+#   bash start_server.sh
+#   bash start_server.sh 8   # use tp=8
+set -e
+TP=${1:-2}
+BASE_MODEL=/workspace/models/Qwen3-8B
+MERGED=/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-merged
+INTRANET_IP=10.1.1.131
+PORT=30000
+if [ ! -d "$MERGED" ]; then
+    echo "[ERROR] Merged model not found: $MERGED"
+    echo "        Run: conda activate sglang && python3 merge_lora.py"
+    exit 1
+fi
+echo "============================================"
+echo " SGLang STANDALONE Speculative Decoding"
+echo " target : $BASE_MODEL"
+echo " draft  : $MERGED"
+echo " host   : $INTRANET_IP:$PORT"
+echo " tp     : $TP"
+echo "============================================"
+/workspace/miniconda3/envs/sglang/bin/python3 -m sglang.launch_server \
+    --model-path                    $BASE_MODEL \
+    --speculative-algorithm         STANDALONE \
+    --speculative-draft-model-path  $MERGED \
+    --speculative-num-steps         4 \
+    --speculative-eagle-topk        1 \
+    --speculative-num-draft-tokens  4 \
+    --tp-size                       $TP \
+    --mem-fraction-static           0.30 \
+    --trust-remote-code \
+    --host                          $INTRANET_IP \
+    --port                          $PORT \
+    --dtype                         bfloat16

syxin/start_server_dflash.sh ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/bin/bash
+# Evaluate DFlash-LoRA-Inject: measure accepted length OFFLINE.
+# 8 GPUs parallel by default, each GPU runs a shard of prompts independently.
+#
+# WHY offline?
+#   sglang STANDALONE treats draft as an independent autoregressive model,
+#   completely ignoring the layer-by-layer injection that LoRA-Inject was
+#   trained with. Result: accept_length ≈ 4.7 for ALL models (no signal).
+#
+#   sglang DFLASH expects the DFlash-b16 architecture (5-layer, fc+hidden_norm),
+#   which is structurally different from LoRA-Inject (full 36-layer + LoRA).
+#
+#   So we run offline spec-generate with the correct injection pattern.
+#
+# Usage:
+#   bash start_server_dflash.sh                    # 8 GPUs, all benchmarks
+#   bash start_server_dflash.sh 4                  # 4 GPUs
+#   bash start_server_dflash.sh 8 humaneval        # specific benchmark
+#   bash start_server_dflash.sh 8 --num-samples 20 # quick test
+set -e
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+NUM_GPUS=${1:-8}
+shift 2>/dev/null || true
+# ---- defaults ----
+BASE_MODEL=/workspace/models/Qwen3-8B
+ADAPTER_ROOT=/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-inject
+CKPT=epoch_3_step_1400
+MERGED=/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-inject-merged
+RESULT_DIR=/workspace/hanrui/syxin_old/Specforge/benchmarks/results
+PYTHON=/workspace/miniconda3/envs/spec/bin/python3
+echo "============================================"
+echo " DFlash-LoRA-Inject Offline Evaluation"
+echo " target : $BASE_MODEL"
+echo " ckpt   : $CKPT"
+echo " merged : $MERGED"
+echo " GPUs   : $NUM_GPUS"
+echo "============================================"
+$PYTHON -m torch.distributed.run \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $SCRIPT_DIR/eval_dflash_lora_inject.py \
+    --base-model   $BASE_MODEL \
+    --adapter-root $ADAPTER_ROOT \
+    --ckpt         $CKPT \
+    --merged-path  $MERGED \
+    --block-size   16 \
+    --output-dir   $RESULT_DIR \
+    "$@"

syxin/step1.md ADDED Viewed

	@@ -0,0 +1,139 @@

+Now I have a complete picture. Here's the full analysis and proposed plan.
+  ---
+  Current Logic of train_dflash.py
+  This script trains a small, standalone DFlash draft model (1–10 transformer layers) that learns to predict token blocks in parallel,
+  guided by hidden states extracted from a frozen target model.
+  Architecture (two separate models)
+  1. Target model (frozen, e.g. Qwen3-8B/32B) — runs a full forward pass to produce hidden states at selected layers.
+  2. Draft model (DFlashDraftModel, ~1–10 layers) — a lightweight Qwen3-based decoder that takes noise embeddings + target hidden states
+  and predicts the block tokens.
+  3. Target embed_tokens + lm_head — loaded separately via TargetEmbeddingsAndHead to avoid duplicating the full target model in memory.
+  Key locations
+  ┌──────────────────────────┬────────────────────────────────────┬───────────────────────────────────────────────────────┐
+  │        Component         │                File                │                         Lines                         │
+  ├──────────────────────────┼────────────────────────────────────┼───────────────────────────────────────────────────────┤
+  │ Model init               │ scripts/train_dflash.py            │ build_models() L254–311                               │
+  ├──────────────────────────┼────────────────────────────────────┼───────────────────────────────────────────────────────┤
+  │ Target hidden extraction │ scripts/train_dflash.py            │ L644–647 (target_model.generate_dflash_data)          │
+  ├──────────────────────────┼────────────────────────────────────┼───────────────────────────────────────────────────────┤
+  │ Forward pass             │ specforge/core/dflash.py           │ OnlineDFlashModel.forward() L243–332                  │
+  ├──────────────────────────┼────────────────────────────────────┼───────────────────────────────────────────────────────┤
+  │ Loss calculation         │ specforge/core/dflash.py           │ _full_lm_loss() L382–417, _chunked_lm_loss() L419–478 │
+  ├──────────────────────────┼────────────────────────────────────┼───────────────────────────────────────────────────────┤
+  │ Loss mask                │ specforge/core/dflash.py           │ create_dflash_loss_mask() L481–509                    │
+  ├──────────────────────────┼────────────────────────────────────┼────���──────────────────────────────────────────────────┤
+  │ Draft model architecture │ specforge/modeling/draft/dflash.py │ DFlashDraftModel L212–266                             │
+  ├──────────────────────────┼────────────────────────────────────┼───────────────────────────────────────────────────────┤
+  │ DFlash attention         │ specforge/modeling/draft/dflash.py │ Qwen3DFlashAttention L42–134                          │
+  └──────────────────────────┴────────────────────────────────────┴───────────────────────────────────────────────────────┘
+  Forward pass flow (per training step)
+  input_ids, attention_mask, loss_mask  →  target_model.generate_dflash_data()
+                                                ↓
+                                       hidden_states (from target layers [1,9,17,25,33])
+                                                ↓
+                                OnlineDFlashModel.forward():
+                                  1. Truncate to block boundary
+                                  2. prepare_noise_input(): anchor tokens kept, rest → MASK
+                                  3. embed_tokens(noise_input_ids) → noise_embedding
+                                  4. Build DFlash attention mask (flex_attention or additive)
+                                  5. draft_model(noise_embedding, target_hidden, mask)
+                                  6. lm_head(hidden) → logits
+                                  7. CE loss on non-anchor positions (weighted by loss_mask × decay)
+  The draft model's custom Qwen3DFlashAttention concatenates [context_hidden, noise_hidden] as KV, with queries only from noise tokens. The
+   attention mask enforces: block tokens see all preceding blocks' context + bidirectional within their own block.
+  ---
+  What already exists: train_dflash_lora.py
+  Interestingly, the repo already has a LoRA variant at scripts/train_dflash_lora.py with its own model (DFlashLoRADraftModel) and wrapper
+  (OnlineDFlashLoRAModel). This is exactly the approach you described — Qwen3-8B + LoRA, no separate target model, 1-step diffusion
+  training. The key differences from train_dflash.py:
+  ┌─────────────────┬─────────────────────────────────────────────────────────────┬────────────────────────────────────────────────────┐
+  │     Aspect      │                       train_dflash.py                       │                train_dflash_lora.py                │
+  ├─────────────────┼─────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────┤
+  │ Draft model     │ Small custom DFlashDraftModel (1–10 layers)                 │ Full Qwen3-8B + LoRA adapters                      │
+  ├─────────────────┼─────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────┤
+  │ Target model    │ Separate frozen model for hidden state extraction           │ None — model uses its own representations          │
+  ├─────────────────┼─────────────────────────────────────────────────────────────┼───────────────────────────���────────────────────────┤
+  │ Attention       │ Custom Qwen3DFlashAttention (Q from noise, KV from [ctx,    │ Standard HF attention with 4D additive DFlash mask │
+  │                 │ noise])                                                     │                                                    │
+  ├─────────────────┼─────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────┤
+  │ Forward         │ draft_model(noise_emb, target_hidden, mask)                 │ model(noise_input_ids, 4d_mask, position_ids) →    │
+  │                 │                                                             │ logits                                             │
+  ├─────────────────┼─────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────┤
+  │ Trainable       │ All draft model params                                      │ Only LoRA (q/k/v/o_proj), base frozen              │
+  │ params          │                                                             │                                                    │
+  ├─────────────────┼─────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────┤
+  │ FSDP strategy   │ SHARD_GRAD_OP                                               │ FULL_SHARD                                         │
+  └─────────────────┴─────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────┘
+  ---
+  Proposed Modification Plan
+  Since train_dflash_lora.py already implements the core idea, the plan focuses on what's missing or needs improvement to make it a proper
+  "1-step dLLM draft model" for your research:
+  Phase 1: Validate and extend the existing LoRA pipeline
+  1. Add MLP to LoRA targets — The current config only targets q_proj, k_proj, v_proj, o_proj. For stronger 1-step diffusion capability,
+  add gate_proj, up_proj, down_proj to lora_target_modules. This gives the model more capacity to learn the non-autoregressive distribution
+   shift.
+  2. Add multi-step noise schedule support — Currently the training is strictly 1-step (all non-anchors → MASK). For a proper diffusion/AR
+  fusion, add an option for a noise schedule where a fraction of block tokens are revealed (not just the anchor), controlled by a
+  noise_ratio parameter. This would modify prepare_noise_input() in OnlineDFlashLoRAModel:
+  # Instead of: all non-anchor → MASK
+  # Allow: randomly keep some non-anchor tokens with probability (1 - noise_ratio)
+  3. Add configurable context_len strategy — Currently context_len=0 treats the whole sequence as blocks. Add a --context-ratio arg that
+  dynamically sets context_len as a fraction of the sequence, so the model learns to condition on varying amounts of AR-decoded prefix.
+  Phase 2: Training logic improvements
+  4. Add KL divergence loss — In addition to CE loss against ground truth, add an optional KL loss against the base model's AR distribution
+   (teacher forcing). This regularizes the LoRA model to stay close to the original Qwen3-8B distribution. Modify
+  OnlineDFlashLoRAModel.forward():
+  # Compute base model logits (no_grad, no LoRA) as teacher
+  # KL(draft_logits || teacher_logits) on block positions
+  # total_loss = ce_loss + kl_weight * kl_loss
+  5. Add evaluation with speculative decoding metrics — The current accuracy metric is block-wise acceptance rate. Add an eval loop that
+  actually runs speculative decoding (draft → verify) to measure real speedup, using the LoRA model as the drafter and the base model (with
+   LoRA disabled) as the verifier.
+  Phase 3: Integration with train_dflash.py style features
+  6. Port random anchor sampling — train_dflash.py has --random-anchor (L147–156) which samples diverse anchor positions per sequence. This
+   is missing from the LoRA variant and would improve training diversity. Port _sample_anchor_positions and _build_blocks_from_anchors from
+   OnlineDFlashModel to OnlineDFlashLoRAModel.
+  7. Port chunked lm_head — The LoRA variant materializes full [bsz, seq_len, vocab_size] logits. For Qwen3-8B's 152K vocab, this is ~1.2GB
+   per sample at seq_len=2048. Port the _chunked_lm_loss from OnlineDFlashModel for memory efficiency.
+  8. Add tensor parallelism support — The LoRA script currently forces tp_size=1. For Qwen3-8B on multi-GPU setups, add TP support to shard
+   the base model across GPUs while keeping LoRA params on each rank.
+  Minimal starting point
+  If you want to just get training running immediately, the existing train_dflash_lora.py + configs/qwen3-8b-dflash-lora.json should work
+  out of the box:
+  torchrun --nproc_per_node=4 scripts/train_dflash_lora.py \
+    --model-path Qwen/Qwen3-8B \
+    --lora-config configs/qwen3-8b-dflash-lora.json \
+    --train-data-path /path/to/data.jsonl \
+    --output-dir ./outputs/qwen3-8b-dflash-lora \
+    --block-size 16 \
+    --gradient-checkpointing \
+    --learning-rate 2e-4 \
+    --batch-size 1 \
+    --max-length 2048 \
+    --num-epochs 3
+  Want me to start implementing any of these phases? I'd suggest starting with Phase 1 (noise schedule + MLP targets) since that's the most
+   impactful for the diffusion/AR fusion research angle.