Instructions to use yujiepan/deepseek-v4-tiny-random with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use yujiepan/deepseek-v4-tiny-random with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="yujiepan/deepseek-v4-tiny-random")

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("yujiepan/deepseek-v4-tiny-random")
model = AutoModelForCausalLM.from_pretrained("yujiepan/deepseek-v4-tiny-random")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use yujiepan/deepseek-v4-tiny-random with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "yujiepan/deepseek-v4-tiny-random"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "yujiepan/deepseek-v4-tiny-random",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/yujiepan/deepseek-v4-tiny-random

SGLang

How to use yujiepan/deepseek-v4-tiny-random with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "yujiepan/deepseek-v4-tiny-random" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "yujiepan/deepseek-v4-tiny-random",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "yujiepan/deepseek-v4-tiny-random" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "yujiepan/deepseek-v4-tiny-random",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use yujiepan/deepseek-v4-tiny-random with Docker Model Runner:
```
docker model run hf.co/yujiepan/deepseek-v4-tiny-random
```

deepseek-v4-tiny-random

File size: 12,650 Bytes

e44adc3

#!/usr/bin/env python3
"""
从 HuggingFace 风格的 config.json 构建 inference/model.py 中的 Transformer，
导出 state_dict，键名与仓库内 model.safetensors.index.json / info.html 中的 Metadata 列一致
（例如 embed.weight、layers.0.attn.wq_a.weight、layers.0.ffn.shared_experts.w1.weight，
无 transformers 的 model. 前缀）。

分片保存：指定 --output-dir 时，按 --index-json（默认仓库根目录 model.safetensors.index.json）
的 weight_map 将张量写入对应 model-XXXX-of-YYYY.safetensors，并生成同目录下的
model.safetensors.index.json（键名与参考索引一致，分片文件名与参考索引一致）。

用法示例:
  python export_state_dict_from_config.py --config config.json --output /tmp/out.safetensors
  python export_state_dict_from_config.py --config config.json --output-dir /tmp/shards

完整 max_position_embeddings 会在每层分配 RoPE 缓冲，内存很大；默认将 max_seq_len 限制为
65536，可用 --max-seq-len 覆盖。state_dict 不包含 persistent=False 的 buffer，与官方分片键集合对齐。
"""

from __future__ import annotations

import argparse
import json
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any

REPO_ROOT = Path(__file__).resolve().parent
INFERENCE_DIR = REPO_ROOT / "inference"
sys.path.insert(0, str(INFERENCE_DIR))

from model import ModelArgs, Transformer  # noqa: E402


def without_mtp_tied_aliases(sd: dict[str, Any]) -> dict[str, Any]:
    """MTP 的 embed/head 与主模块共享参数，safetensors 不能重复保存这些别名。"""
    out: dict[str, Any] = {}
    for k, v in sd.items():
        parts = k.split(".")
        if len(parts) >= 4 and parts[0] == "mtp" and parts[2] in {"embed", "head"}:
            continue
        out[k] = v
    return out


def _load_index_weight_map(index_path: Path) -> dict[str, str]:
    with open(index_path, encoding="utf-8") as f:
        idx = json.load(f)
    wm = idx.get("weight_map")
    if not isinstance(wm, dict):
        raise SystemExit(f"{index_path} 缺少 weight_map")
    return dict(wm)


def save_sharded_index_style(
    sd_inference: dict[str, Any],
    weight_map: dict[str, str],
    out_dir: Path,
) -> None:
    """按 weight_map 中的分片文件名写入多个 .safetensors，并写 model.safetensors.index.json。"""
    from safetensors.torch import save_file

    out_dir.mkdir(parents=True, exist_ok=True)
    shard_to_tensors: dict[str, dict[str, Any]] = defaultdict(dict)
    new_weight_map: dict[str, str] = {}
    for inf_key, shard_file in weight_map.items():
        if inf_key not in sd_inference:
            continue
        shard_to_tensors[shard_file][inf_key] = sd_inference[inf_key]
        new_weight_map[inf_key] = shard_file

    if not new_weight_map:
        raise SystemExit("weight_map 与当前 state_dict 无交集，无法分片写出")

    total_size = 0
    for shard_file, part in shard_to_tensors.items():
        for t in part.values():
            total_size += int(t.numel()) * int(t.element_size())
        save_file(part, str(out_dir / shard_file))

    index_out = {
        "metadata": {"total_size": total_size},
        "weight_map": new_weight_map,
    }
    with open(out_dir / "model.safetensors.index.json", "w", encoding="utf-8") as f:
        json.dump(index_out, f, indent=2)
        f.write("\n")


def _is_hf_deepseek_config(cfg: dict[str, Any]) -> bool:
    return cfg.get("model_type") == "deepseek_v4" or "num_hidden_layers" in cfg


def hf_config_to_model_args(
    cfg: dict[str, Any],
    *,
    max_batch_size: int,
    max_seq_len: int,
) -> ModelArgs:
    """将仓库根目录下 HuggingFace 的 config.json 转为 inference.ModelArgs。"""
    rope = cfg.get("rope_scaling") or {}
    qc = cfg.get("quantization_config") or {}

    use_fp8 = qc.get("quant_method") == "fp8" or str(qc.get("fmt", "")).lower() in (
        "e4m3",
        "e4m3fn",
        "fp8",
    )

    n_layers = int(cfg["num_hidden_layers"])
    ratios = list(cfg.get("compress_ratios") or ())
    compress_ratios = tuple(int(x) for x in ratios)

    return ModelArgs(
        max_batch_size=max_batch_size,
        max_seq_len=max_seq_len,
        dtype="fp8" if use_fp8 else "bf16",
        scale_fmt=qc.get("scale_fmt") or cfg.get("scale_fmt") or "ue8m0",
        expert_dtype=cfg.get("expert_dtype"),
        scale_dtype=cfg.get("scale_dtype") or ("fp8" if use_fp8 else "fp32"),
        vocab_size=int(cfg["vocab_size"]),
        dim=int(cfg["hidden_size"]),
        moe_inter_dim=int(cfg["moe_intermediate_size"]),
        n_layers=n_layers,
        n_hash_layers=int(cfg.get("num_hash_layers", 0)),
        n_mtp_layers=int(cfg.get("num_nextn_predict_layers", 0)),
        n_heads=int(cfg["num_attention_heads"]),
        n_routed_experts=int(cfg["n_routed_experts"]),
        n_shared_experts=int(cfg.get("n_shared_experts", 1)),
        n_activated_experts=int(cfg["num_experts_per_tok"]),
        score_func=str(cfg.get("scoring_func", cfg.get(
            "score_func", "sqrtsoftplus"))),
        route_scale=float(cfg.get("routed_scaling_factor",
                          cfg.get("route_scale", 1.0))),
        swiglu_limit=float(cfg.get("swiglu_limit", 0.0)),
        q_lora_rank=int(cfg["q_lora_rank"]),
        head_dim=int(cfg["head_dim"]),
        rope_head_dim=int(cfg["qk_rope_head_dim"]),
        norm_eps=float(cfg.get("rms_norm_eps", cfg.get("norm_eps", 1e-6))),
        o_groups=int(cfg["o_groups"]),
        o_lora_rank=int(cfg["o_lora_rank"]),
        window_size=int(cfg["sliding_window"]),
        compress_ratios=compress_ratios,
        compress_rope_theta=float(cfg.get("compress_rope_theta", 160000.0)),
        original_seq_len=int(
            rope.get(
                "original_max_position_embeddings",
                cfg.get("original_seq_len", 0),
            )
        ),
        rope_theta=float(cfg.get("rope_theta", 10000.0)),
        rope_factor=float(rope.get("factor", cfg.get("rope_factor", 1.0))),
        beta_fast=int(rope.get("beta_fast", cfg.get("beta_fast", 32))),
        beta_slow=int(rope.get("beta_slow", cfg.get("beta_slow", 1))),
        index_n_heads=int(cfg["index_n_heads"]),
        index_head_dim=int(cfg["index_head_dim"]),
        index_topk=int(cfg["index_topk"]),
        hc_mult=int(cfg["hc_mult"]),
        hc_sinkhorn_iters=int(cfg["hc_sinkhorn_iters"]),
        hc_eps=float(cfg.get("hc_eps", 1e-6)),
    )


def load_model_args(
    config_path: Path,
    *,
    max_batch_size: int,
    max_seq_len: int | None,
    cap_seq_len: int,
) -> ModelArgs:
    with open(config_path, encoding="utf-8") as f:
        raw = json.load(f)

    if _is_hf_deepseek_config(raw):
        cfg_max = int(raw.get("max_position_embeddings", cap_seq_len))
        mseq = max_seq_len if max_seq_len is not None else min(
            cfg_max, cap_seq_len)
        return hf_config_to_model_args(
            raw, max_batch_size=max_batch_size, max_seq_len=mseq
        )
    return ModelArgs(**raw)


def optional_prefix_keys(
    sd: dict[str, Any], prefix: str | None
) -> dict[str, Any]:
    if not prefix:
        return sd
    p = prefix.rstrip(".") + "."
    return {p + k: v for k, v in sd.items()}


def validate_against_index(
    keys: set[str], index_path: Path
) -> tuple[set[str], set[str]]:
    with open(index_path, encoding="utf-8") as f:
        idx = json.load(f)
    ref = set(idx.get("weight_map", {}).keys())
    missing = ref - keys
    extra = keys - ref
    return missing, extra


def main() -> None:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument(
        "--config",
        type=Path,
        default=REPO_ROOT / "config.json",
        help="config.json（HF 或 inference 格式）",
    )
    out_group = ap.add_mutually_exclusive_group(required=True)
    out_group.add_argument(
        "--output",
        type=Path,
        help="输出单个 .safetensors 或 .pt / .pth",
    )
    out_group.add_argument(
        "--output-dir",
        type=Path,
        default=None,
        help="按 index 的 weight_map 分片写入该目录（仅 safetensors + model.safetensors.index.json）",
    )
    ap.add_argument("--device", type=str, default="cpu",
                    help="cpu / cuda / meta")
    ap.add_argument("--max-batch-size", type=int, default=4)
    ap.add_argument(
        "--max-seq-len",
        type=int,
        default=None,
        help="覆盖 ModelArgs.max_seq_len；默认 min(max_position_embeddings, --cap-seq-len)",
    )
    ap.add_argument(
        "--cap-seq-len",
        type=int,
        default=65536,
        help="HF 配置下默认 max_seq_len 上限，避免每层分配过大 RoPE 缓冲",
    )
    ap.add_argument(
        "--prefix",
        type=str,
        default="",
        help='为输出键名加此前缀（如 "model."）；默认与 index 一致，不加前缀',
    )
    ap.add_argument(
        "--index-json",
        type=Path,
        default=None,
        help="校验或分片布局：默认在 --output-dir 时为仓库根 model.safetensors.index.json",
    )
    ap.add_argument("--strict-index", action="store_true",
                    help="与 --index-json 联用，要求键完全一致")
    args = ap.parse_args()

    import torch
    torch.set_default_dtype(torch.bfloat16)

    margs = load_model_args(
        args.config,
        max_batch_size=args.max_batch_size,
        max_seq_len=args.max_seq_len,
        cap_seq_len=args.cap_seq_len,
    )
    with open(REPO_ROOT / "ds_config.json", "w", encoding="utf-8") as f:
        json.dump(margs.__dict__, f, indent=2)

    dev = torch.device(args.device)
    with torch.device(dev):
        from transformers import set_seed
        set_seed(42)
        model = Transformer(margs)
    n_params = sum(p.numel() for p in model.parameters())
    print(f"Number of parameters: {n_params}")
    for k, v in model.named_modules():
        if k.count('.') <= 3:
            n_params_k = sum(p.numel() for p in v.parameters())
            print(k, f"{n_params_k} {n_params_k / n_params:.2%}")

    if dev.type != "meta":
        model = model.to(dev)
        for k, v in model.named_parameters():
            if 'norm.weight' in k or k == "head.weight":
                v.data = v.data.to(torch.bfloat16)

    sd_inference = without_mtp_tied_aliases(model.state_dict())
    index_path = args.index_json
    if args.output_dir is not None and index_path is None:
        index_path = REPO_ROOT / "model.safetensors.index.json"

    if index_path is not None:
        missing, extra = validate_against_index(
            set(sd_inference.keys()), index_path)
        if missing or extra:
            msg = f"index 对比: missing={len(missing)} extra={len(extra)}"
            if missing and len(missing) <= 20:
                msg += f"\n  missing 样例: {sorted(missing)[:20]}"
            elif missing:
                msg += f"\n  missing 样例: {sorted(missing)[:5]} ..."
            if extra and len(extra) <= 20:
                msg += f"\n  extra 样例: {sorted(extra)[:20]}"
            elif extra:
                msg += f"\n  extra 样例: {sorted(extra)[:5]} ..."
            if args.strict_index:
                raise SystemExit(msg)
            print(msg, file=sys.stderr)

    if args.output_dir is not None:
        if dev.type == "meta":
            raise SystemExit("device=meta 时无法写 safetensors，请改用 cpu/cuda")
        wm = _load_index_weight_map(index_path)  # type: ignore[arg-type]
        save_sharded_index_style(
            sd_inference,
            wm,
            args.output_dir,
        )
        written = sum(1 for k in wm if k in sd_inference)
        n_shards = len({wm[k] for k in wm if k in sd_inference})
        print(
            f"Wrote {written} tensors in {n_shards} shard files under {args.output_dir}"
        )
        return

    sd = optional_prefix_keys(sd_inference, args.prefix or None)

    out = args.output
    out.parent.mkdir(parents=True, exist_ok=True)
    suffix = out.suffix.lower()
    for k, v in sd.items():
        print(k, v.shape, v.dtype)
    if suffix == ".safetensors":
        from safetensors.torch import save_file

        # meta 张量无法写入 safetensors
        if dev.type == "meta":
            raise SystemExit(
                "device=meta 时无法写 safetensors，请改用 cpu/cuda 或输出 .pt")
        save_file(sd, str(out))
    else:
        if dev.type == "meta":
            torch.save(sd, out)
        else:
            torch.save(sd, out)
    print(f"Wrote {len(sd)} tensors to {out}")


if __name__ == "__main__":
    main()