Add files using upload-large-folder tool

1792b03 verified 21 days ago

336 kB

	#!/usr/bin/env python3
	from __future__ import annotations

	import os
	import argparse
	import contextlib
	import inspect
	import json
	from pathlib import Path
	from typing import Any, List, Tuple, Optional

	import torch
	import torch.nn as nn
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

	try:
	from .llopa_utils.saving_utils import load_embedding_layer, load_llopa_specials, read_backbone_ref
	except Exception:
	from llopa_utils.saving_utils import load_embedding_layer, load_llopa_specials, read_backbone_ref
	from transformers.cache_utils import DynamicCache

	# Be safe with tokenizers threads when forking
	os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
	os.environ.setdefault("DS_BUILD_AIO", "0")
	os.environ.setdefault("DS_BUILD_OPS", "0")


	def _dtype_kwargs(from_pretrained_fn, dtype: torch.dtype) -> dict:
	try:
	params = inspect.signature(from_pretrained_fn).parameters
	except Exception:
	return {"torch_dtype": dtype}
	if "torch_dtype" in params:
	return {"torch_dtype": dtype}
	if "dtype" in params:
	return {"dtype": dtype}
	# With **kwargs-only signatures, prefer torch_dtype. Passing raw `dtype`
	# can leak into GenerationConfig kwargs on some transformers versions and
	# trigger JSON serialization errors for torch.dtype.
	if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()):
	return {"torch_dtype": dtype}
	return {"torch_dtype": dtype}


	def _tokenizer_kwargs(from_pretrained_fn) -> dict:
	kwargs: dict = {}
	try:
	params = inspect.signature(from_pretrained_fn).parameters
	if "fix_mistral_regex" in params:
	kwargs["fix_mistral_regex"] = True
	elif any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()):
	# Older/newer tokenizer loaders often hide optional args behind **kwargs.
	kwargs["fix_mistral_regex"] = True
	except Exception:
	pass
	return kwargs


	def _normalize_dtype_arg(value) -> Optional[str]:
	if value is None:
	return None
	if isinstance(value, str):
	s = value.strip().lower()
	elif value is torch.bfloat16:
	s = "bfloat16"
	elif value is torch.float16:
	s = "float16"
	elif value is torch.float32:
	s = "float32"
	else:
	return None
	mapping = {
	"auto": "auto",
	"bf16": "bf16",
	"bfloat16": "bf16",
	"fp16": "fp16",
	"float16": "fp16",
	"half": "fp16",
	"fp32": "fp32",
	"float32": "fp32",
	}
	return mapping.get(s)

	# Special assistant-start token for Mistral-style templates
	MISTRAL_ASSIST_START = "<Mistral_start>"

	# Instruction appended after the question in prompts (requested).
	BOXED_ANSWER_INSTRUCTION = (
	"Provide the answer and finish with the short answer inside \\\\boxed{}."
	)
	MATH_STEP_BY_STEP_BOXED_INSTRUCTION = (
	"Solve the problem step by step, and put the final answer inside \\\\boxed{}."
	)

	# -----------------------------
	# Custom modeling loader (TRI)
	# -----------------------------
	def load_custom_modeling(modeling_path: str, model_family: str):
	"""Load local tri_*_modeling.py and register it as the HF module."""
	import importlib.util, sys
	import transformers # noqa: F401
	if model_family == "llama":
	import transformers.models.llama # ensure package exists
	target_name = "transformers.models.llama.modeling_llama"
	expected = ("LlamaModel", "LlamaForCausalLM")
	elif model_family == "qwen3":
	import transformers.models.qwen3 # ensure package exists
	target_name = "transformers.models.qwen3.modeling_qwen3"
	expected = ("Qwen3Model", "Qwen3ForCausalLM")
	elif model_family == "mistral":
	import transformers.models.mistral # ensure package exists
	target_name = "transformers.models.mistral.modeling_mistral"
	expected = ("MistralModel", "MistralForCausalLM")
	else:
	raise ValueError(f"Unknown model_family: {model_family}")
	if target_name in sys.modules:
	del sys.modules[target_name]
	spec = importlib.util.spec_from_file_location(target_name, str(modeling_path))
	if spec is None or spec.loader is None:
	raise RuntimeError(f"Failed to load spec for {modeling_path}")
	module = importlib.util.module_from_spec(spec)
	sys.modules[target_name] = module
	spec.loader.exec_module(module)
	for klass in expected:
	if not hasattr(module, klass):
	raise RuntimeError(f"{modeling_path} does not define {klass}")
	return module


	def infer_model_family(model_name: str, model_family_arg: str) -> str:
	if model_family_arg and model_family_arg != "auto":
	return model_family_arg
	name = (model_name or "").lower()
	if "qwen" in name:
	return "qwen3"
	if "llama" in name:
	return "llama"
	if "mistral" in name:
	return "mistral"
	return "llama"

	# -----------------------------
	# Template helpers
	# -----------------------------
	def _is_mistral_template(tokenizer) -> bool:
	tmpl = getattr(tokenizer, "chat_template", "") or ""
	name = getattr(getattr(tokenizer, "init_kwargs", {}), "get", lambda k, d=None: d)("name_or_path", "")
	return ("[INST]" in tmpl) or ("mistral" in str(name).lower()) or ("mistral" in tmpl.lower())

	def _is_qwen3_tokenizer(tokenizer) -> bool:
	name = getattr(tokenizer, "name_or_path", "") or ""
	cls = tokenizer.__class__.__name__.lower()
	return ("qwen3" in name.lower()) or ("qwen3" in cls)

	def ensure_mistral_special_token(tokenizer, model=None):
	"""Ensure the custom assistant-start token exists in tokenizer (and resize model embeddings if provided)."""
	if not _is_mistral_template(tokenizer):
	return False
	add_tok = []
	cur = set(tokenizer.get_vocab().keys())
	if MISTRAL_ASSIST_START not in cur:
	add_tok.append(MISTRAL_ASSIST_START)
	if add_tok:
	tokenizer.add_special_tokens({
	"additional_special_tokens": tokenizer.special_tokens_map_extended.get("additional_special_tokens", []) + add_tok
	})
	if model is not None:
	try:
	model.resize_token_embeddings(len(tokenizer))
	except Exception:
	pass
	return True
	return False

	def build_prompt_parts(task: str, question: str, document: str) -> Tuple[str, str]:
	t = (task or "qa_doc").strip()
	q = (question or "").strip()
	d = (document or "").strip()

	if t == "summary":
	instruction = q if q else "Summarize the passage."
	doc_text = f"Passage:\n{d}" if d else ""
	return doc_text, instruction
	if t == "code":
	if not q:
	q = "Solve the following problem."
	return "", f"Write code to solve the following problem:\n{q}"
	if t == "math":
	mq = (q if q else d).strip()
	if mq:
	mq = f"{mq}\n{MATH_STEP_BY_STEP_BOXED_INSTRUCTION}"
	else:
	mq = MATH_STEP_BY_STEP_BOXED_INSTRUCTION
	return "", mq

	doc_text = f"Document:\n{d}" if d else ""
	query_text = f"Question: {q}\n{BOXED_ANSWER_INSTRUCTION}" if q else ""
	return doc_text, query_text

	def build_user_content(task: str, question: str, document: str) -> str:
	doc_text, query_text = build_prompt_parts(task, question, document)
	if doc_text and query_text:
	return f"{doc_text}\n\n{query_text}"
	return doc_text or query_text

	def build_messages(system: str, document: str, question: str,
	include_query: bool = True, task: str = "qa_doc"):
	q = question if include_query else ""
	user = build_user_content(task, q, document)
	if not user:
	user = (q or document or "")
	return [{"role": "system", "content": system}, {"role": "user", "content": user}]

	def apply_chat_template(tokenizer, messages, add_generation_prompt: bool):
	"""Render chat with robust fallback across templates.

	- Prefer tokenizer.apply_chat_template(..., add_generation_prompt=...)
	- If that signature is unsupported, detect template style:
	* Llama-3 style → append assistant header tokens
	* Mistral/INST style → no explicit assistant header to append
	* Unknown → do not append anything
	"""
	force_thinking = getattr(tokenizer, "_force_enable_thinking", None)
	if force_thinking is None and _is_qwen3_tokenizer(tokenizer):
	force_thinking = False
	try:
	if force_thinking is None:
	return tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=add_generation_prompt
	)
	return tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=add_generation_prompt,
	enable_thinking=bool(force_thinking),
	)
	except TypeError:
	tmpl = getattr(tokenizer, "chat_template", "") or ""
	try:
	if force_thinking is None:
	s = tokenizer.apply_chat_template(messages, tokenize=False)
	else:
	s = tokenizer.apply_chat_template(messages, tokenize=False, enable_thinking=bool(force_thinking))
	except TypeError:
	s = tokenizer.apply_chat_template(messages, tokenize=False)
	if add_generation_prompt:
	if "<\|start_header_id\|>" in tmpl: # Llama 3 style
	s += "<\|start_header_id\|>assistant<\|end_header_id\|>\n\n"
	elif "[INST]" in tmpl or "</s>" in tmpl: # Mistral style: no explicit header
	s += ""
	else:
	# Unknown template → safest is to append nothing
	s += ""
	return s

	def tokens_from_messages(tokenizer, messages, device, add_generation_prompt=False):
	s = apply_chat_template(tokenizer, messages, add_generation_prompt)
	ids = tokenizer(s, add_special_tokens=False, return_tensors="pt").input_ids.to(device)
	# If Mistral template and generation prompt requested, append our assistant-start header token
	# if add_generation_prompt and _is_mistral_template(tokenizer):
	# try:
	# tok_id = tokenizer.convert_tokens_to_ids(MISTRAL_ASSIST_START)
	# if tok_id is not None and tok_id != tokenizer.unk_token_id:
	# extra = torch.tensor([[int(tok_id)]], device=ids.device, dtype=ids.dtype)
	# ids = torch.cat([ids, extra], dim=1)
	# except Exception:
	# pass
	return ids


	def _build_llopa_inputs(tokenizer,
	system: str,
	document: str,
	question: str,
	*,
	task: str = "qa_doc",
	device: str):
	doc_text, query_text = build_prompt_parts(task, question, document)
	msgs = build_messages(system, document, question, include_query=True, task=task)
	ids_sys = tokens_from_messages(
	tokenizer, [{"role": "system", "content": system}], device, add_generation_prompt=False
	)
	ids_hdr = tokens_from_messages(tokenizer, msgs, device, add_generation_prompt=True)
	hdr_tail = _assistant_header_ids(tokenizer, device)
	ids_sys_user = None
	if hdr_tail is not None and hdr_tail.numel() > 0:
	tail_len = hdr_tail.size(1)
	if ids_hdr.size(1) >= tail_len and torch.equal(ids_hdr[:, -tail_len:], hdr_tail):
	ids_sys_user = ids_hdr[:, :-tail_len]
	if ids_sys_user is None:
	ids_sys_user = tokens_from_messages(tokenizer, msgs, device, add_generation_prompt=False)
	hdr_tail = ids_hdr[:, ids_sys_user.size(1):]
	if ids_sys_user.size(1) < ids_sys.size(1):
	raise ValueError("System-only tokens longer than system+user tokens.")
	user_ids_full = ids_sys_user[:, ids_sys.size(1):]

	user_content_full = msgs[-1]["content"]
	if query_text:
	doc_prefix_text = f"{doc_text}\n\n" if doc_text else ""
	else:
	doc_prefix_text = user_content_full
	user_doc_ids = user_ids_full[:, 0:0]
	user_q_ids = user_ids_full[:, 0:0]
	if user_ids_full.size(1) > 0:
	msgs_empty = [{"role": "system", "content": system}, {"role": "user", "content": ""}]
	ids_empty = tokens_from_messages(tokenizer, msgs_empty, device, add_generation_prompt=False)
	header_prefix_len = lcp_len(ids_empty, ids_sys_user)
	user_header_len = max(0, header_prefix_len - ids_sys.size(1))
	doc_prefix_len = 0
	if doc_prefix_text:
	doc_prefix_len = len(tokenizer(doc_prefix_text, add_special_tokens=False).input_ids)
	doc_end = min(user_ids_full.size(1), user_header_len + doc_prefix_len)
	user_doc_ids = user_ids_full[:, :doc_end]
	user_q_ids = user_ids_full[:, doc_end:]

	return {
	"prompt_ids": ids_hdr,
	"system_ids": ids_sys,
	"system_user_ids": ids_sys_user,
	"user_ids_full": user_ids_full,
	"user_doc_ids": user_doc_ids,
	"user_q_ids": user_q_ids,
	"hdr_tail": hdr_tail,
	}


	def build_messages_for_llopa(tokenizer,
	system: str,
	document: str,
	question: str,
	*,
	task: str = "qa_doc",
	device: str):
	return _build_llopa_inputs(
	tokenizer,
	system=system,
	document=document,
	question=question,
	task=task,
	device=device,
	)


	def _normalize_prompt_messages(messages):
	out = []
	if not isinstance(messages, list):
	return out
	for msg in messages:
	if not isinstance(msg, dict):
	continue
	role = str(msg.get("role") or "user").strip().lower()
	if role not in {"system", "user", "assistant"}:
	role = "user"
	content = str(msg.get("content") or "")
	if role != "assistant":
	content = content.strip()
	if not content:
	continue
	out.append({"role": role, "content": content})
	return out


	def _assistant_header_starts_from_messages(
	tokenizer,
	prompt_messages,
	*,
	prompt_add_generation_prompt: bool,
	device,
	):
	msgs = _normalize_prompt_messages(prompt_messages)
	if not msgs:
	return None, None

	starts: list[int] = []
	for idx, msg in enumerate(msgs):
	if msg["role"] != "assistant":
	continue
	prefix_ids = tokens_from_messages(tokenizer, msgs[:idx], device, add_generation_prompt=False)
	starts.append(int(prefix_ids.size(1)))

	if bool(prompt_add_generation_prompt) and msgs[-1]["role"] != "assistant":
	prefix_ids = tokens_from_messages(tokenizer, msgs, device, add_generation_prompt=False)
	starts.append(int(prefix_ids.size(1)))

	if not starts:
	return None, None

	starts_tensor = torch.tensor([starts], device=device, dtype=torch.long)
	start_mask = torch.ones((1, len(starts)), device=device, dtype=torch.bool)
	return starts_tensor, start_mask


	def _assistant_turn_boundaries_from_messages(
	tokenizer,
	prompt_messages,
	*,
	prompt_add_generation_prompt: bool,
	device,
	):
	msgs = _normalize_prompt_messages(prompt_messages)
	if not msgs:
	return None, None, None

	starts: list[int] = []
	ends: list[int] = []
	for idx, msg in enumerate(msgs):
	if msg["role"] != "assistant":
	continue
	prefix_ids = tokens_from_messages(tokenizer, msgs[:idx], device, add_generation_prompt=False)
	turn_ids = tokens_from_messages(tokenizer, msgs[: idx + 1], device, add_generation_prompt=False)
	starts.append(int(prefix_ids.size(1)))
	ends.append(int(turn_ids.size(1)))

	if bool(prompt_add_generation_prompt) and msgs[-1]["role"] != "assistant":
	prefix_ids = tokens_from_messages(tokenizer, msgs, device, add_generation_prompt=False)
	prompt_ids = tokens_from_messages(tokenizer, msgs, device, add_generation_prompt=True)
	starts.append(int(prefix_ids.size(1)))
	ends.append(int(prompt_ids.size(1)))

	if not starts:
	return None, None, None

	starts_tensor = torch.tensor([starts], device=device, dtype=torch.long)
	ends_tensor = torch.tensor([ends], device=device, dtype=torch.long)
	start_mask = torch.ones((1, len(starts)), device=device, dtype=torch.bool)
	return starts_tensor, ends_tensor, start_mask


	def _assistant_content_delta_from_messages(tokenizer, prefix_messages, assistant_text: str, su_gen, device):
	msgs_ass = list(prefix_messages) + [{"role": "assistant", "content": assistant_text}]
	full_ids = tokens_from_messages(tokenizer, msgs_ass, device, add_generation_prompt=False)
	if full_ids.size(1) <= su_gen.size(1):
	return full_ids[:, :0]
	return full_ids[:, su_gen.size(1):]


	def _strip_trailing_assistant_stop_tokens(tokenizer, token_ids: torch.Tensor) -> torch.Tensor:
	if not isinstance(token_ids, torch.Tensor) or token_ids.numel() == 0:
	return token_ids
	stop_ids = set()
	eos_id = getattr(tokenizer, "eos_token_id", None)
	if eos_id is not None:
	stop_ids.add(int(eos_id))
	with contextlib.suppress(Exception):
	eot_id = tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	if eot_id is not None and eot_id != tokenizer.unk_token_id:
	stop_ids.add(int(eot_id))
	if not stop_ids:
	return token_ids
	trimmed = token_ids
	while trimmed.size(1) > 0 and int(trimmed[0, -1].item()) in stop_ids:
	trimmed = trimmed[:, :-1]
	return trimmed


	def _resolve_user_replay_layout_from_messages(
	tokenizer,
	prefix_messages,
	*,
	system_len: int,
	user_len: int,
	device,
	):
	system_len = max(int(system_len), 0)
	user_len = max(int(user_len), 0)
	if user_len <= 0:
	return 0, 0, 0

	user_indices = [idx for idx, msg in enumerate(prefix_messages) if msg["role"] == "user"]
	if not user_indices:
	return 0, 0, user_len

	def _token_len(msgs) -> int:
	try:
	return int(tokens_from_messages(tokenizer, msgs, device, add_generation_prompt=False).size(1))
	except Exception:
	return 0

	first_user_idx = int(user_indices[0])
	latest_user_idx = int(user_indices[-1])

	prefix_before_first_user_len = _token_len(prefix_messages[:first_user_idx])
	prefix_before_latest_user_len = _token_len(prefix_messages[:latest_user_idx])
	prefix_through_latest_user_len = _token_len(prefix_messages[: latest_user_idx + 1])

	user_prefix_keep_len = max(prefix_before_first_user_len - system_len, 0)
	user_prefix_keep_len = min(user_prefix_keep_len, user_len)

	latest_user_start = max(prefix_before_latest_user_len - system_len, user_prefix_keep_len)
	latest_user_start = min(latest_user_start, user_len)

	latest_user_end = max(prefix_through_latest_user_len - system_len, latest_user_start)
	latest_user_end = min(latest_user_end, user_len)

	latest_user_len = max(latest_user_end - latest_user_start, 0)
	if latest_user_len <= 0 and user_len > user_prefix_keep_len:
	latest_user_start = int(user_prefix_keep_len)
	latest_user_len = int(user_len - user_prefix_keep_len)

	return int(user_prefix_keep_len), int(latest_user_start), int(latest_user_len)


	def _build_structured_prompt_segments(tokenizer, prompt_messages, *, prompt_add_generation_prompt: bool, device):
	msgs = _normalize_prompt_messages(prompt_messages)
	if not msgs:
	raise ValueError("prompt_messages must contain at least one non-empty message.")

	if bool(prompt_add_generation_prompt):
	if msgs[-1]["role"] == "assistant":
	raise ValueError("prompt_add_generation_prompt=True requires prompt_messages to end with a non-assistant role.")
	prefix_messages = msgs
	prefix_ids = tokens_from_messages(tokenizer, prefix_messages, device, add_generation_prompt=False)
	prompt_ids = tokens_from_messages(tokenizer, prefix_messages, device, add_generation_prompt=True)
	assistant_prefill_ids = prompt_ids[:, prefix_ids.size(1):]
	else:
	if msgs[-1]["role"] != "assistant":
	raise ValueError("prompt_add_generation_prompt=False requires prompt_messages to end with an assistant prefix.")
	if len(msgs) < 2:
	raise ValueError("assistant-prefix prompts require at least one preceding non-assistant message.")
	prefix_messages = msgs[:-1]
	assistant_text = str(msgs[-1]["content"] or "")
	prefix_ids = tokens_from_messages(tokenizer, prefix_messages, device, add_generation_prompt=False)
	prompt_prefix = tokens_from_messages(tokenizer, prefix_messages, device, add_generation_prompt=True)
	assistant_content_ids = _assistant_content_delta_from_messages(
	tokenizer,
	prefix_messages,
	assistant_text,
	prompt_prefix,
	device,
	)
	assistant_content_ids = _strip_trailing_assistant_stop_tokens(tokenizer, assistant_content_ids)
	assistant_header_ids = prompt_prefix[:, prefix_ids.size(1):]
	assistant_prefill_ids = torch.cat([assistant_header_ids, assistant_content_ids], dim=1)
	prompt_ids = torch.cat([prefix_ids, assistant_prefill_ids], dim=1)

	if assistant_prefill_ids.size(1) <= 0:
	raise ValueError("Structured direct LLoPA prompt produced an empty assistant prefill segment.")

	if prefix_messages and prefix_messages[0]["role"] == "system":
	system_ids = tokens_from_messages(tokenizer, [prefix_messages[0]], device, add_generation_prompt=False)
	user_ids = prefix_ids[:, system_ids.size(1):]
	else:
	system_ids = prefix_ids[:, :0]
	user_ids = prefix_ids

	replay_user_prefix_keep_len, replay_user_start, replay_user_len = _resolve_user_replay_layout_from_messages(
	tokenizer,
	prefix_messages,
	system_len=int(system_ids.size(1)),
	user_len=int(user_ids.size(1)),
	device=device,
	)

	assistant_header_starts, assistant_turn_ends, assistant_header_start_mask = _assistant_turn_boundaries_from_messages(
	tokenizer,
	msgs,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	device=device,
	)

	return {
	"prefix_ids": prefix_ids,
	"prompt_ids": prompt_ids,
	"system_ids": system_ids,
	"user_ids": user_ids,
	"assistant_prefill_ids": assistant_prefill_ids,
	"replay_user_prefix_keep_len": int(replay_user_prefix_keep_len),
	"replay_user_start": int(replay_user_start),
	"replay_user_len": int(replay_user_len),
	"assistant_header_starts": assistant_header_starts,
	"assistant_turn_ends": assistant_turn_ends,
	"assistant_header_start_mask": assistant_header_start_mask,
	}


	def _build_unified_prefill_lower_prompt_bundle(
	tokenizer,
	*,
	prompt_messages,
	prompt_add_generation_prompt: bool,
	structured_prompt_segments=None,
	device,
	):
	segments = structured_prompt_segments if isinstance(structured_prompt_segments, dict) else None
	if segments is None:
	segments = _build_structured_prompt_segments(
	tokenizer,
	prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	device=device,
	)

	prompt_ids = segments["prompt_ids"]
	system_ids = segments["system_ids"]
	prefix_ids = segments.get("prefix_ids")
	header_starts = segments.get("assistant_header_starts")
	assistant_turn_ends = segments.get("assistant_turn_ends")
	header_start_mask = segments.get("assistant_header_start_mask")
	replay_user_prefix_keep_len = int(segments.get("replay_user_prefix_keep_len", 0) or 0)
	replay_user_start = int(segments.get("replay_user_start", 0) or 0)
	replay_user_len = int(segments.get("replay_user_len", 0) or 0)

	assistant_header_start: Optional[int] = None
	if (
	isinstance(header_starts, torch.Tensor)
	and header_starts.ndim == 2
	and header_starts.size(0) == 1
	and header_starts.numel() > 0
	):
	valid_mask = header_start_mask
	if not isinstance(valid_mask, torch.Tensor) or valid_mask.shape != header_starts.shape:
	valid_mask = header_starts >= 0
	valid_starts = header_starts[0][valid_mask[0]]
	if valid_starts.numel() > 0:
	assistant_header_start = int(valid_starts[-1].item())
	if isinstance(prefix_ids, torch.Tensor) and prefix_ids.ndim == 2 and prefix_ids.size(0) == 1:
	assistant_header_start = int(prefix_ids.size(1))
	if assistant_header_start is None:
	header_ids = _assistant_header_ids(tokenizer, device=device)
	if isinstance(header_ids, torch.Tensor) and header_ids.ndim == 2 and header_ids.size(0) == 1:
	assistant_header_start = _find_last_subsequence_start(prompt_ids, header_ids)
	if assistant_header_start is None:
	assistant_header_start = max(int(prompt_ids.size(1) - 1), 0)

	return {
	"segments": segments,
	"prompt_ids": prompt_ids,
	"attention_mask": torch.ones_like(prompt_ids, device=prompt_ids.device),
	"assistant_header_start": int(assistant_header_start),
	"prefill_lower_split_start": torch.tensor(
	[int(assistant_header_start)],
	device=prompt_ids.device,
	dtype=torch.long,
	),
	"assistant_header_starts": (
	header_starts.to(device=prompt_ids.device, dtype=torch.long)
	if isinstance(header_starts, torch.Tensor) and header_starts.numel() > 0
	else torch.tensor(
	[[int(assistant_header_start)]],
	device=prompt_ids.device,
	dtype=torch.long,
	)
	),
	"assistant_turn_ends": (
	assistant_turn_ends.to(device=prompt_ids.device, dtype=torch.long)
	if isinstance(assistant_turn_ends, torch.Tensor) and assistant_turn_ends.numel() > 0
	else torch.tensor(
	[[int(prompt_ids.size(1))]],
	device=prompt_ids.device,
	dtype=torch.long,
	)
	),
	"assistant_header_start_mask": (
	header_start_mask.to(device=prompt_ids.device, dtype=torch.bool)
	if isinstance(header_start_mask, torch.Tensor) and header_start_mask.numel() > 0
	else torch.ones((1, 1), device=prompt_ids.device, dtype=torch.bool)
	),
	"prefill_lower_system_len": torch.tensor(
	[int(system_ids.size(1))],
	device=prompt_ids.device,
	dtype=torch.long,
	),
	"prefill_lower_replay_user_prefix_keep_len": torch.tensor(
	[int(replay_user_prefix_keep_len)],
	device=prompt_ids.device,
	dtype=torch.long,
	),
	"prefill_lower_replay_user_start": torch.tensor(
	[int(replay_user_start)],
	device=prompt_ids.device,
	dtype=torch.long,
	),
	"prefill_lower_replay_user_len": torch.tensor(
	[int(replay_user_len)],
	device=prompt_ids.device,
	dtype=torch.long,
	),
	}


	def _prompt_bundle_has_past_assistant_history(prompt_bundle) -> bool:
	if not isinstance(prompt_bundle, dict):
	return False

	header_starts = prompt_bundle.get("assistant_header_starts")
	turn_ends = prompt_bundle.get("assistant_turn_ends")
	if not isinstance(header_starts, torch.Tensor) or not isinstance(turn_ends, torch.Tensor):
	return False
	if header_starts.numel() == 0 or turn_ends.numel() == 0:
	return False
	if header_starts.dim() == 1:
	header_starts = header_starts.view(1, -1)
	if turn_ends.dim() == 1:
	turn_ends = turn_ends.view(1, -1)
	if header_starts.dim() != 2 or turn_ends.dim() != 2:
	return False

	header_mask = prompt_bundle.get("assistant_header_start_mask")
	if isinstance(header_mask, torch.Tensor) and header_mask.numel() > 0:
	if header_mask.dim() == 1:
	header_mask = header_mask.view(1, -1)
	if header_mask.dim() != 2 or header_mask.shape != header_starts.shape:
	header_mask = header_starts >= 0
	else:
	header_mask = header_mask.to(device=header_starts.device, dtype=torch.bool)
	else:
	header_mask = header_starts >= 0

	split_starts = prompt_bundle.get("effective_prefill_lower_split_start")
	if not isinstance(split_starts, torch.Tensor) or split_starts.numel() == 0:
	split_starts = prompt_bundle.get("prefill_lower_split_start")
	if isinstance(split_starts, torch.Tensor) and split_starts.numel() > 0:
	split_starts = split_starts.flatten().to(device=header_starts.device, dtype=torch.long)
	else:
	split_starts = torch.tensor(
	[int(prompt_bundle.get("assistant_header_start", 0) or 0)],
	device=header_starts.device,
	dtype=torch.long,
	)

	rows = min(int(header_starts.size(0)), int(turn_ends.size(0)))
	cols = min(int(header_starts.size(1)), int(turn_ends.size(1)))
	if rows <= 0 or cols <= 0:
	return False
	for row in range(rows):
	split_idx = min(row, int(split_starts.numel()) - 1)
	split_start = int(split_starts[split_idx].item())
	for col in range(cols):
	if not bool(header_mask[row, col].item()):
	continue
	turn_start = int(header_starts[row, col].item())
	turn_end = int(turn_ends[row, col].item())
	if turn_end <= turn_start or turn_start >= split_start:
	continue
	if min(turn_end, split_start) > turn_start:
	return True
	return False


	def _direct_prefill_lower_cache_and_logits(
	model,
	*,
	prompt_bundle,
	lower_k: int,
	prefill_attn: str,
	system_prefill: str,
	no_upper_attn: bool,
	see_past_assistant: bool = False,
	replay_module: str = "none",
	replay_per_layers: int = -1,
	last_layer_module: Optional[str] = None,
	seed_mode: str = "auto",
	):
	if last_layer_module is not None and _normalize_replay_module_value(replay_module) == "none":
	replay_module = last_layer_module
	replay_module = _normalize_replay_module_value(replay_module)
	replay_per_layers = _normalize_replay_per_layers_value(replay_per_layers)
	seed_mode = _normalize_structured_llopa_seed_mode(seed_mode)
	if isinstance(prompt_bundle, dict):
	prompt_ids = prompt_bundle.get("prompt_ids")
	if isinstance(prompt_ids, torch.Tensor):
	attention_mask = prompt_bundle.get("attention_mask")
	if not isinstance(attention_mask, torch.Tensor) or attention_mask.shape != prompt_ids.shape:
	attention_mask = torch.ones_like(prompt_ids, device=prompt_ids.device, dtype=torch.long)
	else:
	attention_mask = attention_mask.to(device=prompt_ids.device, dtype=torch.long)

	effective_prompt_ids = prompt_ids
	effective_prompt_attention_mask = attention_mask
	split_starts = prompt_bundle.get("prefill_lower_split_start")
	if isinstance(split_starts, torch.Tensor):
	split_starts = split_starts.to(device=prompt_ids.device, dtype=torch.long)
	else:
	split_starts = torch.tensor(
	[int(prompt_bundle.get("assistant_header_start", max(int(prompt_ids.size(1) - 1), 0)))],
	device=prompt_ids.device,
	dtype=torch.long,
	)

	valid_len = (
	int(effective_prompt_attention_mask[0].sum().item())
	if effective_prompt_attention_mask.ndim == 2 and effective_prompt_attention_mask.size(0) == 1
	else int(effective_prompt_ids.size(1))
	)
	if valid_len > 0:
	effective_prompt_ids = effective_prompt_ids[:, :valid_len]
	effective_prompt_attention_mask = effective_prompt_attention_mask[:, :valid_len]

	prompt_bundle["effective_prompt_ids"] = effective_prompt_ids
	prompt_bundle["effective_prompt_attention_mask"] = effective_prompt_attention_mask
	prompt_bundle["effective_prefill_lower_split_start"] = split_starts

	segments = prompt_bundle.get("segments") if isinstance(prompt_bundle, dict) else None
	if not isinstance(segments, dict):
	return None

	if not bool(see_past_assistant):
	matched_inband_seed = _matched_inband_prefill_cache_and_logits(
	model,
	prompt_bundle=prompt_bundle,
	lower_k=lower_k,
	prefill_attn=prefill_attn,
	system_prefill=system_prefill,
	no_upper_attn=no_upper_attn,
	)
	if matched_inband_seed is not None:
	return matched_inband_seed

	llopa_full_prompt_seed_fn = _get_llopa_full_prompt_seed(model)
	effective_prompt_ids = prompt_bundle.get("effective_prompt_ids")
	effective_prompt_attention_mask = prompt_bundle.get("effective_prompt_attention_mask")
	split_starts = prompt_bundle.get("effective_prefill_lower_split_start")
	system_lens = prompt_bundle.get("prefill_lower_system_len")
	replay_user_prefix_keep_lens = prompt_bundle.get("prefill_lower_replay_user_prefix_keep_len")
	replay_user_starts = prompt_bundle.get("prefill_lower_replay_user_start")
	replay_user_lens = prompt_bundle.get("prefill_lower_replay_user_len")
	assistant_header_starts = prompt_bundle.get("assistant_header_starts")
	assistant_turn_ends = prompt_bundle.get("assistant_turn_ends")
	assistant_header_start_mask = prompt_bundle.get("assistant_header_start_mask")
	needs_past_assistant_seed = bool(see_past_assistant) and _prompt_bundle_has_past_assistant_history(prompt_bundle)
	should_try_full_prompt_seed = seed_mode == "auto" or bool(needs_past_assistant_seed)
	full_prompt_seed_error = None
	if should_try_full_prompt_seed and callable(llopa_full_prompt_seed_fn) and isinstance(effective_prompt_ids, torch.Tensor):
	if not isinstance(effective_prompt_attention_mask, torch.Tensor):
	effective_prompt_attention_mask = torch.ones_like(
	effective_prompt_ids,
	device=effective_prompt_ids.device,
	dtype=torch.long,
	)
	if not isinstance(split_starts, torch.Tensor):
	split_starts = prompt_bundle.get("prefill_lower_split_start")
	if not isinstance(system_lens, torch.Tensor):
	system_lens = torch.zeros(
	(effective_prompt_ids.size(0),),
	device=effective_prompt_ids.device,
	dtype=torch.long,
	)
	try:
	full_prompt_seed = llopa_full_prompt_seed_fn(
	input_ids=effective_prompt_ids,
	attention_mask=effective_prompt_attention_mask,
	use_cache=True,
	logits_to_keep=1,
	lower_k=int(lower_k),
	prefill_attn=str(prefill_attn),
	system_prefill=str(system_prefill),
	no_upper_attn=bool(no_upper_attn),
	prefill_lower_split_start=split_starts,
	prefill_lower_system_len=system_lens,
	prefill_lower_replay_user_prefix_keep_len=replay_user_prefix_keep_lens,
	prefill_lower_replay_user_start=replay_user_starts,
	prefill_lower_replay_user_len=replay_user_lens,
	assistant_header_starts=assistant_header_starts,
	assistant_turn_ends=assistant_turn_ends,
	assistant_header_start_mask=assistant_header_start_mask,
	prefill_lower_see_past_assistant=bool(see_past_assistant),
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	)
	if full_prompt_seed is not None:
	return full_prompt_seed
	except Exception as exc:
	full_prompt_seed_error = exc

	if bool(needs_past_assistant_seed):
	if not callable(llopa_full_prompt_seed_fn):
	raise RuntimeError(
	"LLOPA_SEE_PAST_ASSISTANT=1 requires llopa_full_prompt_prefill_seed "
	"for prefill_header prompts with previous assistant turns."
	)
	if full_prompt_seed_error is not None:
	raise RuntimeError(
	"LLOPA_SEE_PAST_ASSISTANT=1 failed in llopa_full_prompt_prefill_seed "
	"for a prefill_header prompt with previous assistant turns."
	) from full_prompt_seed_error
	raise RuntimeError(
	"LLOPA_SEE_PAST_ASSISTANT=1 could not build a prefill_header seed "
	"that includes previous assistant turns."
	)

	seed_fn = getattr(model, "llopa_reference_prefill_seed", None)
	if not callable(seed_fn):
	return None
	try:
	return seed_fn(
	system_ids=segments["system_ids"],
	user_ids=segments["user_ids"],
	assistant_ids=segments["assistant_prefill_ids"],
	lower_k=int(lower_k),
	prefill_attn=str(prefill_attn),
	system_prefill=str(system_prefill),
	no_upper_attn=bool(no_upper_attn),
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	replay_user_prefix_keep_len=int(segments.get("replay_user_prefix_keep_len", 0) or 0),
	replay_user_start=int(segments.get("replay_user_start", 0) or 0),
	replay_user_len=int(segments.get("replay_user_len", 0) or 0),
	)
	except Exception:
	return None


	def _optimized_prefill_lower_cache_and_logits(
	model,
	*,
	prompt_bundle,
	lower_k: int,
	prefill_attn: str,
	system_prefill: str,
	no_upper_attn: bool,
	see_past_assistant: bool = False,
	replay_module: str = "none",
	replay_per_layers: int = -1,
	last_layer_module: Optional[str] = None,
	seed_mode: str = "auto",
	):
	if last_layer_module is not None and _normalize_replay_module_value(replay_module) == "none":
	replay_module = last_layer_module
	replay_module = _normalize_replay_module_value(replay_module)
	replay_per_layers = _normalize_replay_per_layers_value(replay_per_layers)
	seed_mode = _normalize_optimized_llopa_seed_mode(seed_mode)
	if seed_mode in {"matched", "auto"}:
	matched_seed = _matched_inband_prefill_cache_and_logits(
	model,
	prompt_bundle=prompt_bundle,
	lower_k=lower_k,
	prefill_attn=prefill_attn,
	system_prefill=system_prefill,
	no_upper_attn=no_upper_attn,
	replay_module=replay_module,
	replay_per_layers=replay_per_layers,
	)
	if matched_seed is not None:
	return matched_seed

	llopa_full_prompt_seed_fn = _get_llopa_full_prompt_seed(model)
	if seed_mode in {"tri", "auto"} and callable(llopa_full_prompt_seed_fn) and isinstance(prompt_bundle, dict):
	effective_prompt_ids = prompt_bundle.get("effective_prompt_ids")
	effective_prompt_attention_mask = prompt_bundle.get("effective_prompt_attention_mask")
	split_starts = prompt_bundle.get("effective_prefill_lower_split_start")
	system_lens = prompt_bundle.get("prefill_lower_system_len")
	replay_user_prefix_keep_lens = prompt_bundle.get("prefill_lower_replay_user_prefix_keep_len")
	replay_user_starts = prompt_bundle.get("prefill_lower_replay_user_start")
	replay_user_lens = prompt_bundle.get("prefill_lower_replay_user_len")
	assistant_header_starts = prompt_bundle.get("assistant_header_starts")
	assistant_header_start_mask = prompt_bundle.get("assistant_header_start_mask")
	if isinstance(effective_prompt_ids, torch.Tensor):
	if not isinstance(effective_prompt_attention_mask, torch.Tensor):
	effective_prompt_attention_mask = torch.ones_like(
	effective_prompt_ids,
	device=effective_prompt_ids.device,
	dtype=torch.long,
	)
	if not isinstance(split_starts, torch.Tensor):
	split_starts = prompt_bundle.get("prefill_lower_split_start")
	if not isinstance(split_starts, torch.Tensor):
	split_starts = torch.tensor(
	[int(prompt_bundle.get("assistant_header_start", max(int(effective_prompt_ids.size(1) - 1), 0)))],
	device=effective_prompt_ids.device,
	dtype=torch.long,
	)
	if not isinstance(system_lens, torch.Tensor):
	system_lens = torch.zeros(
	(effective_prompt_ids.size(0),),
	device=effective_prompt_ids.device,
	dtype=torch.long,
	)
	if not isinstance(replay_user_prefix_keep_lens, torch.Tensor):
	replay_user_prefix_keep_lens = torch.zeros(
	(effective_prompt_ids.size(0),),
	device=effective_prompt_ids.device,
	dtype=torch.long,
	)
	if not isinstance(replay_user_starts, torch.Tensor):
	replay_user_starts = replay_user_prefix_keep_lens.clone()
	if not isinstance(replay_user_lens, torch.Tensor):
	replay_user_lens = torch.zeros(
	(effective_prompt_ids.size(0),),
	device=effective_prompt_ids.device,
	dtype=torch.long,
	)

	try:
	full_prompt_seed = llopa_full_prompt_seed_fn(
	input_ids=effective_prompt_ids,
	attention_mask=effective_prompt_attention_mask,
	use_cache=True,
	logits_to_keep=1,
	lower_k=int(lower_k),
	prefill_attn=str(prefill_attn),
	system_prefill=str(system_prefill),
	no_upper_attn=bool(no_upper_attn),
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	prefill_lower_split_start=split_starts,
	prefill_lower_system_len=system_lens,
	prefill_lower_replay_user_prefix_keep_len=replay_user_prefix_keep_lens,
	prefill_lower_replay_user_start=replay_user_starts,
	prefill_lower_replay_user_len=replay_user_lens,
	assistant_header_starts=assistant_header_starts,
	assistant_turn_ends=prompt_bundle.get("assistant_turn_ends"),
	assistant_header_start_mask=assistant_header_start_mask,
	prefill_lower_see_past_assistant=bool(see_past_assistant),
	)
	if full_prompt_seed is not None:
	return full_prompt_seed
	except Exception:
	pass

	if seed_mode in {"stable", "auto", "matched"}:
	return _direct_prefill_lower_cache_and_logits(
	model,
	prompt_bundle=prompt_bundle,
	lower_k=lower_k,
	prefill_attn=prefill_attn,
	system_prefill=system_prefill,
	no_upper_attn=no_upper_attn,
	see_past_assistant=bool(see_past_assistant),
	replay_module=replay_module,
	replay_per_layers=replay_per_layers,
	)

	return None


	def _normalize_structured_llopa_seed_mode(seed_mode: Optional[str]) -> str:
	normalized = str(seed_mode or "auto").strip().lower()
	aliases = {
	"": "auto",
	"default": "auto",
	"tri": "auto",
	"tri_auto": "auto",
	"reference": "prefill_header",
	"reference_only": "prefill_header",
	"prefill-header": "prefill_header",
	"prefill_header_seed": "prefill_header",
	"prefill_header_only": "prefill_header",
	}
	normalized = aliases.get(normalized, normalized)
	if normalized not in {"auto", "prefill_header"}:
	normalized = "auto"
	return normalized


	def _llopa_modeling_module(model) -> Optional[Any]:
	llopa_core = _get_llopa_core(model)
	if llopa_core is None:
	return None
	with contextlib.suppress(Exception):
	return inspect.getmodule(llopa_core.__class__)
	return None


	def _matched_inband_prefill_cache_and_logits(
	model,
	*,
	prompt_bundle,
	lower_k: int,
	prefill_attn: str,
	system_prefill: str,
	no_upper_attn: bool,
	):
	if bool(no_upper_attn):
	return None

	llopa_core = _get_llopa_core(model)
	output_head = _get_output_head(model)
	llopa_mod = _llopa_modeling_module(model)
	if llopa_core is None or output_head is None or llopa_mod is None:
	return None

	fusion_mode = str(
	getattr(getattr(llopa_core, "config", None), "capsule_fusion_mode", "upper_only") or "upper_only"
	).strip().lower()
	if fusion_mode != "inband":
	return None

	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	if attn != "causal":
	return None

	required_names = (
	"_safe_dynamic_cache",
	"_tri_arange",
	"_llopa_position_ids_from_mask",
	"_resolve_attn_impl",
	"_can_use_implicit_causal_mask",
	"_llopa_mask_is_all_ones",
	"_build_tri_mask_local",
	"_tri_insert_suffix_specials_inband",
	"_tri_effective_suffix_special_token_ids",
	"_tri_build_prefill_lower_upper_index_batch",
	"_tri_pack_indexed_tensor",
	"create_causal_mask",
	)
	if any(not hasattr(llopa_mod, name) for name in required_names):
	return None

	prompt_ids = prompt_bundle["prompt_ids"]
	attention_mask = prompt_bundle["attention_mask"].to(device=prompt_ids.device, dtype=torch.long)
	system_len = int(prompt_bundle["prefill_lower_system_len"][0].item()) if prompt_bundle["prefill_lower_system_len"].numel() > 0 else 0
	full_ids = prompt_ids
	full_attention_mask = attention_mask
	split_starts = prompt_bundle.get("prefill_lower_split_start")
	if isinstance(split_starts, torch.Tensor):
	split_starts = split_starts.to(device=prompt_ids.device, dtype=torch.long)
	else:
	split_starts = torch.tensor(
	[int(prompt_bundle.get("assistant_header_start", max(int(prompt_ids.size(1) - 1), 0)))],
	device=prompt_ids.device,
	dtype=torch.long,
	)

	token_ids = list(getattr(llopa_mod, "_tri_effective_suffix_special_token_ids")(model) or [])
	if token_ids:
	header_starts = prompt_bundle.get("assistant_header_starts")
	header_start_mask = prompt_bundle.get("assistant_header_start_mask")
	(
	full_ids,
	full_attention_mask,
	_,
	remapped_split_starts,
	_,
	_,
	) = getattr(llopa_mod, "_tri_insert_suffix_specials_inband")(
	token_ids=token_ids,
	input_ids=full_ids,
	attention_mask=full_attention_mask,
	labels=None,
	split_starts=split_starts,
	assistant_header_starts=header_starts,
	assistant_header_start_mask=header_start_mask,
	)
	if isinstance(remapped_split_starts, torch.Tensor) and remapped_split_starts.numel() > 0:
	split_starts = remapped_split_starts.to(device=prompt_ids.device, dtype=torch.long)

	valid_len = int(full_attention_mask[0].sum().item())
	if valid_len <= 0:
	return None
	full_ids = full_ids[:, :valid_len]
	full_attention_mask = full_attention_mask[:, :valid_len]
	split_start = max(0, min(int(split_starts[0].item()) if split_starts.numel() > 0 else int(valid_len - 1), valid_len))

	try:
	lower_k = int(lower_k)
	except Exception:
	return None
	n_layers = len(getattr(llopa_core, "layers", []))
	if lower_k <= 0 or n_layers <= 0:
	return None
	lower_k = max(0, min(lower_k, n_layers))

	device = full_ids.device
	pkv = getattr(llopa_mod, "_safe_dynamic_cache")(llopa_core.config)
	inputs_embeds = llopa_core.embed_tokens(full_ids)
	cache_position = getattr(llopa_mod, "_tri_arange")(0, inputs_embeds.shape[1], device)
	position_ids = getattr(llopa_mod, "_llopa_position_ids_from_mask")(full_attention_mask)
	attn_impl = getattr(llopa_mod, "_resolve_attn_impl")(llopa_core.config)
	if attn_impl == "flash_attention_2":
	lower_mask = None if getattr(llopa_mod, "_llopa_mask_is_all_ones")(full_attention_mask) else full_attention_mask
	elif getattr(llopa_mod, "_can_use_implicit_causal_mask")(llopa_core.config) and getattr(llopa_mod, "_llopa_mask_is_all_ones")(full_attention_mask):
	lower_mask = None
	else:
	lower_mask = getattr(llopa_mod, "create_causal_mask")(
	config=llopa_core.config,
	input_embeds=inputs_embeds,
	attention_mask=full_attention_mask,
	cache_position=cache_position,
	past_key_values=None,
	position_ids=position_ids,
	)

	hidden_states = inputs_embeds
	position_embeddings = llopa_core.rotary_emb(hidden_states, position_ids)
	for li in range(lower_k):
	layer = llopa_core.layers[li]
	hidden_states = layer(
	hidden_states,
	attention_mask=lower_mask,
	position_ids=position_ids,
	past_key_values=pkv,
	use_cache=True,
	cache_position=cache_position,
	position_embeddings=position_embeddings,
	)

	split_starts = torch.tensor([split_start], device=hidden_states.device, dtype=torch.long)
	valid_lens = torch.tensor([valid_len], device=hidden_states.device, dtype=torch.long)
	system_lens = torch.tensor([system_len], device=hidden_states.device, dtype=torch.long)
	upper_gather_idx, upper_valid_mask, upper_lens = getattr(llopa_mod, "_tri_build_prefill_lower_upper_index_batch")(
	split_starts=split_starts,
	valid_lens=valid_lens,
	system_lens=system_lens,
	system_prefill=str(system_prefill),
	device=hidden_states.device,
	)
	upper_hidden, _ = getattr(llopa_mod, "_tri_pack_indexed_tensor")(
	hidden_states,
	gather_idx=upper_gather_idx,
	valid_mask=upper_valid_mask,
	pad_value=0.0,
	)
	upper_position_ids_src = position_ids.to(device=hidden_states.device, dtype=torch.long)
	upper_position_ids, _ = getattr(llopa_mod, "_tri_pack_indexed_tensor")(
	upper_position_ids_src,
	gather_idx=upper_gather_idx,
	valid_mask=upper_valid_mask,
	pad_value=0,
	)

	upper_len = int(upper_lens[0].item()) if upper_lens.numel() > 0 else 0
	if upper_len <= 0:
	return None
	upper_hidden = upper_hidden[:, :upper_len, :]
	upper_position_ids = upper_position_ids[:, :upper_len]
	upper_cache_position = upper_position_ids[0]

	if lower_k < n_layers:
	if attn_impl == "flash_attention_2" or getattr(llopa_mod, "_can_use_implicit_causal_mask")(llopa_core.config):
	upper_mask = None
	else:
	upper_mask = getattr(llopa_mod, "_build_tri_mask_local")(
	1,
	upper_len,
	0,
	upper_hidden.device,
	upper_hidden.dtype,
	)
	upper_pos_emb = llopa_core.rotary_emb(upper_hidden, upper_position_ids)
	for li in range(lower_k, n_layers):
	layer = llopa_core.layers[li]
	upper_hidden = layer(
	upper_hidden,
	attention_mask=upper_mask,
	position_ids=upper_position_ids,
	past_key_values=pkv,
	use_cache=True,
	cache_position=upper_cache_position,
	position_embeddings=upper_pos_emb,
	)

	upper_hidden = llopa_core.norm(upper_hidden)
	initial_logits = output_head(upper_hidden[:, -1:, :])[:, -1, :].to(torch.float32)

	sys_mode = str(system_prefill or "full").strip().lower()
	if sys_mode == "full":
	visible_prefix_len = system_len
	elif sys_mode == "no_system":
	visible_prefix_len = min(system_len, 1)
	else:
	visible_prefix_len = 0
	return pkv, int(visible_prefix_len), max(int(split_start) - int(visible_prefix_len), 0), initial_logits


	def _coerce_llopa_inputs(tokenizer,
	system: str,
	document: str,
	question: str,
	*,
	task: str,
	device: str,
	input_ids):
	if not isinstance(input_ids, dict):
	return _build_llopa_inputs(
	tokenizer,
	system=system,
	document=document,
	question=question,
	task=task,
	device=device,
	)

	def _get_first(mapping, *keys):
	for k in keys:
	if k in mapping and mapping[k] is not None:
	return mapping[k]
	return None

	ids_sys = _get_first(input_ids, "system_ids", "ids_sys")
	ids_sys_user = _get_first(input_ids, "system_user_ids", "ids_sys_user")
	user_doc_ids = _get_first(input_ids, "user_doc_ids")
	user_q_ids = _get_first(input_ids, "user_q_ids")
	hdr_tail = _get_first(input_ids, "hdr_tail")
	prompt_ids = _get_first(input_ids, "prompt_ids", "ids_hdr")

	if (ids_sys is None or ids_sys_user is None or user_doc_ids is None or
	user_q_ids is None or hdr_tail is None):
	built = _build_llopa_inputs(
	tokenizer,
	system=system,
	document=document,
	question=question,
	task=task,
	device=device,
	)
	if ids_sys is None:
	ids_sys = built["system_ids"]
	if ids_sys_user is None:
	ids_sys_user = built["system_user_ids"]
	if user_doc_ids is None:
	user_doc_ids = built["user_doc_ids"]
	if user_q_ids is None:
	user_q_ids = built["user_q_ids"]
	if hdr_tail is None:
	hdr_tail = built["hdr_tail"]
	if prompt_ids is None:
	prompt_ids = built.get("prompt_ids")

	if prompt_ids is None and ids_sys_user is not None and hdr_tail is not None:
	try:
	prompt_ids = torch.cat([ids_sys_user, hdr_tail], dim=1)
	except Exception:
	prompt_ids = None

	return {
	"prompt_ids": prompt_ids,
	"system_ids": ids_sys,
	"system_user_ids": ids_sys_user,
	"user_doc_ids": user_doc_ids,
	"user_q_ids": user_q_ids,
	"hdr_tail": hdr_tail,
	}

	def lcp_len(a: torch.Tensor, b: torch.Tensor) -> int:
	L = min(a.size(1), b.size(1))
	eq = (a[0, :L] == b[0, :L])
	nz = (~eq).nonzero(as_tuple=False)
	return int(nz[0, 0]) if nz.numel() else L


	def _assistant_header_ids_from_chat_template(tokenizer, device):
	"""Infer assistant header ids by diffing chat-template renders."""
	probe_messages_list = [
	[{"role": "system", "content": "system"}, {"role": "user", "content": "user"}],
	[{"role": "user", "content": "user"}],
	]

	for messages in probe_messages_list:
	try:
	rendered_no_prompt = apply_chat_template(tokenizer, messages, add_generation_prompt=False)
	rendered_with_prompt = apply_chat_template(tokenizer, messages, add_generation_prompt=True)
	except Exception:
	continue

	if not isinstance(rendered_no_prompt, str) or not isinstance(rendered_with_prompt, str):
	continue
	if not rendered_with_prompt or rendered_with_prompt == rendered_no_prompt:
	continue

	if rendered_with_prompt.startswith(rendered_no_prompt):
	suffix_text = rendered_with_prompt[len(rendered_no_prompt):]
	if suffix_text:
	try:
	ids = tokenizer(
	suffix_text,
	add_special_tokens=False,
	return_tensors="pt",
	).input_ids.to(device)
	if ids.numel() > 0:
	return ids
	except Exception:
	pass

	try:
	ids_no_prompt = tokens_from_messages(tokenizer, messages, device, add_generation_prompt=False)
	ids_with_prompt = tokens_from_messages(tokenizer, messages, device, add_generation_prompt=True)
	except Exception:
	continue

	if ids_with_prompt.numel() == 0 or ids_with_prompt.size(1) <= ids_no_prompt.size(1):
	continue

	prefix_len = lcp_len(ids_no_prompt, ids_with_prompt)
	if prefix_len < ids_with_prompt.size(1):
	delta_ids = ids_with_prompt[:, prefix_len:]
	if delta_ids.numel() > 0:
	return delta_ids

	return None

	def _assistant_header_ids(tokenizer, device):
	"""Best-effort header ids appended by add_generation_prompt for common templates."""
	if _is_mistral_template(tokenizer):
	try:
	tok_id = tokenizer.convert_tokens_to_ids(MISTRAL_ASSIST_START)
	if tok_id is None or tok_id == tokenizer.unk_token_id:
	return None
	return torch.tensor([[int(tok_id)]], device=device, dtype=torch.long)
	except Exception:
	return None
	tmpl = getattr(tokenizer, "chat_template", "") or ""
	if "<\|start_header_id\|>" in tmpl:
	header = "<\|start_header_id\|>assistant<\|end_header_id\|>\n\n"
	try:
	return tokenizer(header, add_special_tokens=False, return_tensors="pt").input_ids.to(device)
	except Exception:
	return None
	return _assistant_header_ids_from_chat_template(tokenizer, device)


	def split_system_user_ids(tokenizer, system_text: str, user_text: str, device):
	msgs_sys = [{"role": "system", "content": system_text}]
	msgs_sys_user = [{"role": "system", "content": system_text},
	{"role": "user", "content": user_text}]
	ids_sys = tokens_from_messages(tokenizer, msgs_sys, device, add_generation_prompt=False)
	ids_sys_user = tokens_from_messages(tokenizer, msgs_sys_user, device, add_generation_prompt=False)
	if ids_sys_user.size(1) < ids_sys.size(1):
	raise ValueError("System-only tokens longer than system+user tokens.")
	user_ids = ids_sys_user[:, ids_sys.size(1):]
	return ids_sys, user_ids, ids_sys_user

	# -----------------------------
	# DynamicCache helpers
	# -----------------------------
	def pkv_len(pkv) -> int:
	if hasattr(pkv, "layers"): return len(pkv.layers)
	if hasattr(pkv, "key_cache"): return len(pkv.key_cache)
	return len(pkv)

	def pkv_get(pkv, idx: int):
	if hasattr(pkv, "layers"):
	layer = pkv.layers[idx]
	return layer.keys, layer.values
	if hasattr(pkv, "key_cache"):
	return pkv.key_cache[idx], pkv.value_cache[idx]
	return pkv[idx]

	def dc_from_subset(pkv_src, idxs: List[int]) -> DynamicCache:
	dc = DynamicCache()
	for li in idxs:
	k, v = pkv_get(pkv_src, li)
	dc.update(k, v, li)
	return dc


	def _safe_dynamic_cache(config=None) -> DynamicCache:
	try:
	return DynamicCache(config=config)
	except TypeError as exc:
	if "max_cache_len" in str(exc):
	return DynamicCache()
	raise

	def _get_inner_model(m):
	"""Return the decoder backbone that owns `.layers` (robust across wrappers)."""
	# unwrap DDP/Accelerate
	if hasattr(m, "module"):
	m = m.module
	# unwrap PEFT
	try:
	from peft import PeftModel
	if isinstance(m, PeftModel):
	try:
	m = m.get_base_model()
	except Exception:
	m = getattr(m, "base_model", m)
	except Exception:
	pass

	for attr in ("model", "transformer", "backbone", "base_model", "language_model"):
	if hasattr(m, attr):
	cand = getattr(m, attr)
	if hasattr(cand, "layers") and isinstance(getattr(cand, "layers", None), nn.ModuleList):
	return cand
	if hasattr(cand, "decoder") and hasattr(cand.decoder, "layers") and isinstance(cand.decoder.layers, nn.ModuleList):
	return cand.decoder
	if hasattr(m, "layers") and isinstance(getattr(m, "layers", None), nn.ModuleList):
	return m
	for child in m.modules():
	if child is m:
	continue
	if hasattr(child, "layers") and isinstance(getattr(child, "layers", None), nn.ModuleList):
	return child
	raise AttributeError("Could not locate inner base model with a .layers attribute")

	def _get_llopa_core(model):
	"""Return the decoder object that owns the LLoPA prefill/cache hooks."""
	inner = _get_inner_model(model)
	if hasattr(inner, "llopa_prefill_cache"):
	return inner
	if hasattr(inner, "model") and hasattr(inner.model, "llopa_prefill_cache"):
	return inner.model
	if hasattr(inner, "tri_build_caches"):
	return inner
	if hasattr(inner, "model") and hasattr(inner.model, "tri_build_caches"):
	return inner.model
	return None

	def _get_llopa_decode_step(model):
	"""Return the cached LLoPA decode-step callable if present."""
	for name in ("llopa_decode_step_logits", "tri_step_logits"):
	if hasattr(model, name):
	return getattr(model, name)
	try:
	from peft import PeftModel
	if isinstance(model, PeftModel):
	try:
	base = model.get_base_model()
	except Exception:
	base = getattr(model, "base_model", None)
	if base is not None:
	for name in ("llopa_decode_step_logits", "tri_step_logits"):
	if hasattr(base, name):
	return getattr(base, name)
	except Exception:
	pass
	return None


	def _get_llopa_full_prompt_seed(model):
	"""Return the full-prompt seed callable used for past-assistant contexts."""
	for name in ("llopa_full_prompt_prefill_seed", "tri_reference_prefill_seed"):
	if hasattr(model, name):
	return getattr(model, name)
	try:
	from peft import PeftModel
	if isinstance(model, PeftModel):
	try:
	base = model.get_base_model()
	except Exception:
	base = getattr(model, "base_model", None)
	if base is not None:
	for name in ("llopa_full_prompt_prefill_seed", "tri_reference_prefill_seed"):
	if hasattr(base, name):
	return getattr(base, name)
	except Exception:
	pass
	return None


	def _has_active_llopa_runtime(model) -> bool:
	"""Return True when LLoPA hooks are present on the loaded model."""
	return _get_llopa_core(model) is not None and _get_llopa_decode_step(model) is not None


	def _round_up_to_multiple(value: int, multiple: int) -> int:
	value = int(value)
	multiple = int(multiple)
	if multiple <= 0:
	return value
	return ((value + multiple - 1) // multiple) * multiple


	_OPTIMIZED_LLOPA_VARIANT_PRESETS = {
	"baseline": {
	"seed_mode": "auto",
	"upper_prepare_mode": "exact",
	"upper_bucket_multiple": 0,
	"seq_bucket_multiple": 256,
	},
	"upper_ws_auto": {
	"seed_mode": "auto",
	"upper_prepare_mode": "bucketed_workspace",
	"upper_bucket_multiple": 256,
	"seq_bucket_multiple": 256,
	},
	"upper_ws_auto_128": {
	"seed_mode": "auto",
	"upper_prepare_mode": "bucketed_workspace",
	"upper_bucket_multiple": 128,
	"seq_bucket_multiple": 128,
	},
	"upper_ws_tri": {
	"seed_mode": "tri",
	"upper_prepare_mode": "bucketed_workspace",
	"upper_bucket_multiple": 256,
	"seq_bucket_multiple": 256,
	},
	"upper_ws_stable": {
	"seed_mode": "stable",
	"upper_prepare_mode": "bucketed_workspace",
	"upper_bucket_multiple": 256,
	"seq_bucket_multiple": 256,
	},
	"upper_ws_matched": {
	"seed_mode": "matched",
	"upper_prepare_mode": "bucketed_workspace",
	"upper_bucket_multiple": 256,
	"seq_bucket_multiple": 256,
	},
	}


	def _normalize_optimized_llopa_seed_mode(seed_mode: Optional[str]) -> str:
	raw = str(seed_mode or "auto").strip().lower()
	if raw in {"", "default"}:
	raw = "auto"
	if raw not in {"auto", "tri", "stable", "matched"}:
	raw = "auto"
	return raw


	def _normalize_optimized_llopa_upper_prepare_mode(mode: Optional[str]) -> str:
	raw = str(mode or "exact").strip().lower()
	if raw in {"", "default"}:
	raw = "exact"
	if raw not in {"exact", "bucketed_workspace"}:
	raw = "exact"
	return raw


	def _resolve_optimized_llopa_settings(
	*,
	variant: Optional[str],
	seed_mode: Optional[str],
	upper_prepare_mode: Optional[str],
	upper_bucket_multiple: Optional[int],
	seq_bucket_multiple: Optional[int],
	):
	preset_name = str(variant or "upper_ws_auto").strip().lower()
	if preset_name in {"", "default", "auto"}:
	preset_name = "upper_ws_auto"
	preset = _OPTIMIZED_LLOPA_VARIANT_PRESETS.get(
	preset_name,
	_OPTIMIZED_LLOPA_VARIANT_PRESETS["upper_ws_auto"],
	)
	resolved_seed_mode = _normalize_optimized_llopa_seed_mode(
	seed_mode if seed_mode is not None else preset.get("seed_mode")
	)
	resolved_upper_prepare_mode = _normalize_optimized_llopa_upper_prepare_mode(
	upper_prepare_mode if upper_prepare_mode is not None else preset.get("upper_prepare_mode")
	)
	resolved_upper_bucket_multiple = (
	int(upper_bucket_multiple)
	if upper_bucket_multiple is not None
	else int(preset.get("upper_bucket_multiple", 0) or 0)
	)
	resolved_seq_bucket_multiple = (
	int(seq_bucket_multiple)
	if seq_bucket_multiple is not None
	else int(preset.get("seq_bucket_multiple", 256) or 256)
	)
	if resolved_upper_prepare_mode != "bucketed_workspace":
	resolved_upper_bucket_multiple = 0
	if resolved_upper_bucket_multiple < 0:
	resolved_upper_bucket_multiple = 0
	if resolved_seq_bucket_multiple <= 0:
	resolved_seq_bucket_multiple = 256
	return {
	"variant": preset_name,
	"seed_mode": resolved_seed_mode,
	"upper_prepare_mode": resolved_upper_prepare_mode,
	"upper_bucket_multiple": int(resolved_upper_bucket_multiple),
	"seq_bucket_multiple": int(resolved_seq_bucket_multiple),
	}


	@contextlib.contextmanager
	def _temporary_model_attrs(model, **updates):
	sentinel = object()
	prior = {}
	try:
	for key, value in updates.items():
	prior[key] = getattr(model, key, sentinel)
	setattr(model, key, value)
	yield
	finally:
	for key, old_value in prior.items():
	if old_value is sentinel:
	with contextlib.suppress(Exception):
	delattr(model, key)
	else:
	with contextlib.suppress(Exception):
	setattr(model, key, old_value)


	def _acquire_bucketed_sequence_workspace(
	model,
	*,
	reference_ids: torch.Tensor,
	batch_size: int,
	total_len: int,
	bucket_multiple: int,
	):
	bucket_total_len = _round_up_to_multiple(total_len, bucket_multiple)
	if bucket_total_len <= 0:
	bucket_total_len = int(total_len)
	dtype = reference_ids.dtype
	device = reference_ids.device
	key = (str(device), str(dtype), int(batch_size), int(bucket_total_len))
	cache = getattr(model, "_optimized_llopa_sequence_workspace_cache", None)
	if not isinstance(cache, dict):
	cache = {}
	workspace = cache.get(key)
	if (
	not isinstance(workspace, torch.Tensor)
	or workspace.device != device
	or workspace.dtype != dtype
	or workspace.shape != (int(batch_size), int(bucket_total_len))
	):
	workspace = torch.empty((int(batch_size), int(bucket_total_len)), dtype=dtype, device=device)
	cache[key] = workspace
	try:
	setattr(model, "_optimized_llopa_sequence_workspace_cache", cache)
	except Exception:
	pass
	return workspace, int(bucket_total_len)

	def _kv_meta_from_model(model_like):
	"""Return (num_kv_heads, head_dim, dtype)."""
	try:
	cfg = getattr(model_like, "config", None) or getattr(_get_inner_model(model_like), "config", None)
	except Exception:
	cfg = getattr(_get_inner_model(model_like), "config", None)
	num_heads = getattr(cfg, "num_attention_heads", None)
	num_kv = getattr(cfg, "num_key_value_heads", None) or num_heads
	hidden = getattr(cfg, "hidden_size", None)
	head_dim = (hidden // num_heads) if (hidden and num_heads) else None
	try:
	dtype = next(_get_inner_model(model_like).parameters()).dtype
	except Exception:
	dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	return int(num_kv), int(head_dim), dtype

	def _make_empty_kv(batch: int, num_kv: int, head_dim: int, device, dtype):
	shape = (batch, num_kv, 0, head_dim)
	k = torch.empty(shape, device=device, dtype=dtype)
	v = torch.empty(shape, device=device, dtype=dtype)
	return k.contiguous(), v.contiguous()

	# -----------------------------
	# LLOPA helpers (encapsulated)
	# -----------------------------
	def _llopa_split_system(system_ids: torch.Tensor, system_prefill: str):
	"""Return (system_upper, system_lower_extra) based on system_prefill."""
	sys_prefill = (system_prefill or "full").strip().lower()
	if sys_prefill not in {"full", "no_system", "no_bos_system"}:
	sys_prefill = "full"
	if sys_prefill == "full":
	return system_ids, system_ids[:, :0]
	if sys_prefill == "no_system":
	if system_ids.size(1) < 1:
	return system_ids[:, :0], system_ids[:, :0]
	return system_ids[:, :1], system_ids[:, 1:]
	# no_bos_system
	return system_ids[:, :0], system_ids


	def _llopa_merge_replay_user_span(
	system_ids: torch.Tensor,
	*,
	system_prefill: str,
	replay_user_prefix_keep_len: int,
	replay_user_start: Optional[int],
	replay_user_len: Optional[int],
	):
	_, sys_lower_extra = _llopa_split_system(system_ids, system_prefill)
	merged_prefix_keep_len = int(sys_lower_extra.size(1)) + max(int(replay_user_prefix_keep_len or 0), 0)
	merged_user_start = None
	if replay_user_start is not None:
	merged_user_start = int(sys_lower_extra.size(1)) + max(int(replay_user_start), 0)
	merged_user_len = None if replay_user_len is None else max(int(replay_user_len), 0)
	return int(merged_prefix_keep_len), merged_user_start, merged_user_len


	def _llopa_merge_user(system_ids: torch.Tensor, user_ids: torch.Tensor, system_prefill: str):
	sys_upper, sys_lower_extra = _llopa_split_system(system_ids, system_prefill)
	if sys_lower_extra.numel() == 0:
	return sys_upper, user_ids
	if user_ids.numel() == 0:
	return sys_upper, sys_lower_extra
	return sys_upper, torch.cat([sys_lower_extra, user_ids], dim=1)


	def _llopa_prefill_cache(llopa_core,
	system_ids: torch.Tensor,
	user_ids: torch.Tensor,
	assistant_ids: torch.Tensor,
	*,
	lower_k: int,
	prefill_mode: str,
	prefill_attn: str,
	system_prefill: str,
	return_last_assistant_hidden: bool = False,
	replay_user_prefix_keep_len: int = 0,
	replay_user_start: Optional[int] = None,
	replay_user_len: Optional[int] = None):
	prefill_mode = (prefill_mode or "lower").strip().lower()
	prefill_attn = (prefill_attn or "causal").strip().lower()
	if prefill_attn == "prefix_full":
	prefill_attn = "full"
	if prefill_mode != "lower":
	raise ValueError("llopa_prefill requires prefill_mode='lower'.")
	if prefill_attn not in {"causal", "full"}:
	raise ValueError("llopa_prefill requires prefill_attn in {'causal','full'}.")
	llopa_fn = getattr(llopa_core, "llopa_prefill_cache", None)
	if llopa_fn is None:
	raise RuntimeError("llopa_prefill_cache not found. Check LLoPA modeling patch.")
	sys_upper, user_llopa = _llopa_merge_user(system_ids, user_ids, system_prefill)
	merged_replay_prefix_keep_len, merged_replay_user_start, merged_replay_user_len = _llopa_merge_replay_user_span(
	system_ids,
	system_prefill=system_prefill,
	replay_user_prefix_keep_len=int(replay_user_prefix_keep_len or 0),
	replay_user_start=replay_user_start,
	replay_user_len=replay_user_len,
	)
	prefill_out = llopa_fn(
	system_ids=sys_upper,
	user_ids=user_llopa,
	assistant_ids=assistant_ids,
	lower_k=lower_k,
	prefill_mode=prefill_mode,
	prefill_attn=prefill_attn,
	return_last_assistant_hidden=bool(return_last_assistant_hidden),
	replay_user_prefix_keep_len=merged_replay_prefix_keep_len,
	replay_user_start=merged_replay_user_start,
	replay_user_len=merged_replay_user_len,
	)
	if bool(return_last_assistant_hidden):
	if not isinstance(prefill_out, tuple):
	raise RuntimeError("llopa_prefill_cache did not return the requested last assistant hidden state.")
	pkv, last_hidden = prefill_out
	return pkv, sys_upper.size(1), user_llopa.size(1), last_hidden
	pkv = prefill_out
	return pkv, sys_upper.size(1), user_llopa.size(1)

	# ---------------------------------------------------------------------------
	# LoPA per-layer cache_position/position_ids adjustment
	# Aligns per-layer positions when lower layers have prefill past and upper
	# layers start from zero. Mirrors the trainer's runtime patch.
	# ---------------------------------------------------------------------------
	import contextlib

	@contextlib.contextmanager
	def lopa_cache_position_patch(model, past_key_values):
	"""
	Match trainer's dynamic position alignment:
	For each decoder layer, compute its current past length from the provided
	past_key_values, and during forward adjust cache_position/position_ids by
	off = start_val - past_len so that lower-K layers (with past=L_sys+L_doc)
	and upper layers (with past=0) align logically for the current token.
	"""
	inner = _get_inner_model(model)

	# Per-layer past length from the provided cache snapshot
	def _pkv_past_len(li: int) -> int:
	if hasattr(past_key_values, "key_cache") and hasattr(past_key_values, "value_cache"):
	return int(past_key_values.key_cache[li].shape[2])
	if hasattr(past_key_values, "layers"):
	return int(past_key_values.layers[li].keys.shape[2])
	return int(past_key_values[li][0].shape[2])

	n_layers = len(inner.layers)
	past_lens = [_pkv_past_len(li) for li in range(n_layers)]

	handles = []
	for li, layer in enumerate(inner.layers):
	layer._lopa_past = int(past_lens[li])

	def _pre_hook(module, args, kwargs):
	past_len = getattr(module, "_lopa_past", 0)
	cp = kwargs.get("cache_position", None)
	pi = kwargs.get("position_ids", None)
	start_val = None
	if isinstance(cp, torch.Tensor) and cp.numel() > 0:
	start_val = int(cp.view(-1)[0].item())
	elif isinstance(pi, torch.Tensor) and pi.numel() > 0:
	start_val = int(pi.view(-1)[0].item())
	if start_val is not None:
	off = start_val - past_len
	if off != 0:
	if isinstance(cp, torch.Tensor):
	kwargs["cache_position"] = cp - off
	if isinstance(pi, torch.Tensor):
	kwargs["position_ids"] = pi - off
	return args, kwargs

	h = layer.register_forward_pre_hook(_pre_hook, with_kwargs=True)
	handles.append(h)

	try:
	yield
	finally:
	for h in handles:
	h.remove()
	for layer in inner.layers:
	if hasattr(layer, "_lopa_past"):
	delattr(layer, "_lopa_past")

	# -----------------------------
	# TRI inference core
	# -----------------------------
	@torch.inference_mode()

	def _get_peft_wrapper(m):
	try:
	from peft import PeftModel
	except Exception:
	return None
	if hasattr(m, "module"):
	m = m.module
	return m if isinstance(m, PeftModel) else None


	def _set_prefill_adapter(model, enabled: bool) -> None:
	if not bool(getattr(model, "_prefill_adapter_only", False)):
	return
	peft_model = _get_peft_wrapper(model)
	if peft_model is None:
	return
	base = getattr(peft_model, "base_model", None)
	if base is None:
	return
	try:
	if enabled:
	base.enable_adapter_layers()
	else:
	base.disable_adapter_layers()
	except Exception:
	return


	def _get_output_head(model):
	getter = getattr(model, "get_output_embeddings", None)
	if callable(getter):
	head = getter()
	if head is not None:
	return head
	return getattr(model, "lm_head", None)


	def _env_flag_enabled(name: str, default: str = "1") -> bool:
	raw = os.environ.get(name, default).strip().lower()
	return raw not in {"0", "false", "no", "off"}

	@torch.inference_mode()
	def lopa_generate(model,
	tokenizer,
	system: str,
	document: str,
	question: str,
	*,
	task: str = "qa_doc",
	K: int,
	prefill_mode: str = "lower",
	prefill_attn: str = "causal",
	system_prefill: str = "full",
	user_prefill: str = "full",
	device: str,
	input_ids: Optional[dict] = None,
	max_new_tokens: int = 256,
	min_length: int = 16,
	temperature: float = 0.7,
	top_p: float = 0.9,
	top_k: Optional[int] = None,
	do_sample: bool = True,
	math_force_final_hash_rule: bool = False,
	log_cuda_mem: bool = False,
	log_cuda_tag: Optional[str] = None,
	debug: bool = False,
	debug_dir: Optional[Path] = None,
	llopa_prefill: bool = False,
	no_upper_attn: Optional[bool] = None,
	return_tokens: bool = False) -> str \| Tuple[str, int]:
	# Build ids
	if input_ids is None and task == "math" and math_force_final_hash_rule and "####" not in (system or ""):
	system = (
	system.rstrip()
	+ " Conclude your explanation with the answer in a '#### {numeric answer}' format, "
	+ "where the answer is solely a number."
	)
	user_prefill = (user_prefill or "full").strip().lower()
	if user_prefill not in {"full", "no_question"}:
	user_prefill = "full"
	prefill_attn = (prefill_attn or "causal").strip().lower()
	if prefill_attn == "prefix_full":
	prefill_attn = "full"
	if prefill_attn not in {"causal", "full"}:
	raise ValueError("prefill_attn must be one of: causal \| full")

	llopa_inputs = _coerce_llopa_inputs(
	tokenizer,
	system=system,
	document=document,
	question=question,
	task=task,
	device=device,
	input_ids=input_ids,
	)
	ids_sys = llopa_inputs["system_ids"]
	ids_sys_user = llopa_inputs["system_user_ids"]
	user_doc_ids = llopa_inputs["user_doc_ids"]
	user_q_ids = llopa_inputs["user_q_ids"]
	hdr_tail = llopa_inputs["hdr_tail"]
	ids_hdr = llopa_inputs.get("prompt_ids", None)

	if debug:
	try:
	msgs = build_messages(system, document, question, include_query=True, task=task)
	s_no_hdr = apply_chat_template(tokenizer, msgs, add_generation_prompt=False)
	s_with_hdr = apply_chat_template(tokenizer, msgs, add_generation_prompt=True)
	print(f"[debug] render lengths (chars): no_hdr={len(s_no_hdr)}, with_hdr={len(s_with_hdr)}")
	if debug_dir is not None:
	debug_dir.mkdir(parents=True, exist_ok=True)
	(debug_dir / "infer_render_no_header.txt").write_text(s_no_hdr, encoding="utf-8")
	(debug_dir / "infer_render_with_header.txt").write_text(s_with_hdr, encoding="utf-8")
	except Exception as e:
	print(f"[debug] render dump failed: {e}")

	# Assistant header tokens
	if ids_hdr is None:
	ids_hdr = torch.cat([ids_sys_user, hdr_tail], dim=1) if hdr_tail is not None else ids_sys_user
	hdr_tail = ids_hdr[:, ids_sys_user.size(1):]
	if user_prefill == "no_question":
	prefix_full = torch.cat([user_q_ids, hdr_tail], dim=1)
	else:
	prefix_full = hdr_tail

	# Require LLoPA runtime API
	llopa_core = _get_llopa_core(model)
	llopa_step = _get_llopa_decode_step(model)
	if llopa_core is None or llopa_step is None:
	raise RuntimeError("Custom LLoPA modeling not active. Check --lopa_modeling_path/--modeling_family.")
	if no_upper_attn is None:
	no_upper_attn = bool(getattr(model, "_no_upper_attn", False))
	no_upper_attn = bool(no_upper_attn)
	if no_upper_attn and (not bool(llopa_prefill)):
	print("[infer][warn] no_upper_attn is ignored unless llopa_prefill=True.")
	effective_no_upper_attn = bool(no_upper_attn and bool(llopa_prefill))
	llopa_step_accepts_no_upper_attn = False
	try:
	llopa_step_params = inspect.signature(llopa_step).parameters
	llopa_step_accepts_no_upper_attn = ("no_upper_attn" in llopa_step_params)
	except Exception:
	llopa_step_accepts_no_upper_attn = False
	if effective_no_upper_attn and (not llopa_step_accepts_no_upper_attn):
	print("[infer][warn] no_upper_attn requested but llopa_decode_step_logits does not support it; ignoring.")
	effective_no_upper_attn = False

	def _log_mem(tag: str) -> None:
	if not log_cuda_mem or not torch.cuda.is_available():
	return
	try:
	torch.cuda.synchronize()
	except Exception:
	pass
	alloc = torch.cuda.memory_allocated() / (1024 ** 3)
	reserved = torch.cuda.memory_reserved() / (1024 ** 3)
	max_alloc = torch.cuda.max_memory_allocated() / (1024 ** 3)
	max_reserved = torch.cuda.max_memory_reserved() / (1024 ** 3)
	prefix = "[mem]"
	if log_cuda_tag:
	prefix = f"{prefix}[{log_cuda_tag}]"
	print(f"{prefix} {tag} \| alloc={alloc:.2f}GiB reserved={reserved:.2f}GiB "
	f"max_alloc={max_alloc:.2f}GiB max_reserved={max_reserved:.2f}GiB")

	if log_cuda_mem and torch.cuda.is_available():
	try:
	torch.cuda.reset_peak_memory_stats()
	except Exception:
	pass
	_log_mem("start")

	# 1) TRI prefill: system all + user lower-K
	_set_prefill_adapter(model, True)
	lower_k = int(K)
	sys_prefill = (system_prefill or "full").strip().lower()
	if sys_prefill not in {"full", "no_system", "no_bos_system"}:
	sys_prefill = "full"
	if user_prefill == "no_question":
	user_prefill_ids = user_doc_ids
	else:
	user_prefill_ids = torch.cat([user_doc_ids, user_q_ids], dim=1)
	if llopa_prefill:
	use_fused_first_token = bool(
	prefix_full.numel() > 0
	and not effective_no_upper_attn
	and not bool(getattr(model, "_prefill_adapter_only", False))
	)
	if use_fused_first_token:
	pkv, S, U, llopa_last_hidden = _llopa_prefill_cache(
	llopa_core,
	ids_sys,
	user_prefill_ids,
	prefix_full,
	lower_k=lower_k,
	prefill_mode=prefill_mode,
	prefill_attn=prefill_attn,
	system_prefill=sys_prefill,
	return_last_assistant_hidden=True,
	)
	else:
	pkv, S, U = _llopa_prefill_cache(
	llopa_core,
	ids_sys,
	user_prefill_ids,
	prefix_full,
	lower_k=lower_k,
	prefill_mode=prefill_mode,
	prefill_attn=prefill_attn,
	system_prefill=sys_prefill,
	)
	llopa_last_hidden = None
	else:
	use_fused_first_token = False
	llopa_last_hidden = None
	if sys_prefill == "full":
	pkv, S, U = llopa_core.tri_build_caches(
	system_ids=ids_sys,
	user_ids=user_prefill_ids,
	lower_k=lower_k,
	prefill_mode=prefill_mode,
	prefill_attn=prefill_attn,
	)
	elif sys_prefill == "no_system":
	if ids_sys.size(1) < 1:
	pkv = _safe_dynamic_cache(getattr(llopa_core, "config", None))
	else:
	bos_ids = ids_sys[:, :1]
	rest_ids = ids_sys[:, 1:]
	out = llopa_core.tri_prefill_system_all(
	bos_ids,
	past_key_values=None,
	prefill_attn=prefill_attn,
	)
	pkv = out.past_key_values
	if rest_ids.size(1) > 0:
	_ = llopa_core.tri_prefill_user_lower(
	rest_ids,
	lower_k=lower_k,
	past_key_values=pkv,
	prefill_mode=prefill_mode,
	prefill_attn=prefill_attn,
	)
	_ = llopa_core.tri_prefill_user_lower(
	user_prefill_ids,
	lower_k=lower_k,
	past_key_values=pkv,
	prefill_mode=prefill_mode,
	prefill_attn=prefill_attn,
	)
	S, U = ids_sys.size(1), user_prefill_ids.size(1)
	else:
	pkv = _safe_dynamic_cache(getattr(llopa_core, "config", None))
	if ids_sys.size(1) > 0:
	_ = llopa_core.tri_prefill_user_lower(
	ids_sys,
	lower_k=lower_k,
	past_key_values=pkv,
	prefill_mode=prefill_mode,
	prefill_attn=prefill_attn,
	)
	_ = llopa_core.tri_prefill_user_lower(
	user_prefill_ids,
	lower_k=lower_k,
	past_key_values=pkv,
	prefill_mode=prefill_mode,
	prefill_attn=prefill_attn,
	)
	S, U = ids_sys.size(1), user_prefill_ids.size(1)

	_set_prefill_adapter(model, False)
	_log_mem("prefill_end")
	initial_logits = None
	if use_fused_first_token:
	output_head = _get_output_head(model)
	if output_head is not None and isinstance(llopa_last_hidden, torch.Tensor) and llopa_last_hidden.numel() > 0:
	initial_logits = output_head(llopa_last_hidden)[:, -1, :].to(torch.float32)

	# 2) Push assistant header if present (or fallback to last user token)
	if llopa_prefill:
	if prefix_full.numel() > 0:
	last_pushed = prefix_full[:, -1:]
	elif ids_sys_user.numel() > 0:
	last_pushed = ids_sys_user[:, -1:]
	else:
	raise ValueError("Empty prompt after LLOPA prefill; cannot start decoding.")
	else:
	if prefix_full.numel() > 0:
	seed_kwargs = dict(
	assistant_ids=prefix_full,
	lower_k=lower_k,
	pkv=pkv,
	S=S,
	U=U,
	logits_to_keep=0,
	labels=None,
	prefill_mode=prefill_mode,
	)
	if effective_no_upper_attn and llopa_step_accepts_no_upper_attn:
	seed_kwargs["no_upper_attn"] = True
	out_seed = llopa_step(**seed_kwargs)
	pkv = out_seed.past_key_values or pkv
	last_pushed = prefix_full[:, -1:]
	else:
	step_tok = ids_sys_user[:, -1:]
	seed_kwargs = dict(
	assistant_ids=step_tok,
	lower_k=lower_k,
	pkv=pkv,
	S=S,
	U=U,
	logits_to_keep=0,
	labels=None,
	prefill_mode=prefill_mode,
	)
	if effective_no_upper_attn and llopa_step_accepts_no_upper_attn:
	seed_kwargs["no_upper_attn"] = True
	out_seed = llopa_step(**seed_kwargs)
	pkv = out_seed.past_key_values or pkv
	last_pushed = step_tok

	if log_cuda_mem and torch.cuda.is_available():
	try:
	torch.cuda.reset_peak_memory_stats()
	except Exception:
	pass
	_log_mem("decode_start")

	# 5) decoding
	from transformers.generation import LogitsProcessorList
	from transformers.generation.logits_process import TemperatureLogitsWarper, TopPLogitsWarper, TopKLogitsWarper

	eos_id = tokenizer.eos_token_id
	try:
	eot_id = tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	except Exception:
	eot_id = None
	stop_ids = set()
	if eos_id is not None: stop_ids.add(int(eos_id))
	if eot_id is not None and eot_id != tokenizer.unk_token_id: stop_ids.add(int(eot_id))

	procs = None
	if do_sample:
	procs = LogitsProcessorList()
	if temperature and temperature != 1.0:
	procs.append(TemperatureLogitsWarper(temperature=float(temperature)))
	if top_p and top_p < 1.0:
	procs.append(TopPLogitsWarper(top_p=float(top_p), min_tokens_to_keep=1))
	if top_k is not None and top_k > 0:
	procs.append(TopKLogitsWarper(top_k=int(top_k), filter_value=-float("inf")))

	device_t = last_pushed.device
	last = last_pushed
	generated = torch.empty((1, max_new_tokens), dtype=torch.long, device=device_t)
	cur = 0
	stop_reason = None
	pending_logits = initial_logits

	while cur < max_new_tokens:
	if pending_logits is None:
	step_kwargs = dict(
	assistant_ids=last,
	lower_k=lower_k,
	pkv=pkv,
	S=S,
	U=U,
	logits_to_keep=1,
	labels=None,
	prefill_mode=prefill_mode,
	)
	if effective_no_upper_attn and llopa_step_accepts_no_upper_attn:
	step_kwargs["no_upper_attn"] = True
	out = llopa_step(**step_kwargs)
	pkv = out.past_key_values or pkv
	logits = out.logits[:, -1, :]
	else:
	logits = pending_logits
	pending_logits = None

	# force min_length
	if stop_ids and cur < min_length:
	for sid in stop_ids:
	logits[:, sid] = -float("inf")

	if procs is not None:
	inp_for_proc = generated[:, :cur]
	if logits.dtype != torch.float32:
	logits = logits.float()
	logits = procs(inp_for_proc, logits)

	if do_sample:
	probs = torch.softmax(logits, dim=-1)
	next_tok = torch.multinomial(probs, num_samples=1)
	else:
	next_tok = torch.argmax(logits, dim=-1, keepdim=True)

	generated[:, cur:cur + 1] = next_tok
	last = next_tok

	if stop_ids and cur >= min_length:
	tok_id = int(next_tok.item())
	if tok_id in stop_ids:
	stop_reason = f"stop_token:{tok_id}"
	cur += 1
	break

	cur += 1

	if stop_reason is None and cur >= max_new_tokens:
	stop_reason = "max_new_tokens"

	gen_ids = generated[:, :cur]
	text = tokenizer.decode(gen_ids[0].tolist(), skip_special_tokens=True)
	_log_mem("decode_end")
	if debug:
	print(f"[debug] finished \| tokens={cur} \| reason={stop_reason}")
	if debug_dir is not None:
	debug_dir.mkdir(parents=True, exist_ok=True)
	(debug_dir / "infer_generated.txt").write_text(text, encoding="utf-8")
	if return_tokens:
	return text, int(cur)
	return text

	# -----------------------------
	# LLOPA helpers (Capsule interface + HF packaging)
	# -----------------------------

	def llopa_generate(args, *kwargs):
	"""Capsule interface: identical behavior to lopa_generate."""
	if "device" not in kwargs or kwargs.get("device") is None:
	if len(args) >= 1:
	model = args[0]
	try:
	dev = model.get_input_embeddings().weight.device
	except Exception:
	try:
	dev = next(model.parameters()).device
	except Exception:
	dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	kwargs["device"] = str(dev)
	return lopa_generate(args, *kwargs)


	def _read_kv_file(path: Path) -> dict[str, str]:
	info: dict[str, str] = {}
	try:
	lines = path.read_text(encoding="utf-8").splitlines()
	except Exception:
	return info
	for raw in lines:
	line = raw.strip()
	if not line or line.startswith("#"):
	continue
	if "=" not in line:
	continue
	k, v = line.split("=", 1)
	k = k.strip()
	v = v.strip()
	if k:
	info[k] = v
	return info


	def _read_adapter_backbone_ref(repo_path: Path) -> str:
	adapter_cfg = repo_path / "adapter_config.json"
	if not adapter_cfg.is_file():
	return ""
	try:
	data = json.loads(adapter_cfg.read_text(encoding="utf-8"))
	except Exception:
	return ""
	val = data.get("base_model_name_or_path")
	if isinstance(val, str):
	return val.strip()
	return ""


	def _resolve_repo_path(model_repo: str, cache_dir: Optional[str] = None,
	revision: Optional[str] = None, token: Optional[str] = None,
	local_files_only: bool = False) -> Path:
	repo = Path(model_repo)
	if repo.exists():
	return repo
	try:
	from huggingface_hub import snapshot_download
	except Exception as exc:
	raise RuntimeError("huggingface_hub is required to load remote repos") from exc
	path = snapshot_download(
	repo_id=model_repo,
	cache_dir=cache_dir,
	revision=revision,
	token=token,
	local_files_only=local_files_only,
	)
	return Path(path)


	def _repo_has_pretrained_weights(repo_path: Path) -> bool:
	weight_files = (
	"pytorch_model.bin",
	"pytorch_model.bin.index.json",
	"model.safetensors",
	"model.safetensors.index.json",
	)
	if any((repo_path / name).is_file() for name in weight_files):
	return True
	return any(repo_path.glob("pytorch_model--of-.bin")) or any(repo_path.glob("model--of-.safetensors"))


	def _pick_vocab_weight_key(keys) -> Optional[str]:
	preferred = (
	"model.embed_tokens.weight",
	"embed_tokens.weight",
	"model.decoder.embed_tokens.weight",
	"transformer.wte.weight",
	"lm_head.weight",
	"model.lm_head.weight",
	)
	key_list = list(keys)
	for cand in preferred:
	if cand in key_list:
	return cand
	suffixes = (
	"embed_tokens.weight",
	"decoder.embed_tokens.weight",
	"wte.weight",
	"lm_head.weight",
	)
	for suffix in suffixes:
	for key in key_list:
	if str(key).endswith(suffix):
	return str(key)
	return None


	def _load_state_dict_meta(path: Path):
	try:
	return torch.load(path, map_location="meta", weights_only=True)
	except TypeError:
	return torch.load(path, map_location="meta")


	def _weight_rows_from_file(path: Path, tensor_key: Optional[str] = None) -> Optional[int]:
	try:
	suffixes = path.suffixes
	except Exception:
	suffixes = []
	try:
	if suffixes[-1:] == [".safetensors"] or suffixes[-2:] == [".model", ".safetensors"]:
	from safetensors import safe_open

	with safe_open(str(path), framework="pt", device="cpu") as handle:
	chosen = tensor_key or _pick_vocab_weight_key(handle.keys())
	if not chosen:
	return None
	shape = handle.get_slice(chosen).get_shape()
	if len(shape) >= 2:
	return int(shape[0])
	return None
	state_dict = _load_state_dict_meta(path)
	if not isinstance(state_dict, dict):
	return None
	chosen = tensor_key if tensor_key in state_dict else _pick_vocab_weight_key(state_dict.keys())
	if not chosen:
	return None
	tensor = state_dict.get(chosen)
	if isinstance(tensor, torch.Tensor) and tensor.ndim >= 2:
	return int(tensor.shape[0])
	except Exception:
	return None
	return None


	def _infer_checkpoint_vocab_size(repo_path: Path) -> Optional[int]:
	index_files = (
	"pytorch_model.bin.index.json",
	"model.safetensors.index.json",
	)
	single_weight_files = (
	"pytorch_model.bin",
	"model.safetensors",
	)

	for name in index_files:
	index_path = repo_path / name
	if not index_path.is_file():
	continue
	try:
	data = json.loads(index_path.read_text(encoding="utf-8"))
	except Exception:
	continue
	weight_map = data.get("weight_map")
	if not isinstance(weight_map, dict):
	continue
	tensor_key = _pick_vocab_weight_key(weight_map.keys())
	if not tensor_key:
	continue
	shard_rel = weight_map.get(tensor_key)
	if not shard_rel:
	continue
	rows = _weight_rows_from_file(repo_path / shard_rel, tensor_key=tensor_key)
	if rows is not None and rows > 0:
	return rows

	for name in single_weight_files:
	weight_path = repo_path / name
	if not weight_path.is_file():
	continue
	rows = _weight_rows_from_file(weight_path)
	if rows is not None and rows > 0:
	return rows

	for weight_path in sorted(repo_path.glob("pytorch_model--of-.bin")):
	rows = _weight_rows_from_file(weight_path)
	if rows is not None and rows > 0:
	return rows
	for weight_path in sorted(repo_path.glob("model--of-.safetensors")):
	rows = _weight_rows_from_file(weight_path)
	if rows is not None and rows > 0:
	return rows
	return None


	def _expand_tokenizer_placeholders(tokenizer, target_vocab_size: int) -> int:
	try:
	current_vocab_size = int(len(tokenizer))
	except Exception:
	return 0
	if target_vocab_size <= current_vocab_size:
	return 0
	missing = int(target_vocab_size - current_vocab_size)
	placeholder_tokens = [f"<\|capsule_missing_token_{idx}\|>" for idx in range(missing)]
	try:
	return int(tokenizer.add_tokens(placeholder_tokens, special_tokens=True) or 0)
	except Exception:
	return 0


	def _normalize_special_token_values(raw_value: Any) -> list[str]:
	if not isinstance(raw_value, list):
	return []
	normalized: list[str] = []
	for item in raw_value:
	token = ""
	if isinstance(item, str):
	token = item
	elif isinstance(item, dict):
	for key in ("content", "token", "text"):
	value = item.get(key)
	if isinstance(value, str) and value:
	token = value
	break
	if token:
	normalized.append(token)
	return normalized


	def _checkpoint_special_token_candidates(checkpoint_repo_path: Path) -> list[str]:
	candidates: list[str] = []
	for filename in ("config.json", "tokenizer_config.json", "special_tokens_map.json"):
	path = checkpoint_repo_path / filename
	if not path.is_file():
	continue
	try:
	payload = json.loads(path.read_text(encoding="utf-8"))
	except Exception:
	continue
	if not isinstance(payload, dict):
	continue
	candidates.extend(_normalize_special_token_values(payload.get("capsule_suffix_special_tokens")))
	candidates.extend(_normalize_special_token_values(payload.get("additional_special_tokens")))
	raw_decoder = payload.get("added_tokens_decoder")
	if isinstance(raw_decoder, dict):
	for _, value in sorted(
	raw_decoder.items(),
	key=lambda item: int(item[0]) if str(item[0]).isdigit() else str(item[0]),
	):
	candidates.extend(_normalize_special_token_values([value]))
	deduped: list[str] = []
	seen: set[str] = set()
	for token in candidates:
	if not token or token in seen:
	continue
	seen.add(token)
	deduped.append(token)
	return deduped


	def _align_tokenizer_with_checkpoint_vocab(
	tokenizer,
	checkpoint_repo_path: Path,
	target_vocab_size: int,
	) -> dict[str, int]:
	report = {
	"checkpoint_vocab_size": int(target_vocab_size or 0),
	"tokenizer_vocab_size_before": 0,
	"checkpoint_special_candidate_count": 0,
	"checkpoint_specials_missing_before": 0,
	"added_checkpoint_specials": 0,
	"tokenizer_vocab_size_after_specials": 0,
	"padding_gap_before_placeholders": 0,
	"added_placeholders": 0,
	"tokenizer_vocab_size_after": 0,
	}
	try:
	current_vocab_size = int(len(tokenizer))
	except Exception:
	return report
	report["tokenizer_vocab_size_before"] = current_vocab_size
	report["tokenizer_vocab_size_after_specials"] = current_vocab_size
	report["tokenizer_vocab_size_after"] = current_vocab_size

	missing = int(target_vocab_size - current_vocab_size)
	if missing <= 0:
	return report

	checkpoint_specials = _checkpoint_special_token_candidates(checkpoint_repo_path)
	report["checkpoint_special_candidate_count"] = len(checkpoint_specials)
	try:
	existing_vocab = set(tokenizer.get_vocab().keys())
	except Exception:
	existing_vocab = set()

	missing_checkpoint_specials = [token for token in checkpoint_specials if token not in existing_vocab]
	report["checkpoint_specials_missing_before"] = len(missing_checkpoint_specials)

	tokens_to_add: list[str] = []
	for token in missing_checkpoint_specials:
	tokens_to_add.append(token)
	existing_vocab.add(token)
	if len(tokens_to_add) >= missing:
	break

	added_checkpoint_specials = 0
	if tokens_to_add:
	try:
	current_specials = tokenizer.special_tokens_map_extended.get("additional_special_tokens", []) or []
	added_checkpoint_specials = int(
	tokenizer.add_special_tokens(
	{"additional_special_tokens": list(current_specials) + tokens_to_add}
	)
	or 0
	)
	except Exception:
	try:
	added_checkpoint_specials = int(tokenizer.add_tokens(tokens_to_add, special_tokens=True) or 0)
	except Exception:
	added_checkpoint_specials = 0
	report["added_checkpoint_specials"] = added_checkpoint_specials

	try:
	post_special_vocab_size = int(len(tokenizer))
	except Exception:
	post_special_vocab_size = current_vocab_size + added_checkpoint_specials
	report["tokenizer_vocab_size_after_specials"] = post_special_vocab_size
	report["padding_gap_before_placeholders"] = max(0, int(target_vocab_size - post_special_vocab_size))

	added_placeholders = 0
	if target_vocab_size > post_special_vocab_size:
	added_placeholders = _expand_tokenizer_placeholders(tokenizer, target_vocab_size)
	report["added_placeholders"] = added_placeholders
	try:
	report["tokenizer_vocab_size_after"] = int(len(tokenizer))
	except Exception:
	report["tokenizer_vocab_size_after"] = post_special_vocab_size + added_placeholders
	return report


	def _log_tokenizer_checkpoint_alignment(
	log_prefix: str,
	report: dict[str, int],
	*,
	print_fn=print,
	) -> None:
	added_specials = int(report.get("added_checkpoint_specials", 0) or 0)
	added_placeholders = int(report.get("added_placeholders", 0) or 0)
	if added_specials <= 0 and added_placeholders <= 0:
	return

	vocab_before = int(report.get("tokenizer_vocab_size_before", 0) or 0)
	vocab_after_specials = int(report.get("tokenizer_vocab_size_after_specials", vocab_before) or vocab_before)
	vocab_after = int(report.get("tokenizer_vocab_size_after", vocab_after_specials) or vocab_after_specials)
	missing_specials_before = int(report.get("checkpoint_specials_missing_before", 0) or 0)

	if added_specials > 0:
	print_fn(
	f"{log_prefix}[info] tokenizer vocab is smaller than checkpoint embeddings; "
	f"recovered {added_specials} checkpoint special tokens "
	f"({vocab_before} -> {vocab_after_specials})."
	)
	if added_placeholders > 0:
	if missing_specials_before > 0:
	print_fn(
	f"{log_prefix}[warn] checkpoint embeddings still exceed tokenizer vocab after recovering "
	f"checkpoint special tokens; added {added_placeholders} placeholder special tokens "
	f"to preserve id alignment ({vocab_after_specials} -> {vocab_after})."
	)
	else:
	print_fn(
	f"{log_prefix}[info] checkpoint embeddings include {added_placeholders} padded rows "
	f"beyond tokenizer vocab; added {added_placeholders} placeholder special tokens "
	f"to preserve id alignment ({vocab_after_specials} -> {vocab_after})."
	)


	def _log_config_checkpoint_vocab_alignment(
	log_prefix: str,
	*,
	config_vocab_size: int,
	checkpoint_vocab_size: int,
	tokenizer_vocab_size: int,
	alignment_report: Optional[dict[str, int]] = None,
	print_fn=print,
	) -> None:
	if checkpoint_vocab_size <= 0 or checkpoint_vocab_size == config_vocab_size:
	return

	final_tokenizer_vocab = int(tokenizer_vocab_size or 0)
	if alignment_report is not None:
	final_tokenizer_vocab = int(
	alignment_report.get("tokenizer_vocab_size_after", final_tokenizer_vocab) or final_tokenizer_vocab
	)
	missing_specials_before = 0
	if alignment_report is not None:
	missing_specials_before = int(alignment_report.get("checkpoint_specials_missing_before", 0) or 0)

	if final_tokenizer_vocab >= checkpoint_vocab_size and missing_specials_before <= 0:
	print_fn(
	f"{log_prefix}[info] config vocab_size ({config_vocab_size}) lags tokenizer/checkpoint embeddings "
	f"({checkpoint_vocab_size}); using checkpoint size."
	)
	return

	padding_rows = max(0, checkpoint_vocab_size - final_tokenizer_vocab)
	if padding_rows > 0 and missing_specials_before <= 0:
	print_fn(
	f"{log_prefix}[info] config vocab_size ({config_vocab_size}) lags checkpoint embeddings "
	f"({checkpoint_vocab_size}); using checkpoint size. tokenizer vocab is {final_tokenizer_vocab}, "
	f"so {padding_rows} rows are embedding padding/alignment."
	)
	return

	print_fn(
	f"{log_prefix}[warn] config vocab_size ({config_vocab_size}) does not match checkpoint embeddings "
	f"({checkpoint_vocab_size}); using checkpoint size."
	)


	def _expand_tokenizer_with_checkpoint_specials(
	tokenizer,
	checkpoint_repo_path: Path,
	target_vocab_size: int,
	) -> tuple[int, int]:
	report = _align_tokenizer_with_checkpoint_vocab(
	tokenizer,
	checkpoint_repo_path,
	target_vocab_size,
	)
	return (
	int(report.get("added_checkpoint_specials", 0) or 0),
	int(report.get("added_placeholders", 0) or 0),
	)


	def _resolve_modeling_path(repo_path: Path, user_path: Optional[str], model_family: str) -> Optional[str]:
	if user_path:
	cand = Path(user_path)
	if not cand.is_file():
	cand = repo_path / user_path
	return str(cand) if cand.is_file() else None
	tri_info = repo_path / "tri_info.txt"
	if tri_info.is_file():
	info = _read_kv_file(tri_info)
	tri_file = info.get("lopa_modeling_path") or ""
	if tri_file:
	cand = repo_path / tri_file
	if cand.is_file():
	return str(cand)
	default_name = {
	"llama": "tri_llama3_modeling.py",
	"qwen3": "tri_qwen3_modeling.py",
	"mistral": "tri_mistral_modeling.py",
	}.get(model_family, "tri_llama3_modeling.py")
	cand = repo_path / default_name
	if cand.is_file():
	return str(cand)
	return None


	def _attach_llopa_generate(model):
	try:
	import types
	def _llopa_generate(self, tokenizer, system: Optional[str] = None, document: Optional[str] = None,
	question: Optional[str] = None, **kwargs):
	if system is None:
	system = kwargs.pop("system", "")
	if document is None:
	document = kwargs.pop("document", "")
	if question is None:
	question = kwargs.pop("question", "")
	if "device" not in kwargs or kwargs.get("device") is None:
	try:
	dev = self.get_input_embeddings().weight.device
	except Exception:
	try:
	dev = next(self.parameters()).device
	except Exception:
	dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	kwargs["device"] = str(dev)
	return lopa_generate(self, tokenizer, system=system, document=document, question=question, **kwargs)
	model.llopa_generate = types.MethodType(_llopa_generate, model)
	except Exception:
	pass


	def _maybe_attach_llopa_generate(model):
	if not hasattr(model, "llopa_generate"):
	_attach_llopa_generate(model)


	def _attach_prefill_lower_generate(
	model,
	*,
	lower_k: int,
	prefill_attn: str = "causal",
	system_prefill: str = "no_bos_system",
	no_upper_attn: bool = False,
	) -> None:
	try:
	import types
	except Exception:
	return

	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	sys_prefill = (system_prefill or "no_bos_system").strip().lower()
	if sys_prefill not in {"full", "no_system", "no_bos_system"}:
	sys_prefill = "no_bos_system"
	if lower_k <= 0 or attn not in {"causal", "full"}:
	return

	try:
	setattr(model, "_runtime_prefill_lower_layers", int(lower_k))
	setattr(model, "_runtime_prefill_lower_attn", attn)
	setattr(model, "_runtime_prefill_lower_system_prefill", sys_prefill)
	setattr(model, "_runtime_prefill_lower_no_upper_attn", bool(no_upper_attn))
	except Exception:
	return

	if getattr(model, "_runtime_prefill_generate_attached", False):
	return

	orig_prepare = getattr(model, "prepare_inputs_for_generation", None)
	if orig_prepare is None:
	return

	def _runtime_prepare_inputs_for_generation(
	self,
	input_ids,
	past_key_values=None,
	attention_mask=None,
	inputs_embeds=None,
	cache_position=None,
	prefill_lower_layers=None,
	prefill_lower_attn=None,
	prefill_lower_system_prefill=None,
	prefill_lower_no_upper_attn=None,
	prefill_lower_split_start=None,
	prefill_lower_system_len=None,
	prefill_lower_replay_user_prefix_keep_len=None,
	prefill_lower_replay_user_start=None,
	prefill_lower_replay_user_len=None,
	assistant_header_start=None,
	assistant_header_starts=None,
	assistant_header_start_mask=None,
	**kwargs,
	):
	model_inputs = orig_prepare(
	input_ids,
	past_key_values=past_key_values,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	cache_position=cache_position,
	**kwargs,
	)
	if prefill_lower_layers is None:
	prefill_lower_layers = int(getattr(self, "_runtime_prefill_lower_layers", 0) or 0)
	if prefill_lower_attn is None:
	prefill_lower_attn = str(getattr(self, "_runtime_prefill_lower_attn", "causal") or "causal")
	if prefill_lower_system_prefill is None:
	prefill_lower_system_prefill = str(
	getattr(self, "_runtime_prefill_lower_system_prefill", "no_bos_system") or "no_bos_system"
	)
	if prefill_lower_no_upper_attn is None:
	prefill_lower_no_upper_attn = bool(
	getattr(self, "_runtime_prefill_lower_no_upper_attn", False)
	)
	if int(prefill_lower_layers or 0) > 0:
	model_inputs["prefill_lower_layers"] = int(prefill_lower_layers)
	model_inputs["prefill_lower_attn"] = str(prefill_lower_attn)
	model_inputs["prefill_lower_system_prefill"] = str(prefill_lower_system_prefill)
	if bool(prefill_lower_no_upper_attn):
	model_inputs["prefill_lower_no_upper_attn"] = True
	if prefill_lower_split_start is not None:
	model_inputs["prefill_lower_split_start"] = prefill_lower_split_start
	if prefill_lower_system_len is not None:
	model_inputs["prefill_lower_system_len"] = prefill_lower_system_len
	if prefill_lower_replay_user_prefix_keep_len is not None:
	model_inputs["prefill_lower_replay_user_prefix_keep_len"] = prefill_lower_replay_user_prefix_keep_len
	if prefill_lower_replay_user_start is not None:
	model_inputs["prefill_lower_replay_user_start"] = prefill_lower_replay_user_start
	if prefill_lower_replay_user_len is not None:
	model_inputs["prefill_lower_replay_user_len"] = prefill_lower_replay_user_len
	if assistant_header_start is not None:
	model_inputs["assistant_header_start"] = assistant_header_start
	if assistant_header_starts is not None:
	model_inputs["assistant_header_starts"] = assistant_header_starts
	if assistant_header_start_mask is not None:
	model_inputs["assistant_header_start_mask"] = assistant_header_start_mask
	return model_inputs

	try:
	model.prepare_inputs_for_generation = types.MethodType(
	_runtime_prepare_inputs_for_generation,
	model,
	)
	setattr(model, "_runtime_prefill_generate_attached", True)
	except Exception:
	pass


	def _attach_prefill_lower_freeze_generate(
	model,
	*,
	tokenizer=None,
	lower_k: int,
	prefill_attn: str = "causal",
	system_prefill: str = "no_bos_system",
	) -> None:
	try:
	import types
	except Exception:
	return

	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	sys_prefill = (system_prefill or "no_bos_system").strip().lower()
	if sys_prefill not in {"full", "no_system", "no_bos_system"}:
	sys_prefill = "no_bos_system"
	if lower_k <= 0 or attn not in {"causal", "full"}:
	return

	try:
	setattr(model, "_runtime_prefill_freeze_layers", int(lower_k))
	setattr(model, "_runtime_prefill_freeze_attn", attn)
	setattr(model, "_runtime_prefill_freeze_system_prefill", sys_prefill)
	setattr(model, "_runtime_structured_freeze_generate_default", True)
	except Exception:
	return

	if tokenizer is not None:
	_attach_structured_llopa_generate(model, tokenizer)


	def _attach_prefill_lower_solo_generate(
	model,
	*,
	tokenizer=None,
	lower_k: int,
	prefill_attn: str = "causal",
	system_prefill: str = "no_bos_system",
	) -> None:
	try:
	import types
	except Exception:
	return

	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	sys_prefill = (system_prefill or "no_bos_system").strip().lower()
	if sys_prefill not in {"full", "no_system", "no_bos_system"}:
	sys_prefill = "no_bos_system"
	if lower_k <= 0 or attn not in {"causal", "full"}:
	return

	try:
	setattr(model, "_runtime_prefill_solo_layers", int(lower_k))
	setattr(model, "_runtime_prefill_solo_attn", attn)
	setattr(model, "_runtime_prefill_solo_system_prefill", sys_prefill)
	setattr(model, "_runtime_structured_solo_generate_default", True)
	except Exception:
	return

	if tokenizer is not None:
	_attach_structured_llopa_generate(model, tokenizer)


	def _attach_prefill_lower_solo_v2_generate(
	model,
	*,
	tokenizer=None,
	lower_k: int,
	prefill_attn: str = "causal",
	system_prefill: str = "no_bos_system",
	with_bos: bool = False,
	) -> None:
	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	sys_prefill = (system_prefill or "no_bos_system").strip().lower()
	if sys_prefill not in {"full", "no_system", "no_bos_system"}:
	sys_prefill = "no_bos_system"
	if lower_k <= 0 or attn not in {"causal", "full"}:
	return

	try:
	setattr(model, "_runtime_prefill_solo_v2_layers", int(lower_k))
	setattr(model, "_runtime_prefill_solo_v2_attn", attn)
	setattr(model, "_runtime_prefill_solo_v2_system_prefill", sys_prefill)
	setattr(model, "_runtime_prefill_solo_v2_with_bos", bool(with_bos))
	setattr(model, "_runtime_structured_solo_v2_generate_default", True)
	except Exception:
	return

	if tokenizer is not None:
	_attach_structured_llopa_generate(model, tokenizer)


	@torch.inference_mode()
	def _direct_freeze_generate_impl(
	model,
	tokenizer,
	*,
	prompt_messages,
	prompt_add_generation_prompt: bool,
	input_ids: Optional[torch.LongTensor],
	attention_mask: Optional[torch.Tensor],
	lower_k: int,
	prefill_attn: str,
	system_prefill: str,
	max_length=None,
	max_new_tokens=None,
	min_length=None,
	min_new_tokens=None,
	do_sample=None,
	temperature=None,
	top_p=None,
	top_k=None,
	stopping_criteria=None,
	pad_token_id=None,
	eos_token_id=None,
	output_scores: bool = False,
	return_dict_in_generate: bool = False,
	use_cache: Optional[bool] = None,
	):
	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	if lower_k <= 0:
	return None

	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	if attn not in {"causal", "full"}:
	attn = "causal"

	sys_prefill = (system_prefill or "no_bos_system").strip().lower()
	if sys_prefill not in {"full", "no_system", "no_bos_system"}:
	sys_prefill = "no_bos_system"

	device = None
	if isinstance(input_ids, torch.Tensor):
	if input_ids.dim() != 2 or input_ids.size(0) != 1:
	return None
	device = input_ids.device
	if device is None:
	try:
	device = next(model.parameters()).device
	except Exception:
	device = "cpu"

	segments = _build_structured_prompt_segments(
	tokenizer,
	prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	device=device,
	)
	prompt_ids = segments["prompt_ids"]
	system_ids = segments["system_ids"]
	user_ids = segments["user_ids"]
	canonical_input_ids = prompt_ids
	split_start = int(system_ids.size(1) + user_ids.size(1))
	system_len = int(system_ids.size(1))
	total_prompt_len = int(canonical_input_ids.size(1))

	if max_new_tokens is None:
	if max_length is None:
	max_new_tokens = 256
	else:
	max_new_tokens = max(0, int(max_length) - total_prompt_len)
	else:
	max_new_tokens = int(max_new_tokens)
	if min_new_tokens is None:
	if min_length is None:
	min_new_tokens = 0
	else:
	min_new_tokens = max(0, int(min_length) - total_prompt_len)
	else:
	min_new_tokens = int(min_new_tokens)

	raw_temp = 0.0 if temperature is None else float(temperature)
	if do_sample is None:
	do_sample = bool(raw_temp != 0.0)
	do_sample = bool(do_sample)
	sample_temp = 1.0 if (not do_sample or raw_temp == 0.0) else float(raw_temp)
	top_p = 1.0 if top_p is None else float(top_p)
	top_k = None if top_k is None else int(top_k)

	stop_ids = set(_normalize_eos_token_ids(eos_token_id))
	if not stop_ids:
	tok_eos = getattr(tokenizer, "eos_token_id", None)
	if tok_eos is not None:
	stop_ids.add(int(tok_eos))
	with contextlib.suppress(Exception):
	eot_id = tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	if eot_id is not None and eot_id != tokenizer.unk_token_id:
	stop_ids.add(int(eot_id))
	logits_warpers = _build_sampling_warpers(do_sample, sample_temp, top_p, top_k)

	generated = torch.empty((1, int(max_new_tokens)), dtype=torch.long, device=canonical_input_ids.device)
	score_list: list[torch.Tensor] = []
	cur = 0

	while cur < int(max_new_tokens):
	current_ids = torch.cat([canonical_input_ids, generated[:, :cur]], dim=1)
	current_attn = torch.ones_like(current_ids, device=current_ids.device)
	out = model(
	input_ids=current_ids,
	attention_mask=current_attn,
	use_cache=False,
	logits_to_keep=1,
	prefill_lower_layers=int(lower_k),
	prefill_lower_attn=str(attn),
	prefill_lower_freeze_runtime=True,
	prefill_lower_split_start=int(split_start),
	prefill_lower_system_len=int(system_len),
	prefill_lower_system_prefill=str(sys_prefill),
	)
	if out is None or not isinstance(getattr(out, "logits", None), torch.Tensor):
	return None
	logits = out.logits[:, -1, :].to(torch.float32)

	if stop_ids and cur < int(min_new_tokens):
	for sid in stop_ids:
	logits[:, sid] = -float("inf")
	if logits_warpers is not None:
	logits = logits_warpers(generated[:, :cur], logits)
	if bool(output_scores):
	score_list.append(logits.detach().clone())

	if do_sample:
	probs = torch.softmax(logits, dim=-1)
	next_tok = torch.multinomial(probs, num_samples=1)
	else:
	next_tok = torch.argmax(logits, dim=-1, keepdim=True)

	generated[:, cur : cur + 1] = next_tok
	cur += 1

	sequences_now = torch.cat([canonical_input_ids, generated[:, :cur]], dim=1)
	should_stop = False
	tok_id = int(next_tok.item())
	if tok_id in stop_ids and cur >= int(min_new_tokens):
	should_stop = True
	if (not should_stop) and stopping_criteria is not None:
	try:
	should_stop = bool(stopping_criteria(sequences_now, logits))
	except TypeError:
	should_stop = bool(stopping_criteria(sequences_now, None))
	if should_stop:
	break

	sequences = torch.cat([canonical_input_ids, generated[:, :cur]], dim=1)
	if not bool(return_dict_in_generate):
	return sequences
	return {
	"sequences": sequences,
	"scores": tuple(score_list) if bool(output_scores) else tuple(),
	}


	@torch.inference_mode()
	def _direct_solo_generate_impl(
	model,
	tokenizer,
	*,
	prompt_messages,
	prompt_add_generation_prompt: bool,
	input_ids: Optional[torch.LongTensor],
	attention_mask: Optional[torch.Tensor],
	lower_k: int,
	prefill_attn: str,
	system_prefill: str,
	max_length=None,
	max_new_tokens=None,
	min_length=None,
	min_new_tokens=None,
	do_sample=None,
	temperature=None,
	top_p=None,
	top_k=None,
	stopping_criteria=None,
	pad_token_id=None,
	eos_token_id=None,
	output_scores: bool = False,
	return_dict_in_generate: bool = False,
	use_cache: Optional[bool] = None,
	solo_v2: bool = False,
	with_bos: bool = False,
	):
	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	if lower_k <= 0:
	return None

	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	if attn not in {"causal", "full"}:
	attn = "causal"

	sys_prefill = (system_prefill or "no_bos_system").strip().lower()
	if sys_prefill not in {"full", "no_system", "no_bos_system"}:
	sys_prefill = "no_bos_system"

	device = None
	if isinstance(input_ids, torch.Tensor):
	if input_ids.dim() != 2 or input_ids.size(0) != 1:
	return None
	device = input_ids.device
	if device is None:
	try:
	device = next(model.parameters()).device
	except Exception:
	device = "cpu"

	segments = _build_structured_prompt_segments(
	tokenizer,
	prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	device=device,
	)
	prompt_ids = segments["prompt_ids"]
	system_ids = segments["system_ids"]
	user_ids = segments["user_ids"]
	canonical_input_ids = prompt_ids
	split_start = int(system_ids.size(1) + user_ids.size(1))
	system_len = int(system_ids.size(1))
	total_prompt_len = int(canonical_input_ids.size(1))

	if max_new_tokens is None:
	if max_length is None:
	max_new_tokens = 256
	else:
	max_new_tokens = max(0, int(max_length) - total_prompt_len)
	else:
	max_new_tokens = int(max_new_tokens)
	if min_new_tokens is None:
	if min_length is None:
	min_new_tokens = 0
	else:
	min_new_tokens = max(0, int(min_length) - total_prompt_len)
	else:
	min_new_tokens = int(min_new_tokens)

	raw_temp = 0.0 if temperature is None else float(temperature)
	if do_sample is None:
	do_sample = bool(raw_temp != 0.0)
	do_sample = bool(do_sample)
	sample_temp = 1.0 if (not do_sample or raw_temp == 0.0) else float(raw_temp)
	top_p = 1.0 if top_p is None else float(top_p)
	top_k = None if top_k is None else int(top_k)

	stop_ids = set(_normalize_eos_token_ids(eos_token_id))
	if not stop_ids:
	tok_eos = getattr(tokenizer, "eos_token_id", None)
	if tok_eos is not None:
	stop_ids.add(int(tok_eos))
	with contextlib.suppress(Exception):
	eot_id = tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	if eot_id is not None and eot_id != tokenizer.unk_token_id:
	stop_ids.add(int(eot_id))
	logits_warpers = _build_sampling_warpers(do_sample, sample_temp, top_p, top_k)

	generated = torch.empty((1, int(max_new_tokens)), dtype=torch.long, device=canonical_input_ids.device)
	score_list: list[torch.Tensor] = []
	cur = 0

	while cur < int(max_new_tokens):
	current_ids = torch.cat([canonical_input_ids, generated[:, :cur]], dim=1)
	current_attn = torch.ones_like(current_ids, device=current_ids.device)
	out = model(
	input_ids=current_ids,
	attention_mask=current_attn,
	use_cache=False,
	logits_to_keep=1,
	prefill_lower_layers=int(lower_k),
	prefill_lower_attn=str(attn),
	prefill_lower_split_start=int(split_start),
	prefill_lower_system_len=int(system_len),
	prefill_lower_system_prefill=str(sys_prefill),
	prefill_lower_solo_attention=bool(not solo_v2),
	prefill_lower_solo_attention_v2=bool(solo_v2),
	prefill_lower_solo_attention_v2_with_bos=bool(solo_v2 and with_bos),
	)
	if out is None or not isinstance(getattr(out, "logits", None), torch.Tensor):
	return None
	logits = out.logits[:, -1, :].to(torch.float32)

	if stop_ids and cur < int(min_new_tokens):
	for sid in stop_ids:
	logits[:, sid] = -float("inf")
	if logits_warpers is not None:
	logits = logits_warpers(generated[:, :cur], logits)
	if bool(output_scores):
	score_list.append(logits.detach().clone())

	if do_sample:
	probs = torch.softmax(logits, dim=-1)
	next_tok = torch.multinomial(probs, num_samples=1)
	else:
	next_tok = torch.argmax(logits, dim=-1, keepdim=True)

	generated[:, cur : cur + 1] = next_tok
	cur += 1

	sequences_now = torch.cat([canonical_input_ids, generated[:, :cur]], dim=1)
	should_stop = False
	tok_id = int(next_tok.item())
	if tok_id in stop_ids and cur >= int(min_new_tokens):
	should_stop = True
	if (not should_stop) and stopping_criteria is not None:
	try:
	should_stop = bool(stopping_criteria(sequences_now, logits))
	except TypeError:
	should_stop = bool(stopping_criteria(sequences_now, None))
	if should_stop:
	break

	sequences = torch.cat([canonical_input_ids, generated[:, :cur]], dim=1)
	if not bool(return_dict_in_generate):
	return sequences
	return {
	"sequences": sequences,
	"scores": tuple(score_list) if bool(output_scores) else tuple(),
	}


	def _attach_runtime_llopa_generate(
	model,
	*,
	header_ids: torch.Tensor,
	lower_k: int,
	prefill_attn: str = "causal",
	no_upper_attn: bool = False,
	) -> None:
	try:
	import types
	except Exception:
	return

	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	if (
	lower_k <= 0
	or attn not in {"causal", "full"}
	or not isinstance(header_ids, torch.Tensor)
	or header_ids.numel() == 0
	):
	return

	try:
	setattr(model, "_runtime_llopa_header_ids", header_ids.detach().to(device="cpu", dtype=torch.long))
	setattr(model, "_runtime_llopa_layers", int(lower_k))
	setattr(model, "_runtime_llopa_attn", attn)
	setattr(model, "_runtime_llopa_no_upper_attn", bool(no_upper_attn))
	except Exception:
	return

	if getattr(model, "_runtime_llopa_generate_attached", False):
	return

	orig_prepare = getattr(model, "prepare_inputs_for_generation", None)
	if orig_prepare is None:
	return

	def _runtime_prepare_inputs_for_generation(
	self,
	input_ids,
	past_key_values=None,
	attention_mask=None,
	inputs_embeds=None,
	cache_position=None,
	runtime_llopa_prefill=None,
	runtime_llopa_layers=None,
	runtime_llopa_attn=None,
	runtime_llopa_no_upper_attn=None,
	**kwargs,
	):
	model_inputs = orig_prepare(
	input_ids,
	past_key_values=past_key_values,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	cache_position=cache_position,
	**kwargs,
	)
	if runtime_llopa_prefill is None:
	runtime_llopa_prefill = True
	if runtime_llopa_layers is None:
	runtime_llopa_layers = int(getattr(self, "_runtime_llopa_layers", 0) or 0)
	if runtime_llopa_attn is None:
	runtime_llopa_attn = str(getattr(self, "_runtime_llopa_attn", "causal") or "causal")
	if runtime_llopa_no_upper_attn is None:
	runtime_llopa_no_upper_attn = bool(getattr(self, "_runtime_llopa_no_upper_attn", False))
	if bool(runtime_llopa_prefill) and int(runtime_llopa_layers or 0) > 0:
	model_inputs["runtime_llopa_prefill"] = True
	model_inputs["runtime_llopa_layers"] = int(runtime_llopa_layers)
	model_inputs["runtime_llopa_attn"] = str(runtime_llopa_attn)
	if bool(runtime_llopa_no_upper_attn):
	model_inputs["runtime_llopa_no_upper_attn"] = True
	return model_inputs

	try:
	model.prepare_inputs_for_generation = types.MethodType(
	_runtime_prepare_inputs_for_generation,
	model,
	)
	setattr(model, "_runtime_llopa_generate_attached", True)
	except Exception:
	pass


	@torch.inference_mode()
	def _runtime_llopa_fast_generate_mode(
	model,
	input_ids: torch.LongTensor,
	logits_processor,
	stopping_criteria,
	generation_config,
	synced_gpus: bool = False,
	streamer=None,
	**model_kwargs,
	):
	from transformers.generation.utils import GenerateDecoderOnlyOutput

	if model.config.is_encoder_decoder:
	return None

	pad_token_id = generation_config._pad_token_tensor
	output_attentions = generation_config.output_attentions
	output_hidden_states = generation_config.output_hidden_states
	output_scores = generation_config.output_scores
	output_logits = generation_config.output_logits
	return_dict_in_generate = generation_config.return_dict_in_generate
	has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
	do_sample = generation_config.do_sample

	scores = () if (return_dict_in_generate and output_scores) else None
	raw_logits = () if (return_dict_in_generate and output_logits) else None
	decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
	decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None

	batch_size, cur_len = input_ids.shape[:2]
	this_peer_finished = False
	unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
	model_kwargs = model._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)

	while model._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
	model_inputs = model.prepare_inputs_for_generation(input_ids, **model_kwargs)
	outputs = model(**model_inputs, return_dict=True)

	model_kwargs = model._update_model_kwargs_for_generation(
	outputs,
	model_kwargs,
	is_encoder_decoder=False,
	)
	if synced_gpus and this_peer_finished:
	continue

	next_token_logits = outputs.logits[:, -1, :].to(dtype=torch.float32)
	raw_next_token_logits = next_token_logits.clone() if (return_dict_in_generate and output_logits) else None
	next_token_scores = logits_processor(input_ids, next_token_logits)

	if return_dict_in_generate:
	if output_scores:
	scores += (next_token_scores,)
	if output_logits:
	raw_logits += (raw_next_token_logits,)
	if output_attentions:
	decoder_attentions += (outputs.attentions,)
	if output_hidden_states:
	decoder_hidden_states += (outputs.hidden_states,)

	if do_sample:
	probs = nn.functional.softmax(next_token_scores, dim=-1)
	next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
	else:
	next_tokens = torch.argmax(next_token_scores, dim=-1)

	if has_eos_stopping_criteria:
	next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)

	input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
	if streamer is not None:
	streamer.put(next_tokens.cpu())

	unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
	this_peer_finished = unfinished_sequences.max() == 0
	cur_len += 1

	del outputs

	if streamer is not None:
	streamer.end()

	if return_dict_in_generate:
	return GenerateDecoderOnlyOutput(
	sequences=input_ids,
	scores=scores,
	logits=raw_logits,
	attentions=decoder_attentions,
	hidden_states=decoder_hidden_states,
	past_key_values=model_kwargs.get("past_key_values"),
	)
	return input_ids


	@torch.inference_mode()
	def _llopa_v2_generation_mixin_decode_loop(
	model,
	*,
	canonical_input_ids: torch.LongTensor,
	attention_mask: Optional[torch.Tensor],
	past_key_values,
	initial_logits: torch.Tensor,
	lower_k: int,
	no_upper_attn: bool,
	replay_module: str,
	replay_per_layers: int,
	max_new_tokens: int,
	min_new_tokens: int,
	do_sample: bool,
	logits_warpers,
	stop_ids: set[int],
	stop_token_ids: Optional[torch.Tensor],
	stopping_criteria,
	output_scores: bool,
	compact_scores: bool,
	return_dict_in_generate: bool,
	):
	if not isinstance(canonical_input_ids, torch.Tensor) or canonical_input_ids.dim() != 2:
	return None
	if canonical_input_ids.size(0) != 1:
	return None
	if not isinstance(initial_logits, torch.Tensor):
	return None
	if int(max_new_tokens) <= 0:
	return canonical_input_ids if not bool(return_dict_in_generate) else None

	from transformers.generation.utils import GenerateDecoderOnlyOutput

	device = canonical_input_ids.device
	total_prompt_len = int(canonical_input_ids.size(1))
	max_new_tokens = int(max_new_tokens)
	min_new_tokens = int(min_new_tokens)
	sequences = torch.empty(
	(canonical_input_ids.size(0), total_prompt_len + max_new_tokens),
	dtype=canonical_input_ids.dtype,
	device=device,
	)
	sequences[:, :total_prompt_len] = canonical_input_ids
	generated = sequences[:, total_prompt_len:]

	if isinstance(attention_mask, torch.Tensor) and attention_mask.dim() == 2:
	prompt_attention_mask = attention_mask.to(device=device)
	if prompt_attention_mask.size(0) != canonical_input_ids.size(0):
	prompt_attention_mask = torch.ones_like(canonical_input_ids, dtype=torch.long, device=device)
	elif prompt_attention_mask.size(1) != total_prompt_len:
	prompt_attention_mask = prompt_attention_mask[:, -total_prompt_len:]
	else:
	prompt_attention_mask = torch.ones_like(canonical_input_ids, dtype=torch.long, device=device)

	model_kwargs = {
	"attention_mask": prompt_attention_mask,
	"past_key_values": past_key_values,
	"use_cache": True,
	"llopa_v2_decode": True,
	"llopa_v2_decode_layers": int(lower_k),
	"llopa_v2_decode_no_upper_attn": bool(no_upper_attn),
	"llopa_v2_decode_replay_module": str(replay_module),
	"llopa_v2_decode_replay_per_layers": int(replay_per_layers),
	}

	record_scores = bool(output_scores)
	record_compact_scores = record_scores and bool(compact_scores)
	score_list: list[torch.Tensor] = []
	score_list_append = score_list.append
	compact_logprob_list: list[torch.Tensor] = []
	compact_logprob_append = compact_logprob_list.append
	cur = 0
	pending_logits = initial_logits
	should_apply_stopping_criteria = stopping_criteria is not None

	while cur < max_new_tokens:
	outputs = None
	used_prefill_logits = pending_logits is not None
	if used_prefill_logits:
	logits = pending_logits.to(dtype=torch.float32, device=device, copy=True)
	pending_logits = None
	else:
	current_input_ids = sequences[:, : total_prompt_len + cur]
	if not isinstance(model_kwargs.get("cache_position"), torch.Tensor):
	model_kwargs["cache_position"] = torch.arange(
	total_prompt_len + cur - 1,
	total_prompt_len + cur,
	device=device,
	dtype=torch.long,
	)
	model_inputs = model.prepare_inputs_for_generation(current_input_ids, **model_kwargs)
	outputs = model(**model_inputs, return_dict=True)
	model_kwargs = model._update_model_kwargs_for_generation(
	outputs,
	model_kwargs,
	is_encoder_decoder=False,
	)
	logits = outputs.logits[:, -1, :].to(dtype=torch.float32, device=device, copy=True)

	if stop_token_ids is not None and cur < min_new_tokens:
	logits.index_fill_(1, stop_token_ids, -float("inf"))
	if logits_warpers is not None:
	logits = logits_warpers(generated[:, :cur], logits)
	if record_scores and not record_compact_scores:
	score_list_append(logits.detach().clone())

	if do_sample:
	probs = torch.softmax(logits, dim=-1)
	next_tok = torch.multinomial(probs, num_samples=1)
	else:
	next_tok = torch.argmax(logits, dim=-1, keepdim=True)
	if record_compact_scores:
	next_logit = torch.gather(logits, 1, next_tok)
	next_logprob = next_logit - torch.logsumexp(logits, dim=-1, keepdim=True)
	compact_logprob_append(next_logprob.squeeze(-1).detach())

	generated[:, cur : cur + 1] = next_tok
	cur += 1

	if used_prefill_logits:
	one_mask = torch.ones(
	(prompt_attention_mask.size(0), 1),
	dtype=prompt_attention_mask.dtype,
	device=device,
	)
	model_kwargs["attention_mask"] = torch.cat([model_kwargs["attention_mask"], one_mask], dim=-1)
	model_kwargs["cache_position"] = torch.arange(
	total_prompt_len,
	total_prompt_len + 1,
	device=device,
	dtype=torch.long,
	)

	should_stop = False
	tok_id = int(next_tok.item())
	if tok_id in stop_ids and cur >= min_new_tokens:
	should_stop = True
	if (not should_stop) and should_apply_stopping_criteria:
	sequences_now = sequences[:, : total_prompt_len + cur]
	try:
	should_stop = bool(stopping_criteria(sequences_now, logits))
	except TypeError:
	should_stop = bool(stopping_criteria(sequences_now, None))
	if outputs is not None:
	del outputs
	if should_stop:
	break

	sequences = sequences[:, : total_prompt_len + cur]
	if not bool(return_dict_in_generate):
	return sequences

	output = GenerateDecoderOnlyOutput(
	sequences=sequences,
	scores=tuple(score_list) if record_scores and not record_compact_scores else None,
	past_key_values=model_kwargs.get("past_key_values", past_key_values),
	)
	if record_compact_scores:
	if compact_logprob_list:
	compact_logprobs = torch.stack(compact_logprob_list, dim=1)
	else:
	compact_logprobs = torch.empty(
	(sequences.size(0), 0),
	dtype=torch.float32,
	device=sequences.device,
	)
	setattr(output, "generated_token_logprobs", compact_logprobs)
	return output


	def _attach_runtime_llopa_fast_generate(
	model,
	*,
	lower_k: int,
	prefill_attn: str = "causal",
	no_upper_attn: bool = False,
	) -> None:
	try:
	import types
	except Exception:
	return

	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	if lower_k <= 0 or attn not in {"causal", "full"}:
	return

	try:
	setattr(model, "_runtime_llopa_fast_layers", int(lower_k))
	setattr(model, "_runtime_llopa_fast_attn", attn)
	setattr(model, "_runtime_llopa_fast_no_upper_attn", bool(no_upper_attn))
	except Exception:
	return

	if getattr(model, "_runtime_llopa_fast_generate_attached", False):
	return

	orig_generate = getattr(model, "generate", None)
	if not callable(orig_generate):
	return

	def _runtime_llopa_fast_generate(self, args, *kwargs):
	if bool(getattr(self.config, "is_encoder_decoder", False)):
	return orig_generate(args, *kwargs)

	if kwargs.get("custom_generate") is not None or kwargs.get("assistant_model") is not None:
	return orig_generate(args, *kwargs)

	if kwargs.get("inputs_embeds") is not None:
	return orig_generate(args, *kwargs)

	num_beams = kwargs.get("num_beams", None)
	if num_beams is None:
	gen_cfg = kwargs.get("generation_config")
	num_beams = getattr(gen_cfg, "num_beams", 1) if gen_cfg is not None else 1
	if int(num_beams or 1) != 1:
	_warn_once(
	self,
	"_warned_runtime_llopa_fast_num_beams",
	"[load_llopa_model][warn] runtime_llopa_fast_generate currently supports only num_beams=1; falling back to model.generate().",
	)
	return orig_generate(args, *kwargs)

	fast_enabled = kwargs.pop("runtime_llopa_fast_generate", None)
	if fast_enabled is None:
	fast_enabled = True
	if not bool(fast_enabled):
	return orig_generate(args, *kwargs)

	runtime_enabled = kwargs.get("runtime_llopa_prefill", None)
	if runtime_enabled is None:
	runtime_enabled = True
	if not bool(runtime_enabled):
	return orig_generate(args, *kwargs)

	if kwargs.get("runtime_llopa_layers") is None:
	kwargs["runtime_llopa_layers"] = int(getattr(self, "_runtime_llopa_fast_layers", 0) or 0)
	if kwargs.get("runtime_llopa_attn") is None:
	kwargs["runtime_llopa_attn"] = str(getattr(self, "_runtime_llopa_fast_attn", "causal") or "causal")
	if kwargs.get("runtime_llopa_no_upper_attn") is None:
	kwargs["runtime_llopa_no_upper_attn"] = bool(
	getattr(self, "_runtime_llopa_fast_no_upper_attn", False)
	)
	kwargs["runtime_llopa_prefill"] = True

	return orig_generate(
	*args,
	custom_generate=_runtime_llopa_fast_generate_mode,
	**kwargs,
	)

	try:
	model.generate = types.MethodType(_runtime_llopa_fast_generate, model)
	setattr(model, "_runtime_llopa_fast_generate_attached", True)
	except Exception:
	pass


	def _supports_prefill_lower_runtime(model) -> bool:
	try:
	inner = _get_inner_model(model)
	except Exception:
	inner = None
	return bool(
	hasattr(model, "tri_vanilla_prefill_decode_forward")
	and inner is not None
	and hasattr(inner, "tri_prefill_lower_cache")
	)


	def _supports_runtime_llopa_prompt_prefill(model) -> bool:
	try:
	inner = _get_inner_model(model)
	except Exception:
	inner = None
	return bool(
	hasattr(model, "tri_runtime_llopa_prompt_prefill_forward")
	and _get_llopa_decode_step(model) is not None
	and inner is not None
	and hasattr(inner, "llopa_prefill_cache")
	)


	def _supports_direct_llopa_generate(model) -> bool:
	try:
	inner = _get_inner_model(model)
	except Exception:
	inner = None
	return bool(
	_get_llopa_decode_step(model) is not None
	and inner is not None
	and hasattr(inner, "llopa_prefill_cache")
	)


	def _warn_once(model, flag: str, msg: str) -> None:
	if getattr(model, flag, False):
	return
	try:
	setattr(model, flag, True)
	except Exception:
	pass
	print(msg)


	def _normalize_eos_token_ids(eos_token_id) -> list[int]:
	if eos_token_id is None:
	return []
	if isinstance(eos_token_id, torch.Tensor):
	eos_token_id = eos_token_id.flatten().tolist()
	if isinstance(eos_token_id, (list, tuple, set)):
	out: list[int] = []
	for item in eos_token_id:
	try:
	out.append(int(item))
	except Exception:
	continue
	return out
	try:
	return [int(eos_token_id)]
	except Exception:
	return []


	def _prepare_stop_token_tensor(stop_ids, device) -> Optional[torch.LongTensor]:
	if not stop_ids:
	return None
	try:
	ordered = [int(tok_id) for tok_id in stop_ids]
	except Exception:
	return None
	if not ordered:
	return None
	return torch.tensor(ordered, device=device, dtype=torch.long)


	def _find_last_subsequence_start(input_ids: torch.Tensor, pattern: torch.Tensor) -> Optional[int]:
	if not isinstance(input_ids, torch.Tensor) or not isinstance(pattern, torch.Tensor):
	return None
	if input_ids.dim() == 2:
	if input_ids.size(0) != 1:
	return None
	seq = input_ids[0]
	elif input_ids.dim() == 1:
	seq = input_ids
	else:
	return None
	if pattern.dim() == 2:
	if pattern.size(0) != 1:
	return None
	pat = pattern[0]
	elif pattern.dim() == 1:
	pat = pattern
	else:
	return None

	pat_len = int(pat.numel())
	seq_len = int(seq.numel())
	if pat_len <= 0 or seq_len < pat_len:
	return None

	windows = seq.unfold(0, pat_len, 1)
	matches = (windows == pat).all(dim=-1).nonzero(as_tuple=False)
	if matches.numel() == 0:
	return None
	return int(matches[-1].item())


	def _build_sampling_warpers(do_sample: bool, temperature, top_p, top_k):
	if not do_sample:
	return None
	from transformers.generation import LogitsProcessorList
	from transformers.generation.logits_process import (
	TemperatureLogitsWarper,
	TopKLogitsWarper,
	TopPLogitsWarper,
	)

	procs = LogitsProcessorList()
	temp = float(temperature)
	if temp != 1.0:
	procs.append(TemperatureLogitsWarper(temp))
	if top_p is not None and float(top_p) < 1.0:
	procs.append(TopPLogitsWarper(float(top_p), min_tokens_to_keep=1))
	if top_k is not None and int(top_k) > 0:
	procs.append(TopKLogitsWarper(int(top_k), filter_value=-float("inf")))
	return procs


	@torch.inference_mode()
	def _legacy_direct_llopa_generate_impl(
	model,
	tokenizer,
	*,
	input_ids: Optional[torch.LongTensor],
	attention_mask: Optional[torch.Tensor],
	lower_k: int,
	prefill_attn: str,
	max_length=None,
	max_new_tokens=None,
	min_length=None,
	min_new_tokens=None,
	do_sample=None,
	temperature=None,
	top_p=None,
	top_k=None,
	stopping_criteria=None,
	pad_token_id=None,
	eos_token_id=None,
	output_scores: bool = False,
	return_dict_in_generate: bool = False,
	use_cache: Optional[bool] = None,
	):
	if input_ids is None or input_ids.dim() != 2 or input_ids.size(0) != 1:
	return None
	if use_cache is False:
	return None

	valid_len = int(input_ids.size(1))
	if attention_mask is not None:
	if attention_mask.dim() != 2 or attention_mask.size(0) != input_ids.size(0):
	return None
	valid_len = int(attention_mask[0].sum().item())
	if valid_len <= 0:
	return None

	trimmed_input_ids = input_ids[:, -valid_len:]
	header_ids = getattr(model, "_direct_llopa_header_ids", None)
	if not isinstance(header_ids, torch.Tensor) or header_ids.numel() == 0:
	return None
	hdr = header_ids.to(device=trimmed_input_ids.device, dtype=trimmed_input_ids.dtype)
	assistant_start = _find_last_subsequence_start(trimmed_input_ids, hdr)
	if assistant_start is None:
	return None
	prefix_ids = trimmed_input_ids[:, :assistant_start]
	assistant_ids = trimmed_input_ids[:, assistant_start:]
	if assistant_ids.numel() == 0:
	return None
	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	if lower_k <= 0:
	return None

	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	if attn not in {"causal", "full"}:
	attn = "causal"

	total_prompt_len = int(input_ids.size(1))
	if max_new_tokens is None:
	if max_length is None:
	max_new_tokens = 256
	else:
	max_new_tokens = max(0, int(max_length) - total_prompt_len)
	else:
	max_new_tokens = int(max_new_tokens)
	if min_new_tokens is None:
	if min_length is None:
	min_new_tokens = 0
	else:
	min_new_tokens = max(0, int(min_length) - total_prompt_len)
	else:
	min_new_tokens = int(min_new_tokens)

	raw_temp = 0.0 if temperature is None else float(temperature)
	if do_sample is None:
	do_sample = bool(raw_temp != 0.0)
	do_sample = bool(do_sample)
	sample_temp = 1.0 if (not do_sample or raw_temp == 0.0) else float(raw_temp)
	top_p = 1.0 if top_p is None else float(top_p)
	top_k = None if top_k is None else int(top_k)

	llopa_core = _get_llopa_core(model)
	llopa_step = _get_llopa_decode_step(model)
	if llopa_core is None or llopa_step is None:
	return None
	llopa_fn = getattr(llopa_core, "llopa_prefill_cache", None)
	if llopa_fn is None:
	return None

	output_head = _get_output_head(model)
	prefill_out = llopa_fn(
	system_ids=prefix_ids[:, :0],
	user_ids=prefix_ids,
	assistant_ids=assistant_ids,
	lower_k=lower_k,
	prefill_mode="lower",
	prefill_attn=attn,
	return_last_assistant_hidden=bool(output_head is not None),
	)
	initial_logits = None
	if isinstance(prefill_out, tuple):
	pkv, last_hidden = prefill_out
	if output_head is not None and isinstance(last_hidden, torch.Tensor) and last_hidden.numel() > 0:
	initial_logits = output_head(last_hidden)[:, -1, :].to(torch.float32)
	else:
	pkv = prefill_out

	S = 0
	U = int(prefix_ids.size(1))
	last = assistant_ids[:, -1:]
	stop_ids = set(_normalize_eos_token_ids(eos_token_id))
	if not stop_ids:
	tok_eos = getattr(tokenizer, "eos_token_id", None)
	if tok_eos is not None:
	stop_ids.add(int(tok_eos))
	stop_token_ids = _prepare_stop_token_tensor(stop_ids, last.device)
	logits_warpers = _build_sampling_warpers(do_sample, sample_temp, top_p, top_k)
	max_new_tokens = int(max_new_tokens)
	min_new_tokens = int(min_new_tokens)
	total_prompt_len = int(input_ids.size(1))
	record_scores = bool(output_scores)
	should_apply_stopping_criteria = stopping_criteria is not None

	sequences = torch.empty(
	(input_ids.size(0), total_prompt_len + max_new_tokens),
	dtype=input_ids.dtype,
	device=input_ids.device,
	)
	sequences[:, :total_prompt_len] = input_ids
	generated = sequences[:, total_prompt_len:]
	score_list: list[torch.Tensor] = []
	score_list_append = score_list.append
	cur = 0
	pending_logits = initial_logits

	while cur < max_new_tokens:
	out = None
	if pending_logits is None:
	out = llopa_step(
	assistant_ids=last,
	lower_k=lower_k,
	pkv=pkv,
	S=S,
	U=U,
	logits_to_keep=1,
	labels=None,
	prefill_mode="lower",
	)
	pkv = out.past_key_values or pkv
	logits = out.logits[:, -1, :].to(dtype=torch.float32, device=last.device, copy=True)
	else:
	logits = pending_logits
	pending_logits = None

	if stop_token_ids is not None and cur < min_new_tokens:
	logits.index_fill_(1, stop_token_ids, -float("inf"))
	if logits_warpers is not None:
	logits = logits_warpers(generated[:, :cur], logits)
	if record_scores:
	score_list_append(logits.detach().clone())

	if do_sample:
	probs = torch.softmax(logits, dim=-1)
	next_tok = torch.multinomial(probs, num_samples=1)
	else:
	next_tok = torch.argmax(logits, dim=-1, keepdim=True)

	generated[:, cur : cur + 1] = next_tok
	cur += 1

	should_stop = False
	tok_id = int(next_tok.item())
	if tok_id in stop_ids and cur >= min_new_tokens:
	should_stop = True
	if (not should_stop) and should_apply_stopping_criteria:
	sequences_now = sequences[:, : total_prompt_len + cur]
	try:
	should_stop = bool(stopping_criteria(sequences_now, logits))
	except TypeError:
	should_stop = bool(stopping_criteria(sequences_now, None))
	if out is not None:
	del out
	if should_stop:
	break
	last = next_tok

	sequences = sequences[:, : total_prompt_len + cur]
	if not bool(return_dict_in_generate):
	return sequences
	from transformers.generation.utils import GenerateDecoderOnlyOutput

	return GenerateDecoderOnlyOutput(
	sequences=sequences,
	scores=tuple(score_list) if bool(output_scores) else None,
	past_key_values=pkv,
	)


	@torch.inference_mode()
	def _direct_llopa_generate_impl(
	model,
	tokenizer,
	*,
	prompt_messages,
	prompt_add_generation_prompt: bool,
	structured_prompt_segments=None,
	input_ids: Optional[torch.LongTensor],
	attention_mask: Optional[torch.Tensor],
	lower_k: int,
	prefill_attn: str,
	system_prefill: str,
	user_prefill: str,
	no_upper_attn: bool,
	see_past_assistant: bool = False,
	replay_module: str = "none",
	replay_per_layers: int = -1,
	last_layer_module: Optional[str] = None,
	seed_mode: str = "auto",
	max_length=None,
	max_new_tokens=None,
	min_length=None,
	min_new_tokens=None,
	do_sample=None,
	temperature=None,
	top_p=None,
	top_k=None,
	stopping_criteria=None,
	pad_token_id=None,
	eos_token_id=None,
	output_scores: bool = False,
	compact_scores: bool = False,
	return_dict_in_generate: bool = False,
	use_cache: Optional[bool] = None,
	):
	if last_layer_module is not None and _normalize_replay_module_value(replay_module) == "none":
	replay_module = last_layer_module
	replay_module = _normalize_replay_module_value(replay_module)
	replay_per_layers = _normalize_replay_per_layers_value(replay_per_layers)
	if use_cache is False:
	return None

	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	if lower_k <= 0:
	return None

	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	if attn not in {"causal", "full"}:
	attn = "causal"

	sys_prefill = (system_prefill or "full").strip().lower()
	if sys_prefill not in {"full", "no_system", "no_bos_system"}:
	sys_prefill = "full"

	user_prefill_norm = (user_prefill or "full").strip().lower()
	if user_prefill_norm != "full":
	raise ValueError("Unified direct LLoPA currently supports only user_prefill='full'.")

	llopa_core = _get_llopa_core(model)
	llopa_step = _get_llopa_decode_step(model)
	if llopa_core is None or llopa_step is None:
	return None
	llopa_forward_assistant = getattr(llopa_core, "tri_forward_assistant", None)
	decode_output_head = _get_output_head(model)
	use_direct_decode_step = (
	_env_flag_enabled("CAPSULE_LLOPA_DIRECT_DECODE_STEP", "1")
	and callable(llopa_forward_assistant)
	and decode_output_head is not None
	)

	device = None
	if isinstance(input_ids, torch.Tensor):
	if input_ids.dim() != 2 or input_ids.size(0) != 1:
	return None
	device = input_ids.device
	if device is None:
	try:
	device = next(model.parameters()).device
	except Exception:
	device = "cpu"

	segments = structured_prompt_segments if isinstance(structured_prompt_segments, dict) else None
	if segments is None:
	segments = _build_structured_prompt_segments(
	tokenizer,
	prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	device=device,
	)
	prompt_ids = segments["prompt_ids"]
	system_ids = segments["system_ids"]
	user_ids = segments["user_ids"]
	assistant_prefill_ids = segments["assistant_prefill_ids"]
	replay_user_prefix_keep_len = int(segments.get("replay_user_prefix_keep_len", 0) or 0)
	replay_user_start = int(segments.get("replay_user_start", 0) or 0)
	replay_user_len = int(segments.get("replay_user_len", 0) or 0)
	if assistant_prefill_ids.numel() == 0:
	return None

	raw_temp = 0.0 if temperature is None else float(temperature)
	if do_sample is None:
	do_sample = bool(raw_temp != 0.0)
	do_sample = bool(do_sample)
	sample_temp = 1.0 if (not do_sample or raw_temp == 0.0) else float(raw_temp)
	top_p = 1.0 if top_p is None else float(top_p)
	top_k = None if top_k is None else int(top_k)

	initial_logits = None
	prompt_bundle = _build_unified_prefill_lower_prompt_bundle(
	tokenizer,
	prompt_messages=prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	structured_prompt_segments=segments,
	device=device,
	)
	reference_seed = _direct_prefill_lower_cache_and_logits(
	model,
	prompt_bundle=prompt_bundle,
	lower_k=lower_k,
	prefill_attn=attn,
	system_prefill=sys_prefill,
	no_upper_attn=bool(no_upper_attn),
	see_past_assistant=bool(see_past_assistant),
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	seed_mode=str(seed_mode or "auto"),
	)
	canonical_input_ids = None
	if isinstance(input_ids, torch.Tensor) and input_ids.dim() == 2 and input_ids.size(0) == 1:
	valid_len = int(input_ids.size(1))
	if isinstance(attention_mask, torch.Tensor) and attention_mask.dim() == 2 and attention_mask.size(0) == 1:
	valid_len = int(attention_mask[0].sum().item())
	if valid_len > 0:
	canonical_input_ids = input_ids[:, -valid_len:]
	if not isinstance(canonical_input_ids, torch.Tensor):
	canonical_input_ids = prompt_bundle.get("effective_prompt_ids")
	if not isinstance(canonical_input_ids, torch.Tensor):
	canonical_input_ids = prompt_ids
	total_prompt_len = int(canonical_input_ids.size(1))
	if max_new_tokens is None:
	if max_length is None:
	max_new_tokens = 256
	else:
	max_new_tokens = max(0, int(max_length) - total_prompt_len)
	else:
	max_new_tokens = int(max_new_tokens)
	if min_new_tokens is None:
	if min_length is None:
	min_new_tokens = 0
	else:
	min_new_tokens = max(0, int(min_length) - total_prompt_len)
	else:
	min_new_tokens = int(min_new_tokens)
	if reference_seed is not None:
	pkv, S, U, initial_logits = reference_seed
	else:
	output_head = _get_output_head(model)
	if bool(no_upper_attn):
	pkv, S, U = _llopa_prefill_cache(
	llopa_core,
	system_ids,
	user_ids,
	assistant_prefill_ids,
	lower_k=lower_k,
	prefill_mode="lower",
	prefill_attn=attn,
	system_prefill=sys_prefill,
	replay_user_prefix_keep_len=replay_user_prefix_keep_len,
	replay_user_start=replay_user_start,
	replay_user_len=replay_user_len,
	)
	else:
	pkv, S, U, last_hidden = _llopa_prefill_cache(
	llopa_core,
	system_ids,
	user_ids,
	assistant_prefill_ids,
	lower_k=lower_k,
	prefill_mode="lower",
	prefill_attn=attn,
	system_prefill=sys_prefill,
	return_last_assistant_hidden=bool(output_head is not None),
	replay_user_prefix_keep_len=replay_user_prefix_keep_len,
	replay_user_start=replay_user_start,
	replay_user_len=replay_user_len,
	)
	if output_head is not None and isinstance(last_hidden, torch.Tensor) and last_hidden.numel() > 0:
	initial_logits = output_head(last_hidden)[:, -1, :].to(torch.float32)

	stop_ids = set(_normalize_eos_token_ids(eos_token_id))
	if not stop_ids:
	tok_eos = getattr(tokenizer, "eos_token_id", None)
	if tok_eos is not None:
	stop_ids.add(int(tok_eos))
	with contextlib.suppress(Exception):
	eot_id = tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	if eot_id is not None and eot_id != tokenizer.unk_token_id:
	stop_ids.add(int(eot_id))

	last = assistant_prefill_ids[:, -1:]
	stop_token_ids = _prepare_stop_token_tensor(stop_ids, last.device)
	logits_warpers = _build_sampling_warpers(do_sample, sample_temp, top_p, top_k)
	max_new_tokens = int(max_new_tokens)
	min_new_tokens = int(min_new_tokens)
	lower_k = int(lower_k)
	no_upper_attn_bool = bool(no_upper_attn)
	replay_module_str = str(replay_module)
	replay_per_layers_int = int(replay_per_layers)
	record_scores = bool(output_scores)
	record_compact_scores = bool(output_scores) and bool(compact_scores)
	should_apply_stopping_criteria = stopping_criteria is not None
	mixin_decode_default = getattr(model, "_llopa_v2_generation_mixin_decode", None)
	use_mixin_decode = (
	bool(mixin_decode_default)
	if mixin_decode_default is not None
	else _env_flag_enabled("CAPSULE_LLOPA_GENERATION_MIXIN_DECODE", "0")
	)
	if use_mixin_decode and isinstance(initial_logits, torch.Tensor):
	mixin_output = _llopa_v2_generation_mixin_decode_loop(
	model,
	canonical_input_ids=canonical_input_ids,
	attention_mask=attention_mask,
	past_key_values=pkv,
	initial_logits=initial_logits,
	lower_k=lower_k,
	no_upper_attn=no_upper_attn_bool,
	replay_module=replay_module_str,
	replay_per_layers=replay_per_layers_int,
	max_new_tokens=max_new_tokens,
	min_new_tokens=min_new_tokens,
	do_sample=do_sample,
	logits_warpers=logits_warpers,
	stop_ids=stop_ids,
	stop_token_ids=stop_token_ids,
	stopping_criteria=stopping_criteria,
	output_scores=output_scores,
	compact_scores=compact_scores,
	return_dict_in_generate=return_dict_in_generate,
	)
	if mixin_output is not None:
	return mixin_output
	sequences = torch.empty(
	(canonical_input_ids.size(0), total_prompt_len + max_new_tokens),
	dtype=canonical_input_ids.dtype,
	device=canonical_input_ids.device,
	)
	sequences[:, :total_prompt_len] = canonical_input_ids
	generated = sequences[:, total_prompt_len:]
	score_list: list[torch.Tensor] = []
	score_list_append = score_list.append
	compact_logprob_list: list[torch.Tensor] = []
	compact_logprob_append = compact_logprob_list.append
	cur = 0
	pending_logits = initial_logits

	while cur < max_new_tokens:
	out = None
	if pending_logits is None:
	if use_direct_decode_step:
	out = llopa_forward_assistant(
	assistant_ids=last,
	lower_k=lower_k,
	pkv=pkv,
	S=S,
	U=U,
	write_cache=True,
	prefill_mode="lower",
	no_upper_attn=no_upper_attn_bool,
	align_cache_position_to_layer_past=False,
	replay_module=replay_module_str,
	replay_per_layers=replay_per_layers_int,
	)
	pkv = out.past_key_values or pkv
	logits = decode_output_head(out.last_hidden_state[:, -1, :])
	if logits.dim() == 3:
	logits = logits[:, -1, :]
	else:
	out = llopa_step(
	assistant_ids=last,
	lower_k=lower_k,
	pkv=pkv,
	S=S,
	U=U,
	logits_to_keep=1,
	labels=None,
	prefill_mode="lower",
	no_upper_attn=no_upper_attn_bool,
	align_cache_position_to_layer_past=False,
	replay_module=replay_module_str,
	replay_per_layers=replay_per_layers_int,
	)
	pkv = out.past_key_values or pkv
	logits = out.logits[:, -1, :]
	logits = logits.to(dtype=torch.float32, device=last.device, copy=True)
	else:
	logits = pending_logits
	pending_logits = None

	if stop_token_ids is not None and cur < min_new_tokens:
	logits.index_fill_(1, stop_token_ids, -float("inf"))
	if logits_warpers is not None:
	logits = logits_warpers(generated[:, :cur], logits)
	if record_scores and not record_compact_scores:
	score_list_append(logits.detach().clone())

	if do_sample:
	probs = torch.softmax(logits, dim=-1)
	next_tok = torch.multinomial(probs, num_samples=1)
	else:
	next_tok = torch.argmax(logits, dim=-1, keepdim=True)
	if record_compact_scores:
	next_logit = torch.gather(logits, 1, next_tok)
	next_logprob = next_logit - torch.logsumexp(logits, dim=-1, keepdim=True)
	compact_logprob_append(next_logprob.squeeze(-1).detach())

	generated[:, cur : cur + 1] = next_tok
	cur += 1

	should_stop = False
	tok_id = int(next_tok.item())
	if tok_id in stop_ids and cur >= min_new_tokens:
	should_stop = True
	if (not should_stop) and should_apply_stopping_criteria:
	sequences_now = sequences[:, : total_prompt_len + cur]
	try:
	should_stop = bool(stopping_criteria(sequences_now, logits))
	except TypeError:
	should_stop = bool(stopping_criteria(sequences_now, None))
	if out is not None:
	del out
	if should_stop:
	break
	last = next_tok

	sequences = sequences[:, : total_prompt_len + cur]
	if not bool(return_dict_in_generate):
	return sequences
	from transformers.generation.utils import GenerateDecoderOnlyOutput

	output = GenerateDecoderOnlyOutput(
	sequences=sequences,
	scores=tuple(score_list) if bool(output_scores) and not record_compact_scores else None,
	past_key_values=pkv,
	)
	if record_compact_scores:
	if compact_logprob_list:
	compact_logprobs = torch.stack(compact_logprob_list, dim=1)
	else:
	compact_logprobs = torch.empty(
	(sequences.size(0), 0),
	dtype=torch.float32,
	device=sequences.device,
	)
	setattr(output, "generated_token_logprobs", compact_logprobs)
	return output


	def _is_prompt_messages_batch(prompt_messages) -> bool:
	if not isinstance(prompt_messages, (list, tuple)) or not prompt_messages:
	return False
	if all(isinstance(item, dict) for item in prompt_messages):
	return False
	return all(isinstance(item, (list, tuple)) for item in prompt_messages)


	def _is_structured_segments_batch(structured_prompt_segments) -> bool:
	return (
	isinstance(structured_prompt_segments, (list, tuple))
	and not isinstance(structured_prompt_segments, dict)
	and all(isinstance(item, dict) for item in structured_prompt_segments)
	)


	def _batch_prompt_add_generation_flags(value, batch_size: int) -> list[bool]:
	if isinstance(value, (list, tuple)):
	if len(value) != int(batch_size):
	raise ValueError(
	f"prompt_add_generation_prompt batch size mismatch: {len(value)} != {int(batch_size)}"
	)
	return [bool(item) for item in value]
	return [bool(value) for _ in range(int(batch_size))]


	def _as_1d_long_tensor(value: torch.Tensor, *, device) -> torch.LongTensor:
	value = value.to(device=device, dtype=torch.long)
	if value.dim() == 2:
	if value.size(0) != 1:
	raise ValueError("Expected a single-row prompt segment tensor.")
	value = value[0]
	elif value.dim() != 1:
	value = value.reshape(-1)
	return value


	def _pad_1d_rows(
	rows: list[torch.Tensor],
	*,
	pad_value: int,
	device,
	dtype: torch.dtype = torch.long,
	) -> torch.Tensor:
	batch_size = len(rows)
	max_len = max((int(row.numel()) for row in rows), default=0)
	out = torch.full((batch_size, max_len), int(pad_value), device=device, dtype=dtype)
	for row_idx, row in enumerate(rows):
	row = row.to(device=device, dtype=dtype).reshape(-1)
	width = int(row.numel())
	if width > 0:
	out[row_idx, :width] = row
	return out


	def _pad_prompt_metadata_rows(
	rows: list[Optional[torch.Tensor]],
	*,
	fill_value: int,
	device,
	dtype: torch.dtype,
	) -> torch.Tensor:
	batch_size = len(rows)
	max_len = 0
	flat_rows: list[torch.Tensor] = []
	for row in rows:
	if isinstance(row, torch.Tensor) and row.numel() > 0:
	flat = row.to(device=device, dtype=dtype)
	if flat.dim() == 2:
	flat = flat[0]
	else:
	flat = flat.reshape(-1)
	else:
	flat = torch.empty((0,), device=device, dtype=dtype)
	flat_rows.append(flat)
	max_len = max(max_len, int(flat.numel()))
	if max_len <= 0:
	return torch.full((batch_size, 1), int(fill_value), device=device, dtype=dtype)
	out = torch.full((batch_size, max_len), int(fill_value), device=device, dtype=dtype)
	for row_idx, flat in enumerate(flat_rows):
	width = int(flat.numel())
	if width > 0:
	out[row_idx, :width] = flat
	return out


	def _build_batched_structured_prompt_segments(
	tokenizer,
	*,
	prompt_messages,
	prompt_add_generation_prompt,
	structured_prompt_segments,
	device,
	) -> list[dict]:
	if _is_structured_segments_batch(structured_prompt_segments):
	return list(structured_prompt_segments)
	if isinstance(structured_prompt_segments, dict):
	return [structured_prompt_segments]
	if _is_prompt_messages_batch(prompt_messages):
	batch_size = len(prompt_messages)
	flags = _batch_prompt_add_generation_flags(prompt_add_generation_prompt, batch_size)
	out = []
	for row_idx, messages in enumerate(prompt_messages):
	row_messages = list(messages)
	add_generation_prompt = bool(flags[row_idx])
	normalized = _normalize_prompt_messages(row_messages)
	if add_generation_prompt and normalized and normalized[-1]["role"] == "assistant":
	add_generation_prompt = False
	out.append(
	_build_structured_prompt_segments(
	tokenizer,
	row_messages,
	prompt_add_generation_prompt=add_generation_prompt,
	device=device,
	)
	)
	return out
	if prompt_messages is not None:
	if prompt_add_generation_prompt is None:
	raise ValueError("llopa_v2_batch_generate requires prompt_add_generation_prompt for prompt_messages.")
	return [
	_build_structured_prompt_segments(
	tokenizer,
	prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	device=device,
	)
	]
	return []


	def _batched_segments_to_tensors(segments: list[dict], *, device, pad_token_id: int) -> dict:
	prompt_rows: list[torch.Tensor] = []
	prompt_lens: list[int] = []
	split_starts: list[int] = []
	system_lens: list[int] = []
	replay_prefix_keep_lens: list[int] = []
	replay_user_starts: list[int] = []
	replay_user_lens: list[int] = []
	header_start_rows: list[Optional[torch.Tensor]] = []
	turn_end_rows: list[Optional[torch.Tensor]] = []
	header_mask_rows: list[Optional[torch.Tensor]] = []

	for seg in segments:
	prompt_ids = seg.get("prompt_ids")
	assistant_prefill_ids = seg.get("assistant_prefill_ids")
	if not isinstance(prompt_ids, torch.Tensor) or prompt_ids.numel() == 0:
	raise ValueError("llopa_v2_batch_generate requires non-empty prompt_ids in each segment.")
	if not isinstance(assistant_prefill_ids, torch.Tensor) or assistant_prefill_ids.numel() == 0:
	raise ValueError("llopa_v2_batch_generate requires non-empty assistant_prefill_ids in each segment.")
	prompt_row = _as_1d_long_tensor(prompt_ids, device=device)
	assistant_row = _as_1d_long_tensor(assistant_prefill_ids, device=device)
	prompt_len = int(prompt_row.numel())
	assistant_len = int(assistant_row.numel())
	split_start = int(prompt_len - assistant_len)
	prefix_ids = seg.get("prefix_ids")
	if isinstance(prefix_ids, torch.Tensor) and prefix_ids.numel() > 0:
	split_start = int(_as_1d_long_tensor(prefix_ids, device=device).numel())
	split_start = max(0, min(split_start, prompt_len - 1))

	system_ids = seg.get("system_ids")
	system_len = int(_as_1d_long_tensor(system_ids, device=device).numel()) if isinstance(system_ids, torch.Tensor) else 0
	system_len = max(0, min(system_len, split_start))

	prompt_rows.append(prompt_row)
	prompt_lens.append(prompt_len)
	split_starts.append(split_start)
	system_lens.append(system_len)
	replay_prefix_keep_lens.append(int(seg.get("replay_user_prefix_keep_len", 0) or 0))
	replay_user_starts.append(int(seg.get("replay_user_start", 0) or 0))
	replay_user_lens.append(int(seg.get("replay_user_len", 0) or 0))

	header_starts = seg.get("assistant_header_starts")
	turn_ends = seg.get("assistant_turn_ends")
	header_mask = seg.get("assistant_header_start_mask")
	if not isinstance(header_starts, torch.Tensor) or header_starts.numel() == 0:
	header_starts = torch.tensor([[split_start]], device=device, dtype=torch.long)
	if not isinstance(turn_ends, torch.Tensor) or turn_ends.numel() == 0:
	turn_ends = torch.tensor([[prompt_len]], device=device, dtype=torch.long)
	if not isinstance(header_mask, torch.Tensor) or header_mask.numel() == 0:
	header_mask = torch.ones_like(header_starts, device=device, dtype=torch.bool)
	header_start_rows.append(header_starts)
	turn_end_rows.append(turn_ends)
	header_mask_rows.append(header_mask)

	prompt_ids = _pad_1d_rows(prompt_rows, pad_value=int(pad_token_id), device=device, dtype=torch.long)
	prompt_lens_tensor = torch.tensor(prompt_lens, device=device, dtype=torch.long)
	prompt_attention_mask = (
	torch.arange(prompt_ids.size(1), device=device, dtype=torch.long).unsqueeze(0)
	< prompt_lens_tensor.unsqueeze(1)
	).to(dtype=torch.long)
	return {
	"prompt_rows": prompt_rows,
	"prompt_ids": prompt_ids,
	"prompt_attention_mask": prompt_attention_mask,
	"prompt_lens": prompt_lens_tensor,
	"split_starts": torch.tensor(split_starts, device=device, dtype=torch.long),
	"system_lens": torch.tensor(system_lens, device=device, dtype=torch.long),
	"replay_user_prefix_keep_lens": torch.tensor(replay_prefix_keep_lens, device=device, dtype=torch.long),
	"replay_user_starts": torch.tensor(replay_user_starts, device=device, dtype=torch.long),
	"replay_user_lens": torch.tensor(replay_user_lens, device=device, dtype=torch.long),
	"assistant_header_starts": _pad_prompt_metadata_rows(
	header_start_rows,
	fill_value=-1,
	device=device,
	dtype=torch.long,
	),
	"assistant_turn_ends": _pad_prompt_metadata_rows(
	turn_end_rows,
	fill_value=-1,
	device=device,
	dtype=torch.long,
	),
	"assistant_header_start_mask": _pad_prompt_metadata_rows(
	header_mask_rows,
	fill_value=0,
	device=device,
	dtype=torch.bool,
	).to(dtype=torch.bool),
	}


	def _cache_layer_seq_len(pkv, layer_idx: int) -> int:
	try:
	k, _ = pkv_get(pkv, int(layer_idx))
	except Exception:
	return 0
	if not isinstance(k, torch.Tensor) or k.dim() < 3:
	return 0
	return int(k.shape[-2])


	def _merge_optional_batch_sequence_attr(row_pkvs: list, attr_name: str, *, device, dtype=None):
	rows = []
	trailing_shape = None
	for pkv in row_pkvs:
	value = getattr(pkv, attr_name, None)
	if isinstance(value, torch.Tensor) and value.numel() > 0:
	value = value.to(device=device)
	if dtype is not None:
	value = value.to(dtype=dtype)
	if value.dim() == 1:
	value = value.view(1, -1)
	elif value.dim() >= 2 and value.size(0) != 1:
	value = value[:1]
	trailing_shape = tuple(value.shape[2:])
	else:
	value = None
	rows.append(value)
	if trailing_shape is None:
	return None
	max_len = max((int(row.shape[1]) for row in rows if isinstance(row, torch.Tensor)), default=0)
	if max_len <= 0:
	return None
	ref = next(row for row in rows if isinstance(row, torch.Tensor))
	out_shape = (len(rows), max_len) + trailing_shape
	out = torch.zeros(out_shape, device=device, dtype=ref.dtype)
	for row_idx, row in enumerate(rows):
	if not isinstance(row, torch.Tensor):
	continue
	width = int(row.shape[1])
	if width > 0:
	out[row_idx : row_idx + 1, :width, ...] = row[:, :width, ...]
	return out


	def _merge_llopa_batch_row_caches(row_pkvs: list, *, device) -> Optional[DynamicCache]:
	if not row_pkvs:
	return None
	try:
	n_layers = max(int(pkv_len(pkv)) for pkv in row_pkvs)
	except Exception:
	return None
	if n_layers <= 0:
	return None

	merged_pairs = []
	layer_valid_masks: list[torch.Tensor] = []
	for layer_idx in range(n_layers):
	row_kvs = []
	max_len = 0
	ref_k = None
	ref_v = None
	for pkv in row_pkvs:
	try:
	k, v = pkv_get(pkv, layer_idx)
	except Exception:
	k = None
	v = None
	if isinstance(k, torch.Tensor) and isinstance(v, torch.Tensor) and k.dim() == 4 and v.dim() == 4:
	k = k.to(device=device)
	v = v.to(device=device)
	if k.size(0) != 1:
	k = k[:1]
	v = v[:1]
	ref_k = k if ref_k is None else ref_k
	ref_v = v if ref_v is None else ref_v
	max_len = max(max_len, int(k.shape[-2]))
	row_kvs.append((k, v))
	else:
	row_kvs.append((None, None))
	if ref_k is None or ref_v is None:
	return None
	B = len(row_pkvs)
	merged_k = torch.zeros(
	(B, int(ref_k.shape[1]), max_len, int(ref_k.shape[-1])),
	device=device,
	dtype=ref_k.dtype,
	)
	merged_v = torch.zeros(
	(B, int(ref_v.shape[1]), max_len, int(ref_v.shape[-1])),
	device=device,
	dtype=ref_v.dtype,
	)
	valid_mask = torch.zeros((B, max_len), device=device, dtype=torch.bool)
	for row_idx, (k, v) in enumerate(row_kvs):
	if not isinstance(k, torch.Tensor) or not isinstance(v, torch.Tensor):
	continue
	width = int(k.shape[-2])
	if width > 0:
	merged_k[row_idx : row_idx + 1, :, :width, :] = k[:, :, :width, :]
	merged_v[row_idx : row_idx + 1, :, :width, :] = v[:, :, :width, :]
	valid_mask[row_idx, :width] = True
	merged_pairs.append((merged_k, merged_v))
	layer_valid_masks.append(valid_mask)

	try:
	merged = DynamicCache(ddp_cache_data=merged_pairs)
	except Exception:
	merged = DynamicCache()
	for layer_idx, (k, v) in enumerate(merged_pairs):
	merged.update(k, v, layer_idx)

	try:
	setattr(merged, "_llopa_batch_layer_valid_masks", layer_valid_masks)
	except Exception:
	pass

	replay_hidden = _merge_optional_batch_sequence_attr(
	row_pkvs,
	"_tri_last_layer_memory_hidden",
	device=device,
	)
	replay_pos = _merge_optional_batch_sequence_attr(
	row_pkvs,
	"_tri_last_layer_memory_position_ids",
	device=device,
	dtype=torch.long,
	)
	replay_valid = _merge_optional_batch_sequence_attr(
	row_pkvs,
	"_tri_last_layer_memory_valid_mask",
	device=device,
	dtype=torch.bool,
	)
	if isinstance(replay_hidden, torch.Tensor) and isinstance(replay_pos, torch.Tensor):
	with contextlib.suppress(Exception):
	setattr(merged, "_tri_last_layer_memory_hidden", replay_hidden)
	setattr(merged, "_tri_last_layer_memory_position_ids", replay_pos)
	if isinstance(replay_valid, torch.Tensor):
	setattr(merged, "_tri_last_layer_memory_valid_mask", replay_valid.to(dtype=torch.bool))
	module_type = getattr(row_pkvs[0], "_tri_replay_module", getattr(row_pkvs[0], "_tri_last_layer_module", "none"))
	setattr(merged, "_tri_replay_module", str(module_type or "none"))
	setattr(merged, "_tri_last_layer_module", str(module_type or "none"))
	setattr(merged, "_tri_replay_per_layers", int(getattr(row_pkvs[0], "_tri_replay_per_layers", -1) or -1))
	return merged


	def _append_llopa_batch_cache_valid_masks(pkv, input_valid: torch.Tensor) -> None:
	masks = getattr(pkv, "_llopa_batch_layer_valid_masks", None)
	if not isinstance(masks, list):
	return
	input_valid = input_valid.to(dtype=torch.bool)
	new_masks = []
	for layer_idx, mask in enumerate(masks):
	if not isinstance(mask, torch.Tensor) or mask.dim() != 2:
	new_masks.append(mask)
	continue
	layer_len = _cache_layer_seq_len(pkv, layer_idx)
	cur_len = int(mask.size(1))
	if layer_len > cur_len:
	add_width = int(layer_len - cur_len)
	add = input_valid.to(device=mask.device).view(-1, 1).expand(mask.size(0), add_width)
	mask = torch.cat([mask, add], dim=1)
	elif layer_len < cur_len:
	mask = mask[:, :layer_len]
	new_masks.append(mask)
	with contextlib.suppress(Exception):
	setattr(pkv, "_llopa_batch_layer_valid_masks", new_masks)
	setattr(pkv, "_tri_past_len_cache", None)


	def _direct_llopa_batch_prefill_cache_and_logits(
	model,
	*,
	segments: list[dict],
	pad_id: int,
	lower_k: int,
	prefill_attn: str,
	system_prefill: str,
	no_upper_attn: bool,
	see_past_assistant: bool,
	replay_module: str,
	replay_per_layers: int,
	device,
	):
	if not segments:
	return None
	if _normalize_replay_module_value(replay_module) != "none":
	return None
	seed_fn = _get_llopa_full_prompt_seed(model)
	if not callable(seed_fn):
	return None
	try:
	batch = _batched_segments_to_tensors(segments, device=device, pad_token_id=int(pad_id))
	except Exception:
	return None
	try:
	seed = seed_fn(
	input_ids=batch["prompt_ids"],
	attention_mask=batch["prompt_attention_mask"],
	use_cache=True,
	logits_to_keep=1,
	lower_k=int(lower_k),
	prefill_attn=str(prefill_attn),
	system_prefill=str(system_prefill),
	no_upper_attn=bool(no_upper_attn),
	prefill_lower_split_start=batch["split_starts"],
	prefill_lower_system_len=batch["system_lens"],
	prefill_lower_replay_user_prefix_keep_len=batch["replay_user_prefix_keep_lens"],
	prefill_lower_replay_user_start=batch["replay_user_starts"],
	prefill_lower_replay_user_len=batch["replay_user_lens"],
	assistant_header_starts=batch["assistant_header_starts"],
	assistant_turn_ends=batch["assistant_turn_ends"],
	assistant_header_start_mask=batch["assistant_header_start_mask"],
	prefill_lower_see_past_assistant=bool(see_past_assistant),
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	)
	except Exception:
	return None
	if not isinstance(seed, tuple) or len(seed) != 4:
	return None
	pkv, S, U, logits = seed
	if pkv is None or not isinstance(logits, torch.Tensor) or logits.numel() == 0:
	return None
	if logits.dim() == 3:
	logits = logits[:, -1, :]
	if logits.dim() != 2 or int(logits.size(0)) != len(segments):
	return None
	return pkv, S, U, logits.to(device=device, dtype=torch.float32)


	def _direct_llopa_batch_generate_cached_impl(
	model,
	tokenizer,
	*,
	segments: list[dict],
	canonical_input_ids: torch.Tensor,
	context_width: int,
	batch_size: int,
	lower_k: int,
	prefill_attn: str,
	system_prefill: str,
	no_upper_attn: bool,
	see_past_assistant: bool,
	replay_module: str,
	replay_per_layers: int,
	last_layer_module: Optional[str],
	max_new_tokens: int,
	min_new_tokens: int,
	do_sample: bool,
	logits_warpers,
	stopping_criteria,
	stop_ids: set[int],
	stop_token_ids: Optional[torch.Tensor],
	pad_id: int,
	output_scores: bool,
	compact_scores: bool,
	return_dict_in_generate: bool,
	device,
	):
	llopa_core = _get_llopa_core(model)
	llopa_step = _get_llopa_decode_step(model)
	if llopa_core is None or llopa_step is None:
	return None
	llopa_forward_assistant = getattr(llopa_core, "tri_forward_assistant", None)
	decode_output_head = _get_output_head(model)
	use_direct_decode_step = (
	_env_flag_enabled("CAPSULE_LLOPA_DIRECT_DECODE_STEP", "1")
	and callable(llopa_forward_assistant)
	and decode_output_head is not None
	)

	row_pkvs = []
	initial_logits_rows = []
	prompt_bundles = []
	for seg in segments:
	prompt_bundle = _build_unified_prefill_lower_prompt_bundle(
	tokenizer,
	prompt_messages=None,
	prompt_add_generation_prompt=True,
	structured_prompt_segments=seg,
	device=device,
	)
	prompt_bundles.append(prompt_bundle)

	has_past_assistant_history = bool(see_past_assistant) and any(
	_prompt_bundle_has_past_assistant_history(bundle) for bundle in prompt_bundles
	)
	batch_seed = None
	if int(batch_size) > 1:
	batch_seed = _direct_llopa_batch_prefill_cache_and_logits(
	model,
	segments=segments,
	pad_id=int(pad_id),
	lower_k=int(lower_k),
	prefill_attn=str(prefill_attn),
	system_prefill=str(system_prefill),
	no_upper_attn=bool(no_upper_attn),
	see_past_assistant=bool(see_past_assistant),
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	device=device,
	)

	if batch_seed is not None:
	pkv, _S, _U, pending_logits = batch_seed
	pending_logits = pending_logits.to(device=device, dtype=torch.float32)
	else:
	if int(batch_size) > 1 and _normalize_replay_module_value(replay_module) != "none":
	return None
	if int(batch_size) > 1 and bool(has_past_assistant_history):
	return None

	for prompt_bundle in prompt_bundles:
	seed = _direct_prefill_lower_cache_and_logits(
	model,
	prompt_bundle=prompt_bundle,
	lower_k=int(lower_k),
	prefill_attn=str(prefill_attn),
	system_prefill=str(system_prefill),
	no_upper_attn=bool(no_upper_attn),
	see_past_assistant=bool(see_past_assistant),
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	last_layer_module=last_layer_module,
	seed_mode="prefill_header",
	)
	if seed is None:
	return None
	pkv_row, _S, _U, initial_logits = seed
	if not isinstance(initial_logits, torch.Tensor) or initial_logits.numel() == 0:
	return None
	row_pkvs.append(pkv_row)
	initial_logits_rows.append(initial_logits.to(device=device, dtype=torch.float32))

	pkv = _merge_llopa_batch_row_caches(row_pkvs, device=device)
	if pkv is None:
	return None
	pending_logits = torch.cat(initial_logits_rows, dim=0)
	del row_pkvs, initial_logits_rows
	del prompt_bundles

	sequences = torch.full(
	(batch_size, int(context_width) + int(max_new_tokens)),
	int(pad_id),
	dtype=canonical_input_ids.dtype,
	device=device,
	)
	sequences[:, : int(context_width)] = canonical_input_ids.to(device=device)
	generated = sequences[:, int(context_width) :]
	score_list: list[torch.Tensor] = []
	score_list_append = score_list.append
	record_scores = bool(output_scores)
	record_compact_scores = record_scores and bool(compact_scores)
	compact_logprob_list: list[torch.Tensor] = []
	compact_logprob_append = compact_logprob_list.append
	unfinished = torch.ones((batch_size,), device=device, dtype=torch.bool)
	finish_steps = torch.full((batch_size,), int(max_new_tokens), device=device, dtype=torch.long)
	last = None
	last_valid = torch.ones((batch_size,), device=device, dtype=torch.bool)
	cur = 0

	while cur < int(max_new_tokens):
	if pending_logits is None:
	if not isinstance(last, torch.Tensor):
	return None
	if use_direct_decode_step:
	out = llopa_forward_assistant(
	assistant_ids=last,
	lower_k=int(lower_k),
	pkv=pkv,
	S=0,
	U=0,
	write_cache=True,
	prefill_mode="lower",
	no_upper_attn=bool(no_upper_attn),
	align_cache_position_to_layer_past=True,
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	)
	pkv = out.past_key_values or pkv
	_append_llopa_batch_cache_valid_masks(pkv, last_valid)
	logits = decode_output_head(out.last_hidden_state[:, -1, :])
	if logits.dim() == 3:
	logits = logits[:, -1, :]
	else:
	out = llopa_step(
	assistant_ids=last,
	lower_k=int(lower_k),
	pkv=pkv,
	S=0,
	U=0,
	logits_to_keep=1,
	labels=None,
	prefill_mode="lower",
	no_upper_attn=bool(no_upper_attn),
	align_cache_position_to_layer_past=True,
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	)
	pkv = out.past_key_values or pkv
	_append_llopa_batch_cache_valid_masks(pkv, last_valid)
	logits = out.logits[:, -1, :]
	logits = logits.to(dtype=torch.float32, device=device, copy=True)
	else:
	logits = pending_logits
	pending_logits = None

	if stop_token_ids is not None and cur < int(min_new_tokens):
	logits.index_fill_(1, stop_token_ids.to(device=logits.device), -float("inf"))
	if logits_warpers is not None:
	logits = logits_warpers(generated[:, :cur], logits)
	if record_scores and not record_compact_scores:
	score_list_append(logits.detach().clone())

	if bool(do_sample):
	probs = torch.softmax(logits, dim=-1)
	next_tok = torch.multinomial(probs, num_samples=1)
	else:
	next_tok = torch.argmax(logits, dim=-1, keepdim=True)
	if record_compact_scores:
	next_logit = torch.gather(logits, 1, next_tok)
	next_logprob = next_logit - torch.logsumexp(logits, dim=-1, keepdim=True)
	compact_logprob_append(next_logprob.squeeze(-1).detach())
	token_valid = unfinished.clone()
	if not bool(unfinished.all().item()):
	next_tok = torch.where(
	unfinished.view(-1, 1),
	next_tok,
	torch.full_like(next_tok, int(pad_id)),
	)
	token_valid = unfinished.clone()

	generated[:, cur : cur + 1] = next_tok.to(dtype=generated.dtype)
	cur += 1

	if stop_ids and cur >= int(min_new_tokens):
	stop_mask = torch.zeros_like(unfinished)
	for stop_id in stop_ids:
	stop_mask \|= next_tok.squeeze(1).eq(int(stop_id))
	newly_finished = unfinished & stop_mask
	if bool(newly_finished.any().item()):
	finish_steps = torch.where(
	newly_finished,
	torch.full_like(finish_steps, int(cur)),
	finish_steps,
	)
	unfinished = unfinished & ~stop_mask

	if stopping_criteria is not None:
	sequences_now = sequences[:, : int(context_width) + cur]
	try:
	stop_result = stopping_criteria(sequences_now, logits)
	except TypeError:
	stop_result = stopping_criteria(sequences_now, None)
	if isinstance(stop_result, torch.Tensor):
	stop_result = stop_result.to(device=device, dtype=torch.bool).reshape(-1)
	if stop_result.numel() == 1:
	if bool(stop_result.item()):
	unfinished.zero_()
	elif stop_result.numel() == batch_size:
	newly_finished = unfinished & stop_result
	if bool(newly_finished.any().item()):
	finish_steps = torch.where(
	newly_finished,
	torch.full_like(finish_steps, int(cur)),
	finish_steps,
	)
	unfinished = unfinished & ~stop_result
	elif bool(stop_result.all().item()):
	unfinished.zero_()
	elif bool(stop_result):
	unfinished.zero_()

	if not bool(unfinished.any().item()):
	break
	last = next_tok
	last_valid = token_valid

	sequences = sequences[:, : int(context_width) + cur]
	if not bool(return_dict_in_generate):
	return sequences
	from transformers.generation.utils import GenerateDecoderOnlyOutput

	output = GenerateDecoderOnlyOutput(
	sequences=sequences,
	scores=tuple(score_list) if record_scores and not record_compact_scores else None,
	past_key_values=pkv,
	)
	if record_compact_scores:
	if compact_logprob_list:
	compact_logprobs = torch.stack(compact_logprob_list, dim=1)
	else:
	compact_logprobs = torch.empty(
	(sequences.size(0), 0),
	dtype=torch.float32,
	device=sequences.device,
	)
	setattr(output, "generated_token_logprobs", compact_logprobs)
	return output


	def _direct_llopa_batch_generate_serial_cached_fallback_impl(
	model,
	tokenizer,
	*,
	segments: list[dict],
	canonical_input_ids: torch.Tensor,
	context_width: int,
	batch_size: int,
	lower_k: int,
	prefill_attn: str,
	system_prefill: str,
	user_prefill: str,
	no_upper_attn: bool,
	see_past_assistant: bool,
	replay_module: str,
	replay_per_layers: int,
	last_layer_module: Optional[str],
	seed_mode: str,
	max_new_tokens: int,
	min_new_tokens: int,
	do_sample: bool,
	temperature,
	top_p,
	top_k,
	stopping_criteria,
	pad_id: int,
	eos_token_id,
	output_scores: bool,
	compact_scores: bool,
	return_dict_in_generate: bool,
	device,
	):
	if int(batch_size) <= 1:
	return None
	row_sequences: list[torch.Tensor] = []
	row_score_tuples: list[tuple] = []
	row_compact_tensors: list[torch.Tensor] = []
	score_template = None
	max_gen_len = 0
	record_scores = bool(output_scores)
	record_compact_scores = record_scores and bool(compact_scores)
	for row_idx, seg in enumerate(segments):
	prompt_ids = seg.get("prompt_ids") if isinstance(seg, dict) else None
	if not isinstance(prompt_ids, torch.Tensor) or prompt_ids.numel() == 0:
	return None
	row_prompt = prompt_ids.to(device=device, dtype=torch.long)
	if row_prompt.dim() == 1:
	row_prompt = row_prompt.unsqueeze(0)
	elif row_prompt.dim() == 2:
	row_prompt = row_prompt[:1]
	else:
	row_prompt = row_prompt.reshape(1, -1)
	row_prompt_len = int(row_prompt.size(1))
	row_out = _direct_llopa_generate_impl(
	model,
	tokenizer,
	prompt_messages=None,
	prompt_add_generation_prompt=True,
	structured_prompt_segments=seg,
	input_ids=row_prompt,
	attention_mask=torch.ones_like(row_prompt, device=device, dtype=torch.long),
	lower_k=int(lower_k),
	prefill_attn=str(prefill_attn),
	system_prefill=str(system_prefill),
	user_prefill=str(user_prefill),
	no_upper_attn=bool(no_upper_attn),
	see_past_assistant=bool(see_past_assistant),
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	last_layer_module=last_layer_module,
	seed_mode=str(seed_mode or "prefill_header"),
	max_new_tokens=int(max_new_tokens),
	min_new_tokens=int(min_new_tokens),
	do_sample=bool(do_sample),
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	stopping_criteria=stopping_criteria,
	pad_token_id=int(pad_id),
	eos_token_id=eos_token_id,
	output_scores=record_scores,
	compact_scores=record_compact_scores,
	return_dict_in_generate=True,
	use_cache=True,
	)
	if row_out is None:
	return None
	row_seq = getattr(row_out, "sequences", None)
	if not isinstance(row_seq, torch.Tensor) or row_seq.dim() != 2 or row_seq.size(0) < 1:
	return None
	row_gen = row_seq[:1, row_prompt_len:].to(device=device, dtype=canonical_input_ids.dtype)
	row_sequences.append(row_gen)
	max_gen_len = max(max_gen_len, int(row_gen.size(1)))
	if record_compact_scores:
	compact = getattr(row_out, "generated_token_logprobs", None)
	if isinstance(compact, torch.Tensor):
	row_compact_tensors.append(compact[:1].to(device=device, dtype=torch.float32))
	else:
	row_compact_tensors.append(torch.empty((1, 0), device=device, dtype=torch.float32))
	elif record_scores:
	scores = getattr(row_out, "scores", None)
	if isinstance(scores, tuple):
	row_score_tuples.append(scores)
	if score_template is None and len(scores) > 0 and isinstance(scores[0], torch.Tensor):
	score_template = scores[0][:1].to(device=device)
	else:
	row_score_tuples.append(())

	sequences = torch.full(
	(int(batch_size), int(context_width) + int(max_gen_len)),
	int(pad_id),
	dtype=canonical_input_ids.dtype,
	device=device,
	)
	sequences[:, : int(context_width)] = canonical_input_ids.to(device=device)
	for row_idx, row_gen in enumerate(row_sequences):
	width = int(row_gen.size(1))
	if width > 0:
	sequences[row_idx : row_idx + 1, int(context_width) : int(context_width) + width] = row_gen[:, :width]

	compact_logprobs = None
	score_list = None
	if record_compact_scores:
	compact_logprobs = torch.zeros(
	(int(batch_size), int(max_gen_len)),
	device=device,
	dtype=torch.float32,
	)
	for row_idx, compact in enumerate(row_compact_tensors):
	width = min(int(compact.size(1)), int(max_gen_len))
	if width > 0:
	compact_logprobs[row_idx : row_idx + 1, :width] = compact[:, :width]
	elif record_scores:
	if score_template is None:
	score_list = []
	else:
	score_list = []
	for step_idx in range(int(max_gen_len)):
	step_scores = []
	for scores in row_score_tuples:
	if step_idx < len(scores) and isinstance(scores[step_idx], torch.Tensor):
	step_scores.append(scores[step_idx][:1].to(device=device))
	else:
	step_scores.append(torch.zeros_like(score_template, device=device))
	score_list.append(torch.cat(step_scores, dim=0))

	if not bool(return_dict_in_generate):
	return sequences
	from transformers.generation.utils import GenerateDecoderOnlyOutput

	output = GenerateDecoderOnlyOutput(
	sequences=sequences,
	scores=tuple(score_list) if record_scores and not record_compact_scores and score_list is not None else None,
	past_key_values=None,
	)
	if compact_logprobs is not None:
	setattr(output, "generated_token_logprobs", compact_logprobs)
	return output


	@torch.inference_mode()
	def _direct_llopa_batch_generate_impl(
	model,
	tokenizer,
	*,
	prompt_messages,
	prompt_add_generation_prompt: bool,
	structured_prompt_segments=None,
	input_ids: Optional[torch.LongTensor],
	attention_mask: Optional[torch.Tensor],
	lower_k: int,
	prefill_attn: str,
	system_prefill: str,
	user_prefill: str,
	no_upper_attn: bool,
	see_past_assistant: bool = False,
	replay_module: str = "none",
	replay_per_layers: int = -1,
	last_layer_module: Optional[str] = None,
	seed_mode: str = "prefill_header",
	max_length=None,
	max_new_tokens=None,
	min_length=None,
	min_new_tokens=None,
	do_sample=None,
	temperature=None,
	top_p=None,
	top_k=None,
	stopping_criteria=None,
	pad_token_id=None,
	eos_token_id=None,
	output_scores: bool = False,
	compact_scores: bool = False,
	return_dict_in_generate: bool = False,
	use_cache: Optional[bool] = None,
	):
	allow_cached_paths = use_cache is not False
	batch_size = int(input_ids.size(0)) if isinstance(input_ids, torch.Tensor) and input_ids.dim() == 2 else 0
	if _is_structured_segments_batch(structured_prompt_segments):
	batch_size = len(structured_prompt_segments)
	elif _is_prompt_messages_batch(prompt_messages):
	batch_size = len(prompt_messages)
	elif batch_size <= 0:
	batch_size = 1

	if last_layer_module is not None and _normalize_replay_module_value(replay_module) == "none":
	replay_module = last_layer_module
	replay_module = _normalize_replay_module_value(replay_module)
	replay_per_layers = _normalize_replay_per_layers_value(replay_per_layers)

	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	if lower_k <= 0:
	return None

	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	if attn not in {"causal", "full"}:
	attn = "causal"

	sys_prefill = (system_prefill or "full").strip().lower()
	if sys_prefill not in {"full", "no_system", "no_bos_system"}:
	sys_prefill = "full"

	user_prefill_norm = (user_prefill or "full").strip().lower()
	if user_prefill_norm != "full":
	raise ValueError("llopa_v2_batch_generate currently supports only user_prefill='full'.")

	if isinstance(input_ids, torch.Tensor):
	if input_ids.dim() != 2 or input_ids.size(0) != batch_size:
	return None
	device = input_ids.device
	else:
	try:
	device = next(model.parameters()).device
	except Exception:
	device = "cpu"

	pad_id = pad_token_id
	if pad_id is None:
	pad_id = getattr(tokenizer, "pad_token_id", None)
	if pad_id is None:
	pad_id = getattr(tokenizer, "eos_token_id", None)
	if pad_id is None:
	pad_id = 0
	pad_id = int(pad_id)

	segments = _build_batched_structured_prompt_segments(
	tokenizer,
	prompt_messages=prompt_messages,
	prompt_add_generation_prompt=prompt_add_generation_prompt,
	structured_prompt_segments=structured_prompt_segments,
	device=device,
	)
	if len(segments) != batch_size:
	raise ValueError(
	f"llopa_v2_batch_generate prompt metadata batch size mismatch: {len(segments)} != {batch_size}"
	)
	batch = _batched_segments_to_tensors(segments, device=device, pad_token_id=pad_id)

	if isinstance(input_ids, torch.Tensor):
	canonical_input_ids = input_ids.to(device=device)
	context_width = int(canonical_input_ids.size(1))
	else:
	canonical_input_ids = batch["prompt_ids"]
	context_width = int(canonical_input_ids.size(1))

	raw_temp = 0.0 if temperature is None else float(temperature)
	if do_sample is None:
	do_sample = bool(raw_temp != 0.0)
	do_sample = bool(do_sample)
	sample_temp = 1.0 if (not do_sample or raw_temp == 0.0) else float(raw_temp)
	top_p = 1.0 if top_p is None else float(top_p)
	top_k = None if top_k is None else int(top_k)

	if max_new_tokens is None:
	if max_length is None:
	max_new_tokens = 256
	else:
	max_new_tokens = max(0, int(max_length) - int(context_width))
	else:
	max_new_tokens = int(max_new_tokens)
	if min_new_tokens is None:
	if min_length is None:
	min_new_tokens = 0
	else:
	min_new_tokens = max(0, int(min_length) - int(context_width))
	else:
	min_new_tokens = int(min_new_tokens)
	max_new_tokens = int(max_new_tokens)
	min_new_tokens = int(min_new_tokens)

	stop_ids = set(_normalize_eos_token_ids(eos_token_id))
	if not stop_ids:
	tok_eos = getattr(tokenizer, "eos_token_id", None)
	if tok_eos is not None:
	stop_ids.add(int(tok_eos))
	with contextlib.suppress(Exception):
	eot_id = tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	if eot_id is not None and eot_id != tokenizer.unk_token_id:
	stop_ids.add(int(eot_id))

	stop_token_ids = _prepare_stop_token_tensor(stop_ids, device)
	logits_warpers = _build_sampling_warpers(do_sample, sample_temp, top_p, top_k)
	record_scores = bool(output_scores)
	record_compact_scores = record_scores and bool(compact_scores)
	should_apply_stopping_criteria = stopping_criteria is not None

	cached_out = None
	if bool(allow_cached_paths):
	cached_out = _direct_llopa_batch_generate_cached_impl(
	model,
	tokenizer,
	segments=segments,
	canonical_input_ids=canonical_input_ids,
	context_width=int(context_width),
	batch_size=int(batch_size),
	lower_k=int(lower_k),
	prefill_attn=str(attn),
	system_prefill=str(sys_prefill),
	no_upper_attn=bool(no_upper_attn),
	see_past_assistant=bool(see_past_assistant),
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	last_layer_module=last_layer_module,
	max_new_tokens=int(max_new_tokens),
	min_new_tokens=int(min_new_tokens),
	do_sample=bool(do_sample),
	logits_warpers=logits_warpers,
	stopping_criteria=stopping_criteria,
	stop_ids=stop_ids,
	stop_token_ids=stop_token_ids,
	pad_id=int(pad_id),
	output_scores=record_scores,
	compact_scores=record_compact_scores,
	return_dict_in_generate=bool(return_dict_in_generate),
	device=device,
	)
	if cached_out is not None:
	return cached_out

	serial_cached_out = None
	if bool(allow_cached_paths) and stopping_criteria is None:
	serial_cached_out = _direct_llopa_batch_generate_serial_cached_fallback_impl(
	model,
	tokenizer,
	segments=segments,
	canonical_input_ids=canonical_input_ids,
	context_width=int(context_width),
	batch_size=int(batch_size),
	lower_k=int(lower_k),
	prefill_attn=str(attn),
	system_prefill=str(sys_prefill),
	user_prefill=str(user_prefill_norm),
	no_upper_attn=bool(no_upper_attn),
	see_past_assistant=bool(see_past_assistant),
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	last_layer_module=last_layer_module,
	seed_mode=str(seed_mode or "prefill_header"),
	max_new_tokens=int(max_new_tokens),
	min_new_tokens=int(min_new_tokens),
	do_sample=bool(do_sample),
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	stopping_criteria=stopping_criteria,
	pad_id=int(pad_id),
	eos_token_id=eos_token_id,
	output_scores=record_scores,
	compact_scores=record_compact_scores,
	return_dict_in_generate=bool(return_dict_in_generate),
	device=device,
	)
	if serial_cached_out is not None:
	return serial_cached_out

	sequences = torch.full(
	(batch_size, context_width + max_new_tokens),
	pad_id,
	dtype=canonical_input_ids.dtype,
	device=device,
	)
	sequences[:, :context_width] = canonical_input_ids
	generated = sequences[:, context_width:]
	score_list: list[torch.Tensor] = []
	score_list_append = score_list.append
	compact_logprob_list: list[torch.Tensor] = []
	compact_logprob_append = compact_logprob_list.append

	prompt_rows: list[torch.Tensor] = batch["prompt_rows"]
	prompt_lens = batch["prompt_lens"]
	split_starts = batch["split_starts"]
	system_lens = batch["system_lens"]
	unfinished = torch.ones((batch_size,), device=device, dtype=torch.bool)
	finish_steps = torch.full((batch_size,), max_new_tokens, device=device, dtype=torch.long)

	def _build_step_tensors(cur_tokens: int) -> tuple[torch.LongTensor, torch.Tensor, torch.LongTensor]:
	active_lens = []
	rows = []
	for row_idx, prompt_row in enumerate(prompt_rows):
	if bool(unfinished[row_idx].item()):
	gen_len = int(cur_tokens)
	else:
	gen_len = min(int(cur_tokens), int(finish_steps[row_idx].item()))
	gen_prefix = generated[row_idx, :gen_len].to(device=device, dtype=torch.long)
	row_ids = torch.cat([prompt_row.to(device=device, dtype=torch.long), gen_prefix], dim=0)
	rows.append(row_ids)
	active_lens.append(int(row_ids.numel()))
	step_ids = _pad_1d_rows(rows, pad_value=pad_id, device=device, dtype=torch.long)
	active_lens_t = torch.tensor(active_lens, device=device, dtype=torch.long)
	step_mask = (
	torch.arange(step_ids.size(1), device=device, dtype=torch.long).unsqueeze(0)
	< active_lens_t.unsqueeze(1)
	).to(dtype=torch.long)
	labels = torch.full_like(step_ids, -100)
	for row_idx, active_len in enumerate(active_lens):
	start = int(split_starts[row_idx].item())
	if active_len > start:
	labels[row_idx, start:active_len] = step_ids[row_idx, start:active_len]
	return step_ids, step_mask, labels

	cur = 0
	while cur < max_new_tokens:
	step_ids, step_mask, labels = _build_step_tensors(cur)
	out = model(
	input_ids=step_ids,
	attention_mask=step_mask,
	labels=labels,
	use_cache=False,
	logits_to_keep=1,
	prefill_lower_layers=int(lower_k),
	prefill_lower_attn=str(attn),
	prefill_lower_system_prefill=str(sys_prefill),
	prefill_lower_no_upper_attn=bool(no_upper_attn),
	prefill_lower_split_start=split_starts,
	prefill_lower_system_len=system_lens,
	prefill_lower_replay_user_prefix_keep_len=batch["replay_user_prefix_keep_lens"],
	prefill_lower_replay_user_start=batch["replay_user_starts"],
	prefill_lower_replay_user_len=batch["replay_user_lens"],
	assistant_header_starts=batch["assistant_header_starts"],
	assistant_turn_ends=batch["assistant_turn_ends"],
	assistant_header_start_mask=batch["assistant_header_start_mask"],
	prefill_lower_see_past_assistant=bool(see_past_assistant),
	prefill_lower_replay_module=str(replay_module),
	prefill_lower_replay_per_layers=int(replay_per_layers),
	)
	if out is None or not isinstance(getattr(out, "logits", None), torch.Tensor):
	return None
	logits = out.logits[:, -1, :].to(dtype=torch.float32, device=device, copy=True)

	if stop_token_ids is not None and cur < min_new_tokens:
	logits.index_fill_(1, stop_token_ids, -float("inf"))
	if logits_warpers is not None:
	logits = logits_warpers(generated[:, :cur], logits)
	if record_scores and not record_compact_scores:
	score_list_append(logits.detach().clone())

	if do_sample:
	probs = torch.softmax(logits, dim=-1)
	next_tok = torch.multinomial(probs, num_samples=1)
	else:
	next_tok = torch.argmax(logits, dim=-1, keepdim=True)
	if record_compact_scores:
	next_logit = torch.gather(logits, 1, next_tok)
	next_logprob = next_logit - torch.logsumexp(logits, dim=-1, keepdim=True)
	compact_logprob_append(next_logprob.squeeze(-1).detach())
	if not bool(unfinished.all().item()):
	next_tok = torch.where(
	unfinished.view(-1, 1),
	next_tok,
	torch.full_like(next_tok, pad_id),
	)

	generated[:, cur : cur + 1] = next_tok.to(dtype=generated.dtype)
	cur += 1

	if stop_ids and cur >= min_new_tokens:
	stop_mask = torch.zeros_like(unfinished)
	for stop_id in stop_ids:
	stop_mask \|= next_tok.squeeze(1).eq(int(stop_id))
	newly_finished = unfinished & stop_mask
	if bool(newly_finished.any().item()):
	finish_steps = torch.where(
	newly_finished,
	torch.full_like(finish_steps, int(cur)),
	finish_steps,
	)
	unfinished = unfinished & ~stop_mask

	if should_apply_stopping_criteria:
	sequences_now = sequences[:, : context_width + cur]
	try:
	stop_result = stopping_criteria(sequences_now, logits)
	except TypeError:
	stop_result = stopping_criteria(sequences_now, None)
	if isinstance(stop_result, torch.Tensor):
	stop_result = stop_result.to(device=device, dtype=torch.bool).reshape(-1)
	if stop_result.numel() == 1:
	if bool(stop_result.item()):
	unfinished.zero_()
	elif stop_result.numel() == batch_size:
	newly_finished = unfinished & stop_result
	if bool(newly_finished.any().item()):
	finish_steps = torch.where(
	newly_finished,
	torch.full_like(finish_steps, int(cur)),
	finish_steps,
	)
	unfinished = unfinished & ~stop_result
	elif bool(stop_result.all().item()):
	unfinished.zero_()
	elif bool(stop_result):
	unfinished.zero_()

	if not bool(unfinished.any().item()):
	break

	sequences = sequences[:, : context_width + cur]
	if not bool(return_dict_in_generate):
	return sequences
	from transformers.generation.utils import GenerateDecoderOnlyOutput

	output = GenerateDecoderOnlyOutput(
	sequences=sequences,
	scores=tuple(score_list) if record_scores and not record_compact_scores else None,
	past_key_values=None,
	)
	if record_compact_scores:
	if compact_logprob_list:
	compact_logprobs = torch.stack(compact_logprob_list, dim=1)
	else:
	compact_logprobs = torch.empty(
	(sequences.size(0), 0),
	dtype=torch.float32,
	device=sequences.device,
	)
	setattr(output, "generated_token_logprobs", compact_logprobs)
	return output


	def _direct_optimized_llopa_generate_impl(
	model,
	tokenizer,
	*,
	prompt_messages,
	prompt_add_generation_prompt: bool,
	structured_prompt_segments=None,
	input_ids: Optional[torch.LongTensor],
	attention_mask: Optional[torch.Tensor],
	lower_k: int,
	prefill_attn: str,
	system_prefill: str,
	user_prefill: str,
	no_upper_attn: bool,
	see_past_assistant: bool = False,
	replay_module: str = "none",
	replay_per_layers: int = -1,
	last_layer_module: Optional[str] = None,
	optimized_variant: Optional[str] = None,
	optimized_seed_mode: Optional[str] = None,
	optimized_upper_prepare_mode: Optional[str] = None,
	optimized_upper_bucket_multiple: Optional[int] = None,
	optimized_seq_bucket_multiple: Optional[int] = None,
	max_length=None,
	max_new_tokens=None,
	min_length=None,
	min_new_tokens=None,
	do_sample=None,
	temperature=None,
	top_p=None,
	top_k=None,
	stopping_criteria=None,
	pad_token_id=None,
	eos_token_id=None,
	output_scores: bool = False,
	compact_scores: bool = False,
	return_dict_in_generate: bool = False,
	use_cache: Optional[bool] = None,
	):
	if last_layer_module is not None and _normalize_replay_module_value(replay_module) == "none":
	replay_module = last_layer_module
	replay_module = _normalize_replay_module_value(replay_module)
	replay_per_layers = _normalize_replay_per_layers_value(replay_per_layers)
	if use_cache is False:
	return None

	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	if lower_k <= 0:
	return None

	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	if attn not in {"causal", "full"}:
	attn = "causal"

	sys_prefill = (system_prefill or "full").strip().lower()
	if sys_prefill not in {"full", "no_system", "no_bos_system"}:
	sys_prefill = "full"

	user_prefill_norm = (user_prefill or "full").strip().lower()
	if user_prefill_norm != "full":
	raise ValueError("Optimized LLoPA currently supports only user_prefill='full'.")

	optimized_settings = _resolve_optimized_llopa_settings(
	variant=optimized_variant,
	seed_mode=optimized_seed_mode,
	upper_prepare_mode=optimized_upper_prepare_mode,
	upper_bucket_multiple=optimized_upper_bucket_multiple,
	seq_bucket_multiple=optimized_seq_bucket_multiple,
	)

	llopa_core = _get_llopa_core(model)
	llopa_step = _get_llopa_decode_step(model)
	if llopa_core is None or llopa_step is None:
	return None
	llopa_forward_assistant = getattr(llopa_core, "tri_forward_assistant", None)
	decode_output_head = _get_output_head(model)
	use_direct_decode_step = (
	_env_flag_enabled("CAPSULE_LLOPA_DIRECT_DECODE_STEP", "1")
	and callable(llopa_forward_assistant)
	and decode_output_head is not None
	)

	device = None
	if isinstance(input_ids, torch.Tensor):
	if input_ids.dim() != 2 or input_ids.size(0) != 1:
	return None
	device = input_ids.device
	if device is None:
	try:
	device = next(model.parameters()).device
	except Exception:
	device = "cpu"

	segments = structured_prompt_segments if isinstance(structured_prompt_segments, dict) else None
	if segments is None:
	segments = _build_structured_prompt_segments(
	tokenizer,
	prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	device=device,
	)
	prompt_ids = segments["prompt_ids"]
	system_ids = segments["system_ids"]
	user_ids = segments["user_ids"]
	assistant_prefill_ids = segments["assistant_prefill_ids"]
	replay_user_prefix_keep_len = int(segments.get("replay_user_prefix_keep_len", 0) or 0)
	replay_user_start = int(segments.get("replay_user_start", 0) or 0)
	replay_user_len = int(segments.get("replay_user_len", 0) or 0)
	if assistant_prefill_ids.numel() == 0:
	return None

	raw_temp = 0.0 if temperature is None else float(temperature)
	if do_sample is None:
	do_sample = bool(raw_temp != 0.0)
	do_sample = bool(do_sample)
	sample_temp = 1.0 if (not do_sample or raw_temp == 0.0) else float(raw_temp)
	top_p = 1.0 if top_p is None else float(top_p)
	top_k = None if top_k is None else int(top_k)

	initial_logits = None
	prompt_bundle = _build_unified_prefill_lower_prompt_bundle(
	tokenizer,
	prompt_messages=prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	structured_prompt_segments=segments,
	device=device,
	)
	with _temporary_model_attrs(
	model,
	_optimized_llopa_variant=optimized_settings["variant"],
	_optimized_llopa_seed_mode=optimized_settings["seed_mode"],
	_optimized_llopa_upper_prepare_mode=optimized_settings["upper_prepare_mode"],
	_optimized_llopa_upper_bucket_multiple=int(optimized_settings["upper_bucket_multiple"]),
	_optimized_llopa_seq_bucket_multiple=int(optimized_settings["seq_bucket_multiple"]),
	):
	reference_seed = _optimized_prefill_lower_cache_and_logits(
	model,
	prompt_bundle=prompt_bundle,
	lower_k=lower_k,
	prefill_attn=attn,
	system_prefill=sys_prefill,
	no_upper_attn=bool(no_upper_attn),
	see_past_assistant=bool(see_past_assistant),
	replay_module=str(replay_module),
	replay_per_layers=int(replay_per_layers),
	seed_mode=optimized_settings["seed_mode"],
	)
	canonical_input_ids = None
	if isinstance(input_ids, torch.Tensor) and input_ids.dim() == 2 and input_ids.size(0) == 1:
	valid_len = int(input_ids.size(1))
	if isinstance(attention_mask, torch.Tensor) and attention_mask.dim() == 2 and attention_mask.size(0) == 1:
	valid_len = int(attention_mask[0].sum().item())
	if valid_len > 0:
	canonical_input_ids = input_ids[:, -valid_len:]
	if not isinstance(canonical_input_ids, torch.Tensor):
	canonical_input_ids = prompt_bundle.get("effective_prompt_ids")
	if not isinstance(canonical_input_ids, torch.Tensor):
	canonical_input_ids = prompt_ids
	total_prompt_len = int(canonical_input_ids.size(1))
	if max_new_tokens is None:
	if max_length is None:
	max_new_tokens = 256
	else:
	max_new_tokens = max(0, int(max_length) - total_prompt_len)
	else:
	max_new_tokens = int(max_new_tokens)
	if min_new_tokens is None:
	if min_length is None:
	min_new_tokens = 0
	else:
	min_new_tokens = max(0, int(min_length) - total_prompt_len)
	else:
	min_new_tokens = int(min_new_tokens)
	if reference_seed is not None:
	pkv, S, U, initial_logits = reference_seed
	else:
	output_head = _get_output_head(model)
	if bool(no_upper_attn):
	pkv, S, U = _llopa_prefill_cache(
	llopa_core,
	system_ids,
	user_ids,
	assistant_prefill_ids,
	lower_k=lower_k,
	prefill_mode="lower",
	prefill_attn=attn,
	system_prefill=sys_prefill,
	replay_user_prefix_keep_len=replay_user_prefix_keep_len,
	replay_user_start=replay_user_start,
	replay_user_len=replay_user_len,
	)
	else:
	pkv, S, U, last_hidden = _llopa_prefill_cache(
	llopa_core,
	system_ids,
	user_ids,
	assistant_prefill_ids,
	lower_k=lower_k,
	prefill_mode="lower",
	prefill_attn=attn,
	system_prefill=sys_prefill,
	return_last_assistant_hidden=bool(output_head is not None),
	replay_user_prefix_keep_len=replay_user_prefix_keep_len,
	replay_user_start=replay_user_start,
	replay_user_len=replay_user_len,
	)
	if output_head is not None and isinstance(last_hidden, torch.Tensor) and last_hidden.numel() > 0:
	initial_logits = output_head(last_hidden)[:, -1, :].to(torch.float32)

	stop_ids = set(_normalize_eos_token_ids(eos_token_id))
	if not stop_ids:
	tok_eos = getattr(tokenizer, "eos_token_id", None)
	if tok_eos is not None:
	stop_ids.add(int(tok_eos))
	with contextlib.suppress(Exception):
	eot_id = tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	if eot_id is not None and eot_id != tokenizer.unk_token_id:
	stop_ids.add(int(eot_id))

	last = assistant_prefill_ids[:, -1:]
	stop_token_ids = _prepare_stop_token_tensor(stop_ids, last.device)
	logits_warpers = _build_sampling_warpers(do_sample, sample_temp, top_p, top_k)
	max_new_tokens = int(max_new_tokens)
	min_new_tokens = int(min_new_tokens)
	lower_k = int(lower_k)
	no_upper_attn_bool = bool(no_upper_attn)
	replay_module_str = str(replay_module)
	replay_per_layers_int = int(replay_per_layers)
	record_scores = bool(output_scores)
	record_compact_scores = bool(output_scores) and bool(compact_scores)
	should_apply_stopping_criteria = stopping_criteria is not None
	bucket_multiple = int(optimized_settings["seq_bucket_multiple"] or 256)
	sequences_full, _ = _acquire_bucketed_sequence_workspace(
	model,
	reference_ids=canonical_input_ids,
	batch_size=int(canonical_input_ids.size(0)),
	total_len=int(total_prompt_len + max_new_tokens),
	bucket_multiple=bucket_multiple,
	)
	sequences_full[:, :total_prompt_len] = canonical_input_ids
	sequences = sequences_full[:, : total_prompt_len + max_new_tokens]
	generated = sequences[:, total_prompt_len:]
	score_list: list[torch.Tensor] = []
	score_list_append = score_list.append
	compact_logprob_list: list[torch.Tensor] = []
	compact_logprob_append = compact_logprob_list.append
	cur = 0
	pending_logits = initial_logits

	while cur < max_new_tokens:
	out = None
	if pending_logits is None:
	if use_direct_decode_step:
	out = llopa_forward_assistant(
	assistant_ids=last,
	lower_k=lower_k,
	pkv=pkv,
	S=S,
	U=U,
	write_cache=True,
	prefill_mode="lower",
	no_upper_attn=no_upper_attn_bool,
	align_cache_position_to_layer_past=False,
	replay_module=replay_module_str,
	replay_per_layers=replay_per_layers_int,
	)
	pkv = out.past_key_values or pkv
	logits = decode_output_head(out.last_hidden_state[:, -1, :])
	if logits.dim() == 3:
	logits = logits[:, -1, :]
	else:
	out = llopa_step(
	assistant_ids=last,
	lower_k=lower_k,
	pkv=pkv,
	S=S,
	U=U,
	logits_to_keep=1,
	labels=None,
	prefill_mode="lower",
	no_upper_attn=no_upper_attn_bool,
	align_cache_position_to_layer_past=False,
	replay_module=replay_module_str,
	replay_per_layers=replay_per_layers_int,
	)
	pkv = out.past_key_values or pkv
	logits = out.logits[:, -1, :]
	logits = logits.to(dtype=torch.float32, device=last.device, copy=True)
	else:
	logits = pending_logits
	pending_logits = None

	if stop_token_ids is not None and cur < min_new_tokens:
	logits.index_fill_(1, stop_token_ids, -float("inf"))
	if logits_warpers is not None:
	logits = logits_warpers(generated[:, :cur], logits)
	if record_scores and not record_compact_scores:
	score_list_append(logits.detach().clone())

	if do_sample:
	probs = torch.softmax(logits, dim=-1)
	next_tok = torch.multinomial(probs, num_samples=1)
	else:
	next_tok = torch.argmax(logits, dim=-1, keepdim=True)
	if record_compact_scores:
	next_logit = torch.gather(logits, 1, next_tok)
	next_logprob = next_logit - torch.logsumexp(logits, dim=-1, keepdim=True)
	compact_logprob_append(next_logprob.squeeze(-1).detach())

	generated[:, cur : cur + 1] = next_tok
	cur += 1

	should_stop = False
	tok_id = int(next_tok.item())
	if tok_id in stop_ids and cur >= min_new_tokens:
	should_stop = True
	if (not should_stop) and should_apply_stopping_criteria:
	sequences_now = sequences[:, : total_prompt_len + cur]
	try:
	should_stop = bool(stopping_criteria(sequences_now, logits))
	except TypeError:
	should_stop = bool(stopping_criteria(sequences_now, None))
	if out is not None:
	del out
	if should_stop:
	break
	last = next_tok

	sequences = sequences[:, : total_prompt_len + cur]
	if not bool(return_dict_in_generate):
	return sequences
	from transformers.generation.utils import GenerateDecoderOnlyOutput

	output = GenerateDecoderOnlyOutput(
	sequences=sequences,
	scores=tuple(score_list) if bool(output_scores) and not record_compact_scores else None,
	past_key_values=pkv,
	)
	if record_compact_scores:
	if compact_logprob_list:
	compact_logprobs = torch.stack(compact_logprob_list, dim=1)
	else:
	compact_logprobs = torch.empty(
	(sequences.size(0), 0),
	dtype=torch.float32,
	device=sequences.device,
	)
	setattr(output, "generated_token_logprobs", compact_logprobs)
	return output


	def _normalize_replay_module_value(value: Optional[str]) -> str:
	raw = str(value or "none").strip().lower()
	aliases = {
	"": "none",
	"off": "none",
	"disabled": "none",
	"disable": "none",
	"self-attention": "self",
	"self_attention": "self",
	"selfattn": "self",
	"self_attn": "self",
	"cross-attention": "cross",
	"cross_attention": "cross",
	"crossattn": "cross",
	"cross_attn": "cross",
	}
	raw = aliases.get(raw, raw)
	if raw in {"none", "self", "cross"}:
	return raw
	return "none"


	def _normalize_replay_per_layers_value(value) -> int:
	try:
	normalized = int(value)
	except Exception:
	return -1
	if normalized == -1 or normalized >= 1:
	return normalized
	return -1


	def _normalize_structured_llopa_runtime(
	lower_k: int,
	*,
	prefill_attn: str = "causal",
	system_prefill: str = "full",
	user_prefill: str = "full",
	no_upper_attn: bool = False,
	see_past_assistant: bool = False,
	replay_module: str = "none",
	replay_per_layers: int = -1,
	last_layer_module: Optional[str] = None,
	):
	try:
	lower_k = int(lower_k)
	except Exception:
	lower_k = 0
	attn = (prefill_attn or "causal").strip().lower()
	if attn == "prefix_full":
	attn = "full"
	sys_prefill = (system_prefill or "full").strip().lower()
	if sys_prefill not in {"full", "no_system", "no_bos_system"}:
	sys_prefill = "full"
	user_prefill_norm = (user_prefill or "full").strip().lower()
	if last_layer_module is not None and _normalize_replay_module_value(replay_module) == "none":
	replay_module = last_layer_module
	replay_module_norm = _normalize_replay_module_value(replay_module)
	replay_per_layers_norm = _normalize_replay_per_layers_value(replay_per_layers)
	if lower_k <= 0 or attn not in {"causal", "full"}:
	return None
	return (
	int(lower_k),
	attn,
	sys_prefill,
	user_prefill_norm,
	bool(no_upper_attn),
	replay_module_norm,
	replay_per_layers_norm,
	)


	def _attach_structured_llopa_generate(model, tokenizer) -> None:
	try:
	import types
	except Exception:
	return

	if getattr(model, "_structured_llopa_generate_attached", False):
	return

	orig_generate = getattr(model, "generate", None)
	if not callable(orig_generate):
	return

	def _structured_llopa_generate(self, args, *kwargs):
	if args:
	if "input_ids" not in kwargs:
	kwargs["input_ids"] = args[0]
	args = args[1:]
	if args:
	return orig_generate(args, *kwargs)

	optimized_enabled = kwargs.pop("optimized_llopa_generate", None)
	llopa_v2_batch_enabled = kwargs.pop("llopa_v2_batch_generate", None)
	llopa_v2_enabled = kwargs.pop("llopa_v2_generate", None)
	llopa_v3_enabled = kwargs.pop("llopa_v3_generate", None)
	runtime_solo_enabled = kwargs.pop("runtime_solo_generate", None)
	runtime_solo_v2_enabled = kwargs.pop("runtime_solo_v2_generate", None)
	unified_enabled = kwargs.pop("unified_llopa_generate", None)
	direct_enabled = kwargs.pop("direct_llopa_generate", None)
	legacy_search = bool(kwargs.pop("direct_llopa_legacy_search", False))
	prompt_messages = kwargs.pop("prompt_messages", None)
	prompt_add_generation_prompt = kwargs.pop("prompt_add_generation_prompt", None)
	structured_prompt_segments = kwargs.pop("structured_prompt_segments", None)
	compact_scores = bool(kwargs.pop("capsule_compact_scores", False))

	mode = None
	if optimized_enabled is not None:
	if bool(optimized_enabled):
	mode = "optimized"
	elif llopa_v2_batch_enabled is None and llopa_v2_enabled is None and llopa_v3_enabled is None and runtime_solo_enabled is None and runtime_solo_v2_enabled is None and unified_enabled is None and direct_enabled is None:
	return orig_generate(**kwargs)
	if llopa_v2_batch_enabled is not None:
	if bool(llopa_v2_batch_enabled):
	mode = "llopa_v2_batch"
	elif llopa_v2_enabled is None and llopa_v3_enabled is None and runtime_solo_enabled is None and runtime_solo_v2_enabled is None and unified_enabled is None and direct_enabled is None:
	return orig_generate(**kwargs)
	if llopa_v2_enabled is not None:
	if bool(llopa_v2_enabled):
	mode = "llopa_v2"
	elif llopa_v3_enabled is None and runtime_solo_enabled is None and runtime_solo_v2_enabled is None and unified_enabled is None and direct_enabled is None:
	return orig_generate(**kwargs)
	if llopa_v3_enabled is not None:
	if bool(llopa_v3_enabled):
	mode = "llopa_v3"
	elif runtime_solo_enabled is None and runtime_solo_v2_enabled is None and unified_enabled is None and direct_enabled is None:
	return orig_generate(**kwargs)
	if runtime_solo_v2_enabled is not None:
	if bool(runtime_solo_v2_enabled):
	mode = "solo_v2"
	elif runtime_solo_enabled is None and unified_enabled is None and direct_enabled is None:
	return orig_generate(**kwargs)
	if runtime_solo_enabled is not None:
	if bool(runtime_solo_enabled):
	mode = "solo"
	elif unified_enabled is None and direct_enabled is None:
	return orig_generate(**kwargs)
	if unified_enabled is not None:
	if bool(unified_enabled):
	mode = "unified"
	elif direct_enabled is None:
	return orig_generate(**kwargs)
	if mode is None and direct_enabled is not None:
	if bool(direct_enabled):
	mode = "direct"
	else:
	return orig_generate(**kwargs)
	if mode is None:
	if bool(getattr(self, "_optimized_llopa_generate_default", False)):
	mode = "optimized"
	elif bool(getattr(self, "_llopa_v2_batch_generate_default", False)):
	mode = "llopa_v2_batch"
	elif bool(getattr(self, "_llopa_v2_generate_default", False)):
	mode = "llopa_v3" if str(getattr(self, "_capsule_inference_path", "") or "") == "llopa_v3" else "llopa_v2"
	elif bool(getattr(self, "_unified_llopa_generate_default", False)):
	mode = "unified"
	elif bool(getattr(self, "_runtime_structured_freeze_generate_default", False)):
	mode = "freeze"
	elif bool(getattr(self, "_runtime_structured_solo_v2_generate_default", False)):
	mode = "solo_v2"
	elif bool(getattr(self, "_runtime_structured_solo_generate_default", False)):
	mode = "solo"
	elif bool(getattr(self, "_direct_llopa_generate_default", False)):
	mode = "direct"
	else:
	return orig_generate(**kwargs)

	if kwargs.get("inputs_embeds") is not None:
	return orig_generate(**kwargs)

	if mode == "optimized":
	mode_label = "optimized_llopa_generate"
	elif mode == "llopa_v2_batch":
	mode_label = "llopa_v2_batch_generate"
	elif mode == "llopa_v2":
	mode_label = "llopa_v2_generate"
	elif mode == "llopa_v3":
	mode_label = "llopa_v3_generate"
	elif mode == "unified":
	mode_label = "unified_llopa_generate"
	elif mode == "freeze":
	mode_label = "runtime_freeze_generate"
	elif mode == "solo_v2":
	mode_label = "runtime_solo_v2_generate"
	elif mode == "solo":
	mode_label = "runtime_solo_generate"
	else:
	mode_label = "direct_llopa_generate"
	if int(kwargs.get("num_beams", 1) or 1) != 1:
	_warn_once(
	self,
	f"_warned_{mode_label}_num_beams",
	f"[load_llopa_model][warn] {mode_label} currently supports only num_beams=1; falling back to model.generate().",
	)
	return orig_generate(**kwargs)
	if int(kwargs.get("num_return_sequences", 1) or 1) != 1:
	_warn_once(
	self,
	f"_warned_{mode_label}_num_return_sequences",
	f"[load_llopa_model][warn] {mode_label} currently supports only num_return_sequences=1; falling back to model.generate().",
	)
	return orig_generate(**kwargs)

	if mode in {"unified", "optimized", "llopa_v2", "llopa_v3", "llopa_v2_batch"}:
	attr_prefix = "_optimized_llopa" if mode == "optimized" else "_llopa_v2" if mode in {"llopa_v2", "llopa_v3", "llopa_v2_batch"} else "_unified_llopa"
	kw_prefix = "optimized_llopa" if mode == "optimized" else "llopa_v2" if mode in {"llopa_v2", "llopa_v3", "llopa_v2_batch"} else "unified_llopa"
	lower_k_attr = f"{attr_prefix}_layers"
	attn_attr = f"{attr_prefix}_attn"
	system_attr = f"{attr_prefix}_system_prefill"
	user_attr = f"{attr_prefix}_user_prefill"
	no_upper_attr = f"{attr_prefix}_no_upper_attn"
	see_past_assistant_attr = f"{attr_prefix}_see_past_assistant"
	replay_module_attr = f"{attr_prefix}_replay_module"
	replay_per_layers_attr = f"{attr_prefix}_replay_per_layers"
	last_layer_attr = f"{attr_prefix}_last_layer_module"
	variant_attr = "_optimized_llopa_variant"
	seed_attr = "_optimized_llopa_seed_mode"
	upper_prepare_attr = "_optimized_llopa_upper_prepare_mode"
	upper_bucket_attr = "_optimized_llopa_upper_bucket_multiple"
	seq_bucket_attr = "_optimized_llopa_seq_bucket_multiple"
	lower_k_local = kwargs.pop(f"{kw_prefix}_layers", None)
	if lower_k_local is None:
	lower_k_local = int(getattr(self, lower_k_attr, 0) or 0)
	attn_local = kwargs.pop(f"{kw_prefix}_attn", None)
	if attn_local is None:
	attn_local = str(getattr(self, attn_attr, "causal") or "causal")
	system_prefill_local = kwargs.pop(f"{kw_prefix}_system_prefill", None)
	if system_prefill_local is None:
	system_prefill_local = str(getattr(self, system_attr, "full") or "full")
	user_prefill_local = kwargs.pop(f"{kw_prefix}_user_prefill", None)
	if user_prefill_local is None:
	user_prefill_local = str(getattr(self, user_attr, "full") or "full")
	no_upper_attn_local = kwargs.pop(f"{kw_prefix}_no_upper_attn", None)
	if no_upper_attn_local is None:
	no_upper_attn_local = bool(getattr(self, no_upper_attr, False))
	see_past_assistant_local = kwargs.pop(f"{kw_prefix}_see_past_assistant", None)
	if see_past_assistant_local is None:
	see_past_assistant_local = bool(getattr(self, see_past_assistant_attr, False))
	replay_module_local = kwargs.pop(f"{kw_prefix}_replay_module", None)
	if replay_module_local is None:
	replay_module_local = kwargs.pop(f"{kw_prefix}_last_layer_module", None)
	if replay_module_local is None:
	replay_module_local = getattr(self, replay_module_attr, None)
	if replay_module_local is None:
	replay_module_local = str(getattr(self, last_layer_attr, "none") or "none")
	replay_per_layers_local = kwargs.pop(f"{kw_prefix}_replay_per_layers", None)
	if replay_per_layers_local is None:
	replay_per_layers_local = getattr(self, replay_per_layers_attr, -1)
	structured_seed_mode_local = "auto"
	if mode in {"llopa_v2", "llopa_v3", "llopa_v2_batch"}:
	kwargs.pop("llopa_v2_seed_mode", None)
	structured_seed_mode_local = "prefill_header"
	optimized_variant_local = None
	optimized_seed_mode_local = None
	optimized_upper_prepare_mode_local = None
	optimized_upper_bucket_multiple_local = None
	optimized_seq_bucket_multiple_local = None
	if mode == "optimized":
	optimized_variant_local = kwargs.pop("optimized_llopa_variant", None)
	if optimized_variant_local is None:
	optimized_variant_local = getattr(self, variant_attr, "upper_ws_auto")
	optimized_seed_mode_local = kwargs.pop("optimized_llopa_seed_mode", None)
	if optimized_seed_mode_local is None:
	optimized_seed_mode_local = getattr(self, seed_attr, "auto")
	optimized_upper_prepare_mode_local = kwargs.pop("optimized_llopa_upper_prepare_mode", None)
	if optimized_upper_prepare_mode_local is None:
	optimized_upper_prepare_mode_local = getattr(self, upper_prepare_attr, "bucketed_workspace")
	optimized_upper_bucket_multiple_local = kwargs.pop("optimized_llopa_upper_bucket_multiple", None)
	if optimized_upper_bucket_multiple_local is None:
	optimized_upper_bucket_multiple_local = getattr(self, upper_bucket_attr, 256)
	optimized_seq_bucket_multiple_local = kwargs.pop("optimized_llopa_seq_bucket_multiple", None)
	if optimized_seq_bucket_multiple_local is None:
	optimized_seq_bucket_multiple_local = getattr(self, seq_bucket_attr, 256)
	if prompt_messages is None and structured_prompt_segments is None:
	_warn_once(
	self,
	f"_warned_{mode_label}_missing_prompt_metadata",
	f"[load_llopa_model][warn] {mode_label} requested without structured prompt metadata; falling back to model.generate().",
	)
	return orig_generate(**kwargs)
	if prompt_add_generation_prompt is None and structured_prompt_segments is None:
	raise ValueError(f"{mode_label} requires prompt_add_generation_prompt when prompt_messages are provided.")
	generate_impl = (
	_direct_optimized_llopa_generate_impl
	if mode == "optimized"
	else _direct_llopa_batch_generate_impl
	if mode == "llopa_v2_batch"
	else _direct_llopa_generate_impl
	)
	generate_kwargs = dict(
	prompt_messages=prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	structured_prompt_segments=structured_prompt_segments,
	input_ids=kwargs.get("input_ids"),
	attention_mask=kwargs.get("attention_mask"),
	lower_k=int(lower_k_local),
	prefill_attn=str(attn_local),
	system_prefill=str(system_prefill_local),
	user_prefill=str(user_prefill_local),
	no_upper_attn=bool(no_upper_attn_local),
	see_past_assistant=bool(see_past_assistant_local),
	replay_module=str(replay_module_local),
	replay_per_layers=int(replay_per_layers_local or -1),
	max_length=kwargs.get("max_length"),
	max_new_tokens=kwargs.get("max_new_tokens"),
	min_length=kwargs.get("min_length"),
	min_new_tokens=kwargs.get("min_new_tokens"),
	do_sample=kwargs.get("do_sample"),
	temperature=kwargs.get("temperature"),
	top_p=kwargs.get("top_p"),
	top_k=kwargs.get("top_k"),
	stopping_criteria=kwargs.get("stopping_criteria"),
	pad_token_id=kwargs.get("pad_token_id"),
	eos_token_id=kwargs.get("eos_token_id"),
	output_scores=bool(kwargs.get("output_scores", False)),
	return_dict_in_generate=bool(kwargs.get("return_dict_in_generate", False)),
	use_cache=kwargs.get("use_cache"),
	)
	generate_kwargs["compact_scores"] = bool(compact_scores)
	if mode in {"llopa_v2", "llopa_v3", "llopa_v2_batch"}:
	generate_kwargs["seed_mode"] = str(structured_seed_mode_local)
	if mode == "optimized":
	generate_kwargs.update(
	optimized_variant=optimized_variant_local,
	optimized_seed_mode=optimized_seed_mode_local,
	optimized_upper_prepare_mode=optimized_upper_prepare_mode_local,
	optimized_upper_bucket_multiple=optimized_upper_bucket_multiple_local,
	optimized_seq_bucket_multiple=optimized_seq_bucket_multiple_local,
	)
	with torch.inference_mode():
	previous_mixin_decode = getattr(self, "_llopa_v2_generation_mixin_decode", None)
	try:
	if mode == "llopa_v2":
	setattr(self, "_llopa_v2_generation_mixin_decode", False)
	elif mode == "llopa_v3":
	setattr(self, "_llopa_v2_generation_mixin_decode", True)
	unified_out = generate_impl(
	self,
	tokenizer,
	**generate_kwargs,
	)
	finally:
	if previous_mixin_decode is None:
	with contextlib.suppress(Exception):
	delattr(self, "_llopa_v2_generation_mixin_decode")
	else:
	setattr(self, "_llopa_v2_generation_mixin_decode", previous_mixin_decode)
	if unified_out is not None:
	return unified_out
	raise RuntimeError(f"Structured {mode} LLoPA failed unexpectedly for the current prompt.")

	if mode == "freeze":
	lower_k_local = kwargs.pop("runtime_prefill_freeze_layers", None)
	if lower_k_local is None:
	lower_k_local = int(getattr(self, "_runtime_prefill_freeze_layers", 0) or 0)
	attn_local = kwargs.pop("runtime_prefill_freeze_attn", None)
	if attn_local is None:
	attn_local = str(getattr(self, "_runtime_prefill_freeze_attn", "causal") or "causal")
	system_prefill_local = kwargs.pop("runtime_prefill_freeze_system_prefill", None)
	if system_prefill_local is None:
	system_prefill_local = str(getattr(self, "_runtime_prefill_freeze_system_prefill", "no_bos_system") or "no_bos_system")
	if prompt_messages is None:
	_warn_once(
	self,
	"_warned_runtime_freeze_missing_prompt_metadata",
	"[load_llopa_model][warn] runtime_freeze_generate requested without structured prompt metadata; falling back to model.generate().",
	)
	return orig_generate(**kwargs)
	if prompt_add_generation_prompt is None:
	raise ValueError("runtime_freeze_generate requires prompt_add_generation_prompt when prompt_messages are provided.")

	freeze_out = _direct_freeze_generate_impl(
	self,
	tokenizer,
	prompt_messages=prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	input_ids=kwargs.get("input_ids"),
	attention_mask=kwargs.get("attention_mask"),
	lower_k=int(lower_k_local),
	prefill_attn=str(attn_local),
	system_prefill=str(system_prefill_local),
	max_length=kwargs.get("max_length"),
	max_new_tokens=kwargs.get("max_new_tokens"),
	min_length=kwargs.get("min_length"),
	min_new_tokens=kwargs.get("min_new_tokens"),
	do_sample=kwargs.get("do_sample"),
	temperature=kwargs.get("temperature"),
	top_p=kwargs.get("top_p"),
	top_k=kwargs.get("top_k"),
	stopping_criteria=kwargs.get("stopping_criteria"),
	pad_token_id=kwargs.get("pad_token_id"),
	eos_token_id=kwargs.get("eos_token_id"),
	output_scores=bool(kwargs.get("output_scores", False)),
	return_dict_in_generate=bool(kwargs.get("return_dict_in_generate", False)),
	use_cache=kwargs.get("use_cache"),
	)
	if freeze_out is not None:
	return freeze_out
	raise RuntimeError("Structured runtime freeze failed unexpectedly for the current prompt.")

	if mode == "solo":
	lower_k_local = kwargs.pop("runtime_prefill_solo_layers", None)
	if lower_k_local is None:
	lower_k_local = int(getattr(self, "_runtime_prefill_solo_layers", 0) or 0)
	attn_local = kwargs.pop("runtime_prefill_solo_attn", None)
	if attn_local is None:
	attn_local = str(getattr(self, "_runtime_prefill_solo_attn", "causal") or "causal")
	system_prefill_local = kwargs.pop("runtime_prefill_solo_system_prefill", None)
	if system_prefill_local is None:
	system_prefill_local = str(getattr(self, "_runtime_prefill_solo_system_prefill", "no_bos_system") or "no_bos_system")
	if prompt_messages is None:
	_warn_once(
	self,
	"_warned_runtime_solo_missing_prompt_metadata",
	"[load_llopa_model][warn] runtime_solo_generate requested without structured prompt metadata; falling back to model.generate().",
	)
	return orig_generate(**kwargs)
	if prompt_add_generation_prompt is None:
	raise ValueError("runtime_solo_generate requires prompt_add_generation_prompt when prompt_messages are provided.")

	solo_out = _direct_solo_generate_impl(
	self,
	tokenizer,
	prompt_messages=prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	input_ids=kwargs.get("input_ids"),
	attention_mask=kwargs.get("attention_mask"),
	lower_k=int(lower_k_local),
	prefill_attn=str(attn_local),
	system_prefill=str(system_prefill_local),
	max_length=kwargs.get("max_length"),
	max_new_tokens=kwargs.get("max_new_tokens"),
	min_length=kwargs.get("min_length"),
	min_new_tokens=kwargs.get("min_new_tokens"),
	do_sample=kwargs.get("do_sample"),
	temperature=kwargs.get("temperature"),
	top_p=kwargs.get("top_p"),
	top_k=kwargs.get("top_k"),
	stopping_criteria=kwargs.get("stopping_criteria"),
	pad_token_id=kwargs.get("pad_token_id"),
	eos_token_id=kwargs.get("eos_token_id"),
	output_scores=bool(kwargs.get("output_scores", False)),
	return_dict_in_generate=bool(kwargs.get("return_dict_in_generate", False)),
	use_cache=kwargs.get("use_cache"),
	)
	if solo_out is not None:
	return solo_out
	raise RuntimeError("Structured runtime solo-attn failed unexpectedly for the current prompt.")

	if mode == "solo_v2":
	lower_k_local = kwargs.pop("runtime_prefill_solo_v2_layers", None)
	if lower_k_local is None:
	lower_k_local = int(getattr(self, "_runtime_prefill_solo_v2_layers", 0) or 0)
	attn_local = kwargs.pop("runtime_prefill_solo_v2_attn", None)
	if attn_local is None:
	attn_local = str(getattr(self, "_runtime_prefill_solo_v2_attn", "causal") or "causal")
	system_prefill_local = kwargs.pop("runtime_prefill_solo_v2_system_prefill", None)
	if system_prefill_local is None:
	system_prefill_local = str(getattr(self, "_runtime_prefill_solo_v2_system_prefill", "no_bos_system") or "no_bos_system")
	with_bos_local = kwargs.pop("runtime_prefill_solo_v2_with_bos", None)
	if with_bos_local is None:
	with_bos_local = bool(getattr(self, "_runtime_prefill_solo_v2_with_bos", False))
	if prompt_messages is None:
	_warn_once(
	self,
	"_warned_runtime_solo_v2_missing_prompt_metadata",
	"[load_llopa_model][warn] runtime_solo_v2_generate requested without structured prompt metadata; falling back to model.generate().",
	)
	return orig_generate(**kwargs)
	if prompt_add_generation_prompt is None:
	raise ValueError("runtime_solo_v2_generate requires prompt_add_generation_prompt when prompt_messages are provided.")

	solo_out = _direct_solo_generate_impl(
	self,
	tokenizer,
	prompt_messages=prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	input_ids=kwargs.get("input_ids"),
	attention_mask=kwargs.get("attention_mask"),
	lower_k=int(lower_k_local),
	prefill_attn=str(attn_local),
	system_prefill=str(system_prefill_local),
	max_length=kwargs.get("max_length"),
	max_new_tokens=kwargs.get("max_new_tokens"),
	min_length=kwargs.get("min_length"),
	min_new_tokens=kwargs.get("min_new_tokens"),
	do_sample=kwargs.get("do_sample"),
	temperature=kwargs.get("temperature"),
	top_p=kwargs.get("top_p"),
	top_k=kwargs.get("top_k"),
	stopping_criteria=kwargs.get("stopping_criteria"),
	pad_token_id=kwargs.get("pad_token_id"),
	eos_token_id=kwargs.get("eos_token_id"),
	output_scores=bool(kwargs.get("output_scores", False)),
	return_dict_in_generate=bool(kwargs.get("return_dict_in_generate", False)),
	use_cache=kwargs.get("use_cache"),
	solo_v2=True,
	with_bos=bool(with_bos_local),
	)
	if solo_out is not None:
	return solo_out
	raise RuntimeError("Structured runtime solo-attn-v2 failed unexpectedly for the current prompt.")

	_warn_once(
	self,
	"_warned_deprecated_direct_llopa",
	"[load_llopa_model][warn] direct_llopa_* is deprecated; use unified_llopa_* or INFERENCE_PATH=unified_llopa. Legacy users can keep existing envs unchanged.",
	)
	lower_k_local = kwargs.pop("direct_llopa_layers", None)
	if lower_k_local is None:
	lower_k_local = int(getattr(self, "_direct_llopa_layers", 0) or 0)
	attn_local = kwargs.pop("direct_llopa_attn", None)
	if attn_local is None:
	attn_local = str(getattr(self, "_direct_llopa_attn", "causal") or "causal")
	system_prefill_local = kwargs.pop("direct_llopa_system_prefill", None)
	if system_prefill_local is None:
	system_prefill_local = str(getattr(self, "_direct_llopa_system_prefill", "full") or "full")
	user_prefill_local = kwargs.pop("direct_llopa_user_prefill", None)
	if user_prefill_local is None:
	user_prefill_local = str(getattr(self, "_direct_llopa_user_prefill", "full") or "full")
	no_upper_attn_local = kwargs.pop("direct_llopa_no_upper_attn", None)
	if no_upper_attn_local is None:
	no_upper_attn_local = bool(getattr(self, "_direct_llopa_no_upper_attn", False))

	if not legacy_search and prompt_messages is None and structured_prompt_segments is None:
	raise ValueError(
	"direct_llopa_generate now requires prompt_messages and prompt_add_generation_prompt. "
	"Use direct_llopa_legacy_search=True only for legacy prompt scanning."
	)
	if not legacy_search and prompt_add_generation_prompt is None and structured_prompt_segments is None:
	raise ValueError("direct_llopa_generate requires prompt_add_generation_prompt when prompt_messages are provided.")

	if legacy_search:
	with torch.inference_mode():
	direct_out = _legacy_direct_llopa_generate_impl(
	self,
	tokenizer,
	input_ids=kwargs.get("input_ids"),
	attention_mask=kwargs.get("attention_mask"),
	lower_k=int(lower_k_local),
	prefill_attn=str(attn_local),
	max_length=kwargs.get("max_length"),
	max_new_tokens=kwargs.get("max_new_tokens"),
	min_length=kwargs.get("min_length"),
	min_new_tokens=kwargs.get("min_new_tokens"),
	do_sample=kwargs.get("do_sample"),
	temperature=kwargs.get("temperature"),
	top_p=kwargs.get("top_p"),
	top_k=kwargs.get("top_k"),
	stopping_criteria=kwargs.get("stopping_criteria"),
	pad_token_id=kwargs.get("pad_token_id"),
	eos_token_id=kwargs.get("eos_token_id"),
	output_scores=bool(kwargs.get("output_scores", False)),
	return_dict_in_generate=bool(kwargs.get("return_dict_in_generate", False)),
	use_cache=kwargs.get("use_cache"),
	)
	else:
	with torch.inference_mode():
	direct_out = _direct_llopa_generate_impl(
	self,
	tokenizer,
	prompt_messages=prompt_messages,
	prompt_add_generation_prompt=bool(prompt_add_generation_prompt),
	structured_prompt_segments=structured_prompt_segments,
	input_ids=kwargs.get("input_ids"),
	attention_mask=kwargs.get("attention_mask"),
	lower_k=int(lower_k_local),
	prefill_attn=str(attn_local),
	system_prefill=str(system_prefill_local),
	user_prefill=str(user_prefill_local),
	no_upper_attn=bool(no_upper_attn_local),
	max_length=kwargs.get("max_length"),
	max_new_tokens=kwargs.get("max_new_tokens"),
	min_length=kwargs.get("min_length"),
	min_new_tokens=kwargs.get("min_new_tokens"),
	do_sample=kwargs.get("do_sample"),
	temperature=kwargs.get("temperature"),
	top_p=kwargs.get("top_p"),
	top_k=kwargs.get("top_k"),
	stopping_criteria=kwargs.get("stopping_criteria"),
	pad_token_id=kwargs.get("pad_token_id"),
	eos_token_id=kwargs.get("eos_token_id"),
	output_scores=bool(kwargs.get("output_scores", False)),
	return_dict_in_generate=bool(kwargs.get("return_dict_in_generate", False)),
	use_cache=kwargs.get("use_cache"),
	)
	if direct_out is not None:
	return direct_out
	if not legacy_search:
	raise RuntimeError("Structured direct LLoPA failed unexpectedly for the current prompt.")
	_warn_once(
	self,
	"_warned_direct_llopa_fallback",
	"[load_llopa_model][warn] direct_llopa_generate could not use the current prompt/generation settings; falling back to model.generate().",
	)
	return orig_generate(**kwargs)

	try:
	model.generate = types.MethodType(_structured_llopa_generate, model)
	setattr(model, "_structured_llopa_generate_attached", True)
	except Exception:
	pass


	def _attach_unified_llopa_generate(
	model,
	tokenizer,
	*,
	lower_k: int,
	prefill_attn: str = "causal",
	system_prefill: str = "full",
	user_prefill: str = "full",
	no_upper_attn: bool = False,
	see_past_assistant: bool = False,
	replay_module: str = "none",
	replay_per_layers: int = -1,
	last_layer_module: Optional[str] = None,
	) -> None:
	normalized = _normalize_structured_llopa_runtime(
	lower_k,
	prefill_attn=prefill_attn,
	system_prefill=system_prefill,
	user_prefill=user_prefill,
	no_upper_attn=no_upper_attn,
	replay_module=replay_module,
	replay_per_layers=replay_per_layers,
	last_layer_module=last_layer_module,
	)
	if normalized is None:
	return
	lower_k, attn, sys_prefill, user_prefill_norm, no_upper_attn, replay_module, replay_per_layers = normalized
	try:
	setattr(model, "_unified_llopa_layers", int(lower_k))
	setattr(model, "_unified_llopa_attn", attn)
	setattr(model, "_unified_llopa_system_prefill", sys_prefill)
	setattr(model, "_unified_llopa_user_prefill", user_prefill_norm)
	setattr(model, "_unified_llopa_no_upper_attn", bool(no_upper_attn))
	setattr(model, "_unified_llopa_see_past_assistant", bool(see_past_assistant))
	setattr(model, "_unified_llopa_replay_module", str(replay_module))
	setattr(model, "_unified_llopa_last_layer_module", str(replay_module))
	setattr(model, "_unified_llopa_replay_per_layers", int(replay_per_layers))
	setattr(model, "_unified_llopa_generate_default", True)
	setattr(model, "_capsule_inference_path", "unified_llopa")
	except Exception:
	return
	_attach_structured_llopa_generate(model, tokenizer)
	try:
	setattr(model, "_unified_llopa_generate_attached", True)
	except Exception:
	pass


	def _attach_llopa_v2_generate(
	model,
	tokenizer,
	*,
	lower_k: int,
	prefill_attn: str = "causal",
	system_prefill: str = "full",
	user_prefill: str = "full",
	no_upper_attn: bool = False,
	see_past_assistant: bool = False,
	replay_module: str = "none",
	replay_per_layers: int = -1,
	last_layer_module: Optional[str] = None,
	seed_mode: str = "prefill_header",
	generation_mixin_decode: bool = False,
	capsule_inference_path: str = "llopa_v2",
	) -> None:
	normalized = _normalize_structured_llopa_runtime(
	lower_k,
	prefill_attn=prefill_attn,
	system_prefill=system_prefill,
	user_prefill=user_prefill,
	no_upper_attn=no_upper_attn,
	replay_module=replay_module,
	replay_per_layers=replay_per_layers,
	last_layer_module=last_layer_module,
	)
	if normalized is None:
	return
	lower_k, attn, sys_prefill, user_prefill_norm, no_upper_attn, replay_module, replay_per_layers = normalized
	normalized_seed_mode = _normalize_structured_llopa_seed_mode(seed_mode)
	if normalized_seed_mode != "prefill_header":
	normalized_seed_mode = "prefill_header"
	try:
	setattr(model, "_llopa_v2_layers", int(lower_k))
	setattr(model, "_llopa_v2_attn", attn)
	setattr(model, "_llopa_v2_system_prefill", sys_prefill)
	setattr(model, "_llopa_v2_user_prefill", user_prefill_norm)
	setattr(model, "_llopa_v2_no_upper_attn", bool(no_upper_attn))
	setattr(model, "_llopa_v2_see_past_assistant", bool(see_past_assistant))
	setattr(model, "_llopa_v2_replay_module", str(replay_module))
	setattr(model, "_llopa_v2_last_layer_module", str(replay_module))
	setattr(model, "_llopa_v2_replay_per_layers", int(replay_per_layers))
	setattr(model, "_llopa_v2_seed_mode", str(normalized_seed_mode))
	setattr(model, "_llopa_v2_generation_mixin_decode", bool(generation_mixin_decode))
	setattr(model, "_llopa_v2_generate_default", True)
	setattr(model, "_capsule_inference_path", str(capsule_inference_path or "llopa_v2"))
	except Exception:
	return
	_attach_structured_llopa_generate(model, tokenizer)
	try:
	setattr(model, "_llopa_v2_generate_attached", True)
	except Exception:
	pass


	def _attach_llopa_v2_batch_generate(
	model,
	tokenizer,
	*,
	lower_k: int,
	prefill_attn: str = "causal",
	system_prefill: str = "full",
	user_prefill: str = "full",
	no_upper_attn: bool = False,
	see_past_assistant: bool = False,
	replay_module: str = "none",
	replay_per_layers: int = -1,
	last_layer_module: Optional[str] = None,
	seed_mode: str = "prefill_header",
	) -> None:
	normalized = _normalize_structured_llopa_runtime(
	lower_k,
	prefill_attn=prefill_attn,
	system_prefill=system_prefill,
	user_prefill=user_prefill,
	no_upper_attn=no_upper_attn,
	replay_module=replay_module,
	replay_per_layers=replay_per_layers,
	last_layer_module=last_layer_module,
	)
	if normalized is None:
	return
	lower_k, attn, sys_prefill, user_prefill_norm, no_upper_attn, replay_module, replay_per_layers = normalized
	normalized_seed_mode = _normalize_structured_llopa_seed_mode(seed_mode)
	if normalized_seed_mode != "prefill_header":
	normalized_seed_mode = "prefill_header"
	try:
	setattr(model, "_llopa_v2_layers", int(lower_k))
	setattr(model, "_llopa_v2_attn", attn)
	setattr(model, "_llopa_v2_system_prefill", sys_prefill)
	setattr(model, "_llopa_v2_user_prefill", user_prefill_norm)
	setattr(model, "_llopa_v2_no_upper_attn", bool(no_upper_attn))
	setattr(model, "_llopa_v2_see_past_assistant", bool(see_past_assistant))
	setattr(model, "_llopa_v2_replay_module", str(replay_module))
	setattr(model, "_llopa_v2_last_layer_module", str(replay_module))
	setattr(model, "_llopa_v2_replay_per_layers", int(replay_per_layers))
	setattr(model, "_llopa_v2_seed_mode", str(normalized_seed_mode))
	setattr(model, "_llopa_v2_batch_generate_default", True)
	setattr(model, "_capsule_inference_path", "llopa_v2_batch")
	except Exception:
	return
	_attach_structured_llopa_generate(model, tokenizer)
	try:
	setattr(model, "_llopa_v2_batch_generate_attached", True)
	except Exception:
	pass


	def _attach_optimized_llopa_generate(
	model,
	tokenizer,
	*,
	lower_k: int,
	prefill_attn: str = "causal",
	system_prefill: str = "full",
	user_prefill: str = "full",
	no_upper_attn: bool = False,
	see_past_assistant: bool = False,
	replay_module: str = "none",
	replay_per_layers: int = -1,
	last_layer_module: Optional[str] = None,
	optimized_variant: Optional[str] = None,
	optimized_seed_mode: Optional[str] = None,
	optimized_upper_prepare_mode: Optional[str] = None,
	optimized_upper_bucket_multiple: Optional[int] = None,
	optimized_seq_bucket_multiple: Optional[int] = None,
	) -> None:
	normalized = _normalize_structured_llopa_runtime(
	lower_k,
	prefill_attn=prefill_attn,
	system_prefill=system_prefill,
	user_prefill=user_prefill,
	no_upper_attn=no_upper_attn,
	replay_module=replay_module,
	replay_per_layers=replay_per_layers,
	last_layer_module=last_layer_module,
	)
	if normalized is None:
	return
	lower_k, attn, sys_prefill, user_prefill_norm, no_upper_attn, replay_module, replay_per_layers = normalized
	optimized_settings = _resolve_optimized_llopa_settings(
	variant=optimized_variant,
	seed_mode=optimized_seed_mode,
	upper_prepare_mode=optimized_upper_prepare_mode,
	upper_bucket_multiple=optimized_upper_bucket_multiple,
	seq_bucket_multiple=optimized_seq_bucket_multiple,
	)
	try:
	setattr(model, "_optimized_llopa_layers", int(lower_k))
	setattr(model, "_optimized_llopa_attn", attn)
	setattr(model, "_optimized_llopa_system_prefill", sys_prefill)
	setattr(model, "_optimized_llopa_user_prefill", user_prefill_norm)
	setattr(model, "_optimized_llopa_no_upper_attn", bool(no_upper_attn))
	setattr(model, "_optimized_llopa_see_past_assistant", bool(see_past_assistant))
	setattr(model, "_optimized_llopa_replay_module", str(replay_module))
	setattr(model, "_optimized_llopa_last_layer_module", str(replay_module))
	setattr(model, "_optimized_llopa_replay_per_layers", int(replay_per_layers))
	setattr(model, "_optimized_llopa_variant", str(optimized_settings["variant"]))
	setattr(model, "_optimized_llopa_seed_mode", str(optimized_settings["seed_mode"]))
	setattr(model, "_optimized_llopa_upper_prepare_mode", str(optimized_settings["upper_prepare_mode"]))
	setattr(model, "_optimized_llopa_upper_bucket_multiple", int(optimized_settings["upper_bucket_multiple"]))
	setattr(model, "_optimized_llopa_seq_bucket_multiple", int(optimized_settings["seq_bucket_multiple"]))
	setattr(model, "_optimized_llopa_generate_default", True)
	setattr(model, "_capsule_inference_path", "optimized_llopa")
	except Exception:
	return
	_attach_structured_llopa_generate(model, tokenizer)
	try:
	setattr(model, "_optimized_llopa_generate_attached", True)
	except Exception:
	pass


	def _attach_direct_llopa_generate(
	model,
	tokenizer,
	*,
	lower_k: int,
	prefill_attn: str = "causal",
	system_prefill: str = "full",
	user_prefill: str = "full",
	no_upper_attn: bool = False,
	) -> None:
	normalized = _normalize_structured_llopa_runtime(
	lower_k,
	prefill_attn=prefill_attn,
	system_prefill=system_prefill,
	user_prefill=user_prefill,
	no_upper_attn=no_upper_attn,
	)
	if normalized is None:
	return
	lower_k, attn, sys_prefill, user_prefill_norm, no_upper_attn, _, _ = normalized
	header_ids = _assistant_header_ids(tokenizer, "cpu")
	try:
	setattr(model, "_direct_llopa_layers", int(lower_k))
	setattr(model, "_direct_llopa_attn", attn)
	setattr(model, "_direct_llopa_system_prefill", sys_prefill)
	setattr(model, "_direct_llopa_user_prefill", user_prefill_norm)
	setattr(model, "_direct_llopa_no_upper_attn", bool(no_upper_attn))
	setattr(model, "_direct_llopa_generate_default", True)
	setattr(model, "_capsule_inference_path", "legacy_llopa")
	if isinstance(header_ids, torch.Tensor) and header_ids.numel() > 0:
	setattr(model, "_direct_llopa_header_ids", header_ids.detach().to(device="cpu", dtype=torch.long))
	except Exception:
	return
	_attach_structured_llopa_generate(model, tokenizer)
	try:
	setattr(model, "_direct_llopa_generate_attached", True)
	except Exception:
	pass


	def load_llopa_model(model_repo: str,
	*,
	model_name: str = "",
	tokenizer_name: str = "",
	num_specials: Optional[int] = None,
	backbone_dir: str = "",
	lopa_modeling_path: str = "",
	modeling_family: str = "auto",
	dtype: str = "auto",
	torch_dtype=None,
	device: str = "",
	device_map: Optional[str] = None,
	attn_impl: str = "auto",
	attn_implementation: Optional[str] = None,
	_attn_implementation: Optional[str] = None,
	trust_remote_code: bool = False,
	cache_dir: Optional[str] = None,
	revision: Optional[str] = None,
	token: Optional[str] = None,
	local_files_only: bool = False,
	force_custom_modeling: bool = False,
	number_of_lora: int = 1,
	use_lora: bool = True,
	merge_on_cpu: bool = True,
	enable_thinking: Optional[bool] = None,
	no_upper_attn: Optional[bool] = None,
	runtime_prefill_lower: Optional[bool] = None,
	runtime_prefill_layers: Optional[int] = None,
	runtime_prefill_attn: Optional[str] = None,
	runtime_prefill_system_prefill: Optional[str] = None,
	runtime_prefill_freeze: Optional[bool] = None,
	runtime_prefill_freeze_layers: Optional[int] = None,
	runtime_prefill_freeze_attn: Optional[str] = None,
	runtime_prefill_freeze_system_prefill: Optional[str] = None,
	runtime_prefill_solo: Optional[bool] = None,
	runtime_prefill_solo_layers: Optional[int] = None,
	runtime_prefill_solo_attn: Optional[str] = None,
	runtime_prefill_solo_system_prefill: Optional[str] = None,
	runtime_prefill_solo_v2: Optional[bool] = None,
	runtime_prefill_solo_v2_layers: Optional[int] = None,
	runtime_prefill_solo_v2_attn: Optional[str] = None,
	runtime_prefill_solo_v2_system_prefill: Optional[str] = None,
	runtime_prefill_solo_v2_with_bos: Optional[bool] = None,
	runtime_llopa_prefill: Optional[bool] = None,
	runtime_llopa_layers: Optional[int] = None,
	runtime_llopa_attn: Optional[str] = None,
	runtime_llopa_no_upper_attn: Optional[bool] = None,
	unified_llopa_generate: Optional[bool] = None,
	unified_llopa_layers: Optional[int] = None,
	unified_llopa_attn: Optional[str] = None,
	unified_llopa_system_prefill: Optional[str] = None,
	unified_llopa_user_prefill: Optional[str] = None,
	unified_llopa_no_upper_attn: Optional[bool] = None,
	unified_llopa_see_past_assistant: Optional[bool] = None,
	unified_llopa_replay_module: Optional[str] = None,
	unified_llopa_replay_per_layers: Optional[int] = None,
	unified_llopa_last_layer_module: Optional[str] = None,
	llopa_v2_batch_generate: Optional[bool] = None,
	llopa_v2_generate: Optional[bool] = None,
	llopa_v3_generate: Optional[bool] = None,
	llopa_v2_layers: Optional[int] = None,
	llopa_v2_attn: Optional[str] = None,
	llopa_v2_system_prefill: Optional[str] = None,
	llopa_v2_user_prefill: Optional[str] = None,
	llopa_v2_no_upper_attn: Optional[bool] = None,
	llopa_v2_see_past_assistant: Optional[bool] = None,
	llopa_v2_replay_module: Optional[str] = None,
	llopa_v2_replay_per_layers: Optional[int] = None,
	llopa_v2_last_layer_module: Optional[str] = None,
	llopa_v2_seed_mode: Optional[str] = None,
	optimized_llopa_generate: Optional[bool] = None,
	optimized_llopa_layers: Optional[int] = None,
	optimized_llopa_attn: Optional[str] = None,
	optimized_llopa_system_prefill: Optional[str] = None,
	optimized_llopa_user_prefill: Optional[str] = None,
	optimized_llopa_no_upper_attn: Optional[bool] = None,
	optimized_llopa_see_past_assistant: Optional[bool] = None,
	optimized_llopa_replay_module: Optional[str] = None,
	optimized_llopa_replay_per_layers: Optional[int] = None,
	optimized_llopa_last_layer_module: Optional[str] = None,
	optimized_llopa_variant: Optional[str] = None,
	optimized_llopa_seed_mode: Optional[str] = None,
	optimized_llopa_upper_prepare_mode: Optional[str] = None,
	optimized_llopa_upper_bucket_multiple: Optional[int] = None,
	optimized_llopa_seq_bucket_multiple: Optional[int] = None,
	runtime_llopa_fast_generate: Optional[bool] = None,
	direct_llopa_generate: Optional[bool] = None,
	direct_llopa_layers: Optional[int] = None,
	direct_llopa_attn: Optional[str] = None,
	direct_llopa_system_prefill: Optional[str] = None,
	direct_llopa_user_prefill: Optional[str] = None,
	direct_llopa_no_upper_attn: Optional[bool] = None):
	repo_path = _resolve_repo_path(model_repo, cache_dir=cache_dir, revision=revision,
	token=token, local_files_only=local_files_only)

	info: dict[str, str] = {}
	tri_info = repo_path / "tri_info.txt"
	if tri_info.is_file():
	info = _read_kv_file(tri_info)
	if not model_name:
	model_name = info.get("model_name", "")
	if num_specials is None:
	try:
	num_specials = int(info.get("num_specials", "") or 0)
	except Exception:
	num_specials = 0
	if runtime_prefill_lower is None:
	runtime_prefill_lower = False
	if runtime_prefill_freeze is None:
	runtime_prefill_freeze = False
	if runtime_prefill_solo is None:
	runtime_prefill_solo = False
	if runtime_prefill_solo_v2 is None:
	runtime_prefill_solo_v2 = False
	if runtime_llopa_prefill is None:
	runtime_llopa_prefill = False
	if unified_llopa_generate is None:
	unified_llopa_generate = False
	if llopa_v2_batch_generate is None:
	llopa_v2_batch_generate = False
	if llopa_v2_generate is None:
	llopa_v2_generate = False
	if llopa_v3_generate is None:
	llopa_v3_generate = False
	if bool(llopa_v3_generate):
	llopa_v2_generate = True
	if optimized_llopa_generate is None:
	optimized_llopa_generate = False
	if runtime_llopa_fast_generate is None:
	runtime_llopa_fast_generate = False
	if direct_llopa_generate is None:
	direct_llopa_generate = False

	backbone_ref = (
	backbone_dir
	or read_backbone_ref(repo_path)
	or _read_adapter_backbone_ref(repo_path)
	or model_name
	or model_repo
	)
	num_specials_arg = num_specials

	config = None
	config_source = str(repo_path) if (repo_path / "config.json").is_file() else backbone_ref
	try:
	config = AutoConfig.from_pretrained(
	config_source,
	cache_dir=cache_dir,
	revision=revision,
	token=token,
	local_files_only=local_files_only,
	)
	if num_specials_arg is not None:
	config.llopa_num_specials = int(num_specials_arg)
	except Exception:
	config = None
	if num_specials_arg is None:
	num_specials = int(getattr(config, "llopa_num_specials", 0) or 0) if config is not None else 0
	else:
	num_specials = int(num_specials_arg)

	def _config_str(name: str) -> str:
	if config is None:
	return ""
	raw = getattr(config, name, "")
	return str(raw or "").strip()

	def _config_bool(name: str):
	if config is None:
	return None
	value = getattr(config, name, None)
	if value is None:
	return None
	return bool(value)

	def _normalize_attention_gate_mode(mode: str) -> str:
	normalized = str(mode or "off").strip().lower()
	aliases = {
	"": "off",
	"none": "off",
	"disabled": "off",
	"disable": "off",
	"false": "off",
	"0": "off",
	"paper": "sdpa_sigmoid",
	"sdpa_gate": "sdpa_sigmoid",
	"sdpa-gate": "sdpa_sigmoid",
	"sigmoid_after_sdpa": "sdpa_sigmoid",
	"sdpa_elementwise_sigmoid": "sdpa_sigmoid",
	}
	normalized = aliases.get(normalized, normalized)
	if normalized not in {"off", "sdpa_sigmoid"}:
	normalized = "off"
	return normalized

	attention_gate_mode = _normalize_attention_gate_mode(
	info.get("attention_gate_mode") or _config_str("capsule_attention_gate_mode") or "off"
	)
	if config is not None:
	with contextlib.suppress(Exception):
	setattr(config, "capsule_attention_gate_mode", attention_gate_mode)

	if no_upper_attn is None:
	raw = (info.get("no_upper_attn") or _config_str("capsule_no_upper_attn")).strip().lower()
	if raw in {"1", "true", "yes", "on"}:
	no_upper_attn = True
	elif raw in {"0", "false", "no", "off"}:
	no_upper_attn = False
	else:
	cfg_bool = _config_bool("capsule_no_upper_attn")
	if cfg_bool is not None:
	no_upper_attn = cfg_bool
	if runtime_prefill_layers is None:
	raw = (info.get("lower_k") or _config_str("capsule_lower_layers")).strip()
	if raw:
	with contextlib.suppress(Exception):
	runtime_prefill_layers = int(raw)
	if runtime_prefill_attn is None:
	runtime_prefill_attn = (
	(info.get("prefill_attn") or _config_str("capsule_prefill_attn") or "causal").strip().lower() or "causal"
	)
	if runtime_prefill_system_prefill is None:
	raw_system_prefill = (info.get("system_prefill") or _config_str("capsule_system_prefill")).strip().lower()
	if raw_system_prefill in {"full", "no_system", "no_bos_system"}:
	runtime_prefill_system_prefill = raw_system_prefill
	if runtime_llopa_layers is None:
	runtime_llopa_layers = runtime_prefill_layers
	if runtime_llopa_attn is None:
	runtime_llopa_attn = runtime_prefill_attn
	if runtime_llopa_no_upper_attn is None:
	runtime_llopa_no_upper_attn = bool(no_upper_attn) if no_upper_attn is not None else False
	if unified_llopa_layers is None:
	unified_llopa_layers = runtime_prefill_layers
	if unified_llopa_attn is None:
	unified_llopa_attn = runtime_prefill_attn
	if unified_llopa_system_prefill is None:
	unified_llopa_system_prefill = runtime_prefill_system_prefill
	if unified_llopa_user_prefill is None:
	raw_user_prefill = (info.get("user_prefill") or _config_str("capsule_user_prefill") or "full").strip().lower()
	if raw_user_prefill:
	unified_llopa_user_prefill = raw_user_prefill
	if unified_llopa_no_upper_attn is None:
	unified_llopa_no_upper_attn = bool(no_upper_attn) if no_upper_attn is not None else False
	if unified_llopa_see_past_assistant is None:
	unified_llopa_see_past_assistant = False
	if unified_llopa_replay_module is None:
	unified_llopa_replay_module = unified_llopa_last_layer_module
	if unified_llopa_replay_module is None:
	unified_llopa_replay_module = (
	info.get("replay_module")
	or _config_str("capsule_replay_module")
	or info.get("last_layer_module")
	or _config_str("capsule_last_layer_module")
	)
	unified_llopa_replay_module = _normalize_replay_module_value(unified_llopa_replay_module)
	unified_llopa_last_layer_module = str(unified_llopa_replay_module)
	if unified_llopa_replay_per_layers is None:
	unified_llopa_replay_per_layers = info.get("replay_per_layers") or _config_str("capsule_replay_per_layers") or -1
	unified_llopa_replay_per_layers = _normalize_replay_per_layers_value(unified_llopa_replay_per_layers)
	if llopa_v2_layers is None:
	llopa_v2_layers = runtime_prefill_layers
	if llopa_v2_attn is None:
	llopa_v2_attn = runtime_prefill_attn
	if llopa_v2_system_prefill is None:
	llopa_v2_system_prefill = runtime_prefill_system_prefill
	if llopa_v2_user_prefill is None:
	raw_user_prefill = (info.get("user_prefill") or _config_str("capsule_user_prefill") or "full").strip().lower()
	if raw_user_prefill:
	llopa_v2_user_prefill = raw_user_prefill
	if llopa_v2_no_upper_attn is None:
	llopa_v2_no_upper_attn = bool(no_upper_attn) if no_upper_attn is not None else False
	if llopa_v2_see_past_assistant is None:
	llopa_v2_see_past_assistant = False
	if llopa_v2_replay_module is None:
	llopa_v2_replay_module = llopa_v2_last_layer_module
	if llopa_v2_replay_module is None:
	llopa_v2_replay_module = (
	info.get("replay_module")
	or _config_str("capsule_replay_module")
	or info.get("last_layer_module")
	or _config_str("capsule_last_layer_module")
	)
	llopa_v2_replay_module = _normalize_replay_module_value(llopa_v2_replay_module)
	llopa_v2_last_layer_module = str(llopa_v2_replay_module)
	if llopa_v2_replay_per_layers is None:
	llopa_v2_replay_per_layers = info.get("replay_per_layers") or _config_str("capsule_replay_per_layers") or -1
	llopa_v2_replay_per_layers = _normalize_replay_per_layers_value(llopa_v2_replay_per_layers)
	llopa_v2_seed_mode = "prefill_header"
	if optimized_llopa_layers is None:
	optimized_llopa_layers = runtime_prefill_layers
	if optimized_llopa_attn is None:
	optimized_llopa_attn = runtime_prefill_attn
	if optimized_llopa_system_prefill is None:
	optimized_llopa_system_prefill = runtime_prefill_system_prefill
	if optimized_llopa_user_prefill is None:
	raw_user_prefill = (info.get("user_prefill") or _config_str("capsule_user_prefill") or "full").strip().lower()
	if raw_user_prefill:
	optimized_llopa_user_prefill = raw_user_prefill
	if optimized_llopa_no_upper_attn is None:
	optimized_llopa_no_upper_attn = bool(no_upper_attn) if no_upper_attn is not None else False
	if optimized_llopa_see_past_assistant is None:
	optimized_llopa_see_past_assistant = False
	if optimized_llopa_replay_module is None:
	optimized_llopa_replay_module = optimized_llopa_last_layer_module
	if optimized_llopa_replay_module is None:
	optimized_llopa_replay_module = (
	info.get("replay_module")
	or _config_str("capsule_replay_module")
	or info.get("last_layer_module")
	or _config_str("capsule_last_layer_module")
	)
	optimized_llopa_replay_module = _normalize_replay_module_value(optimized_llopa_replay_module)
	optimized_llopa_last_layer_module = str(optimized_llopa_replay_module)
	if optimized_llopa_replay_per_layers is None:
	optimized_llopa_replay_per_layers = info.get("replay_per_layers") or _config_str("capsule_replay_per_layers") or -1
	optimized_llopa_replay_per_layers = _normalize_replay_per_layers_value(optimized_llopa_replay_per_layers)
	optimized_settings = _resolve_optimized_llopa_settings(
	variant=optimized_llopa_variant,
	seed_mode=optimized_llopa_seed_mode,
	upper_prepare_mode=optimized_llopa_upper_prepare_mode,
	upper_bucket_multiple=optimized_llopa_upper_bucket_multiple,
	seq_bucket_multiple=optimized_llopa_seq_bucket_multiple,
	)
	if direct_llopa_layers is None:
	direct_llopa_layers = runtime_prefill_layers
	if direct_llopa_attn is None:
	direct_llopa_attn = runtime_prefill_attn
	if direct_llopa_system_prefill is None:
	direct_llopa_system_prefill = runtime_prefill_system_prefill
	if direct_llopa_user_prefill is None:
	raw_user_prefill = (info.get("user_prefill") or _config_str("capsule_user_prefill") or "full").strip().lower()
	if raw_user_prefill:
	direct_llopa_user_prefill = raw_user_prefill
	if direct_llopa_no_upper_attn is None:
	direct_llopa_no_upper_attn = bool(no_upper_attn) if no_upper_attn is not None else False
	if runtime_prefill_solo_layers is None:
	runtime_prefill_solo_layers = runtime_prefill_layers
	if runtime_prefill_solo_attn is None:
	runtime_prefill_solo_attn = runtime_prefill_attn
	if runtime_prefill_solo_system_prefill is None:
	runtime_prefill_solo_system_prefill = runtime_prefill_system_prefill
	if runtime_prefill_solo_v2_layers is None:
	runtime_prefill_solo_v2_layers = runtime_prefill_layers
	if runtime_prefill_solo_v2_attn is None:
	runtime_prefill_solo_v2_attn = runtime_prefill_attn
	if runtime_prefill_solo_v2_system_prefill is None:
	runtime_prefill_solo_v2_system_prefill = runtime_prefill_system_prefill
	if runtime_prefill_solo_v2_with_bos is None:
	runtime_prefill_solo_v2_with_bos = False

	config_kwargs = {"config": config} if config is not None else {}

	dtype_norm = _normalize_dtype_arg(dtype) or "auto"
	torch_dtype_norm = _normalize_dtype_arg(torch_dtype)
	if dtype_norm == "auto" and torch_dtype_norm is not None:
	dtype_norm = torch_dtype_norm

	if dtype_norm == "fp32":
	torch_dtype = torch.float32
	elif dtype_norm == "bf16":
	torch_dtype = torch.bfloat16
	elif dtype_norm == "fp16":
	torch_dtype = torch.float16
	else:
	if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
	torch_dtype = torch.bfloat16
	elif torch.cuda.is_available():
	torch_dtype = torch.float16
	else:
	torch_dtype = torch.float32

	# Accept HF-style aliases so AutoModel.from_pretrained(...) kwargs work as-is.
	for cand in (attn_implementation, _attn_implementation):
	if cand not in (None, "", "auto"):
	attn_impl = str(cand)
	break
	if attn_impl != "auto" and config is not None:
	for k in ("attn_implementation", "_attn_implementation"):
	with contextlib.suppress(Exception):
	setattr(config, k, attn_impl)

	# `device_map="cuda:0"` is a single-device placement request, not a sharding map.
	if isinstance(device_map, str):
	dm = device_map.strip()
	if dm in ("", "none", "None", "null", "NULL"):
	device_map = None
	elif dm in ("cuda", "cpu", "mps") or dm.startswith(("cuda:", "xpu:", "npu:")):
	if not device:
	device = dm
	device_map = None

	if device_map is None and not device:
	device = "cuda" if torch.cuda.is_available() else "cpu"

	if tokenizer_name:
	tok_src = tokenizer_name
	else:
	tok_src = str(repo_path) if (repo_path / "tokenizer.json").is_file() else backbone_ref
	tokenizer = AutoTokenizer.from_pretrained(
	tok_src,
	use_fast=True,
	cache_dir=cache_dir,
	revision=revision,
	token=token,
	local_files_only=local_files_only,
	**_tokenizer_kwargs(AutoTokenizer.from_pretrained),
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right"
	checkpoint_repo_path = repo_path
	if not _repo_has_pretrained_weights(checkpoint_repo_path):
	with contextlib.suppress(Exception):
	resolved_backbone = _resolve_repo_path(
	backbone_ref,
	cache_dir=cache_dir,
	revision=revision,
	token=token,
	local_files_only=local_files_only,
	)
	if _repo_has_pretrained_weights(resolved_backbone):
	checkpoint_repo_path = resolved_backbone
	checkpoint_vocab_size = 0
	alignment_report = None
	with contextlib.suppress(Exception):
	checkpoint_vocab_size = int(_infer_checkpoint_vocab_size(checkpoint_repo_path) or 0)
	if checkpoint_vocab_size > 0:
	try:
	tokenizer_vocab_size = int(len(tokenizer))
	except Exception:
	tokenizer_vocab_size = 0
	if tokenizer_vocab_size > 0 and checkpoint_vocab_size > tokenizer_vocab_size:
	alignment_report = _align_tokenizer_with_checkpoint_vocab(
	tokenizer,
	checkpoint_repo_path,
	checkpoint_vocab_size,
	)
	_log_tokenizer_checkpoint_alignment("[load_llopa_model]", alignment_report)
	if config is not None:
	try:
	tokenizer_vocab_size = int(len(tokenizer))
	except Exception:
	tokenizer_vocab_size = 0
	config_vocab_size = 0
	try:
	config_vocab_size = int(getattr(config, "vocab_size", 0) or 0)
	except Exception:
	config_vocab_size = 0
	target_vocab_size = max(config_vocab_size, tokenizer_vocab_size, int(checkpoint_vocab_size or 0))
	if checkpoint_vocab_size > 0 and checkpoint_vocab_size != config_vocab_size:
	_log_config_checkpoint_vocab_alignment(
	"[load_llopa_model]",
	config_vocab_size=config_vocab_size,
	checkpoint_vocab_size=checkpoint_vocab_size,
	tokenizer_vocab_size=tokenizer_vocab_size,
	alignment_report=alignment_report,
	)
	if target_vocab_size > 0 and target_vocab_size != config_vocab_size:
	with contextlib.suppress(Exception):
	config.vocab_size = int(target_vocab_size)
	if enable_thinking is not None:
	if hasattr(tokenizer, "enable_thinking"):
	try:
	tokenizer.enable_thinking(enable_thinking)
	except Exception:
	try:
	tokenizer.enable_thinking = enable_thinking
	except Exception:
	pass
	elif hasattr(tokenizer, "set_enable_thinking"):
	try:
	tokenizer.set_enable_thinking(enable_thinking)
	except Exception:
	pass
	try:
	setattr(tokenizer, "_force_enable_thinking", enable_thinking)
	except Exception:
	pass

	model_family = infer_model_family(backbone_ref, modeling_family)
	modeling_path = _resolve_modeling_path(repo_path, lopa_modeling_path, model_family)
	lora_path = repo_path / "lora"
	has_lora = bool(use_lora and lora_path.exists() and any(lora_path.iterdir()))
	if not has_lora and bool(use_lora):
	top_level_adapter = (
	(repo_path / "adapter_config.json").is_file()
	and (
	(repo_path / "adapter_model.safetensors").is_file()
	or (repo_path / "adapter_model.bin").is_file()
	)
	)
	if top_level_adapter:
	lora_path = repo_path
	has_lora = True
	load_device_map = device_map
	merge_on_cpu_active = bool(merge_on_cpu and has_lora and device_map is None)
	if merge_on_cpu and has_lora and device_map is not None:
	print("[LoRA] merge_on_cpu ignored because sharded device_map is requested.")
	if merge_on_cpu_active:
	print("[LoRA] Merging adapters on CPU to reduce CUDA peak memory.")
	load_device_map = None

	custom_mod = None
	if modeling_path:
	try:
	custom_mod = load_custom_modeling(modeling_path, model_family=model_family)
	except Exception:
	custom_mod = None

	base = None
	custom_load_exc = None
	if custom_mod is not None:
	try:
	if model_family == "qwen3":
	base = custom_mod.Qwen3ForCausalLM.from_pretrained(
	backbone_ref,
	**_dtype_kwargs(custom_mod.Qwen3ForCausalLM.from_pretrained, torch_dtype),
	trust_remote_code=trust_remote_code,
	cache_dir=cache_dir,
	revision=revision,
	token=token,
	local_files_only=local_files_only,
	device_map=load_device_map,
	**config_kwargs,
	)
	elif model_family == "mistral":
	base = custom_mod.MistralForCausalLM.from_pretrained(
	backbone_ref,
	**_dtype_kwargs(custom_mod.MistralForCausalLM.from_pretrained, torch_dtype),
	trust_remote_code=trust_remote_code,
	cache_dir=cache_dir,
	revision=revision,
	token=token,
	local_files_only=local_files_only,
	device_map=load_device_map,
	**config_kwargs,
	)
	else:
	base = custom_mod.LlamaForCausalLM.from_pretrained(
	backbone_ref,
	**_dtype_kwargs(custom_mod.LlamaForCausalLM.from_pretrained, torch_dtype),
	trust_remote_code=trust_remote_code,
	cache_dir=cache_dir,
	revision=revision,
	token=token,
	local_files_only=local_files_only,
	device_map=load_device_map,
	**config_kwargs,
	)
	except Exception as exc:
	custom_load_exc = exc
	base = None

	if base is None and force_custom_modeling and custom_mod is not None and custom_load_exc is not None:
	raise RuntimeError(
	f"Failed to load base model with custom LLoPA modeling from {backbone_ref}"
	) from custom_load_exc

	if base is None:
	base = AutoModelForCausalLM.from_pretrained(
	backbone_ref,
	trust_remote_code=trust_remote_code,
	**_dtype_kwargs(AutoModelForCausalLM.from_pretrained, torch_dtype),
	cache_dir=cache_dir,
	revision=revision,
	token=token,
	local_files_only=local_files_only,
	device_map=load_device_map,
	**config_kwargs,
	)

	ensure_mistral_special_token(tokenizer, base)
	load_embedding_layer(base, repo_path)
	if merge_on_cpu_active:
	try:
	base = base.to("cpu")
	except Exception:
	pass

	model = None
	if has_lora:
	num_lora = int(number_of_lora or 1)
	if num_lora == 2:
	gen_dir = lora_path / "gen"
	prefill_dir = lora_path / "prefill"
	if gen_dir.is_dir() and prefill_dir.is_dir():
	try:
	from peft import PeftModel
	peft_gen = PeftModel.from_pretrained(base, str(gen_dir), adapter_name="gen")
	try:
	peft_gen.set_adapter("gen")
	except Exception:
	pass
	merged_base = peft_gen.merge_and_unload()
	peft_prefill = PeftModel.from_pretrained(merged_base, str(prefill_dir), adapter_name="prefill")
	model = peft_prefill
	setattr(model, "_prefill_adapter_only", True)
	except Exception:
	model = base
	else:
	model = base
	if model is None or model is base:
	try:
	from peft import PeftModel
	peft = PeftModel.from_pretrained(base, str(lora_path))
	try:
	model = peft.merge_and_unload()
	except Exception:
	model = peft
	except Exception:
	model = base
	else:
	model = base

	if num_specials > 0:
	if not load_llopa_specials(model, repo_path):
	print("[Warn] Failed to load LLOPA specials.")

	try:
	p0 = next(model.parameters())
	print(f"[load_llopa_model] model dtype={p0.dtype}, on={p0.device}")
	except Exception:
	pass

	if device_map is None:
	model = model.to(device).eval()
	else:
	model = model.eval()
	if no_upper_attn is not None:
	try:
	setattr(model, "_no_upper_attn", bool(no_upper_attn))
	except Exception:
	pass
	if bool(runtime_prefill_lower):
	if _supports_prefill_lower_runtime(model):
	_attach_prefill_lower_generate(
	model,
	lower_k=int(runtime_prefill_layers or 0),
	prefill_attn=str(runtime_prefill_attn or "causal"),
	system_prefill=str(runtime_prefill_system_prefill or "no_bos_system"),
	)
	try:
	print(
	f"[load_llopa_model] standard generate runtime enabled "
	f"(prefill_lower_layers={int(runtime_prefill_layers or 0)}, "
	f"prefill_attn={str(runtime_prefill_attn or 'causal')})"
	)
	except Exception:
	pass
	try:
	setattr(model, "_capsule_inference_path", "runtime_lower")
	except Exception:
	pass
	else:
	print("[load_llopa_model][warn] runtime_prefill_lower requested but TRI prefill-lower runtime is unavailable.")
	if bool(runtime_prefill_freeze):
	if _supports_prefill_lower_runtime(model):
	_attach_prefill_lower_freeze_generate(
	model,
	tokenizer=tokenizer,
	lower_k=int(runtime_prefill_freeze_layers or 0),
	prefill_attn=str(runtime_prefill_freeze_attn or "causal"),
	system_prefill=str(runtime_prefill_freeze_system_prefill or "no_bos_system"),
	)
	try:
	print(
	f"[load_llopa_model] freeze-faithful generate runtime enabled "
	f"(prefill_lower_layers={int(runtime_prefill_freeze_layers or 0)}, "
	f"prefill_attn={str(runtime_prefill_freeze_attn or 'causal')}, "
	f"system_prefill={str(runtime_prefill_freeze_system_prefill or 'no_bos_system')})"
	)
	except Exception:
	pass
	try:
	setattr(model, "_capsule_inference_path", "runtime_freeze")
	except Exception:
	pass
	else:
	print("[load_llopa_model][warn] runtime_prefill_freeze requested but TRI prefill-freeze runtime is unavailable.")
	if bool(runtime_prefill_solo):
	if _supports_prefill_lower_runtime(model):
	_attach_prefill_lower_solo_generate(
	model,
	tokenizer=tokenizer,
	lower_k=int(runtime_prefill_solo_layers or 0),
	prefill_attn=str(runtime_prefill_solo_attn or "causal"),
	system_prefill=str(runtime_prefill_solo_system_prefill or "no_bos_system"),
	)
	try:
	print(
	f"[load_llopa_model] solo-attn generate runtime enabled "
	f"(prefill_lower_layers={int(runtime_prefill_solo_layers or 0)}, "
	f"prefill_attn={str(runtime_prefill_solo_attn or 'causal')}, "
	f"system_prefill={str(runtime_prefill_solo_system_prefill or 'no_bos_system')})"
	)
	except Exception:
	pass
	try:
	setattr(model, "_capsule_inference_path", "runtime_solo")
	except Exception:
	pass
	else:
	print("[load_llopa_model][warn] runtime_prefill_solo requested but TRI prefill-solo runtime is unavailable.")
	if bool(runtime_prefill_solo_v2):
	if _supports_prefill_lower_runtime(model):
	_attach_prefill_lower_solo_v2_generate(
	model,
	tokenizer=tokenizer,
	lower_k=int(runtime_prefill_solo_v2_layers or 0),
	prefill_attn=str(runtime_prefill_solo_v2_attn or "causal"),
	system_prefill=str(runtime_prefill_solo_v2_system_prefill or "no_bos_system"),
	with_bos=bool(runtime_prefill_solo_v2_with_bos),
	)
	try:
	print(
	f"[load_llopa_model] solo-attn-v2 generate runtime enabled "
	f"(prefill_lower_layers={int(runtime_prefill_solo_v2_layers or 0)}, "
	f"prefill_attn={str(runtime_prefill_solo_v2_attn or 'causal')}, "
	f"system_prefill={str(runtime_prefill_solo_v2_system_prefill or 'no_bos_system')}, "
	f"with_bos={int(bool(runtime_prefill_solo_v2_with_bos))})"
	)
	except Exception:
	pass
	try:
	setattr(model, "_capsule_inference_path", "runtime_solo_v2")
	except Exception:
	pass
	else:
	print("[load_llopa_model][warn] runtime_prefill_solo_v2 requested but TRI prefill-solo-v2 runtime is unavailable.")
	if bool(runtime_llopa_prefill):
	if _supports_runtime_llopa_prompt_prefill(model):
	header_ids = _assistant_header_ids(tokenizer, "cpu")
	if isinstance(header_ids, torch.Tensor) and header_ids.numel() > 0:
	_attach_runtime_llopa_generate(
	model,
	header_ids=header_ids,
	lower_k=int(runtime_llopa_layers or 0),
	prefill_attn=str(runtime_llopa_attn or "causal"),
	no_upper_attn=bool(runtime_llopa_no_upper_attn),
	)
	if bool(runtime_llopa_fast_generate):
	_attach_runtime_llopa_fast_generate(
	model,
	lower_k=int(runtime_llopa_layers or 0),
	prefill_attn=str(runtime_llopa_attn or "causal"),
	no_upper_attn=bool(runtime_llopa_no_upper_attn),
	)
	try:
	print(
	f"[load_llopa_model] standard generate runtime enabled "
	f"(llopa_prefill_layers={int(runtime_llopa_layers or 0)}, "
	f"prefill_attn={str(runtime_llopa_attn or 'causal')})"
	)
	except Exception:
	pass
	try:
	setattr(model, "_capsule_inference_path", "legacy_llopa")
	except Exception:
	pass
	if bool(runtime_llopa_fast_generate):
	try:
	print("[load_llopa_model] runtime_llopa_fast_generate enabled")
	except Exception:
	pass
	try:
	setattr(model, "_capsule_inference_path", "legacy_llopa")
	except Exception:
	pass
	else:
	print("[load_llopa_model][warn] runtime_llopa_prefill requested but assistant header ids are unavailable.")
	else:
	print("[load_llopa_model][warn] runtime_llopa_prefill requested but TRI LLoPA runtime is unavailable.")
	if bool(unified_llopa_generate):
	if _supports_direct_llopa_generate(model):
	_attach_unified_llopa_generate(
	model,
	tokenizer,
	lower_k=int(unified_llopa_layers or 0),
	prefill_attn=str(unified_llopa_attn or "causal"),
	system_prefill=str(unified_llopa_system_prefill or "full"),
	user_prefill=str(unified_llopa_user_prefill or "full"),
	no_upper_attn=bool(unified_llopa_no_upper_attn),
	see_past_assistant=bool(unified_llopa_see_past_assistant),
	replay_module=str(unified_llopa_replay_module or "none"),
	replay_per_layers=int(unified_llopa_replay_per_layers or -1),
	)
	try:
	print(
	f"[load_llopa_model] unified_llopa generate enabled "
	f"(llopa_prefill_layers={int(unified_llopa_layers or 0)}, "
	f"prefill_attn={str(unified_llopa_attn or 'causal')}, "
	f"replay_module={str(unified_llopa_replay_module or 'none')}, "
	f"replay_per_layers={int(unified_llopa_replay_per_layers or -1)})"
	)
	except Exception:
	pass
	else:
	print("[load_llopa_model][warn] unified_llopa_generate requested but LLoPA direct prompt prefill is unavailable.")
	if bool(llopa_v2_batch_generate):
	if _supports_direct_llopa_generate(model):
	_attach_llopa_v2_batch_generate(
	model,
	tokenizer,
	lower_k=int(llopa_v2_layers or 0),
	prefill_attn=str(llopa_v2_attn or "causal"),
	system_prefill=str(llopa_v2_system_prefill or "full"),
	user_prefill=str(llopa_v2_user_prefill or "full"),
	no_upper_attn=bool(llopa_v2_no_upper_attn),
	see_past_assistant=bool(llopa_v2_see_past_assistant),
	replay_module=str(llopa_v2_replay_module or "none"),
	replay_per_layers=int(llopa_v2_replay_per_layers or -1),
	seed_mode=str(llopa_v2_seed_mode or "prefill_header"),
	)
	try:
	print(
	f"[load_llopa_model] llopa_v2_batch generate enabled "
	f"(llopa_prefill_layers={int(llopa_v2_layers or 0)}, "
	f"prefill_attn={str(llopa_v2_attn or 'causal')}, "
	f"replay_module={str(llopa_v2_replay_module or 'none')}, "
	f"replay_per_layers={int(llopa_v2_replay_per_layers or -1)}, "
	f"seed_mode={str(llopa_v2_seed_mode or 'prefill_header')})"
	)
	except Exception:
	pass
	else:
	print("[load_llopa_model][warn] llopa_v2_batch_generate requested but LLoPA direct prompt prefill is unavailable.")
	if bool(llopa_v2_generate):
	if _supports_direct_llopa_generate(model):
	llopa_v3_active = bool(llopa_v3_generate)
	_attach_llopa_v2_generate(
	model,
	tokenizer,
	lower_k=int(llopa_v2_layers or 0),
	prefill_attn=str(llopa_v2_attn or "causal"),
	system_prefill=str(llopa_v2_system_prefill or "full"),
	user_prefill=str(llopa_v2_user_prefill or "full"),
	no_upper_attn=bool(llopa_v2_no_upper_attn),
	see_past_assistant=bool(llopa_v2_see_past_assistant),
	replay_module=str(llopa_v2_replay_module or "none"),
	replay_per_layers=int(llopa_v2_replay_per_layers or -1),
	seed_mode=str(llopa_v2_seed_mode or "prefill_header"),
	generation_mixin_decode=llopa_v3_active,
	capsule_inference_path="llopa_v3" if llopa_v3_active else "llopa_v2",
	)
	try:
	label = "llopa_v3" if llopa_v3_active else "llopa_v2"
	print(
	f"[load_llopa_model] {label} generate enabled "
	f"(llopa_prefill_layers={int(llopa_v2_layers or 0)}, "
	f"prefill_attn={str(llopa_v2_attn or 'causal')}, "
	f"replay_module={str(llopa_v2_replay_module or 'none')}, "
	f"replay_per_layers={int(llopa_v2_replay_per_layers or -1)}, "
	f"seed_mode={str(llopa_v2_seed_mode or 'prefill_header')}, "
	f"generation_mixin_decode={int(llopa_v3_active)})"
	)
	except Exception:
	pass
	else:
	label = "llopa_v3_generate" if bool(llopa_v3_generate) else "llopa_v2_generate"
	print(f"[load_llopa_model][warn] {label} requested but LLoPA direct prompt prefill is unavailable.")
	if bool(optimized_llopa_generate):
	if _supports_direct_llopa_generate(model):
	_attach_optimized_llopa_generate(
	model,
	tokenizer,
	lower_k=int(optimized_llopa_layers or 0),
	prefill_attn=str(optimized_llopa_attn or "causal"),
	system_prefill=str(optimized_llopa_system_prefill or "full"),
	user_prefill=str(optimized_llopa_user_prefill or "full"),
	no_upper_attn=bool(optimized_llopa_no_upper_attn),
	see_past_assistant=bool(optimized_llopa_see_past_assistant),
	replay_module=str(optimized_llopa_replay_module or "none"),
	replay_per_layers=int(optimized_llopa_replay_per_layers or -1),
	optimized_variant=str(optimized_settings["variant"]),
	optimized_seed_mode=str(optimized_settings["seed_mode"]),
	optimized_upper_prepare_mode=str(optimized_settings["upper_prepare_mode"]),
	optimized_upper_bucket_multiple=int(optimized_settings["upper_bucket_multiple"]),
	optimized_seq_bucket_multiple=int(optimized_settings["seq_bucket_multiple"]),
	)
	try:
	print(
	f"[load_llopa_model] optimized_llopa generate enabled "
	f"(llopa_prefill_layers={int(optimized_llopa_layers or 0)}, "
	f"prefill_attn={str(optimized_llopa_attn or 'causal')}, "
	f"replay_module={str(optimized_llopa_replay_module or 'none')}, "
	f"replay_per_layers={int(optimized_llopa_replay_per_layers or -1)}, "
	f"variant={str(optimized_settings['variant'])})"
	)
	except Exception:
	pass
	else:
	print("[load_llopa_model][warn] optimized_llopa_generate requested but LLoPA direct prompt prefill is unavailable.")
	if bool(direct_llopa_generate):
	print(
	"[load_llopa_model][warn] direct_llopa_* is deprecated; "
	"use unified_llopa_* or INFERENCE_PATH=unified_llopa. Legacy users can keep existing envs unchanged."
	)
	if _supports_direct_llopa_generate(model):
	_attach_direct_llopa_generate(
	model,
	tokenizer,
	lower_k=int(direct_llopa_layers or 0),
	prefill_attn=str(direct_llopa_attn or "causal"),
	system_prefill=str(direct_llopa_system_prefill or "full"),
	user_prefill=str(direct_llopa_user_prefill or "full"),
	no_upper_attn=bool(direct_llopa_no_upper_attn),
	)
	try:
	print(
	f"[load_llopa_model] direct_llopa generate enabled "
	f"(llopa_prefill_layers={int(direct_llopa_layers or 0)}, "
	f"prefill_attn={str(direct_llopa_attn or 'causal')})"
	)
	except Exception:
	pass
	else:
	print("[load_llopa_model][warn] direct_llopa_generate requested but LLoPA direct prompt prefill is unavailable.")

	if force_custom_modeling:
	try:
	has_llopa = _has_active_llopa_runtime(model)
	except Exception:
	has_llopa = False
	if not has_llopa:
	raise RuntimeError("Custom LLoPA modeling not active at inference. Check --lopa_modeling_path.")

	if attn_impl != "auto":
	impl = attn_impl
	for k in ("attn_implementation", "_attn_implementation"):
	try:
	setattr(model.config, k, impl)
	inner = getattr(model, "model", None) or getattr(model, "transformer", None)
	if inner is not None and hasattr(inner, "config"):
	setattr(inner.config, k, impl)
	except Exception:
	pass

	_attach_llopa_generate(model)

	return model, tokenizer


	# -----------------------------
	# CLI
	# -----------------------------
	def main():
	ap = argparse.ArgumentParser("TRI inference helper")
	ap.add_argument("--best_dir", type=str, required=True, help="Path to best/ folder produced by training")
	ap.add_argument("--backbone_dir", type=str, default=None,
	help="Optional backbone path/ID (overrides best_dir/backbone.json and --model_name).")
	ap.add_argument("--model_name", type=str, default="meta-llama/Llama-3.1-8B-Instruct")
	ap.add_argument("--tokenizer_name", type=str, default="", help="Optional tokenizer name or path")
	ap.add_argument("--prefill_layers", type=int, default=4)
	ap.add_argument("--prefill_mode", type=str, choices=["lower", "periodic"], default="lower",
	help="Prefill mode for user tokens: 'lower' uses first K layers, 'periodic' uses every K-th layer.")
	ap.add_argument("--prefill_attn", type=str, choices=["causal", "full"], default="causal",
	help="Prefill attention for system/user tokens (training must match).")
	ap.add_argument("--system_prefill", type=str, choices=["full", "no_system", "no_bos_system"], default="full",
	help="System prefill mode (must match training).")
	ap.add_argument("--user_prefill", type=str, choices=["full", "no_question"], default="full",
	help="User prefill ablation: full=doc+question, no_question=doc-only (question runs in full layers).")
	ap.add_argument("--llopa_prefill", action="store_true",
	help="Use single-forward LLOPA prefill (causal + lower only).")
	ap.add_argument("--no_upper_attn", action="store_true",
	help="Skip upper-layer attention during decode (effective only with --llopa_prefill).")
	ap.add_argument("--lopa_modeling_path", type=str, default="tri_llama3_modeling.py",
	help="Path to custom LLoPA modeling file used in training")
	ap.add_argument("--modeling_family", type=str, choices=["auto", "llama", "qwen3", "mistral"], default="auto",
	help="Model family for custom modeling injection (auto detects from --model_name)")
	ap.add_argument("--force_custom_modeling", action="store_true",
	help="Require custom LLoPA modeling to be active; error if not.")
	ap.add_argument("--system", type=str, default="You are a helpful assistant that answers questions based on the given document. ")
	ap.add_argument("--task", type=str, default="qa_doc",
	help="Prompt template task: qa_doc \| math \| summary \| code")
	ap.add_argument("--math_force_final_hash_rule", action="store_true",
	help="If set and task=math, append the #### answer rule to the system prompt.")
	ap.add_argument("--document", type=str, required=True)
	ap.add_argument("--question", type=str, required=True)
	ap.add_argument("--max_new_tokens", type=int, default=256)
	ap.add_argument("--min_length", type=int, default=16)
	ap.add_argument("--temperature", type=float, default=0.7)
	ap.add_argument("--top_p", type=float, default=0.9)
	ap.add_argument("--top_k", type=int, default=None)
	ap.add_argument("--do_sample", action="store_true")
	# numeric controls for reproducibility
	ap.add_argument("--dtype", type=str, choices=["auto","bf16","fp16","fp32"], default="auto")
	ap.add_argument("--no_tf32", action="store_true")
	ap.add_argument("--sdpa_math_only", action="store_true")
	ap.add_argument("--debug", action="store_true")
	ap.add_argument("--attn_impl", type=str, choices=["sdpa", "eager", "auto"], default="sdpa",
	help="Attention implementation override (auto keeps model default).")
	args = ap.parse_args()

	device = "cuda" if torch.cuda.is_available() else "cpu"
	if args.dtype == "fp32":
	dtype = torch.float32
	elif args.dtype == "bf16":
	dtype = torch.bfloat16
	elif args.dtype == "fp16":
	dtype = torch.float16
	else:
	dtype = torch.bfloat16 if (device == "cuda" and torch.cuda.is_bf16_supported()) else (torch.float16 if device == "cuda" else torch.float32)

	# global numeric toggles
	if args.no_tf32 and torch.cuda.is_available():
	try:
	torch.backends.cuda.matmul.allow_tf32 = False
	torch.backends.cudnn.allow_tf32 = False
	except Exception:
	pass
	if args.sdpa_math_only and torch.cuda.is_available():
	try:
	torch.backends.cuda.enable_flash_sdp(False)
	torch.backends.cuda.enable_mem_efficient_sdp(False)
	torch.backends.cuda.enable_math_sdp(True)
	except Exception:
	pass

	best_dir = Path(args.best_dir)
	if getattr(args, "tokenizer_name", ""):
	tok_src = args.tokenizer_name
	else:
	tok_src = str(best_dir) if (best_dir / "tokenizer.json").is_file() else args.model_name
	tokenizer = AutoTokenizer.from_pretrained(
	tok_src,
	use_fast=True,
	**_tokenizer_kwargs(AutoTokenizer.from_pretrained),
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right"

	# Prefer a saved base backbone under best_dir/base if present (captures resized embeddings)
	backbone_ref = None
	if args.backbone_dir:
	backbone_ref = args.backbone_dir
	else:
	backbone_ref = read_backbone_ref(best_dir)
	base_path = best_dir / "base"
	if backbone_ref:
	base_load_src = backbone_ref
	elif base_path.exists() and any(base_path.iterdir()):
	base_load_src = str(base_path)
	else:
	base_load_src = args.model_name

	num_specials = None
	tri_no_upper_attn = None
	tri_info = best_dir / "tri_info.txt"
	if tri_info.is_file():
	info = _read_kv_file(tri_info)
	try:
	num_specials = int(info.get("num_specials", "") or 0)
	except Exception:
	num_specials = None
	raw_no_upper = (info.get("no_upper_attn") or "").strip().lower()
	if raw_no_upper in {"1", "true", "yes", "on"}:
	tri_no_upper_attn = True
	elif raw_no_upper in {"0", "false", "no", "off"}:
	tri_no_upper_attn = False
	if (not bool(getattr(args, "no_upper_attn", False))) and (tri_no_upper_attn is not None):
	args.no_upper_attn = bool(tri_no_upper_attn)

	config = None
	try:
	config = AutoConfig.from_pretrained(base_load_src)
	if num_specials is not None:
	config.llopa_num_specials = int(num_specials)
	except Exception:
	config = None
	if num_specials is None:
	num_specials = int(getattr(config, "llopa_num_specials", 0) or 0) if config is not None else 0
	config_kwargs = {"config": config} if config is not None else {}

	# Try loading custom LLoPA modeling before model load
	custom_mod = None
	model_family = infer_model_family(args.model_name, args.modeling_family)
	try:
	custom_mod = load_custom_modeling(args.lopa_modeling_path, model_family=model_family)
	except Exception:
	custom_mod = None

	base = None
	if custom_mod is not None:
	try:
	# Prefer explicit class if available (ensures we really use custom class)
	if model_family == "qwen3":
	base = custom_mod.Qwen3ForCausalLM.from_pretrained(
	base_load_src,
	**_dtype_kwargs(custom_mod.Qwen3ForCausalLM.from_pretrained, dtype),
	**config_kwargs,
	)
	elif model_family == "mistral":
	base = custom_mod.MistralForCausalLM.from_pretrained(
	base_load_src,
	**_dtype_kwargs(custom_mod.MistralForCausalLM.from_pretrained, dtype),
	**config_kwargs,
	)
	else:
	base = custom_mod.LlamaForCausalLM.from_pretrained(
	base_load_src,
	**_dtype_kwargs(custom_mod.LlamaForCausalLM.from_pretrained, dtype),
	**config_kwargs,
	)
	except Exception:
	base = None
	if base is None:
	base = AutoModelForCausalLM.from_pretrained(
	base_load_src,
	trust_remote_code=False,
	**_dtype_kwargs(AutoModelForCausalLM.from_pretrained, dtype),
	**config_kwargs,
	)
	# Ensure special token availability for Mistral
	ensure_mistral_special_token(tokenizer, base)
	# Apply saved embedding layer (special tokens) if present.
	loaded_emb = load_embedding_layer(base, best_dir)

	# attach LoRA if exists
	lora_path = best_dir / "lora"
	model = None
	merged_lora = False
	if lora_path.exists() and any(lora_path.iterdir()):
	try:
	from peft import PeftModel
	peft = PeftModel.from_pretrained(base, str(lora_path))
	try:
	model = peft.merge_and_unload()
	merged_lora = True
	except Exception:
	# fallback: keep PEFT wrapper without merge
	model = peft
	except Exception:
	model = base
	else:
	model = base

	if num_specials > 0:
	if not load_llopa_specials(model, best_dir):
	print("[Warn] Failed to load LLOPA specials.")

	# device & eval
	model = model.to(device).eval()
	try:
	setattr(model, "_no_upper_attn", bool(getattr(args, "no_upper_attn", False)))
	except Exception:
	pass

	# Validate custom modeling presence if requested
	if args.force_custom_modeling:
	has_llopa = _has_active_llopa_runtime(model)
	if not has_llopa:
	raise RuntimeError("Custom LLoPA modeling not active at inference. Check --lopa_modeling_path.")

	# Allow sdpa/eager; avoid flash_attention_2 for LoPA masks.
	if args.attn_impl != "auto":
	impl = args.attn_impl
	for k in ("attn_implementation", "_attn_implementation"):
	try:
	setattr(model.config, k, impl)
	inner = getattr(model, "model", None) or getattr(model, "transformer", None)
	if inner is not None and hasattr(inner, "config"):
	setattr(inner.config, k, impl)
	except Exception:
	pass
	print(f"[infer] Forcing attn_implementation='{impl}' for all models.")
	else:
	print("[infer] Using model default attn_implementation (auto).")
	if args.debug:
	print(f"[debug] load base from: {base_load_src}")
	if loaded_emb:
	print("[debug] loaded embedding layer from best_dir")
	print(f"[debug] lora path: {lora_path} \| merged={merged_lora}")
	tmpl = getattr(tokenizer, "chat_template", "") or ""
	print(f"[debug] template contains Llama3 header? {('<\|start_header_id\|>' in tmpl)} \| Mistral? {('[INST]' in tmpl)}")

	# debug dir under best_dir
	dbg_dir = (best_dir / "debug_infer") if args.debug else None
	text = lopa_generate(
	model, tokenizer,
	system=args.system, document=args.document, question=args.question,
	task=str(getattr(args, "task", "qa_doc")),
	K=int(args.prefill_layers),
	prefill_mode=str(args.prefill_mode),
	prefill_attn=str(getattr(args, "prefill_attn", "causal")),
	system_prefill=str(getattr(args, "system_prefill", "full")),
	user_prefill=str(getattr(args, "user_prefill", "full")),
	device=device,
	max_new_tokens=args.max_new_tokens, min_length=args.min_length,
	temperature=args.temperature, top_p=args.top_p, top_k=args.top_k,
	do_sample=bool(args.do_sample),
	math_force_final_hash_rule=bool(getattr(args, "math_force_final_hash_rule", False)),
	llopa_prefill=bool(getattr(args, "llopa_prefill", False)),
	no_upper_attn=bool(getattr(args, "no_upper_attn", False)),
	debug=bool(args.debug), debug_dir=dbg_dir,
	)
	print(text)

	if __name__ == "__main__":
	main()