Image-Text-to-Text
Transformers
Safetensors
fast_d_drive
feature-extraction
block-diffusion
vision-language-action
autonomous-driving
qwen2.5-vl
conversational
custom_code
Instructions to use xiwenyoumu/Fast-dDrive with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use xiwenyoumu/Fast-dDrive with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="xiwenyoumu/Fast-dDrive", trust_remote_code=True) messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("xiwenyoumu/Fast-dDrive", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use xiwenyoumu/Fast-dDrive with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "xiwenyoumu/Fast-dDrive" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "xiwenyoumu/Fast-dDrive", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/xiwenyoumu/Fast-dDrive
- SGLang
How to use xiwenyoumu/Fast-dDrive with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "xiwenyoumu/Fast-dDrive" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "xiwenyoumu/Fast-dDrive", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "xiwenyoumu/Fast-dDrive" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "xiwenyoumu/Fast-dDrive", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use xiwenyoumu/Fast-dDrive with Docker Model Runner:
docker model run hf.co/xiwenyoumu/Fast-dDrive
File size: 34,310 Bytes
5e9a603 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 | """
Section-aware block scheduling for JSON structured output.
Inspired by the S3 (Self-adaptive Schema Scaffolding) paper (arXiv:2507.04504),
this module provides utilities to:
1. Parse tokenized JSON output into sections (critical_objects, explanation, etc.)
2. Assign section-aware block indices for variable block sizes per section
3. Build JSON scaffolds for inference (pre-fill structural tokens)
The DVLM-AD output schema has 4 sections:
- critical_objects: ~88 tokens (12 yes/no fields, nearly constant)
- explanation: ~114 tokens (variable, 72-172)
- future_meta_behavior: ~40 tokens (nearly constant)
- trajectory: ~80 tokens (nearly constant)
"""
import torch
from typing import Dict, List, Optional, Tuple
import math
# Ordered list of section keys as they appear in the JSON output
SECTION_KEYS = [
"critical_objects",
"explanation",
"future_meta_behavior",
"trajectory",
]
# Default token budgets per section (based on training data analysis)
DEFAULT_TOKEN_BUDGETS = {
"critical_objects": 88,
"explanation": 128,
"future_meta_behavior": 40,
"trajectory": 80,
}
# Default steps per section
DEFAULT_SECTION_STEPS = {
"critical_objects": 1,
"explanation": 3,
"future_meta_behavior": 1,
"trajectory": 1,
}
def _v1_removed_parse_json_sections(*args, **kwargs):
raise NotImplementedError("DS v1 parse_json_sections has been removed. Use deep scaffold v2.")
def _v1_removed_compute_section_block_idx(*args, **kwargs):
raise NotImplementedError("DS v1 compute_section_block_idx has been removed. Use compute_section_block_idx_deep_static.")
def _v1_removed_build_json_scaffold(*args, **kwargs):
raise NotImplementedError("DS v1 build_json_scaffold has been removed. Use build_deep_json_scaffold.")
def _v1_removed_compute_section_block_sizes(*args, **kwargs):
raise NotImplementedError("DS v1 compute_section_block_sizes has been removed.")
def build_static_scaffold_sequences(tokenizer) -> Dict[str, List[int]]:
"""Pre-compute token sequences for top-level JSON boundary matching.
Used internally by :func:`build_deep_scaffold_sequences`.
"""
return {
"prefix": tokenizer.encode('{"critical_objects":', add_special_tokens=False),
"between_co_exp": tokenizer.encode(' "explanation":', add_special_tokens=False),
"between_exp_fmb": tokenizer.encode(' "future_meta_behavior":', add_special_tokens=False),
"between_fmb_traj": tokenizer.encode(' "trajectory":', add_special_tokens=False),
}
def _v1_removed_compute_section_block_idx_static(*args, **kwargs):
raise NotImplementedError("DS v1 compute_section_block_idx_static has been removed. Use compute_section_block_idx_deep_static.")
# Backward-compatible aliases so stale imports produce clear errors
parse_json_sections = _v1_removed_parse_json_sections
compute_section_block_idx = _v1_removed_compute_section_block_idx
build_json_scaffold = _v1_removed_build_json_scaffold
compute_section_block_sizes = _v1_removed_compute_section_block_sizes
compute_section_block_idx_static = _v1_removed_compute_section_block_idx_static
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Deep scaffold v2: constants and utilities
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
NULL_TOKEN_ID = 151666
# critical_objects: 12 sub-keys, each value is exactly 1 token (yes=9693 / no=2152)
CRITICAL_OBJECTS_SUBKEYS = [
"nearby_vehicle", "pedestrian", "cyclist", "construction",
"traffic_element", "weather_condition", "road_hazard",
"emergency_vehicle", "animal", "special_vehicle",
"conflicting_vehicle", "door_opening_vehicle",
]
# future_meta_behavior: each sub-key value is exactly 3 tokens
# (e.g., "keep speed" β [4867, 4732, 151667] or "go straight" β [2849, 7833, 151667])
FMB_VALUE_BUDGET = 3
def build_deep_json_scaffold(
tokenizer,
section_token_budgets: Optional[Dict[str, int]] = None,
mask_id: Optional[int] = None,
null_id: Optional[int] = None,
explanation_block_size: int = 32,
explanation_max_blocks: int = 6,
) -> Tuple[List[int], Dict[str, Tuple[int, int]], List[int]]:
"""Build a deep JSON scaffold for inference (v2).
Constructs a template response by building a Python dict and
processing it through the **exact same pipeline** as the training
dataloader (``multi_modal_dataset.py``):
1. Build a realistic dict with placeholder values.
2. Pad explanation with ``<|NULL|>`` to ``exp_budget`` tokens.
3. Pad FMB values with ``<|NULL|>`` to 3 tokens each.
4. Normalize trajectory to ``+XXX.XX`` format with spaces.
5. Serialize with ``json.dumps(obj, ensure_ascii=False)``.
6. Tokenize the whole string as one piece.
7. Run ``compute_section_block_idx_deep_static`` to get scaffold/value.
8. Replace value positions with MASK tokens.
This guarantees identical BPE tokenization as training data.
Returns
-------
scaffold_tokens : list[int]
Token IDs with MASK at value positions.
section_ranges : dict
Section name -> (start, end) within scaffold_tokens.
scaffold_mask : list[int]
0 = value (to denoise), 1 = scaffold (frozen).
"""
import torch as _torch
import json as _json
import re as _re
if mask_id is None:
mask_tok = tokenizer.encode("|<MASK>|", add_special_tokens=False)
mask_id = mask_tok[0] if len(mask_tok) == 1 else 151665
if null_id is None:
null_id = NULL_TOKEN_ID
exp_budget = explanation_block_size * explanation_max_blocks # default 192
# ββ Step 1: Build a Python dict matching training data structure ββ
# Placeholder explanation text (will be replaced with MASK anyway).
filler_explanation = (
"The ego vehicle is driving forward on the road. "
"There are nearby vehicles ahead that may affect the path. "
"No pedestrians or cyclists are detected in the immediate area. "
"The road conditions appear normal with no hazards present. "
"Speed adjustment may be needed based on the traffic ahead. "
"No lateral maneuvering is required at this time."
)
def _build_template(n_exp_nulls: int) -> str:
"""Build template via json.dumps β identical to dataloader output."""
null_pad = "<|NULL|>" * n_exp_nulls
data_obj = {
"critical_objects": {
"nearby_vehicle": "no", "pedestrian": "no", "cyclist": "no",
"construction": "no", "traffic_element": "no",
"weather_condition": "no", "road_hazard": "no",
"emergency_vehicle": "no", "animal": "no",
"special_vehicle": "no", "conflicting_vehicle": "no",
"door_opening_vehicle": "no",
},
"explanation": filler_explanation + null_pad,
"future_meta_behavior": {
"longitudinal": "come to stop",
"lateral": "go straight<|NULL|>",
},
# Raw trajectory β will be normalized below
"trajectory": "[[+14.70,-00.04], [+29.55,-00.21], [+44.51,-00.56], [+59.50,-01.06], [+74.39,-01.69]]",
}
# Apply exact same trajectory normalization as dataloader (lines 851-863)
traj = data_obj["trajectory"]
def _fmt_coord(m):
raw = m.group(0)
sign = raw[0]
num = float(raw[1:])
return f"{sign}{num:06.2f}"
traj = _re.sub(r'[+-]\d+\.\d+', _fmt_coord, traj)
traj = _re.sub(r',([+-])', r', \1', traj)
traj = _re.sub(r'\[([+-])', r'[ \1', traj)
data_obj["trajectory"] = traj
# Serialize with json.dumps β identical to dataloader line 865
return _json.dumps(data_obj, ensure_ascii=False)
# ββ Step 2: Iteratively adjust NULL count for exp_budget ββ
deep_seqs = build_deep_scaffold_sequences(tokenizer)
top_seqs = deep_seqs["top"]
def _count_exp_value_tokens(tok_list):
"""Count explanation VALUE tokens (between boundary patterns)."""
co_exp_pat = top_seqs["between_co_exp"]
exp_fmb_pat = top_seqs["between_exp_fmb"]
co_exp_pos = _find_subseq(tok_list, co_exp_pat, 0)
if co_exp_pos < 0:
return None
exp_start = co_exp_pos + len(co_exp_pat)
exp_fmb_pos = _find_subseq(tok_list, exp_fmb_pat, exp_start)
if exp_fmb_pos < 0:
return None
# exp_start..exp_fmb_pos includes opening/closing quotes (scaffold)
# value tokens = total - 2 (quotes)
return (exp_fmb_pos - exp_start) - 2
# Measure base explanation tokens (no NULLs)
toks_0 = tokenizer.encode(_build_template(0), add_special_tokens=False)
base_exp = _count_exp_value_tokens(toks_0)
if base_exp is not None:
needed_nulls = max(0, exp_budget - base_exp)
else:
needed_nulls = exp_budget // 2 # fallback
# Build and measure, adjust once
template = _build_template(needed_nulls)
template_tokens = tokenizer.encode(template, add_special_tokens=False)
actual_exp = _count_exp_value_tokens(template_tokens)
if actual_exp is not None and actual_exp != exp_budget:
needed_nulls = max(0, needed_nulls + (exp_budget - actual_exp))
template = _build_template(needed_nulls)
template_tokens = tokenizer.encode(template, add_special_tokens=False)
# ββ Step 3: Run training scaffold detection ββ
prompt_len = 10
all_tokens = [1] * prompt_len + template_tokens
labels_list = [-100] * prompt_len + template_tokens
labels = _torch.tensor([labels_list])
token_ids = _torch.tensor([all_tokens])
_, _, _, scaffold_mask_tensor, _ = compute_section_block_idx_deep_static(
labels, token_ids, deep_seqs, fallback_block_size=32,
)
# ββ Step 4: Extract scaffold/value and replace value with MASK ββ
scaffold_tokens = list(template_tokens)
scaffold_mask_list: List[int] = []
for i in range(len(template_tokens)):
abs_pos = prompt_len + i
is_scaffold = scaffold_mask_tensor[abs_pos].item()
scaffold_mask_list.append(1 if is_scaffold else 0)
for i in range(len(scaffold_tokens)):
if scaffold_mask_list[i] == 0:
scaffold_tokens[i] = mask_id
# ββ Step 5: Compute section ranges ββ
section_ranges: Dict[str, Tuple[int, int]] = {}
boundary_order = [
("prefix", "critical_objects"),
("between_co_exp", "explanation"),
("between_exp_fmb", "future_meta_behavior"),
("between_fmb_traj", "trajectory"),
]
search_from = 0
prev_section_name = None
prev_value_start = None
for boundary_key, section_name in boundary_order:
pattern = top_seqs.get(boundary_key)
if pattern is None:
continue
pos = _find_subseq(template_tokens, pattern, search_from)
if pos < 0:
continue
if prev_section_name is not None and prev_value_start is not None:
section_ranges[prev_section_name] = (prev_value_start, pos)
value_start = pos + len(pattern)
prev_section_name = section_name
prev_value_start = value_start
search_from = value_start
if prev_section_name is not None and prev_value_start is not None:
section_ranges[prev_section_name] = (prev_value_start, len(template_tokens))
return scaffold_tokens, section_ranges, scaffold_mask_list
def _find_subseq(seq: List[int], pattern: List[int], start: int = 0) -> int:
"""Find first occurrence of *pattern* in *seq* starting at *start*. Returns -1 if not found."""
n = len(pattern)
for i in range(start, len(seq) - n + 1):
if seq[i : i + n] == pattern:
return i
return -1
def build_deep_scaffold_sequences(tokenizer) -> Dict[str, object]:
"""
Pre-compute token sequences for deep scaffold matching.
Returns a dict with:
- Top-level boundary patterns (same as build_static_scaffold_sequences)
- Sub-key patterns for critical_objects, future_meta_behavior, trajectory
"""
seqs: Dict[str, object] = {}
# ββ Top-level boundaries (reuse existing) ββ
seqs["top"] = build_static_scaffold_sequences(tokenizer)
# ββ critical_objects sub-key patterns ββ
# In context, CO value starts with ' {"nearby_vehicle": "yes", ...'
# Token 5212 = ' {"' merges space+brace+quote in context
# First entry: ' {"key": "'
# Subsequent: '", "key": "' (token 497='","' merges quote+comma)
co_patterns = []
for i, key in enumerate(CRITICAL_OBJECTS_SUBKEYS):
if i == 0:
pattern = tokenizer.encode(' {"' + key + '": "', add_special_tokens=False)
else:
pattern = tokenizer.encode('", "' + key + '": "', add_special_tokens=False)
co_patterns.append({"key": key, "pattern": pattern, "index": i})
seqs["co_subkeys"] = co_patterns
seqs["co_closing"] = tokenizer.encode('"}', add_special_tokens=False)
# json.dumps produces "}," which may merge into a single token
seqs["co_closing_comma"] = tokenizer.encode('"},', add_special_tokens=False)
# ββ future_meta_behavior sub-key patterns ββ
# After dataloader processing (mdm markers removed, NULLs cleaned):
# ' {"longitudinal": "keep speed", "lateral": "go straight"}'
# Scaffold = everything except the value content between quotes.
seqs["fmb_prefix"] = tokenizer.encode(' {"longitudinal": "', add_special_tokens=False)
seqs["fmb_closing"] = tokenizer.encode('"}', add_special_tokens=False)
seqs["fmb_closing_comma"] = tokenizer.encode('"},', add_special_tokens=False)
# Between longitudinal value and lateral value: '", "lateral": "'
seqs["fmb_between"] = tokenizer.encode('", "lateral": "', add_special_tokens=False)
# ββ trajectory structure patterns ββ
# After dataloader processing (no mdm markers), traj is:
# ' "[[+14.70,-00.04], [+29.55,-00.21], ...]"'
seqs["traj_open"] = tokenizer.encode(' "[[', add_special_tokens=False)
# After dataloader inserts spaces (e.g. [+14.70,-00.04] β [ +14.70, -00.04]),
# tokens split cleanly: '],'(1125), ' ['(508), ','(11) are all independent.
seqs["traj_wp_sep"] = tokenizer.encode('],', add_special_tokens=False) # [1125]
seqs["traj_wp_open"] = tokenizer.encode(' [', add_special_tokens=False) # [508]
seqs["traj_coord_comma"] = tokenizer.encode(',', add_special_tokens=False) # [11]
seqs["traj_close"] = tokenizer.encode(']]"}', add_special_tokens=False)
seqs["traj_close_split"] = tokenizer.encode(']]"', add_special_tokens=False)
seqs["traj_close_split2"] = tokenizer.encode(']]', add_special_tokens=False)
# Trajectory-only output support, e.g. {"trajectory": "..."}.
seqs["traj_only_boundaries"] = [
tokenizer.encode('{"trajectory":', add_special_tokens=False),
tokenizer.encode(' {"trajectory":', add_special_tokens=False),
tokenizer.encode('"trajectory":', add_special_tokens=False),
tokenizer.encode(' "trajectory":', add_special_tokens=False),
]
return seqs
def _mark_scaffold_range(scaffold_positions: List[int], start: int, length: int):
"""Add positions [start, start+length) to scaffold_positions."""
for i in range(length):
scaffold_positions.append(start + i)
def compute_section_block_idx_deep_static(
labels: torch.Tensor,
token_ids: torch.Tensor,
deep_scaffold_sequences: Dict[str, object],
fallback_block_size: int = 32,
) -> Tuple[torch.Tensor, torch.Tensor, int, torch.Tensor]:
"""
Deep-scaffold v2 block index computation.
Freezes sub-keys within sections:
- critical_objects: only yes/no values are denoised
- future_meta_behavior: only value tokens are denoised
- trajectory: only coordinate digits are denoised
- explanation: all content is denoised
Block count per section is computed dynamically:
``n_blocks = ceil(num_value_tokens / fallback_block_size)``.
Args:
labels: [B, seq_len]
token_ids: [B, seq_len]
deep_scaffold_sequences: output of ``build_deep_scaffold_sequences``
fallback_block_size: block size (bd_size), default 32
Returns:
response_block_idx, turn_idx, n_blocks, scaffold_mask
"""
labels_single = labels[0]
token_list = token_ids[0].tolist()
seq_len = labels_single.shape[0]
device = labels.device
response_mask = (labels_single != -100)
response_block_idx = torch.full((seq_len,), -1, device=device, dtype=torch.int64)
turn_idx = torch.zeros((seq_len,), device=device, dtype=torch.int64)
scaffold_mask = torch.zeros((seq_len,), device=device, dtype=torch.bool)
response_positions = response_mask.nonzero(as_tuple=True)[0]
if len(response_positions) == 0:
return response_block_idx, turn_idx, 0, scaffold_mask
resp_start = response_positions[0].item()
resp_end = response_positions[-1].item() + 1
effective_resp_end = resp_end
resp_tokens = token_list[resp_start:resp_end]
top_seqs = deep_scaffold_sequences["top"]
# ββ Step 1: Find top-level section boundaries (same as static version) ββ
boundary_order = [
("prefix", "critical_objects"),
("between_co_exp", "explanation"),
("between_exp_fmb", "future_meta_behavior"),
("between_fmb_traj", "trajectory"),
]
sections: Dict[str, Tuple[int, int]] = {}
scaffold_positions: List[int] = []
# Top-level boundary scaffold tokens should belong to the *following*
# section's first block (e.g. `"explanation":` -> explanation block 0).
boundary_scaffold_to_section: Dict[str, List[int]] = {}
search_from = 0
prev_section_name: Optional[str] = None
prev_value_start: Optional[int] = None
for boundary_key, section_name in boundary_order:
pattern = top_seqs.get(boundary_key)
if pattern is None:
continue
pos = _find_subseq(resp_tokens, pattern, search_from)
if pos < 0:
continue
if prev_section_name is not None and prev_value_start is not None:
sections[prev_section_name] = (prev_value_start, pos)
_mark_scaffold_range(scaffold_positions, pos, len(pattern))
boundary_scaffold_to_section.setdefault(section_name, []).extend(
list(range(pos, pos + len(pattern)))
)
value_start = pos + len(pattern)
prev_section_name = section_name
prev_value_start = value_start
search_from = value_start
if prev_section_name is not None and prev_value_start is not None:
sections[prev_section_name] = (prev_value_start, len(resp_tokens))
# New dataset compatibility: response may contain only trajectory.
# If the 4-section boundaries are not found, try direct trajectory key match.
if "trajectory" not in sections:
traj_only_patterns = deep_scaffold_sequences.get("traj_only_boundaries", [])
# Reuse legacy boundary pattern as additional fallback (contains
# `"trajectory":` in old-format responses).
between_fmb_traj = top_seqs.get("between_fmb_traj")
if between_fmb_traj:
traj_only_patterns = list(traj_only_patterns) + [between_fmb_traj]
traj_pos = -1
traj_pat: Optional[List[int]] = None
for pat in traj_only_patterns:
if not pat:
continue
pos = _find_subseq(resp_tokens, pat, 0)
if pos >= 0:
traj_pos = pos
traj_pat = pat
break
if traj_pos >= 0 and traj_pat is not None:
_mark_scaffold_range(scaffold_positions, traj_pos, len(traj_pat))
boundary_scaffold_to_section.setdefault("trajectory", []).extend(
list(range(traj_pos, traj_pos + len(traj_pat)))
)
sections["trajectory"] = (traj_pos + len(traj_pat), len(resp_tokens))
# print(f"sections: {sections}")
# ββ Step 2: Deep scaffold within critical_objects ββ
if "critical_objects" in sections:
co_start, co_end = sections["critical_objects"]
co_tokens = resp_tokens[co_start:co_end]
co_search = 0
for entry in deep_scaffold_sequences["co_subkeys"]:
pattern = entry["pattern"]
pos = _find_subseq(co_tokens, pattern, co_search)
if pos < 0:
continue
_mark_scaffold_range(scaffold_positions, co_start + pos, len(pattern))
# The single value token is right after the pattern β skip it
co_search = pos + len(pattern) + 1
# Mark closing '"}' or "}," as scaffold
co_close = deep_scaffold_sequences["co_closing"]
close_pos = _find_subseq(co_tokens, co_close,
max(0, len(co_tokens) - len(co_close) - 2))
if close_pos >= 0:
_mark_scaffold_range(scaffold_positions, co_start + close_pos, len(co_close))
else:
# json.dumps may produce "}," as a single token
co_close_comma = deep_scaffold_sequences.get("co_closing_comma")
if co_close_comma:
close_pos = _find_subseq(co_tokens, co_close_comma,
max(0, len(co_tokens) - len(co_close_comma) - 2))
if close_pos >= 0:
_mark_scaffold_range(scaffold_positions, co_start + close_pos, len(co_close_comma))
# ββ Step 2b: Explanation opening/closing quotes as scaffold ββ
# Explanation content is all VALUE, but the surrounding quotes must be
# SCAFFOLD so that VALUE tokens are exactly block-aligned (multiple of bd_size).
if "explanation" in sections:
exp_start, exp_end = sections["explanation"]
if exp_start < exp_end:
# Opening quote: first token of explanation section (e.g. ' "')
scaffold_positions.append(exp_start)
# Closing quote+comma: last token (e.g. '",')
scaffold_positions.append(exp_start + (exp_end - exp_start) - 1)
# ββ Step 3: Deep scaffold within future_meta_behavior ββ
# After dataloader processing, FMB has no <|mdm_start|>/<|mdm_end|> markers.
# Format: ' {"longitudinal": "keep speed", "lateral": "go straight"}'
# Strategy: use fmb_prefix to find start, fmb_between to split long/lat values,
# and fmb_closing to find end. Everything except value content is scaffold.
if "future_meta_behavior" in sections:
fmb_start, fmb_end = sections["future_meta_behavior"]
fmb_tokens = resp_tokens[fmb_start:fmb_end]
fmb_scaffold_positions = set()
# 1. Mark fmb_prefix as scaffold: ' {"longitudinal": "'
fmb_prefix = deep_scaffold_sequences["fmb_prefix"]
prefix_pos = _find_subseq(fmb_tokens, fmb_prefix, 0)
if prefix_pos >= 0:
for i in range(prefix_pos, prefix_pos + len(fmb_prefix)):
fmb_scaffold_positions.add(i)
long_value_start = prefix_pos + len(fmb_prefix)
# 2. Mark fmb_between as scaffold: '", "lateral": "'
fmb_between = deep_scaffold_sequences.get("fmb_between")
if fmb_between:
between_pos = _find_subseq(fmb_tokens, fmb_between, long_value_start)
if between_pos >= 0:
for i in range(between_pos, between_pos + len(fmb_between)):
fmb_scaffold_positions.add(i)
lat_value_start = between_pos + len(fmb_between)
# 3. Mark closing '"}' or "}," as scaffold
fmb_close = deep_scaffold_sequences["fmb_closing"]
close_pos = _find_subseq(fmb_tokens, fmb_close,
max(0, len(fmb_tokens) - len(fmb_close) - 2))
if close_pos < 0:
fmb_close_comma = deep_scaffold_sequences.get("fmb_closing_comma")
if fmb_close_comma:
close_pos = _find_subseq(fmb_tokens, fmb_close_comma,
max(0, len(fmb_tokens) - len(fmb_close_comma) - 2))
if close_pos >= 0:
fmb_close = fmb_close_comma
if close_pos >= 0:
for i in range(close_pos, close_pos + len(fmb_close)):
fmb_scaffold_positions.add(i)
for i in fmb_scaffold_positions:
scaffold_positions.append(fmb_start + i)
# ββ Step 4: Deep scaffold within trajectory ββ
# After dataloader processing (no mdm markers), trajectory is:
# ' "[[+14.70,-00.04], [+29.55,-00.21], ...]"'
if "trajectory" in sections:
traj_start, traj_end = sections["trajectory"]
traj_tokens = resp_tokens[traj_start:traj_end]
# Opening "[[
traj_open = deep_scaffold_sequences["traj_open"]
open_pos = _find_subseq(traj_tokens, traj_open, 0)
if open_pos >= 0:
_mark_scaffold_range(scaffold_positions, traj_start + open_pos, len(traj_open))
# Waypoint separators ], (4 of them between 5 waypoints)
traj_wp_sep = deep_scaffold_sequences["traj_wp_sep"]
sep_search = 0
for _ in range(4):
sep_pos = _find_subseq(traj_tokens, traj_wp_sep, sep_search)
if sep_pos < 0:
break
_mark_scaffold_range(scaffold_positions, traj_start + sep_pos, len(traj_wp_sep))
sep_search = sep_pos + len(traj_wp_sep)
# Intermediate waypoint opening ' [' (4 of them, between 5 waypoints)
traj_wp_open = deep_scaffold_sequences.get("traj_wp_open")
if traj_wp_open:
wo_search = 0
for _ in range(4):
wo_pos = _find_subseq(traj_tokens, traj_wp_open, wo_search)
if wo_pos < 0:
break
_mark_scaffold_range(scaffold_positions, traj_start + wo_pos, len(traj_wp_open))
wo_search = wo_pos + len(traj_wp_open)
# Coordinate comma ',' between x and y within each waypoint (5 of them)
traj_coord_comma = deep_scaffold_sequences.get("traj_coord_comma")
if traj_coord_comma:
cc_search = 0
for _ in range(5):
cc_pos = _find_subseq(traj_tokens, traj_coord_comma, cc_search)
if cc_pos < 0:
break
_mark_scaffold_range(scaffold_positions, traj_start + cc_pos, len(traj_coord_comma))
cc_search = cc_pos + len(traj_coord_comma)
# Closing ]]" or just ]]
traj_close = deep_scaffold_sequences["traj_close"]
close_pos = _find_subseq(traj_tokens, traj_close,
max(0, len(traj_tokens) - len(traj_close) - 6))
if close_pos < 0:
for split_key in ["traj_close_split", "traj_close_split2"]:
tcs = deep_scaffold_sequences.get(split_key)
if tcs:
close_pos = _find_subseq(traj_tokens, tcs,
max(0, len(traj_tokens) - len(tcs) - 6))
if close_pos >= 0:
traj_close = tcs
break
if close_pos >= 0:
_mark_scaffold_range(scaffold_positions, traj_start + close_pos, len(traj_close))
# Align training with inference scaffold: exclude trailing tokens
# after the JSON closing of trajectory (e.g. "<|im_end|>\n") from
# section/block scheduling.
effective_resp_end = min(
effective_resp_end,
resp_start + traj_start + close_pos + len(traj_close),
)
# Opening quote " (first token of traj value)
if len(traj_tokens) > 0:
scaffold_positions.append(traj_start)
# ββ Mark scaffold mask (absolute positions) ββ
scaffold_positions_set = set(scaffold_positions)
for sp in scaffold_positions_set:
abs_pos = resp_start + sp
if abs_pos < seq_len:
scaffold_mask[abs_pos] = True
# ββ Assign block indices per section ββ
current_block = 0
assigned = set()
block_to_section = {} # block_idx -> section_name (for SASD compatibility)
section_first_block: Dict[str, int] = {}
for section_name in SECTION_KEYS:
if section_name not in sections:
continue
rel_start, rel_end = sections[section_name]
abs_start = resp_start + rel_start
abs_end = resp_start + rel_end
abs_start = max(abs_start, resp_start)
abs_end = min(abs_end, effective_resp_end)
num_tokens = abs_end - abs_start
if num_tokens <= 0:
continue
# Count only non-scaffold tokens for block sizing
value_positions = [p for p in range(abs_start, abs_end)
if response_mask[p] and (p - resp_start) not in scaffold_positions_set]
num_value_tokens = len(value_positions)
if num_value_tokens <= 0:
section_first_block[section_name] = current_block
block_to_section[current_block] = section_name
current_block += 1
continue
# Use fixed block size (bd_size) and compute number of blocks dynamically
tokens_per_step = fallback_block_size
n_steps = max(1, math.ceil(num_value_tokens / tokens_per_step))
for b in range(n_steps):
block_to_section[current_block + b] = section_name
section_first_block[section_name] = current_block
for vi, pos in enumerate(value_positions):
block_in_section = min(vi // tokens_per_step, n_steps - 1)
response_block_idx[pos] = current_block + block_in_section
assigned.add(pos)
current_block += n_steps
# Assign scaffold tokens within each section to the nearest value token
# in the SAME section. This keeps section-closing tokens such as `"},`
# with their section instead of drifting to the next section.
for section_name in SECTION_KEYS:
if section_name not in sections:
continue
rel_start, rel_end = sections[section_name]
abs_start = max(resp_start + rel_start, resp_start)
abs_end = min(resp_start + rel_end, resp_end)
if abs_end <= abs_start:
continue
for abs_pos in range(abs_start, abs_end):
rel_pos = abs_pos - resp_start
if (
abs_pos >= seq_len
or not response_mask[abs_pos]
or abs_pos in assigned
or rel_pos not in scaffold_positions_set
):
continue
best_block = -1
max_delta = max(1, abs_end - abs_start)
for delta in range(1, max_delta + 1):
# Prefer left first so closing punctuation tends to stay with
# the preceding content in the same section.
for cand in [abs_pos - delta, abs_pos + delta]:
if abs_start <= cand < abs_end and cand in assigned:
best_block = response_block_idx[cand].item()
break
if best_block >= 0:
break
if best_block < 0:
best_block = section_first_block.get(section_name, -1)
if best_block >= 0:
response_block_idx[abs_pos] = best_block
assigned.add(abs_pos)
# Top-level boundary tokens are explicitly attached to the following
# section's first block, instead of nearest-neighbor assignment.
for section_name, rel_positions in boundary_scaffold_to_section.items():
first_block = section_first_block.get(section_name)
if first_block is None:
continue
for rel_pos in rel_positions:
abs_pos = resp_start + rel_pos
if abs_pos >= seq_len or not response_mask[abs_pos]:
continue
response_block_idx[abs_pos] = first_block
assigned.add(abs_pos)
# Scaffold tokens β block index of nearest assigned neighbour
for sp in scaffold_positions_set:
abs_pos = resp_start + sp
if (
abs_pos >= seq_len
or abs_pos >= effective_resp_end
or not response_mask[abs_pos]
or abs_pos in assigned
):
continue
best_block = -1
for delta in range(1, seq_len):
for cand in [abs_pos + delta, abs_pos - delta]:
if 0 <= cand < seq_len and cand in assigned:
best_block = response_block_idx[cand].item()
break
if best_block >= 0:
break
if best_block >= 0:
response_block_idx[abs_pos] = best_block
assigned.add(abs_pos)
# Fallback for unassigned response tokens
for pos in range(resp_start, effective_resp_end):
if response_mask[pos] and pos not in assigned:
offset = pos - resp_start
response_block_idx[pos] = current_block + offset // fallback_block_size
assigned.add(pos)
fallback_positions = [p for p in range(resp_start, effective_resp_end)
if response_mask[p] and response_block_idx[p].item() >= current_block]
if fallback_positions:
current_block = max(response_block_idx[p].item() for p in fallback_positions) + 1
n_blocks = current_block
# Turn index
for i in range(1, seq_len):
if response_block_idx[i] != response_block_idx[i - 1]:
turn_idx[i] = turn_idx[i - 1] + 1
else:
turn_idx[i] = turn_idx[i - 1]
return response_block_idx, turn_idx, n_blocks, scaffold_mask, block_to_section
|