| """Generate dataset statistics figure from the currently-available annotations. |
| |
| Panels (3): |
| (a) Recording duration distribution per scene (boxplot) |
| (b) Segment length distribution (histogram) |
| (c) Top-20 manipulated objects by segment count |
| |
| Note: panel for motor-primitive frequency is deferred until the 18-primitive |
| annotation pipeline (anno.py) is rerun across all recordings. |
| """ |
| import json, re |
| from pathlib import Path |
| from collections import Counter |
| import numpy as np |
| import matplotlib.pyplot as plt |
|
|
| ANNO_DIR = Path("${PULSE_ROOT}/annotations_by_scene") |
| OUT = Path("${PULSE_ROOT}/paper/figures/dataset_stats.pdf") |
|
|
| |
| OBJ_EN = { |
| "笔记本电脑": "laptop", "有线鼠标": "wired mouse", "有线键盘": "wired keyboard", |
| "马克笔": "marker", "胶带": "tape", "笔记本电源": "laptop power", "折叠伞": "umbrella", |
| "剪刀": "scissors", "钱包": "wallet", "纸": "paper", "订书机": "stapler", |
| "纸箱": "box", "文件": "document", "架子": "rack", "桌布": "tablecloth", "罐子": "jar", |
| "调料瓶": "seasoning bottle", "密封罐": "sealed jar", "厨房纸巾": "kitchen paper", |
| "抹布": "cloth", "茶包": "tea bag", "饭碗": "rice bowl", "菜盘": "plate", |
| "菜锅": "pot", "勺子": "spoon", "水杯": "water cup", "茶杯": "tea cup", |
| "茶壶": "teapot", "食物残渣": "food residue", "垃圾桶": "trash bin", |
| "纸巾": "tissue", "餐垫": "placemat", "托盘": "tray", "清洁喷雾": "spray", |
| "食物": "food", "电源": "power adapter", "移动硬盘": "HDD", "鼠标": "mouse", |
| "笔记本充电器": "laptop charger", "转换插头": "plug adapter", "插线板": "power strip", |
| "线材收纳包": "cable organizer", "衬衫": "shirt", "裤子": "pants", |
| "牙膏": "toothpaste", "牙刷": "toothbrush", "牙刷盒": "toothbrush case", |
| "剃须刀": "razor", "毛巾": "towel", "皮鞋": "shoes", "鞋袋": "shoe bag", |
| "耳机": "headphones", "护照套": "passport holder", "证件夹": "ID holder", |
| "纸巾包": "tissue pack", "行李箱": "suitcase", "马克杯": "mug", |
| "调料罐": "seasoning jar", "茶罐": "tea canister", "外套": "coat", |
| "围巾": "scarf", "衣架": "hanger", |
| } |
|
|
|
|
| def parse_t(ts: str) -> float: |
| parts = ts.split(":") |
| if len(parts) == 2: |
| m, s = parts |
| return int(m) * 60 + int(s) |
| h, m, s = parts |
| return int(h) * 3600 + int(m) * 60 + int(s) |
|
|
|
|
| durations = {f"S{i}": [] for i in range(1, 9)} |
| seg_lengths = [] |
| objects = Counter() |
|
|
| for v_dir in sorted(ANNO_DIR.glob("v*")): |
| for jf in sorted(v_dir.glob("s*.json")): |
| scene = jf.stem.upper() |
| try: |
| data = json.loads(jf.read_text()) |
| except Exception: |
| continue |
| segs = data.get("segments", []) |
| if not segs: |
| continue |
| max_end = 0 |
| for seg in segs: |
| ts = seg.get("timestamp", "") |
| if "-" not in ts: |
| continue |
| try: |
| start, end = ts.split("-") |
| s_sec, e_sec = parse_t(start), parse_t(end) |
| seg_lengths.append(e_sec - s_sec) |
| max_end = max(max_end, e_sec) |
| for o in seg.get("objects", []) or []: |
| nm = o.get("name") if isinstance(o, dict) else o |
| if nm: |
| objects[OBJ_EN.get(nm, nm)] += 1 |
| except Exception: |
| continue |
| if max_end > 0 and scene in durations: |
| durations[scene].append(max_end / 60.0) |
|
|
| print(f"Per-scene durations: { {s: len(v) for s, v in durations.items()} }") |
| print(f"Total segments: {len(seg_lengths)}") |
| print(f"Unique objects: {len(objects)}") |
| top_obj = objects.most_common(5) |
| print(f"Top objects: {top_obj}") |
|
|
| fig, axes = plt.subplots(1, 3, figsize=(12, 3.5)) |
|
|
| |
| ax = axes[0] |
| scene_order = [f"S{i}" for i in range(1, 9)] |
| data = [durations[s] for s in scene_order] |
| ax.boxplot(data, tick_labels=scene_order, showfliers=False, patch_artist=True, |
| boxprops=dict(facecolor="#b3cde3")) |
| ax.set_ylabel("Recording duration (min)") |
| ax.set_title("(a) Recording duration per scene") |
| ax.grid(axis="y", alpha=0.3) |
|
|
| |
| ax = axes[1] |
| seg_arr = np.array(seg_lengths) |
| seg_arr = seg_arr[seg_arr <= 10] |
| ax.hist(seg_arr, bins=np.arange(0, 11) - 0.5, color="#8c96c6", edgecolor="black") |
| ax.set_xlabel("Segment length (s)") |
| ax.set_ylabel("Segment count") |
| ax.set_title(f"(b) Segment length (n={len(seg_lengths)})") |
| ax.set_xticks(range(0, 11)) |
| ax.grid(axis="y", alpha=0.3) |
|
|
| |
| ax = axes[2] |
| objs, ocounts = zip(*objects.most_common(20)) |
| ax.barh(objs[::-1], ocounts[::-1], color="#74c476") |
| ax.set_xlabel("Segment count") |
| ax.set_title("(c) Top-20 manipulated objects") |
| ax.tick_params(axis="y", labelsize=8) |
| ax.grid(axis="x", alpha=0.3) |
|
|
| fig.tight_layout() |
| fig.savefig(OUT, bbox_inches="tight") |
| fig.savefig(str(OUT).replace(".pdf", ".png"), dpi=140, bbox_inches="tight") |
| print(f"Saved: {OUT}") |
|
|