| |
| """ |
| Rebuild the frozen taxonomy JSON from the current annotations_v3/ state. |
| |
| Run this *once* after annotation is complete to lock the 28+ noun list. Later |
| experiments load the frozen list via taxonomy.py, so class indices don't |
| drift if more annotations are ever added. |
| |
| Usage: |
| python3 experiments/build_taxonomy.py |
| python3 experiments/build_taxonomy.py --threshold 50 --out experiments/taxonomy_v3.json |
| """ |
|
|
| import argparse |
| import glob |
| import json |
| import os |
| from collections import Counter |
| from pathlib import Path |
|
|
| REPO = Path(__file__).resolve().parents[1] |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument( |
| "--annotations_dir", |
| default=str(REPO / "annotations_v3"), |
| help="Directory containing v*/s*.json annotation files", |
| ) |
| ap.add_argument("--threshold", type=int, default=50, |
| help="Minimum noun frequency to keep (Strategy A drops the rest)") |
| ap.add_argument( |
| "--out", |
| default=str(REPO / "experiments" / "taxonomy_v3.json"), |
| help="Output frozen taxonomy JSON", |
| ) |
| args = ap.parse_args() |
|
|
| |
| |
| import sys |
| sys.path.insert(0, str(REPO)) |
| from experiments.taxonomy import ( |
| VERB_FINE, VERB_COMPOSITE, HAND, NOUN_CANONICAL, canonical_noun, |
| ) |
|
|
| paths = sorted(glob.glob(os.path.join(args.annotations_dir, "v*", "s*.json"))) |
| if not paths: |
| raise SystemExit(f"No json files under {args.annotations_dir}") |
|
|
| verbs, nouns, hands = Counter(), Counter(), Counter() |
| total = 0 |
| dropped_unknown_verb = 0 |
| dropped_unknown_hand = 0 |
| for p in paths: |
| try: |
| with open(p) as f: |
| d = json.load(f) |
| except Exception as e: |
| print(f" WARN: could not parse {p}: {e}") |
| continue |
| for s in d.get("segments", []): |
| a = s.get("action_annotation", {}) |
| v = a.get("action_name") |
| n = a.get("object_name") |
| h = a.get("hand_type") |
| if not (v and n and h): |
| continue |
| total += 1 |
| if v not in VERB_FINE: |
| dropped_unknown_verb += 1 |
| continue |
| if h not in HAND: |
| dropped_unknown_hand += 1 |
| continue |
| verbs[v] += 1 |
| nouns[canonical_noun(n)] += 1 |
| hands[h] += 1 |
|
|
| kept = [n for n, c in nouns.most_common() if c >= args.threshold] |
|
|
| |
| |
| kept = sorted(kept, key=lambda n: (-nouns[n], n)) |
|
|
| surviving_segs = 0 |
| for p in paths: |
| with open(p) as f: |
| d = json.load(f) |
| for s in d.get("segments", []): |
| a = s.get("action_annotation", {}) |
| v = a.get("action_name") |
| n = a.get("object_name") |
| h = a.get("hand_type") |
| if not (v and n and h): |
| continue |
| if v not in VERB_FINE or h not in HAND: |
| continue |
| if canonical_noun(n) not in kept: |
| continue |
| surviving_segs += 1 |
|
|
| out = { |
| "threshold": args.threshold, |
| "annotation_file_count": len(paths), |
| "total_segments": total, |
| "dropped_unknown_verb": dropped_unknown_verb, |
| "dropped_unknown_hand": dropped_unknown_hand, |
| "surviving_segments": surviving_segs, |
| "verbs": VERB_FINE, |
| "verb_composite": VERB_COMPOSITE, |
| "hand": HAND, |
| "nouns": kept, |
| "noun_counts": {n: nouns[n] for n in kept}, |
| "verb_counts": dict(verbs), |
| "hand_counts": dict(hands), |
| } |
| Path(args.out).parent.mkdir(parents=True, exist_ok=True) |
| with open(args.out, "w") as f: |
| json.dump(out, f, ensure_ascii=False, indent=2) |
|
|
| print(f"Scanned {len(paths)} files, {total} segments") |
| print(f"Dropped (unknown verb / hand): {dropped_unknown_verb} / " |
| f"{dropped_unknown_hand}") |
| print(f"Kept {len(kept)} nouns (>= {args.threshold}):") |
| for n in kept: |
| print(f" {n}: {nouns[n]}") |
| print(f"Surviving segments (Strategy A): " |
| f"{surviving_segs} / {total} " |
| f"({100 * surviving_segs / max(1, total):.1f}%)") |
| print(f"Wrote {args.out}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|