File size: 4,553 Bytes

b4b2877

#!/usr/bin/env python3
"""
Rebuild the frozen taxonomy JSON from the current annotations_v3/ state.

Run this *once* after annotation is complete to lock the 28+ noun list. Later
experiments load the frozen list via taxonomy.py, so class indices don't
drift if more annotations are ever added.

Usage:
    python3 experiments/build_taxonomy.py
    python3 experiments/build_taxonomy.py --threshold 50 --out experiments/taxonomy_v3.json
"""

import argparse
import glob
import json
import os
from collections import Counter
from pathlib import Path

REPO = Path(__file__).resolve().parents[1]


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument(
        "--annotations_dir",
        default=str(REPO / "annotations_v3"),
        help="Directory containing v*/s*.json annotation files",
    )
    ap.add_argument("--threshold", type=int, default=50,
                    help="Minimum noun frequency to keep (Strategy A drops the rest)")
    ap.add_argument(
        "--out",
        default=str(REPO / "experiments" / "taxonomy_v3.json"),
        help="Output frozen taxonomy JSON",
    )
    args = ap.parse_args()

    # Late import so building the list doesn't depend on the frozen file
    # being present yet.
    import sys
    sys.path.insert(0, str(REPO))
    from experiments.taxonomy import (
        VERB_FINE, VERB_COMPOSITE, HAND, NOUN_CANONICAL, canonical_noun,
    )

    paths = sorted(glob.glob(os.path.join(args.annotations_dir, "v*", "s*.json")))
    if not paths:
        raise SystemExit(f"No json files under {args.annotations_dir}")

    verbs, nouns, hands = Counter(), Counter(), Counter()
    total = 0
    dropped_unknown_verb = 0
    dropped_unknown_hand = 0
    for p in paths:
        try:
            with open(p) as f:
                d = json.load(f)
        except Exception as e:
            print(f"  WARN: could not parse {p}: {e}")
            continue
        for s in d.get("segments", []):
            a = s.get("action_annotation", {})
            v = a.get("action_name")
            n = a.get("object_name")
            h = a.get("hand_type")
            if not (v and n and h):
                continue
            total += 1
            if v not in VERB_FINE:
                dropped_unknown_verb += 1
                continue
            if h not in HAND:
                dropped_unknown_hand += 1
                continue
            verbs[v] += 1
            nouns[canonical_noun(n)] += 1
            hands[h] += 1

    kept = [n for n, c in nouns.most_common() if c >= args.threshold]

    # Stable alphabetical ordering within kept-set, so re-runs that swap two
    # near-tie classes don't flip indices.
    kept = sorted(kept, key=lambda n: (-nouns[n], n))

    surviving_segs = 0
    for p in paths:
        with open(p) as f:
            d = json.load(f)
        for s in d.get("segments", []):
            a = s.get("action_annotation", {})
            v = a.get("action_name")
            n = a.get("object_name")
            h = a.get("hand_type")
            if not (v and n and h):
                continue
            if v not in VERB_FINE or h not in HAND:
                continue
            if canonical_noun(n) not in kept:
                continue
            surviving_segs += 1

    out = {
        "threshold":             args.threshold,
        "annotation_file_count": len(paths),
        "total_segments":        total,
        "dropped_unknown_verb":  dropped_unknown_verb,
        "dropped_unknown_hand":  dropped_unknown_hand,
        "surviving_segments":    surviving_segs,
        "verbs":                 VERB_FINE,
        "verb_composite":        VERB_COMPOSITE,
        "hand":                  HAND,
        "nouns":                 kept,
        "noun_counts":           {n: nouns[n] for n in kept},
        "verb_counts":           dict(verbs),
        "hand_counts":           dict(hands),
    }
    Path(args.out).parent.mkdir(parents=True, exist_ok=True)
    with open(args.out, "w") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    print(f"Scanned {len(paths)} files, {total} segments")
    print(f"Dropped (unknown verb / hand): {dropped_unknown_verb} / "
          f"{dropped_unknown_hand}")
    print(f"Kept {len(kept)} nouns (>= {args.threshold}):")
    for n in kept:
        print(f"  {n}: {nouns[n]}")
    print(f"Surviving segments (Strategy A): "
          f"{surviving_segs} / {total}  "
          f"({100 * surviving_segs / max(1, total):.1f}%)")
    print(f"Wrote {args.out}")


if __name__ == "__main__":
    main()