PULSE-code / experiments /analysis /build_taxonomy.py
velvet-pine-22's picture
Upload folder using huggingface_hub
b4b2877 verified
#!/usr/bin/env python3
"""
Rebuild the frozen taxonomy JSON from the current annotations_v3/ state.
Run this *once* after annotation is complete to lock the 28+ noun list. Later
experiments load the frozen list via taxonomy.py, so class indices don't
drift if more annotations are ever added.
Usage:
python3 experiments/build_taxonomy.py
python3 experiments/build_taxonomy.py --threshold 50 --out experiments/taxonomy_v3.json
"""
import argparse
import glob
import json
import os
from collections import Counter
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
def main():
ap = argparse.ArgumentParser()
ap.add_argument(
"--annotations_dir",
default=str(REPO / "annotations_v3"),
help="Directory containing v*/s*.json annotation files",
)
ap.add_argument("--threshold", type=int, default=50,
help="Minimum noun frequency to keep (Strategy A drops the rest)")
ap.add_argument(
"--out",
default=str(REPO / "experiments" / "taxonomy_v3.json"),
help="Output frozen taxonomy JSON",
)
args = ap.parse_args()
# Late import so building the list doesn't depend on the frozen file
# being present yet.
import sys
sys.path.insert(0, str(REPO))
from experiments.taxonomy import (
VERB_FINE, VERB_COMPOSITE, HAND, NOUN_CANONICAL, canonical_noun,
)
paths = sorted(glob.glob(os.path.join(args.annotations_dir, "v*", "s*.json")))
if not paths:
raise SystemExit(f"No json files under {args.annotations_dir}")
verbs, nouns, hands = Counter(), Counter(), Counter()
total = 0
dropped_unknown_verb = 0
dropped_unknown_hand = 0
for p in paths:
try:
with open(p) as f:
d = json.load(f)
except Exception as e:
print(f" WARN: could not parse {p}: {e}")
continue
for s in d.get("segments", []):
a = s.get("action_annotation", {})
v = a.get("action_name")
n = a.get("object_name")
h = a.get("hand_type")
if not (v and n and h):
continue
total += 1
if v not in VERB_FINE:
dropped_unknown_verb += 1
continue
if h not in HAND:
dropped_unknown_hand += 1
continue
verbs[v] += 1
nouns[canonical_noun(n)] += 1
hands[h] += 1
kept = [n for n, c in nouns.most_common() if c >= args.threshold]
# Stable alphabetical ordering within kept-set, so re-runs that swap two
# near-tie classes don't flip indices.
kept = sorted(kept, key=lambda n: (-nouns[n], n))
surviving_segs = 0
for p in paths:
with open(p) as f:
d = json.load(f)
for s in d.get("segments", []):
a = s.get("action_annotation", {})
v = a.get("action_name")
n = a.get("object_name")
h = a.get("hand_type")
if not (v and n and h):
continue
if v not in VERB_FINE or h not in HAND:
continue
if canonical_noun(n) not in kept:
continue
surviving_segs += 1
out = {
"threshold": args.threshold,
"annotation_file_count": len(paths),
"total_segments": total,
"dropped_unknown_verb": dropped_unknown_verb,
"dropped_unknown_hand": dropped_unknown_hand,
"surviving_segments": surviving_segs,
"verbs": VERB_FINE,
"verb_composite": VERB_COMPOSITE,
"hand": HAND,
"nouns": kept,
"noun_counts": {n: nouns[n] for n in kept},
"verb_counts": dict(verbs),
"hand_counts": dict(hands),
}
Path(args.out).parent.mkdir(parents=True, exist_ok=True)
with open(args.out, "w") as f:
json.dump(out, f, ensure_ascii=False, indent=2)
print(f"Scanned {len(paths)} files, {total} segments")
print(f"Dropped (unknown verb / hand): {dropped_unknown_verb} / "
f"{dropped_unknown_hand}")
print(f"Kept {len(kept)} nouns (>= {args.threshold}):")
for n in kept:
print(f" {n}: {nouns[n]}")
print(f"Surviving segments (Strategy A): "
f"{surviving_segs} / {total} "
f"({100 * surviving_segs / max(1, total):.1f}%)")
print(f"Wrote {args.out}")
if __name__ == "__main__":
main()