File size: 4,553 Bytes
b4b2877 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | #!/usr/bin/env python3
"""
Rebuild the frozen taxonomy JSON from the current annotations_v3/ state.
Run this *once* after annotation is complete to lock the 28+ noun list. Later
experiments load the frozen list via taxonomy.py, so class indices don't
drift if more annotations are ever added.
Usage:
python3 experiments/build_taxonomy.py
python3 experiments/build_taxonomy.py --threshold 50 --out experiments/taxonomy_v3.json
"""
import argparse
import glob
import json
import os
from collections import Counter
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
def main():
ap = argparse.ArgumentParser()
ap.add_argument(
"--annotations_dir",
default=str(REPO / "annotations_v3"),
help="Directory containing v*/s*.json annotation files",
)
ap.add_argument("--threshold", type=int, default=50,
help="Minimum noun frequency to keep (Strategy A drops the rest)")
ap.add_argument(
"--out",
default=str(REPO / "experiments" / "taxonomy_v3.json"),
help="Output frozen taxonomy JSON",
)
args = ap.parse_args()
# Late import so building the list doesn't depend on the frozen file
# being present yet.
import sys
sys.path.insert(0, str(REPO))
from experiments.taxonomy import (
VERB_FINE, VERB_COMPOSITE, HAND, NOUN_CANONICAL, canonical_noun,
)
paths = sorted(glob.glob(os.path.join(args.annotations_dir, "v*", "s*.json")))
if not paths:
raise SystemExit(f"No json files under {args.annotations_dir}")
verbs, nouns, hands = Counter(), Counter(), Counter()
total = 0
dropped_unknown_verb = 0
dropped_unknown_hand = 0
for p in paths:
try:
with open(p) as f:
d = json.load(f)
except Exception as e:
print(f" WARN: could not parse {p}: {e}")
continue
for s in d.get("segments", []):
a = s.get("action_annotation", {})
v = a.get("action_name")
n = a.get("object_name")
h = a.get("hand_type")
if not (v and n and h):
continue
total += 1
if v not in VERB_FINE:
dropped_unknown_verb += 1
continue
if h not in HAND:
dropped_unknown_hand += 1
continue
verbs[v] += 1
nouns[canonical_noun(n)] += 1
hands[h] += 1
kept = [n for n, c in nouns.most_common() if c >= args.threshold]
# Stable alphabetical ordering within kept-set, so re-runs that swap two
# near-tie classes don't flip indices.
kept = sorted(kept, key=lambda n: (-nouns[n], n))
surviving_segs = 0
for p in paths:
with open(p) as f:
d = json.load(f)
for s in d.get("segments", []):
a = s.get("action_annotation", {})
v = a.get("action_name")
n = a.get("object_name")
h = a.get("hand_type")
if not (v and n and h):
continue
if v not in VERB_FINE or h not in HAND:
continue
if canonical_noun(n) not in kept:
continue
surviving_segs += 1
out = {
"threshold": args.threshold,
"annotation_file_count": len(paths),
"total_segments": total,
"dropped_unknown_verb": dropped_unknown_verb,
"dropped_unknown_hand": dropped_unknown_hand,
"surviving_segments": surviving_segs,
"verbs": VERB_FINE,
"verb_composite": VERB_COMPOSITE,
"hand": HAND,
"nouns": kept,
"noun_counts": {n: nouns[n] for n in kept},
"verb_counts": dict(verbs),
"hand_counts": dict(hands),
}
Path(args.out).parent.mkdir(parents=True, exist_ok=True)
with open(args.out, "w") as f:
json.dump(out, f, ensure_ascii=False, indent=2)
print(f"Scanned {len(paths)} files, {total} segments")
print(f"Dropped (unknown verb / hand): {dropped_unknown_verb} / "
f"{dropped_unknown_hand}")
print(f"Kept {len(kept)} nouns (>= {args.threshold}):")
for n in kept:
print(f" {n}: {nouns[n]}")
print(f"Surviving segments (Strategy A): "
f"{surviving_segs} / {total} "
f"({100 * surviving_segs / max(1, total):.1f}%)")
print(f"Wrote {args.out}")
if __name__ == "__main__":
main()
|