File size: 4,553 Bytes
b4b2877
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python3
"""
Rebuild the frozen taxonomy JSON from the current annotations_v3/ state.

Run this *once* after annotation is complete to lock the 28+ noun list. Later
experiments load the frozen list via taxonomy.py, so class indices don't
drift if more annotations are ever added.

Usage:
    python3 experiments/build_taxonomy.py
    python3 experiments/build_taxonomy.py --threshold 50 --out experiments/taxonomy_v3.json
"""

import argparse
import glob
import json
import os
from collections import Counter
from pathlib import Path

REPO = Path(__file__).resolve().parents[1]


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument(
        "--annotations_dir",
        default=str(REPO / "annotations_v3"),
        help="Directory containing v*/s*.json annotation files",
    )
    ap.add_argument("--threshold", type=int, default=50,
                    help="Minimum noun frequency to keep (Strategy A drops the rest)")
    ap.add_argument(
        "--out",
        default=str(REPO / "experiments" / "taxonomy_v3.json"),
        help="Output frozen taxonomy JSON",
    )
    args = ap.parse_args()

    # Late import so building the list doesn't depend on the frozen file
    # being present yet.
    import sys
    sys.path.insert(0, str(REPO))
    from experiments.taxonomy import (
        VERB_FINE, VERB_COMPOSITE, HAND, NOUN_CANONICAL, canonical_noun,
    )

    paths = sorted(glob.glob(os.path.join(args.annotations_dir, "v*", "s*.json")))
    if not paths:
        raise SystemExit(f"No json files under {args.annotations_dir}")

    verbs, nouns, hands = Counter(), Counter(), Counter()
    total = 0
    dropped_unknown_verb = 0
    dropped_unknown_hand = 0
    for p in paths:
        try:
            with open(p) as f:
                d = json.load(f)
        except Exception as e:
            print(f"  WARN: could not parse {p}: {e}")
            continue
        for s in d.get("segments", []):
            a = s.get("action_annotation", {})
            v = a.get("action_name")
            n = a.get("object_name")
            h = a.get("hand_type")
            if not (v and n and h):
                continue
            total += 1
            if v not in VERB_FINE:
                dropped_unknown_verb += 1
                continue
            if h not in HAND:
                dropped_unknown_hand += 1
                continue
            verbs[v] += 1
            nouns[canonical_noun(n)] += 1
            hands[h] += 1

    kept = [n for n, c in nouns.most_common() if c >= args.threshold]

    # Stable alphabetical ordering within kept-set, so re-runs that swap two
    # near-tie classes don't flip indices.
    kept = sorted(kept, key=lambda n: (-nouns[n], n))

    surviving_segs = 0
    for p in paths:
        with open(p) as f:
            d = json.load(f)
        for s in d.get("segments", []):
            a = s.get("action_annotation", {})
            v = a.get("action_name")
            n = a.get("object_name")
            h = a.get("hand_type")
            if not (v and n and h):
                continue
            if v not in VERB_FINE or h not in HAND:
                continue
            if canonical_noun(n) not in kept:
                continue
            surviving_segs += 1

    out = {
        "threshold":             args.threshold,
        "annotation_file_count": len(paths),
        "total_segments":        total,
        "dropped_unknown_verb":  dropped_unknown_verb,
        "dropped_unknown_hand":  dropped_unknown_hand,
        "surviving_segments":    surviving_segs,
        "verbs":                 VERB_FINE,
        "verb_composite":        VERB_COMPOSITE,
        "hand":                  HAND,
        "nouns":                 kept,
        "noun_counts":           {n: nouns[n] for n in kept},
        "verb_counts":           dict(verbs),
        "hand_counts":           dict(hands),
    }
    Path(args.out).parent.mkdir(parents=True, exist_ok=True)
    with open(args.out, "w") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    print(f"Scanned {len(paths)} files, {total} segments")
    print(f"Dropped (unknown verb / hand): {dropped_unknown_verb} / "
          f"{dropped_unknown_hand}")
    print(f"Kept {len(kept)} nouns (>= {args.threshold}):")
    for n in kept:
        print(f"  {n}: {nouns[n]}")
    print(f"Surviving segments (Strategy A): "
          f"{surviving_segs} / {total}  "
          f"({100 * surviving_segs / max(1, total):.1f}%)")
    print(f"Wrote {args.out}")


if __name__ == "__main__":
    main()