Ashira Pitchayapakayakul commited on
Commit
c0c6fe0
·
1 Parent(s): 836b22a

feat: expand-role-keywords daemon — LLM-expanded SDLC keywords

Browse files

User: 'เธอไป research job description ทุก role มา แล้วดูว่าเค้าต้องหา
ความรู้อะไรบ้าง'. Implemented as a daily 06:00 UTC cron that for each
of the 17 SDLC roles in role-knowledge-map.json:

1. Reads existing core + adjacent skills
2. Sends them to Cerebras (qwen-3-235b) with the prompt
'You are a senior tech recruiter who reads thousands of job
descriptions. Output 80 keyword phrases this role's JD would
mention.'
3. Cleans + dedups the response
4. Merges into role.expanded list

Discoverer reads three lists per role now (core / adjacent / expanded).
17 roles x 80 expanded keywords = up to 1,360 new search queries the
discoverer will fire next cycle, each landing on a fresh slice of HF
hub that we hadn't searched before.

Falls through Cerebras → Groq → OpenRouter on per-role basis. Failure
of one role doesn't block others — bad responses just leave the
'expanded' list empty for that role until next run.

Combined effect of round-5 + cursor + expand-keywords:
- 30+ new dataset entries in static list (round-5)
- Cursor service stops re-pulling row 0 (stamp-and-move)
- Discoverer auto-finds 1.3K+ new role-specific datasets weekly

bin/expand-role-keywords.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ One-shot keyword expander — uses Cerebras (or fallback) to expand each
4
+ SDLC role's core/adjacent skills into 100+ specific HF dataset search
5
+ keywords. Output is written back to role-knowledge-map.json under a new
6
+ "expanded" key per role.
7
+
8
+ Idempotent — re-running just refreshes "expanded" keywords. Existing
9
+ core/adjacent are untouched.
10
+
11
+ Run from cron weekly (or manually). Discoverer auto-reads the map on
12
+ its next cycle and fires search queries for the expanded list.
13
+
14
+ Usage: python expand-role-keywords.py
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import os
20
+ import sys
21
+ import time
22
+ import urllib.request
23
+ import urllib.error
24
+ from pathlib import Path
25
+
26
+ ROLE_MAP_PATH = Path.home() / ".surrogate/agents/role-knowledge-map.json"
27
+
28
+ PROVIDERS = [
29
+ {
30
+ "name": "cerebras",
31
+ "url": "https://api.cerebras.ai/v1/chat/completions",
32
+ "key_env": "CEREBRAS_API_KEY",
33
+ "model": "qwen-3-235b-a22b-instruct-2507",
34
+ },
35
+ {
36
+ "name": "groq",
37
+ "url": "https://api.groq.com/openai/v1/chat/completions",
38
+ "key_env": "GROQ_API_KEY",
39
+ "model": "llama-3.3-70b-versatile",
40
+ },
41
+ {
42
+ "name": "openrouter",
43
+ "url": "https://openrouter.ai/api/v1/chat/completions",
44
+ "key_env": "OPENROUTER_API_KEY",
45
+ "model": "tencent/hy3-preview:free",
46
+ },
47
+ ]
48
+
49
+
50
+ def call_llm(prompt: str, timeout: int = 90) -> str | None:
51
+ for p in PROVIDERS:
52
+ key = os.environ.get(p["key_env"], "").strip()
53
+ if not key:
54
+ continue
55
+ body = json.dumps({
56
+ "model": p["model"],
57
+ "messages": [
58
+ {"role": "system",
59
+ "content": "You are a senior tech recruiter who reads thousands of job descriptions. Output clean comma-separated keyword lists, no prose."},
60
+ {"role": "user", "content": prompt},
61
+ ],
62
+ "max_tokens": 1500,
63
+ "temperature": 0.4,
64
+ }).encode()
65
+ req = urllib.request.Request(
66
+ p["url"],
67
+ data=body,
68
+ headers={
69
+ "Authorization": f"Bearer {key}",
70
+ "Content-Type": "application/json",
71
+ "User-Agent": "Mozilla/5.0 surrogate-1/expand-keywords",
72
+ },
73
+ method="POST",
74
+ )
75
+ try:
76
+ with urllib.request.urlopen(req, timeout=timeout) as r:
77
+ data = json.loads(r.read())
78
+ content = (data.get("choices") or [{}])[0].get("message", {}).get("content", "").strip()
79
+ if content:
80
+ print(f" [{p['name']}] ok ({len(content)} chars)", flush=True)
81
+ return content
82
+ except Exception as e:
83
+ print(f" [{p['name']}] err: {type(e).__name__}: {str(e)[:80]}", flush=True)
84
+ continue
85
+ return None
86
+
87
+
88
+ def expand_role(role_name: str, role_def: dict) -> list[str]:
89
+ core = role_def.get("core", [])
90
+ adjacent = role_def.get("adjacent", [])
91
+ prompt = f"""Role: {role_name}
92
+
93
+ Existing core skills: {', '.join(core)}
94
+ Adjacent skills: {', '.join(adjacent)}
95
+
96
+ Task: Output exactly 80 highly specific keyword phrases (3-6 words each) that this role's job description would mention. Focus on:
97
+ - specific frameworks, tools, libraries by name
98
+ - concrete certifications and standards (CKA, AWS SAA, ISO 27001, etc.)
99
+ - specific design patterns and methodologies
100
+ - production-grade vocabulary used by senior engineers
101
+ - emerging 2025-2026 tech in this domain
102
+
103
+ Output: comma-separated list. NO numbering. NO categories. NO explanatory text. Just keywords."""
104
+
105
+ response = call_llm(prompt)
106
+ if not response:
107
+ return []
108
+
109
+ # Parse comma-separated keywords, strip noise
110
+ kws = []
111
+ for piece in response.replace(";", ",").split(","):
112
+ kw = piece.strip().strip(".\"'`*-•").strip()
113
+ # remove leading numbers like "1. " or "1) "
114
+ if kw and kw[0].isdigit():
115
+ for sep in (". ", ") ", "- "):
116
+ if sep in kw[:5]:
117
+ kw = kw.split(sep, 1)[1].strip()
118
+ break
119
+ if 3 <= len(kw) <= 80 and any(c.isalpha() for c in kw):
120
+ kws.append(kw.lower())
121
+
122
+ # Dedup keep order
123
+ seen = set()
124
+ deduped = []
125
+ for k in kws:
126
+ if k not in seen:
127
+ seen.add(k)
128
+ deduped.append(k)
129
+ return deduped[:80]
130
+
131
+
132
+ def main():
133
+ if not ROLE_MAP_PATH.exists():
134
+ sys.exit(f"role-knowledge-map.json not found at {ROLE_MAP_PATH}")
135
+
136
+ data = json.loads(ROLE_MAP_PATH.read_text())
137
+ roles = data.get("roles", {})
138
+ if not roles:
139
+ sys.exit("no roles in map")
140
+
141
+ total_added = 0
142
+ for name, role_def in roles.items():
143
+ existing = len(role_def.get("expanded", []))
144
+ print(f"\n▶ {name} (existing core={len(role_def.get('core',[]))} adjacent={len(role_def.get('adjacent',[]))} expanded={existing})", flush=True)
145
+ new_kws = expand_role(name, role_def)
146
+ if not new_kws:
147
+ print(f" (no expansion — all providers failed)", flush=True)
148
+ continue
149
+ # Merge with any existing expanded keywords
150
+ existing_set = set(role_def.get("expanded", []))
151
+ merged = list(existing_set | set(new_kws))
152
+ role_def["expanded"] = sorted(merged)
153
+ added = len(role_def["expanded"]) - existing
154
+ total_added += added
155
+ print(f" +{added} keywords (total expanded={len(role_def['expanded'])})", flush=True)
156
+ time.sleep(2) # gentle rate-limit between roles
157
+
158
+ # Write back
159
+ ROLE_MAP_PATH.write_text(json.dumps(data, indent=2, ensure_ascii=False))
160
+ print(f"\n✅ wrote {ROLE_MAP_PATH} — added {total_added} new keywords across {len(roles)} roles")
161
+
162
+
163
+ if __name__ == "__main__":
164
+ main()
bin/hermes-status-server.py CHANGED
@@ -167,7 +167,7 @@ def log_tail(name: str, lines: int = 100) -> PlainTextResponse:
167
  "auto-orchestrate-loop", "training-push", "ollama", "discord-bot",
168
  "hermes-discord-bot", "surrogate-research-loop", "surrogate-research-apply",
169
  "surrogate-dev-loop", "domain-scrape-loop", "github-domain-scrape",
170
- "qwen-coder", "git-clone", "git-pull", "redis", "parquet-direct-ingest", "bulk-ingest-parallel", "rag-vector-builder", "auto-orchestrate-continuous", "dataset-enrich", "hf-dataset-discoverer", "dedup-bootstrap", "github-agentic-crawler", "ollama-pull-granite", "synthetic-data", "self-ingest", "scrape-sre-postmortems", "refresh-cve-feed", "self-heal-watchdog", "gh-actions-ticker", "llm-burst-generator",
171
  "ollama-pull-coder", "ollama-pull-devstral", "ollama-pull-fallback",
172
  "ollama-pull-yicoder", "ollama-pull-embed", "ollama-pull-light",
173
  }
 
167
  "auto-orchestrate-loop", "training-push", "ollama", "discord-bot",
168
  "hermes-discord-bot", "surrogate-research-loop", "surrogate-research-apply",
169
  "surrogate-dev-loop", "domain-scrape-loop", "github-domain-scrape",
170
+ "qwen-coder", "git-clone", "git-pull", "redis", "parquet-direct-ingest", "bulk-ingest-parallel", "rag-vector-builder", "auto-orchestrate-continuous", "dataset-enrich", "hf-dataset-discoverer", "dedup-bootstrap", "github-agentic-crawler", "ollama-pull-granite", "synthetic-data", "self-ingest", "scrape-sre-postmortems", "refresh-cve-feed", "self-heal-watchdog", "gh-actions-ticker", "llm-burst-generator", "expand-role-keywords",
171
  "ollama-pull-coder", "ollama-pull-devstral", "ollama-pull-fallback",
172
  "ollama-pull-yicoder", "ollama-pull-embed", "ollama-pull-light",
173
  }
bin/hf-dataset-discoverer.py CHANGED
@@ -48,6 +48,11 @@ def _load_role_queries() -> list[tuple[str, str]]:
48
  queries.append((q, f"{role}-core"))
49
  for q in (skills.get("adjacent") or []):
50
  queries.append((q, f"{role}-adj"))
 
 
 
 
 
51
  for q in data.get("cross_cutting_topics") or []:
52
  queries.append((q, "cross-cutting"))
53
  # Plus baseline queries (NEVER static — discoverer must keep finding)
 
48
  queries.append((q, f"{role}-core"))
49
  for q in (skills.get("adjacent") or []):
50
  queries.append((q, f"{role}-adj"))
51
+ # NEW: LLM-expanded keywords from real job-description research
52
+ # (filled by expand-role-keywords.py running weekly via cron).
53
+ # 80 keywords per role x 17 roles = up to 1,360 extra search terms.
54
+ for q in (skills.get("expanded") or []):
55
+ queries.append((q, f"{role}-exp"))
56
  for q in data.get("cross_cutting_topics") or []:
57
  queries.append((q, "cross-cutting"))
58
  # Plus baseline queries (NEVER static — discoverer must keep finding)
start.sh CHANGED
@@ -333,6 +333,10 @@ while true; do
333
  [[ $((M % 1440)) -eq 240 ]] && bash ~/.surrogate/bin/refresh-cve-feed.sh >> "$LOG" 2>&1 &
334
  # Daily 05:00 UTC: scrape SRE postmortems (danluu list + awesome-tech-postmortems)
335
  [[ $((M % 1440)) -eq 300 ]] && bash ~/.surrogate/bin/scrape-sre-postmortems.sh >> "$LOG" 2>&1 &
 
 
 
 
336
  sleep 60
337
  done
338
  CRONSH
 
333
  [[ $((M % 1440)) -eq 240 ]] && bash ~/.surrogate/bin/refresh-cve-feed.sh >> "$LOG" 2>&1 &
334
  # Daily 05:00 UTC: scrape SRE postmortems (danluu list + awesome-tech-postmortems)
335
  [[ $((M % 1440)) -eq 300 ]] && bash ~/.surrogate/bin/scrape-sre-postmortems.sh >> "$LOG" 2>&1 &
336
+ # Daily 06:00 UTC: LLM-expand role keywords (sends each role's skills to
337
+ # Cerebras/Groq → +80 specific job-description-style search terms each).
338
+ # Discoverer auto-uses the expanded list on its next cycle.
339
+ [[ $((M % 1440)) -eq 360 ]] && python3 ~/.surrogate/bin/expand-role-keywords.py >> "$LOG_DIR/expand-role-keywords.log" 2>&1 &
340
  sleep 60
341
  done
342
  CRONSH