gaurv007 commited on
Commit
8bc677b
·
verified ·
1 Parent(s): ba25095

fix: hypothesis_hunter v2 — full field metadata to LLM, strict field constraint

Browse files
alpha_factory/personas/hypothesis_hunter.py CHANGED
@@ -1,7 +1,6 @@
1
  """
2
  Hypothesis Hunter v2 — Persona 1 (Microfish)
3
- Now passes FULL field metadata (ID, description, sign, coverage) to LLM
4
- so it never invents fields or uses wrong signs.
5
  """
6
  from ..infra.llm_client import LLMClient
7
  from ..schemas import Blueprint, AnomalyTag
@@ -9,63 +8,41 @@ from ..deterministic.theme_sampler import THEME_FIELDS, PROVEN_ARCHETYPES, THEME
9
  from ..data.brain_fields import FIELD_INDEX, SignConvention
10
 
11
 
12
- def _format_fields_for_llm(field_ids: list[str]) -> str:
13
- """Format field metadata so LLM knows exact IDs, signs, and descriptions."""
14
  lines = []
15
  for fid in field_ids:
16
  if fid in FIELD_INDEX:
17
  f = FIELD_INDEX[fid]
18
- sign_str = {
19
- SignConvention.LONG_HIGH: "long_high (higherbuy)",
20
- SignConvention.LONG_LOW: "long_low (lowerbuy, negate in expression)",
21
- SignConvention.CONTRARIAN: "contrarian (already inverted, use as-is)",
22
- SignConvention.AMBIGUOUS: "ambiguous (check context)",
23
- }[f.sign]
24
- lines.append(f" {f.id} | coverage={f.coverage:.0%} | sign={sign_str} | {f.description}")
25
  else:
26
- lines.append(f" {fid} | (metadata not available)")
27
  return "\n".join(lines)
28
 
29
 
30
- SYSTEM_PROMPT = """You are a senior quantitative researcher at a systematic hedge fund.
31
- Your job is to propose novel cross-sectional equity factor hypotheses for the WorldQuant BRAIN platform.
32
 
33
- CRITICAL RULES:
34
- 1. You MUST use ONLY the exact field IDs listed below. Do NOT invent field names.
35
- Copy the field ID exactly as written (case-sensitive, underscores included).
 
 
 
 
36
 
37
- 2. Every component MUST specify the cross-sectional sign direction:
38
- - "long_high": higher values → long (yield, quality, growth)
39
- - "long_low": lower values → long (risk, distress, illiquidity — will be negated)
40
-
41
- 3. The sign_direction MUST match the field's documented sign convention.
42
- If a field is labeled "long_low", your component must also say "long_low".
43
-
44
- 4. Prefer PROVEN ARCHETYPES for the archetype field:
45
- {archetypes}
46
-
47
- Recommended archetype for this theme: {recommended_archetype}
48
-
49
- 5. Max 3 components. Simpler is better — 1-2 fields dominate performance.
50
-
51
- 6. Every ts_* operator needs a lookback window (days). Suggested: 5, 10, 21, 42, 63, 126, 252.
52
-
53
- 7. For fields with coverage < 70%, the expression compiler will auto-apply ts_backfill(field, 30).
54
-
55
- AVAILABLE FIELDS (use ONLY these exact IDs):
56
  {field_details}
57
 
58
- NEUTRALIZATION options: sector, industry, subindustry
59
  """
60
 
61
 
62
- async def generate_hypothesis(
63
- llm: LLMClient,
64
- theme: str,
65
- retrieved_papers: list[str],
66
- existing_anomaly_tags: list[str],
67
- ) -> Blueprint:
68
- """Generate a novel alpha hypothesis blueprint."""
69
  fields = THEME_FIELDS.get(theme, [])
70
  archetypes_str = ", ".join(PROVEN_ARCHETYPES)
71
  recommended = THEME_TO_ARCHETYPE.get(theme, "novel")
@@ -78,35 +55,25 @@ async def generate_hypothesis(
78
  )
79
 
80
  papers_context = "\n\n".join([
81
- f"PAPER {i+1}:\n{paper}" for i, paper in enumerate(retrieved_papers[:3])
82
- ]) if retrieved_papers else "No papers retrieved. Use your domain knowledge but mark academic_anchor as null."
83
 
84
- saturated = [tag for tag in set(existing_anomaly_tags) if existing_anomaly_tags.count(tag) >= 3]
85
- saturated_str = ", ".join(saturated) if saturated else "none"
86
 
87
- user_prompt = f"""Generate a novel cross-sectional equity alpha hypothesis for theme: "{theme}".
88
 
89
- ACADEMIC CONTEXT:
90
- {papers_context}
91
 
92
  CONSTRAINTS:
93
- - USE ONLY these field IDs: {', '.join(fields)}
94
- - Saturated anomaly tags to AVOID: {saturated_str}
95
  - Universe: USA TOP3000, delay=1
96
- - Must work cross-sectionally (rank stocks, not predict market)
97
- - Recommended archetype: {recommended}
98
- - Set decay between 5-10 (controls turnover)
99
-
100
- Output a complete Blueprint JSON.
101
- The 'fields' array in each component MUST contain exact field IDs from the list above.
102
- Do NOT invent or modify field names."""
103
-
104
- blueprint = await llm.generate_json(
105
- prompt=user_prompt,
106
- schema=Blueprint,
107
- tier="microfish",
108
- temperature=0.7,
109
- system_prompt=system,
110
- )
111
 
112
- return blueprint
 
 
 
 
 
 
1
  """
2
  Hypothesis Hunter v2 — Persona 1 (Microfish)
3
+ Passes FULL field metadata to LLM so it never invents fields.
 
4
  """
5
  from ..infra.llm_client import LLMClient
6
  from ..schemas import Blueprint, AnomalyTag
 
8
  from ..data.brain_fields import FIELD_INDEX, SignConvention
9
 
10
 
11
+ def _format_fields_for_llm(field_ids):
 
12
  lines = []
13
  for fid in field_ids:
14
  if fid in FIELD_INDEX:
15
  f = FIELD_INDEX[fid]
16
+ sign_map = {
17
+ SignConvention.LONG_HIGH: "long_high (higher->buy)",
18
+ SignConvention.LONG_LOW: "long_low (lower->buy,negate)",
19
+ SignConvention.CONTRARIAN: "contrarian (already inverted)",
20
+ SignConvention.AMBIGUOUS: "ambiguous",
21
+ }
22
+ lines.append(f" - {f.id} | cov={f.coverage:.0%} | sign={sign_map[f.sign]} | {f.description}")
23
  else:
24
+ lines.append(f" - {fid}")
25
  return "\n".join(lines)
26
 
27
 
28
+ SYSTEM_PROMPT = """You are a senior quant researcher. Propose cross-sectional equity factor hypotheses for WorldQuant BRAIN.
 
29
 
30
+ RULES:
31
+ 1. Use ONLY exact field IDs listed below. Do NOT invent names.
32
+ 2. sign_direction MUST match field's documented sign.
33
+ 3. Prefer PROVEN ARCHETYPES: {archetypes}
34
+ Recommended: {recommended_archetype}
35
+ 4. Max 3 components. Set decay 5-10.
36
+ 5. ts_* operators need (field, days). Windows: 5,10,21,42,63,126,252.
37
 
38
+ AVAILABLE FIELDS (ONLY these):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  {field_details}
40
 
41
+ NEUTRALIZATION: sector, industry, subindustry
42
  """
43
 
44
 
45
+ async def generate_hypothesis(llm, theme, retrieved_papers, existing_anomaly_tags):
 
 
 
 
 
 
46
  fields = THEME_FIELDS.get(theme, [])
47
  archetypes_str = ", ".join(PROVEN_ARCHETYPES)
48
  recommended = THEME_TO_ARCHETYPE.get(theme, "novel")
 
55
  )
56
 
57
  papers_context = "\n\n".join([
58
+ f"PAPER {i+1}:\n{p}" for i, p in enumerate(retrieved_papers[:3])
59
+ ]) if retrieved_papers else "No papers. Use domain knowledge, set academic_anchor=null."
60
 
61
+ saturated = [t for t in set(existing_anomaly_tags) if existing_anomaly_tags.count(t) >= 3]
 
62
 
63
+ user_prompt = f"""Generate alpha hypothesis for theme: "{theme}".
64
 
65
+ CONTEXT: {papers_context}
 
66
 
67
  CONSTRAINTS:
68
+ - ONLY these fields: {', '.join(fields)}
69
+ - Avoid saturated tags: {', '.join(saturated) or 'none'}
70
  - Universe: USA TOP3000, delay=1
71
+ - Archetype: {recommended}
72
+ - Decay: 5-10
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ Output Blueprint JSON. fields array MUST use exact IDs from list above."""
75
+
76
+ return await llm.generate_json(
77
+ prompt=user_prompt, schema=Blueprint, tier="microfish",
78
+ temperature=0.7, system_prompt=system,
79
+ )