gaurv007 commited on
Commit
ba25095
·
verified ·
1 Parent(s): ede9d13

fix: hypothesis_hunter v2 — pass field descriptions, signs, arities + strict constraint to use ONLY listed fields"

Browse files
alpha_factory/personas/hypothesis_hunter.py CHANGED
@@ -1,36 +1,61 @@
1
  """
2
- Hypothesis Hunter — Persona 1 (Microfish)
3
- Generates novel factor blueprints grounded in academic research.
 
4
  """
5
  from ..infra.llm_client import LLMClient
6
  from ..schemas import Blueprint, AnomalyTag
7
- from ..deterministic.theme_sampler import THEME_FIELDS, PROVEN_ARCHETYPES
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  SYSTEM_PROMPT = """You are a senior quantitative researcher at a systematic hedge fund.
11
  Your job is to propose novel cross-sectional equity factor hypotheses for the WorldQuant BRAIN platform.
12
 
13
  CRITICAL RULES:
14
- 1. You MUST distinguish between AGGREGATE-MARKET anomalies and CROSS-SECTIONAL anomalies.
15
- - Aggregate: signals about the overall market (VIX, aggregate PCR). Signs apply to SPY/market.
16
- - Cross-sectional: signals per-stock. Signs determine which stocks to long/short.
17
- - If your academic anchor studies aggregate returns, you CANNOT reuse that sign cross-sectionally without separate evidence.
18
 
19
- 2. Every hypothesis MUST cite at least one academic paper (arXiv ID or DOI).
 
 
20
 
21
- 3. Every component MUST specify the expected cross-sectional sign direction:
22
- - "long_high": higher values long (e.g., value: high book-to-price long)
23
- - "long_low": lower values → long (e.g., reversal: low recent returns → long/mean-revert)
24
 
25
- 4. Prefer PROVEN ARCHETYPES unless you have strong paper-backed reason for a novel structure.
26
- Known archetypes: {archetypes}
 
 
27
 
28
- 5. Novelty claim must explain WHY this is different from existing alphas.
29
 
30
- 6. Max 5 components. Fewer is better concentrate on 1-2 dominant signals with support.
31
 
32
- Available themes for this round: {theme}
33
- Available fields in this theme: {fields}
 
 
 
 
34
  """
35
 
36
 
@@ -40,17 +65,16 @@ async def generate_hypothesis(
40
  retrieved_papers: list[str],
41
  existing_anomaly_tags: list[str],
42
  ) -> Blueprint:
43
- """
44
- Generate a novel alpha hypothesis blueprint.
45
- Uses tier="microfish" — ModelManager resolves to the user's selected model.
46
- """
47
  fields = THEME_FIELDS.get(theme, [])
48
  archetypes_str = ", ".join(PROVEN_ARCHETYPES)
 
 
49
 
50
  system = SYSTEM_PROMPT.format(
51
  archetypes=archetypes_str,
52
- theme=theme,
53
- fields=", ".join(fields),
54
  )
55
 
56
  papers_context = "\n\n".join([
@@ -60,20 +84,22 @@ async def generate_hypothesis(
60
  saturated = [tag for tag in set(existing_anomaly_tags) if existing_anomaly_tags.count(tag) >= 3]
61
  saturated_str = ", ".join(saturated) if saturated else "none"
62
 
63
- user_prompt = f"""Generate a novel cross-sectional equity alpha hypothesis for the theme: "{theme}".
64
 
65
- CONTEXT — Retrieved academic papers:
66
  {papers_context}
67
 
68
  CONSTRAINTS:
69
- - Fields available: {', '.join(fields)}
70
- - Saturated anomaly tags (AVOID these): {saturated_str}
71
- - Target universe: USA TOP3000, daily frequency
72
- - Must work cross-sectionally (rank stocks, not predict market direction)
73
-
74
- Output a complete Blueprint JSON with all required fields.
75
- Ensure novelty_claim is specific (not "this is novel because it's different").
76
- Ensure each component has a clear sign_direction with academic justification."""
 
 
77
 
78
  blueprint = await llm.generate_json(
79
  prompt=user_prompt,
 
1
  """
2
+ Hypothesis Hunter v2 — Persona 1 (Microfish)
3
+ Now passes FULL field metadata (ID, description, sign, coverage) to LLM
4
+ so it never invents fields or uses wrong signs.
5
  """
6
  from ..infra.llm_client import LLMClient
7
  from ..schemas import Blueprint, AnomalyTag
8
+ from ..deterministic.theme_sampler import THEME_FIELDS, PROVEN_ARCHETYPES, THEME_TO_ARCHETYPE
9
+ from ..data.brain_fields import FIELD_INDEX, SignConvention
10
+
11
+
12
+ def _format_fields_for_llm(field_ids: list[str]) -> str:
13
+ """Format field metadata so LLM knows exact IDs, signs, and descriptions."""
14
+ lines = []
15
+ for fid in field_ids:
16
+ if fid in FIELD_INDEX:
17
+ f = FIELD_INDEX[fid]
18
+ sign_str = {
19
+ SignConvention.LONG_HIGH: "long_high (higher → buy)",
20
+ SignConvention.LONG_LOW: "long_low (lower → buy, negate in expression)",
21
+ SignConvention.CONTRARIAN: "contrarian (already inverted, use as-is)",
22
+ SignConvention.AMBIGUOUS: "ambiguous (check context)",
23
+ }[f.sign]
24
+ lines.append(f" • {f.id} | coverage={f.coverage:.0%} | sign={sign_str} | {f.description}")
25
+ else:
26
+ lines.append(f" • {fid} | (metadata not available)")
27
+ return "\n".join(lines)
28
 
29
 
30
  SYSTEM_PROMPT = """You are a senior quantitative researcher at a systematic hedge fund.
31
  Your job is to propose novel cross-sectional equity factor hypotheses for the WorldQuant BRAIN platform.
32
 
33
  CRITICAL RULES:
34
+ 1. You MUST use ONLY the exact field IDs listed below. Do NOT invent field names.
35
+ Copy the field ID exactly as written (case-sensitive, underscores included).
 
 
36
 
37
+ 2. Every component MUST specify the cross-sectional sign direction:
38
+ - "long_high": higher values → long (yield, quality, growth)
39
+ - "long_low": lower values → long (risk, distress, illiquidity — will be negated)
40
 
41
+ 3. The sign_direction MUST match the field's documented sign convention.
42
+ If a field is labeled "long_low", your component must also say "long_low".
 
43
 
44
+ 4. Prefer PROVEN ARCHETYPES for the archetype field:
45
+ {archetypes}
46
+
47
+ Recommended archetype for this theme: {recommended_archetype}
48
 
49
+ 5. Max 3 components. Simpler is better 1-2 fields dominate performance.
50
 
51
+ 6. Every ts_* operator needs a lookback window (days). Suggested: 5, 10, 21, 42, 63, 126, 252.
52
 
53
+ 7. For fields with coverage < 70%, the expression compiler will auto-apply ts_backfill(field, 30).
54
+
55
+ AVAILABLE FIELDS (use ONLY these exact IDs):
56
+ {field_details}
57
+
58
+ NEUTRALIZATION options: sector, industry, subindustry
59
  """
60
 
61
 
 
65
  retrieved_papers: list[str],
66
  existing_anomaly_tags: list[str],
67
  ) -> Blueprint:
68
+ """Generate a novel alpha hypothesis blueprint."""
 
 
 
69
  fields = THEME_FIELDS.get(theme, [])
70
  archetypes_str = ", ".join(PROVEN_ARCHETYPES)
71
+ recommended = THEME_TO_ARCHETYPE.get(theme, "novel")
72
+ field_details = _format_fields_for_llm(fields)
73
 
74
  system = SYSTEM_PROMPT.format(
75
  archetypes=archetypes_str,
76
+ recommended_archetype=recommended,
77
+ field_details=field_details,
78
  )
79
 
80
  papers_context = "\n\n".join([
 
84
  saturated = [tag for tag in set(existing_anomaly_tags) if existing_anomaly_tags.count(tag) >= 3]
85
  saturated_str = ", ".join(saturated) if saturated else "none"
86
 
87
+ user_prompt = f"""Generate a novel cross-sectional equity alpha hypothesis for theme: "{theme}".
88
 
89
+ ACADEMIC CONTEXT:
90
  {papers_context}
91
 
92
  CONSTRAINTS:
93
+ - USE ONLY these field IDs: {', '.join(fields)}
94
+ - Saturated anomaly tags to AVOID: {saturated_str}
95
+ - Universe: USA TOP3000, delay=1
96
+ - Must work cross-sectionally (rank stocks, not predict market)
97
+ - Recommended archetype: {recommended}
98
+ - Set decay between 5-10 (controls turnover)
99
+
100
+ Output a complete Blueprint JSON.
101
+ The 'fields' array in each component MUST contain exact field IDs from the list above.
102
+ Do NOT invent or modify field names."""
103
 
104
  blueprint = await llm.generate_json(
105
  prompt=user_prompt,