fix: hypothesis_hunter v2 — full field metadata to LLM, strict field constraint

Browse files

Files changed (1) hide show

alpha_factory/personas/hypothesis_hunter.py +36 -69

alpha_factory/personas/hypothesis_hunter.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """
 Hypothesis Hunter v2 — Persona 1 (Microfish)
-Now passes FULL field metadata (ID, description, sign, coverage) to LLM
-so it never invents fields or uses wrong signs.
 """
 from ..infra.llm_client import LLMClient
 from ..schemas import Blueprint, AnomalyTag
@@ -9,63 +8,41 @@ from ..deterministic.theme_sampler import THEME_FIELDS, PROVEN_ARCHETYPES, THEME
 from ..data.brain_fields import FIELD_INDEX, SignConvention
-def _format_fields_for_llm(field_ids: list[str]) -> str:
-    """Format field metadata so LLM knows exact IDs, signs, and descriptions."""
     lines = []
     for fid in field_ids:
         if fid in FIELD_INDEX:
             f = FIELD_INDEX[fid]
-            sign_str = {
-                SignConvention.LONG_HIGH: "long_high (higher → buy)",
-                SignConvention.LONG_LOW: "long_low (lower → buy, negate in expression)",
-                SignConvention.CONTRARIAN: "contrarian (already inverted, use as-is)",
-                SignConvention.AMBIGUOUS: "ambiguous (check context)",
-            }[f.sign]
-            lines.append(f"  • {f.id} | coverage={f.coverage:.0%} | sign={sign_str} | {f.description}")
         else:
-            lines.append(f"  • {fid} | (metadata not available)")
     return "\n".join(lines)
-SYSTEM_PROMPT = """You are a senior quantitative researcher at a systematic hedge fund.
-Your job is to propose novel cross-sectional equity factor hypotheses for the WorldQuant BRAIN platform.
-CRITICAL RULES:
-1. You MUST use ONLY the exact field IDs listed below. Do NOT invent field names.
-   Copy the field ID exactly as written (case-sensitive, underscores included).
-2. Every component MUST specify the cross-sectional sign direction:
-   - "long_high": higher values → long (yield, quality, growth)
-   - "long_low": lower values → long (risk, distress, illiquidity — will be negated)
-3. The sign_direction MUST match the field's documented sign convention.
-   If a field is labeled "long_low", your component must also say "long_low".
-4. Prefer PROVEN ARCHETYPES for the archetype field:
-   {archetypes}
-   Recommended archetype for this theme: {recommended_archetype}
-5. Max 3 components. Simpler is better — 1-2 fields dominate performance.
-6. Every ts_* operator needs a lookback window (days). Suggested: 5, 10, 21, 42, 63, 126, 252.
-7. For fields with coverage < 70%, the expression compiler will auto-apply ts_backfill(field, 30).
-AVAILABLE FIELDS (use ONLY these exact IDs):
 {field_details}
-NEUTRALIZATION options: sector, industry, subindustry
 """
-async def generate_hypothesis(
-    llm: LLMClient,
-    theme: str,
-    retrieved_papers: list[str],
-    existing_anomaly_tags: list[str],
-) -> Blueprint:
-    """Generate a novel alpha hypothesis blueprint."""
     fields = THEME_FIELDS.get(theme, [])
     archetypes_str = ", ".join(PROVEN_ARCHETYPES)
     recommended = THEME_TO_ARCHETYPE.get(theme, "novel")
@@ -78,35 +55,25 @@ async def generate_hypothesis(
     )
     papers_context = "\n\n".join([
-        f"PAPER {i+1}:\n{paper}" for i, paper in enumerate(retrieved_papers[:3])
-    ]) if retrieved_papers else "No papers retrieved. Use your domain knowledge but mark academic_anchor as null."
-    saturated = [tag for tag in set(existing_anomaly_tags) if existing_anomaly_tags.count(tag) >= 3]
-    saturated_str = ", ".join(saturated) if saturated else "none"
-    user_prompt = f"""Generate a novel cross-sectional equity alpha hypothesis for theme: "{theme}".
-ACADEMIC CONTEXT:
-{papers_context}
 CONSTRAINTS:
-- USE ONLY these field IDs: {', '.join(fields)}
-- Saturated anomaly tags to AVOID: {saturated_str}
 - Universe: USA TOP3000, delay=1
-- Must work cross-sectionally (rank stocks, not predict market)
-- Recommended archetype: {recommended}
-- Set decay between 5-10 (controls turnover)
-Output a complete Blueprint JSON.
-The 'fields' array in each component MUST contain exact field IDs from the list above.
-Do NOT invent or modify field names."""
-    blueprint = await llm.generate_json(
-        prompt=user_prompt,
-        schema=Blueprint,
-        tier="microfish",
-        temperature=0.7,
-        system_prompt=system,
-    )
-    return blueprint

 """
 Hypothesis Hunter v2 — Persona 1 (Microfish)
+Passes FULL field metadata to LLM so it never invents fields.
 """
 from ..infra.llm_client import LLMClient
 from ..schemas import Blueprint, AnomalyTag
 from ..data.brain_fields import FIELD_INDEX, SignConvention
+def _format_fields_for_llm(field_ids):
     lines = []
     for fid in field_ids:
         if fid in FIELD_INDEX:
             f = FIELD_INDEX[fid]
+            sign_map = {
+                SignConvention.LONG_HIGH: "long_high (higher->buy)",
+                SignConvention.LONG_LOW: "long_low (lower->buy,negate)",
+                SignConvention.CONTRARIAN: "contrarian (already inverted)",
+                SignConvention.AMBIGUOUS: "ambiguous",
+            }
+            lines.append(f"  - {f.id} | cov={f.coverage:.0%} | sign={sign_map[f.sign]} | {f.description}")
         else:
+            lines.append(f"  - {fid}")
     return "\n".join(lines)
+SYSTEM_PROMPT = """You are a senior quant researcher. Propose cross-sectional equity factor hypotheses for WorldQuant BRAIN.
+RULES:
+1. Use ONLY exact field IDs listed below. Do NOT invent names.
+2. sign_direction MUST match field's documented sign.
+3. Prefer PROVEN ARCHETYPES: {archetypes}
+   Recommended: {recommended_archetype}
+4. Max 3 components. Set decay 5-10.
+5. ts_* operators need (field, days). Windows: 5,10,21,42,63,126,252.
+AVAILABLE FIELDS (ONLY these):
 {field_details}
+NEUTRALIZATION: sector, industry, subindustry
 """
+async def generate_hypothesis(llm, theme, retrieved_papers, existing_anomaly_tags):
     fields = THEME_FIELDS.get(theme, [])
     archetypes_str = ", ".join(PROVEN_ARCHETYPES)
     recommended = THEME_TO_ARCHETYPE.get(theme, "novel")
     )
     papers_context = "\n\n".join([
+        f"PAPER {i+1}:\n{p}" for i, p in enumerate(retrieved_papers[:3])
+    ]) if retrieved_papers else "No papers. Use domain knowledge, set academic_anchor=null."
+    saturated = [t for t in set(existing_anomaly_tags) if existing_anomaly_tags.count(t) >= 3]
+    user_prompt = f"""Generate alpha hypothesis for theme: "{theme}".
+CONTEXT: {papers_context}
 CONSTRAINTS:
+- ONLY these fields: {', '.join(fields)}
+- Avoid saturated tags: {', '.join(saturated) or 'none'}
 - Universe: USA TOP3000, delay=1
+- Archetype: {recommended}
+- Decay: 5-10
+Output Blueprint JSON. fields array MUST use exact IDs from list above."""
+    return await llm.generate_json(
+        prompt=user_prompt, schema=Blueprint, tier="microfish",
+        temperature=0.7, system_prompt=system,
+    )