fix: hypothesis_hunter v2 — pass field descriptions, signs, arities + strict constraint to use ONLY listed fields"

Browse files

Files changed (1) hide show

alpha_factory/personas/hypothesis_hunter.py +59 -33

alpha_factory/personas/hypothesis_hunter.py CHANGED Viewed

@@ -1,36 +1,61 @@
 """
-Hypothesis Hunter — Persona 1 (Microfish)
-Generates novel factor blueprints grounded in academic research.
 """
 from ..infra.llm_client import LLMClient
 from ..schemas import Blueprint, AnomalyTag
-from ..deterministic.theme_sampler import THEME_FIELDS, PROVEN_ARCHETYPES
 SYSTEM_PROMPT = """You are a senior quantitative researcher at a systematic hedge fund.
 Your job is to propose novel cross-sectional equity factor hypotheses for the WorldQuant BRAIN platform.
 CRITICAL RULES:
-1. You MUST distinguish between AGGREGATE-MARKET anomalies and CROSS-SECTIONAL anomalies.
-   - Aggregate: signals about the overall market (VIX, aggregate PCR). Signs apply to SPY/market.
-   - Cross-sectional: signals per-stock. Signs determine which stocks to long/short.
-   - If your academic anchor studies aggregate returns, you CANNOT reuse that sign cross-sectionally without separate evidence.
-2. Every hypothesis MUST cite at least one academic paper (arXiv ID or DOI).
-3. Every component MUST specify the expected cross-sectional sign direction:
-   - "long_high": higher values → long (e.g., value: high book-to-price → long)
-   - "long_low": lower values → long (e.g., reversal: low recent returns → long/mean-revert)
-4. Prefer PROVEN ARCHETYPES unless you have strong paper-backed reason for a novel structure.
-   Known archetypes: {archetypes}
-5. Novelty claim must explain WHY this is different from existing alphas.
-6. Max 5 components. Fewer is better — concentrate on 1-2 dominant signals with support.
-Available themes for this round: {theme}
-Available fields in this theme: {fields}
 """
@@ -40,17 +65,16 @@ async def generate_hypothesis(
     retrieved_papers: list[str],
     existing_anomaly_tags: list[str],
 ) -> Blueprint:
-    """
-    Generate a novel alpha hypothesis blueprint.
-    Uses tier="microfish" — ModelManager resolves to the user's selected model.
-    """
     fields = THEME_FIELDS.get(theme, [])
     archetypes_str = ", ".join(PROVEN_ARCHETYPES)
     system = SYSTEM_PROMPT.format(
         archetypes=archetypes_str,
-        theme=theme,
-        fields=", ".join(fields),
     )
     papers_context = "\n\n".join([
@@ -60,20 +84,22 @@ async def generate_hypothesis(
     saturated = [tag for tag in set(existing_anomaly_tags) if existing_anomaly_tags.count(tag) >= 3]
     saturated_str = ", ".join(saturated) if saturated else "none"
-    user_prompt = f"""Generate a novel cross-sectional equity alpha hypothesis for the theme: "{theme}".
-CONTEXT — Retrieved academic papers:
 {papers_context}
 CONSTRAINTS:
-- Fields available: {', '.join(fields)}
-- Saturated anomaly tags (AVOID these): {saturated_str}
-- Target universe: USA TOP3000, daily frequency
-- Must work cross-sectionally (rank stocks, not predict market direction)
-Output a complete Blueprint JSON with all required fields.
-Ensure novelty_claim is specific (not "this is novel because it's different").
-Ensure each component has a clear sign_direction with academic justification."""
     blueprint = await llm.generate_json(
         prompt=user_prompt,

 """
+Hypothesis Hunter v2 — Persona 1 (Microfish)
+Now passes FULL field metadata (ID, description, sign, coverage) to LLM
+so it never invents fields or uses wrong signs.
 """
 from ..infra.llm_client import LLMClient
 from ..schemas import Blueprint, AnomalyTag
+from ..deterministic.theme_sampler import THEME_FIELDS, PROVEN_ARCHETYPES, THEME_TO_ARCHETYPE
+from ..data.brain_fields import FIELD_INDEX, SignConvention
+def _format_fields_for_llm(field_ids: list[str]) -> str:
+    """Format field metadata so LLM knows exact IDs, signs, and descriptions."""
+    lines = []
+    for fid in field_ids:
+        if fid in FIELD_INDEX:
+            f = FIELD_INDEX[fid]
+            sign_str = {
+                SignConvention.LONG_HIGH: "long_high (higher → buy)",
+                SignConvention.LONG_LOW: "long_low (lower → buy, negate in expression)",
+                SignConvention.CONTRARIAN: "contrarian (already inverted, use as-is)",
+                SignConvention.AMBIGUOUS: "ambiguous (check context)",
+            }[f.sign]
+            lines.append(f"  • {f.id} | coverage={f.coverage:.0%} | sign={sign_str} | {f.description}")
+        else:
+            lines.append(f"  • {fid} | (metadata not available)")
+    return "\n".join(lines)
 SYSTEM_PROMPT = """You are a senior quantitative researcher at a systematic hedge fund.
 Your job is to propose novel cross-sectional equity factor hypotheses for the WorldQuant BRAIN platform.
 CRITICAL RULES:
+1. You MUST use ONLY the exact field IDs listed below. Do NOT invent field names.
+   Copy the field ID exactly as written (case-sensitive, underscores included).
+2. Every component MUST specify the cross-sectional sign direction:
+   - "long_high": higher values → long (yield, quality, growth)
+   - "long_low": lower values → long (risk, distress, illiquidity — will be negated)
+3. The sign_direction MUST match the field's documented sign convention.
+   If a field is labeled "long_low", your component must also say "long_low".
+4. Prefer PROVEN ARCHETYPES for the archetype field:
+   {archetypes}
+   Recommended archetype for this theme: {recommended_archetype}
+5. Max 3 components. Simpler is better — 1-2 fields dominate performance.
+6. Every ts_* operator needs a lookback window (days). Suggested: 5, 10, 21, 42, 63, 126, 252.
+7. For fields with coverage < 70%, the expression compiler will auto-apply ts_backfill(field, 30).
+AVAILABLE FIELDS (use ONLY these exact IDs):
+{field_details}
+NEUTRALIZATION options: sector, industry, subindustry
 """
     retrieved_papers: list[str],
     existing_anomaly_tags: list[str],
 ) -> Blueprint:
+    """Generate a novel alpha hypothesis blueprint."""
     fields = THEME_FIELDS.get(theme, [])
     archetypes_str = ", ".join(PROVEN_ARCHETYPES)
+    recommended = THEME_TO_ARCHETYPE.get(theme, "novel")
+    field_details = _format_fields_for_llm(fields)
     system = SYSTEM_PROMPT.format(
         archetypes=archetypes_str,
+        recommended_archetype=recommended,
+        field_details=field_details,
     )
     papers_context = "\n\n".join([
     saturated = [tag for tag in set(existing_anomaly_tags) if existing_anomaly_tags.count(tag) >= 3]
     saturated_str = ", ".join(saturated) if saturated else "none"
+    user_prompt = f"""Generate a novel cross-sectional equity alpha hypothesis for theme: "{theme}".
+ACADEMIC CONTEXT:
 {papers_context}
 CONSTRAINTS:
+- USE ONLY these field IDs: {', '.join(fields)}
+- Saturated anomaly tags to AVOID: {saturated_str}
+- Universe: USA TOP3000, delay=1
+- Must work cross-sectionally (rank stocks, not predict market)
+- Recommended archetype: {recommended}
+- Set decay between 5-10 (controls turnover)
+Output a complete Blueprint JSON.
+The 'fields' array in each component MUST contain exact field IDs from the list above.
+Do NOT invent or modify field names."""
     blueprint = await llm.generate_json(
         prompt=user_prompt,