fix: hypothesis_hunter v2 — pass field descriptions, signs, arities + strict constraint to use ONLY listed fields"
Browse files
alpha_factory/personas/hypothesis_hunter.py
CHANGED
|
@@ -1,36 +1,61 @@
|
|
| 1 |
"""
|
| 2 |
-
Hypothesis Hunter — Persona 1 (Microfish)
|
| 3 |
-
|
|
|
|
| 4 |
"""
|
| 5 |
from ..infra.llm_client import LLMClient
|
| 6 |
from ..schemas import Blueprint, AnomalyTag
|
| 7 |
-
from ..deterministic.theme_sampler import THEME_FIELDS, PROVEN_ARCHETYPES
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
SYSTEM_PROMPT = """You are a senior quantitative researcher at a systematic hedge fund.
|
| 11 |
Your job is to propose novel cross-sectional equity factor hypotheses for the WorldQuant BRAIN platform.
|
| 12 |
|
| 13 |
CRITICAL RULES:
|
| 14 |
-
1. You MUST
|
| 15 |
-
|
| 16 |
-
- Cross-sectional: signals per-stock. Signs determine which stocks to long/short.
|
| 17 |
-
- If your academic anchor studies aggregate returns, you CANNOT reuse that sign cross-sectionally without separate evidence.
|
| 18 |
|
| 19 |
-
2. Every
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
3.
|
| 22 |
-
|
| 23 |
-
- "long_low": lower values → long (e.g., reversal: low recent returns → long/mean-revert)
|
| 24 |
|
| 25 |
-
4. Prefer PROVEN ARCHETYPES
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
5.
|
| 29 |
|
| 30 |
-
6.
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
"""
|
| 35 |
|
| 36 |
|
|
@@ -40,17 +65,16 @@ async def generate_hypothesis(
|
|
| 40 |
retrieved_papers: list[str],
|
| 41 |
existing_anomaly_tags: list[str],
|
| 42 |
) -> Blueprint:
|
| 43 |
-
"""
|
| 44 |
-
Generate a novel alpha hypothesis blueprint.
|
| 45 |
-
Uses tier="microfish" — ModelManager resolves to the user's selected model.
|
| 46 |
-
"""
|
| 47 |
fields = THEME_FIELDS.get(theme, [])
|
| 48 |
archetypes_str = ", ".join(PROVEN_ARCHETYPES)
|
|
|
|
|
|
|
| 49 |
|
| 50 |
system = SYSTEM_PROMPT.format(
|
| 51 |
archetypes=archetypes_str,
|
| 52 |
-
|
| 53 |
-
|
| 54 |
)
|
| 55 |
|
| 56 |
papers_context = "\n\n".join([
|
|
@@ -60,20 +84,22 @@ async def generate_hypothesis(
|
|
| 60 |
saturated = [tag for tag in set(existing_anomaly_tags) if existing_anomaly_tags.count(tag) >= 3]
|
| 61 |
saturated_str = ", ".join(saturated) if saturated else "none"
|
| 62 |
|
| 63 |
-
user_prompt = f"""Generate a novel cross-sectional equity alpha hypothesis for
|
| 64 |
|
| 65 |
-
|
| 66 |
{papers_context}
|
| 67 |
|
| 68 |
CONSTRAINTS:
|
| 69 |
-
-
|
| 70 |
-
- Saturated anomaly tags
|
| 71 |
-
-
|
| 72 |
-
- Must work cross-sectionally (rank stocks, not predict market
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
| 77 |
|
| 78 |
blueprint = await llm.generate_json(
|
| 79 |
prompt=user_prompt,
|
|
|
|
| 1 |
"""
|
| 2 |
+
Hypothesis Hunter v2 — Persona 1 (Microfish)
|
| 3 |
+
Now passes FULL field metadata (ID, description, sign, coverage) to LLM
|
| 4 |
+
so it never invents fields or uses wrong signs.
|
| 5 |
"""
|
| 6 |
from ..infra.llm_client import LLMClient
|
| 7 |
from ..schemas import Blueprint, AnomalyTag
|
| 8 |
+
from ..deterministic.theme_sampler import THEME_FIELDS, PROVEN_ARCHETYPES, THEME_TO_ARCHETYPE
|
| 9 |
+
from ..data.brain_fields import FIELD_INDEX, SignConvention
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _format_fields_for_llm(field_ids: list[str]) -> str:
|
| 13 |
+
"""Format field metadata so LLM knows exact IDs, signs, and descriptions."""
|
| 14 |
+
lines = []
|
| 15 |
+
for fid in field_ids:
|
| 16 |
+
if fid in FIELD_INDEX:
|
| 17 |
+
f = FIELD_INDEX[fid]
|
| 18 |
+
sign_str = {
|
| 19 |
+
SignConvention.LONG_HIGH: "long_high (higher → buy)",
|
| 20 |
+
SignConvention.LONG_LOW: "long_low (lower → buy, negate in expression)",
|
| 21 |
+
SignConvention.CONTRARIAN: "contrarian (already inverted, use as-is)",
|
| 22 |
+
SignConvention.AMBIGUOUS: "ambiguous (check context)",
|
| 23 |
+
}[f.sign]
|
| 24 |
+
lines.append(f" • {f.id} | coverage={f.coverage:.0%} | sign={sign_str} | {f.description}")
|
| 25 |
+
else:
|
| 26 |
+
lines.append(f" • {fid} | (metadata not available)")
|
| 27 |
+
return "\n".join(lines)
|
| 28 |
|
| 29 |
|
| 30 |
SYSTEM_PROMPT = """You are a senior quantitative researcher at a systematic hedge fund.
|
| 31 |
Your job is to propose novel cross-sectional equity factor hypotheses for the WorldQuant BRAIN platform.
|
| 32 |
|
| 33 |
CRITICAL RULES:
|
| 34 |
+
1. You MUST use ONLY the exact field IDs listed below. Do NOT invent field names.
|
| 35 |
+
Copy the field ID exactly as written (case-sensitive, underscores included).
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
2. Every component MUST specify the cross-sectional sign direction:
|
| 38 |
+
- "long_high": higher values → long (yield, quality, growth)
|
| 39 |
+
- "long_low": lower values → long (risk, distress, illiquidity — will be negated)
|
| 40 |
|
| 41 |
+
3. The sign_direction MUST match the field's documented sign convention.
|
| 42 |
+
If a field is labeled "long_low", your component must also say "long_low".
|
|
|
|
| 43 |
|
| 44 |
+
4. Prefer PROVEN ARCHETYPES for the archetype field:
|
| 45 |
+
{archetypes}
|
| 46 |
+
|
| 47 |
+
Recommended archetype for this theme: {recommended_archetype}
|
| 48 |
|
| 49 |
+
5. Max 3 components. Simpler is better — 1-2 fields dominate performance.
|
| 50 |
|
| 51 |
+
6. Every ts_* operator needs a lookback window (days). Suggested: 5, 10, 21, 42, 63, 126, 252.
|
| 52 |
|
| 53 |
+
7. For fields with coverage < 70%, the expression compiler will auto-apply ts_backfill(field, 30).
|
| 54 |
+
|
| 55 |
+
AVAILABLE FIELDS (use ONLY these exact IDs):
|
| 56 |
+
{field_details}
|
| 57 |
+
|
| 58 |
+
NEUTRALIZATION options: sector, industry, subindustry
|
| 59 |
"""
|
| 60 |
|
| 61 |
|
|
|
|
| 65 |
retrieved_papers: list[str],
|
| 66 |
existing_anomaly_tags: list[str],
|
| 67 |
) -> Blueprint:
|
| 68 |
+
"""Generate a novel alpha hypothesis blueprint."""
|
|
|
|
|
|
|
|
|
|
| 69 |
fields = THEME_FIELDS.get(theme, [])
|
| 70 |
archetypes_str = ", ".join(PROVEN_ARCHETYPES)
|
| 71 |
+
recommended = THEME_TO_ARCHETYPE.get(theme, "novel")
|
| 72 |
+
field_details = _format_fields_for_llm(fields)
|
| 73 |
|
| 74 |
system = SYSTEM_PROMPT.format(
|
| 75 |
archetypes=archetypes_str,
|
| 76 |
+
recommended_archetype=recommended,
|
| 77 |
+
field_details=field_details,
|
| 78 |
)
|
| 79 |
|
| 80 |
papers_context = "\n\n".join([
|
|
|
|
| 84 |
saturated = [tag for tag in set(existing_anomaly_tags) if existing_anomaly_tags.count(tag) >= 3]
|
| 85 |
saturated_str = ", ".join(saturated) if saturated else "none"
|
| 86 |
|
| 87 |
+
user_prompt = f"""Generate a novel cross-sectional equity alpha hypothesis for theme: "{theme}".
|
| 88 |
|
| 89 |
+
ACADEMIC CONTEXT:
|
| 90 |
{papers_context}
|
| 91 |
|
| 92 |
CONSTRAINTS:
|
| 93 |
+
- USE ONLY these field IDs: {', '.join(fields)}
|
| 94 |
+
- Saturated anomaly tags to AVOID: {saturated_str}
|
| 95 |
+
- Universe: USA TOP3000, delay=1
|
| 96 |
+
- Must work cross-sectionally (rank stocks, not predict market)
|
| 97 |
+
- Recommended archetype: {recommended}
|
| 98 |
+
- Set decay between 5-10 (controls turnover)
|
| 99 |
+
|
| 100 |
+
Output a complete Blueprint JSON.
|
| 101 |
+
The 'fields' array in each component MUST contain exact field IDs from the list above.
|
| 102 |
+
Do NOT invent or modify field names."""
|
| 103 |
|
| 104 |
blueprint = await llm.generate_json(
|
| 105 |
prompt=user_prompt,
|