fix: hypothesis_hunter v2 — full field metadata to LLM, strict field constraint
Browse files
alpha_factory/personas/hypothesis_hunter.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
Hypothesis Hunter v2 — Persona 1 (Microfish)
|
| 3 |
-
|
| 4 |
-
so it never invents fields or uses wrong signs.
|
| 5 |
"""
|
| 6 |
from ..infra.llm_client import LLMClient
|
| 7 |
from ..schemas import Blueprint, AnomalyTag
|
|
@@ -9,63 +8,41 @@ from ..deterministic.theme_sampler import THEME_FIELDS, PROVEN_ARCHETYPES, THEME
|
|
| 9 |
from ..data.brain_fields import FIELD_INDEX, SignConvention
|
| 10 |
|
| 11 |
|
| 12 |
-
def _format_fields_for_llm(field_ids
|
| 13 |
-
"""Format field metadata so LLM knows exact IDs, signs, and descriptions."""
|
| 14 |
lines = []
|
| 15 |
for fid in field_ids:
|
| 16 |
if fid in FIELD_INDEX:
|
| 17 |
f = FIELD_INDEX[fid]
|
| 18 |
-
|
| 19 |
-
SignConvention.LONG_HIGH: "long_high (higher
|
| 20 |
-
SignConvention.LONG_LOW: "long_low (lower
|
| 21 |
-
SignConvention.CONTRARIAN: "contrarian (already inverted
|
| 22 |
-
SignConvention.AMBIGUOUS: "ambiguous
|
| 23 |
-
}
|
| 24 |
-
lines.append(f"
|
| 25 |
else:
|
| 26 |
-
lines.append(f"
|
| 27 |
return "\n".join(lines)
|
| 28 |
|
| 29 |
|
| 30 |
-
SYSTEM_PROMPT = """You are a senior
|
| 31 |
-
Your job is to propose novel cross-sectional equity factor hypotheses for the WorldQuant BRAIN platform.
|
| 32 |
|
| 33 |
-
|
| 34 |
-
1.
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
| 38 |
-
- "long_high": higher values → long (yield, quality, growth)
|
| 39 |
-
- "long_low": lower values → long (risk, distress, illiquidity — will be negated)
|
| 40 |
-
|
| 41 |
-
3. The sign_direction MUST match the field's documented sign convention.
|
| 42 |
-
If a field is labeled "long_low", your component must also say "long_low".
|
| 43 |
-
|
| 44 |
-
4. Prefer PROVEN ARCHETYPES for the archetype field:
|
| 45 |
-
{archetypes}
|
| 46 |
-
|
| 47 |
-
Recommended archetype for this theme: {recommended_archetype}
|
| 48 |
-
|
| 49 |
-
5. Max 3 components. Simpler is better — 1-2 fields dominate performance.
|
| 50 |
-
|
| 51 |
-
6. Every ts_* operator needs a lookback window (days). Suggested: 5, 10, 21, 42, 63, 126, 252.
|
| 52 |
-
|
| 53 |
-
7. For fields with coverage < 70%, the expression compiler will auto-apply ts_backfill(field, 30).
|
| 54 |
-
|
| 55 |
-
AVAILABLE FIELDS (use ONLY these exact IDs):
|
| 56 |
{field_details}
|
| 57 |
|
| 58 |
-
NEUTRALIZATION
|
| 59 |
"""
|
| 60 |
|
| 61 |
|
| 62 |
-
async def generate_hypothesis(
|
| 63 |
-
llm: LLMClient,
|
| 64 |
-
theme: str,
|
| 65 |
-
retrieved_papers: list[str],
|
| 66 |
-
existing_anomaly_tags: list[str],
|
| 67 |
-
) -> Blueprint:
|
| 68 |
-
"""Generate a novel alpha hypothesis blueprint."""
|
| 69 |
fields = THEME_FIELDS.get(theme, [])
|
| 70 |
archetypes_str = ", ".join(PROVEN_ARCHETYPES)
|
| 71 |
recommended = THEME_TO_ARCHETYPE.get(theme, "novel")
|
|
@@ -78,35 +55,25 @@ async def generate_hypothesis(
|
|
| 78 |
)
|
| 79 |
|
| 80 |
papers_context = "\n\n".join([
|
| 81 |
-
f"PAPER {i+1}:\n{
|
| 82 |
-
]) if retrieved_papers else "No papers
|
| 83 |
|
| 84 |
-
saturated = [
|
| 85 |
-
saturated_str = ", ".join(saturated) if saturated else "none"
|
| 86 |
|
| 87 |
-
user_prompt = f"""Generate
|
| 88 |
|
| 89 |
-
|
| 90 |
-
{papers_context}
|
| 91 |
|
| 92 |
CONSTRAINTS:
|
| 93 |
-
-
|
| 94 |
-
-
|
| 95 |
- Universe: USA TOP3000, delay=1
|
| 96 |
-
-
|
| 97 |
-
-
|
| 98 |
-
- Set decay between 5-10 (controls turnover)
|
| 99 |
-
|
| 100 |
-
Output a complete Blueprint JSON.
|
| 101 |
-
The 'fields' array in each component MUST contain exact field IDs from the list above.
|
| 102 |
-
Do NOT invent or modify field names."""
|
| 103 |
-
|
| 104 |
-
blueprint = await llm.generate_json(
|
| 105 |
-
prompt=user_prompt,
|
| 106 |
-
schema=Blueprint,
|
| 107 |
-
tier="microfish",
|
| 108 |
-
temperature=0.7,
|
| 109 |
-
system_prompt=system,
|
| 110 |
-
)
|
| 111 |
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
Hypothesis Hunter v2 — Persona 1 (Microfish)
|
| 3 |
+
Passes FULL field metadata to LLM so it never invents fields.
|
|
|
|
| 4 |
"""
|
| 5 |
from ..infra.llm_client import LLMClient
|
| 6 |
from ..schemas import Blueprint, AnomalyTag
|
|
|
|
| 8 |
from ..data.brain_fields import FIELD_INDEX, SignConvention
|
| 9 |
|
| 10 |
|
| 11 |
+
def _format_fields_for_llm(field_ids):
|
|
|
|
| 12 |
lines = []
|
| 13 |
for fid in field_ids:
|
| 14 |
if fid in FIELD_INDEX:
|
| 15 |
f = FIELD_INDEX[fid]
|
| 16 |
+
sign_map = {
|
| 17 |
+
SignConvention.LONG_HIGH: "long_high (higher->buy)",
|
| 18 |
+
SignConvention.LONG_LOW: "long_low (lower->buy,negate)",
|
| 19 |
+
SignConvention.CONTRARIAN: "contrarian (already inverted)",
|
| 20 |
+
SignConvention.AMBIGUOUS: "ambiguous",
|
| 21 |
+
}
|
| 22 |
+
lines.append(f" - {f.id} | cov={f.coverage:.0%} | sign={sign_map[f.sign]} | {f.description}")
|
| 23 |
else:
|
| 24 |
+
lines.append(f" - {fid}")
|
| 25 |
return "\n".join(lines)
|
| 26 |
|
| 27 |
|
| 28 |
+
SYSTEM_PROMPT = """You are a senior quant researcher. Propose cross-sectional equity factor hypotheses for WorldQuant BRAIN.
|
|
|
|
| 29 |
|
| 30 |
+
RULES:
|
| 31 |
+
1. Use ONLY exact field IDs listed below. Do NOT invent names.
|
| 32 |
+
2. sign_direction MUST match field's documented sign.
|
| 33 |
+
3. Prefer PROVEN ARCHETYPES: {archetypes}
|
| 34 |
+
Recommended: {recommended_archetype}
|
| 35 |
+
4. Max 3 components. Set decay 5-10.
|
| 36 |
+
5. ts_* operators need (field, days). Windows: 5,10,21,42,63,126,252.
|
| 37 |
|
| 38 |
+
AVAILABLE FIELDS (ONLY these):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
{field_details}
|
| 40 |
|
| 41 |
+
NEUTRALIZATION: sector, industry, subindustry
|
| 42 |
"""
|
| 43 |
|
| 44 |
|
| 45 |
+
async def generate_hypothesis(llm, theme, retrieved_papers, existing_anomaly_tags):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
fields = THEME_FIELDS.get(theme, [])
|
| 47 |
archetypes_str = ", ".join(PROVEN_ARCHETYPES)
|
| 48 |
recommended = THEME_TO_ARCHETYPE.get(theme, "novel")
|
|
|
|
| 55 |
)
|
| 56 |
|
| 57 |
papers_context = "\n\n".join([
|
| 58 |
+
f"PAPER {i+1}:\n{p}" for i, p in enumerate(retrieved_papers[:3])
|
| 59 |
+
]) if retrieved_papers else "No papers. Use domain knowledge, set academic_anchor=null."
|
| 60 |
|
| 61 |
+
saturated = [t for t in set(existing_anomaly_tags) if existing_anomaly_tags.count(t) >= 3]
|
|
|
|
| 62 |
|
| 63 |
+
user_prompt = f"""Generate alpha hypothesis for theme: "{theme}".
|
| 64 |
|
| 65 |
+
CONTEXT: {papers_context}
|
|
|
|
| 66 |
|
| 67 |
CONSTRAINTS:
|
| 68 |
+
- ONLY these fields: {', '.join(fields)}
|
| 69 |
+
- Avoid saturated tags: {', '.join(saturated) or 'none'}
|
| 70 |
- Universe: USA TOP3000, delay=1
|
| 71 |
+
- Archetype: {recommended}
|
| 72 |
+
- Decay: 5-10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
Output Blueprint JSON. fields array MUST use exact IDs from list above."""
|
| 75 |
+
|
| 76 |
+
return await llm.generate_json(
|
| 77 |
+
prompt=user_prompt, schema=Blueprint, tier="microfish",
|
| 78 |
+
temperature=0.7, system_prompt=system,
|
| 79 |
+
)
|