CatoG commited on
Commit ·
22173b0
1
Parent(s): 92d41c5
jkkjkj
Browse files- app.py +77 -25
- test_workflow.py +166 -0
- workflow_helpers.py +245 -48
app.py
CHANGED
|
@@ -19,6 +19,7 @@ from workflow_helpers import (
|
|
| 19 |
PlannerState, FailureRecord,
|
| 20 |
select_relevant_roles, identify_revision_targets,
|
| 21 |
compress_final_answer, strip_internal_noise,
|
|
|
|
| 22 |
get_synthesizer_format_instruction, get_qa_format_instruction,
|
| 23 |
validate_output_format, format_violations_instruction,
|
| 24 |
parse_task_assumptions, format_assumptions_for_prompt,
|
|
@@ -615,7 +616,7 @@ class WorkflowState(TypedDict):
|
|
| 615 |
|
| 616 |
# --- Role system prompts ---
|
| 617 |
|
| 618 |
-
|
| 619 |
"You are the Planner in a strict planner–specialist–synthesizer–QA workflow.\n"
|
| 620 |
"Your ONLY job is to PLAN and DELEGATE. You do NOT write the answer.\n\n"
|
| 621 |
"Your responsibilities:\n"
|
|
@@ -623,14 +624,7 @@ _PLANNER_SYSTEM = (
|
|
| 623 |
"2. Decide which specialist to call as the PRIMARY lead.\n"
|
| 624 |
" IMPORTANT: Select the FEWEST roles necessary. Do NOT call all roles.\n"
|
| 625 |
" Available specialists:\n"
|
| 626 |
-
"
|
| 627 |
-
" - 'Technical Expert' (code, architecture, implementation)\n"
|
| 628 |
-
" - 'Research Analyst' (information gathering, literature review, fact-finding)\n"
|
| 629 |
-
" - 'Security Reviewer' (security analysis, vulnerability checks)\n"
|
| 630 |
-
" - 'Data Analyst' (data analysis, statistics, patterns)\n"
|
| 631 |
-
" - 'Labour Union Representative' (worker rights, fair wages)\n"
|
| 632 |
-
" - 'UX Designer' (user needs, usability, accessibility)\n"
|
| 633 |
-
" - 'Lawyer' (legal compliance, liability, contracts)\n"
|
| 634 |
"3. State clear success criteria.\n"
|
| 635 |
"4. Identify the required output format and brevity level.\n"
|
| 636 |
"5. Define shared assumptions that ALL specialists must use.\n"
|
|
@@ -642,6 +636,7 @@ _PLANNER_SYSTEM = (
|
|
| 642 |
"- The specialists will create the content. The Synthesizer will combine it.\n"
|
| 643 |
"- For simple questions, ONE specialist is enough.\n"
|
| 644 |
"- Never call persona/gimmick roles unless the user explicitly asks for them.\n"
|
|
|
|
| 645 |
"- QA results are BINDING — if QA says FAIL, you MUST revise, never approve.\n\n"
|
| 646 |
"Respond in this exact format:\n"
|
| 647 |
"TASK BREAKDOWN:\n<subtask list — what needs to be addressed, NOT the answers>\n\n"
|
|
@@ -652,6 +647,34 @@ _PLANNER_SYSTEM = (
|
|
| 652 |
"GUIDANCE FOR SPECIALIST:\n<delegation instructions — what to focus on, NOT answer content>"
|
| 653 |
)
|
| 654 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 655 |
_CREATIVE_SYSTEM = (
|
| 656 |
"You are the Creative Expert in a multi-role AI workflow.\n"
|
| 657 |
"You handle brainstorming, alternative ideas, framing, wording, and concept generation.\n"
|
|
@@ -680,17 +703,30 @@ _QA_SYSTEM = (
|
|
| 680 |
"output format requirements, brevity requirements, AND expert influence.\n\n"
|
| 681 |
"You MUST respond with a JSON object in this exact structure:\n"
|
| 682 |
'{\n'
|
| 683 |
-
'
|
| 684 |
-
'
|
| 685 |
-
'
|
|
|
|
| 686 |
' {\n'
|
| 687 |
-
'
|
| 688 |
-
'
|
| 689 |
-
'
|
| 690 |
' }\n'
|
| 691 |
' ],\n'
|
| 692 |
-
'
|
| 693 |
'}\n\n'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
"Validation rules:\n"
|
| 695 |
"- Check that the output DIRECTLY answers the user's question.\n"
|
| 696 |
"- Check that the output format matches what was requested (single choice, table, code, etc.).\n"
|
|
@@ -704,18 +740,19 @@ _QA_SYSTEM = (
|
|
| 704 |
" * If multiple experts contributed, their relevant points are incorporated or consciously noted.\n"
|
| 705 |
" * The answer is NOT just a paraphrase of planner text with no expert content.\n"
|
| 706 |
" * FAIL with type 'expert_influence' if expert contributions were ignored.\n"
|
| 707 |
-
"- FAIL if any
|
| 708 |
-
"-
|
|
|
|
| 709 |
)
|
| 710 |
|
| 711 |
_PLANNER_REVIEW_SYSTEM = (
|
| 712 |
"You are the Planner reviewing QA feedback.\n"
|
| 713 |
"CRITICAL RULE: QA results are BINDING.\n"
|
| 714 |
-
"- If QA status is PASS: approve the result.\n"
|
| 715 |
"- If QA status is FAIL: you MUST revise. You may NOT approve a FAIL result.\n"
|
| 716 |
"- If this is the final revision (max reached) and QA still FAIL:\n"
|
| 717 |
" you must directly fix the QA issues in your response before approving.\n\n"
|
| 718 |
-
"If QA PASSED, respond with:\n"
|
| 719 |
"DECISION: APPROVED\n"
|
| 720 |
"FINAL ANSWER:\n<the approved output, reproduced in full>\n\n"
|
| 721 |
"If QA FAILED and revisions remain, respond with:\n"
|
|
@@ -1098,7 +1135,10 @@ def _parse_qa_role_feedback(qa_text: str) -> Dict[str, str]:
|
|
| 1098 |
# Each step receives the shared state and an append-only trace list,
|
| 1099 |
# updates state in place, appends log lines, and returns updated state.
|
| 1100 |
|
| 1101 |
-
def _step_plan(
|
|
|
|
|
|
|
|
|
|
| 1102 |
"""Planner: analyse the task, produce a plan, decide which specialist to call."""
|
| 1103 |
trace.append("\n╔══ [PLANNER] Analysing task... ══╗")
|
| 1104 |
fmt = state.get("output_format", "other")
|
|
@@ -1114,7 +1154,8 @@ def _step_plan(chat_model, state: WorkflowState, trace: List[str]) -> WorkflowSt
|
|
| 1114 |
f"\nPrevious QA report:\n{state['qa_report']}"
|
| 1115 |
"\nAdjust the plan to address the QA issues."
|
| 1116 |
)
|
| 1117 |
-
|
|
|
|
| 1118 |
state["plan"] = plan_text
|
| 1119 |
state["current_role"] = _decide_role(plan_text)
|
| 1120 |
trace.append(plan_text)
|
|
@@ -1181,6 +1222,9 @@ def _step_qa(
|
|
| 1181 |
"""
|
| 1182 |
trace.append("\n╔══ [QA TESTER] Reviewing output... ══╗")
|
| 1183 |
|
|
|
|
|
|
|
|
|
|
| 1184 |
fmt = state.get("output_format", "other")
|
| 1185 |
brevity = state.get("brevity_requirement", "normal")
|
| 1186 |
format_rules = get_qa_format_instruction(fmt, brevity)
|
|
@@ -1248,8 +1292,12 @@ def _step_qa(
|
|
| 1248 |
# Also extract legacy role feedback for backward compatibility
|
| 1249 |
state["qa_role_feedback"] = _parse_qa_role_feedback(text)
|
| 1250 |
|
| 1251 |
-
result_label = "✅ PASS" if
|
|
|
|
|
|
|
| 1252 |
trace.append(text)
|
|
|
|
|
|
|
| 1253 |
if qa_result.issues:
|
| 1254 |
issues_summary = "; ".join(
|
| 1255 |
f"{i.owner}: {i.message[:60]}{'…' if len(i.message) > 60 else ''}"
|
|
@@ -2080,7 +2128,8 @@ def run_multi_role_workflow(
|
|
| 2080 |
|
| 2081 |
try:
|
| 2082 |
if planner_active:
|
| 2083 |
-
state = _step_plan(chat_model, state, trace
|
|
|
|
| 2084 |
|
| 2085 |
# Parse shared task assumptions from planner output
|
| 2086 |
assumptions = parse_task_assumptions(state["plan"])
|
|
@@ -2123,6 +2172,9 @@ def run_multi_role_workflow(
|
|
| 2123 |
f"\n[ROLE SELECTION] {len(selected_roles)} specialist(s) selected: "
|
| 2124 |
+ ", ".join(AGENT_ROLES.get(k, k) for k in selected_roles)
|
| 2125 |
)
|
|
|
|
|
|
|
|
|
|
| 2126 |
|
| 2127 |
# Step 4: Run ALL selected specialists (initial run only)
|
| 2128 |
if primary_role not in selected_roles:
|
|
@@ -2210,7 +2262,7 @@ def run_multi_role_workflow(
|
|
| 2210 |
else:
|
| 2211 |
state["qa_passed"] = True
|
| 2212 |
state["qa_report"] = "QA Tester is disabled — skipping quality review."
|
| 2213 |
-
state["qa_structured"] = {"status": "PASS", "reason": "", "issues": [], "correction_instruction": ""}
|
| 2214 |
trace.append("\n[QA Tester disabled] Skipping quality review — auto-pass.")
|
| 2215 |
|
| 2216 |
# Update planner state
|
|
|
|
| 19 |
PlannerState, FailureRecord,
|
| 20 |
select_relevant_roles, identify_revision_targets,
|
| 21 |
compress_final_answer, strip_internal_noise,
|
| 22 |
+
postprocess_format_fixes,
|
| 23 |
get_synthesizer_format_instruction, get_qa_format_instruction,
|
| 24 |
validate_output_format, format_violations_instruction,
|
| 25 |
parse_task_assumptions, format_assumptions_for_prompt,
|
|
|
|
| 616 |
|
| 617 |
# --- Role system prompts ---
|
| 618 |
|
| 619 |
+
_PLANNER_SYSTEM_BASE = (
|
| 620 |
"You are the Planner in a strict planner–specialist–synthesizer–QA workflow.\n"
|
| 621 |
"Your ONLY job is to PLAN and DELEGATE. You do NOT write the answer.\n\n"
|
| 622 |
"Your responsibilities:\n"
|
|
|
|
| 624 |
"2. Decide which specialist to call as the PRIMARY lead.\n"
|
| 625 |
" IMPORTANT: Select the FEWEST roles necessary. Do NOT call all roles.\n"
|
| 626 |
" Available specialists:\n"
|
| 627 |
+
"{specialist_list}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
"3. State clear success criteria.\n"
|
| 629 |
"4. Identify the required output format and brevity level.\n"
|
| 630 |
"5. Define shared assumptions that ALL specialists must use.\n"
|
|
|
|
| 636 |
"- The specialists will create the content. The Synthesizer will combine it.\n"
|
| 637 |
"- For simple questions, ONE specialist is enough.\n"
|
| 638 |
"- Never call persona/gimmick roles unless the user explicitly asks for them.\n"
|
| 639 |
+
"- Only select from the specialists listed above — no others are available.\n"
|
| 640 |
"- QA results are BINDING — if QA says FAIL, you MUST revise, never approve.\n\n"
|
| 641 |
"Respond in this exact format:\n"
|
| 642 |
"TASK BREAKDOWN:\n<subtask list — what needs to be addressed, NOT the answers>\n\n"
|
|
|
|
| 647 |
"GUIDANCE FOR SPECIALIST:\n<delegation instructions — what to focus on, NOT answer content>"
|
| 648 |
)
|
| 649 |
|
| 650 |
+
|
| 651 |
+
def _build_planner_system(enabled_role_keys: List[str]) -> str:
|
| 652 |
+
"""Build the planner system prompt with the actual enabled roles."""
|
| 653 |
+
role_descriptions = {
|
| 654 |
+
"creative": "'Creative Expert' (ideas, framing, wording, brainstorming)",
|
| 655 |
+
"technical": "'Technical Expert' (code, architecture, implementation)",
|
| 656 |
+
"research": "'Research Analyst' (information gathering, literature review, fact-finding)",
|
| 657 |
+
"security": "'Security Reviewer' (security analysis, vulnerability checks)",
|
| 658 |
+
"data_analyst": "'Data Analyst' (data analysis, statistics, patterns)",
|
| 659 |
+
"labour_union_rep": "'Labour Union Representative' (worker rights, fair wages)",
|
| 660 |
+
"ux_designer": "'UX Designer' (user needs, usability, accessibility)",
|
| 661 |
+
"lawyer": "'Lawyer' (legal compliance, liability, contracts)",
|
| 662 |
+
"mad_professor": "'Mad Professor' (wild ideas, provocative perspectives)",
|
| 663 |
+
"accountant": "'Accountant' (cost analysis, budgeting, financial review)",
|
| 664 |
+
"artist": "'Artist' (aesthetic vision, creative expression)",
|
| 665 |
+
"lazy_slacker": "'Lazy Slacker' (minimal effort, simple answers)",
|
| 666 |
+
"black_metal_fundamentalist": "'Black Metal Fundamentalist' (nihilistic perspective)",
|
| 667 |
+
"doris": "'Doris' (practical, no-nonsense perspective)",
|
| 668 |
+
"chairman_of_board": "'Chairman of the Board' (corporate strategy, governance)",
|
| 669 |
+
"maga_appointee": "'MAGA Appointee' (deregulation, America-first perspective)",
|
| 670 |
+
}
|
| 671 |
+
lines = []
|
| 672 |
+
for rk in enabled_role_keys:
|
| 673 |
+
desc = role_descriptions.get(rk, f"'{rk}'")
|
| 674 |
+
lines.append(f" - {desc}\n")
|
| 675 |
+
specialist_list = "".join(lines) if lines else " - (no specialists enabled)\n"
|
| 676 |
+
return _PLANNER_SYSTEM_BASE.format(specialist_list=specialist_list)
|
| 677 |
+
|
| 678 |
_CREATIVE_SYSTEM = (
|
| 679 |
"You are the Creative Expert in a multi-role AI workflow.\n"
|
| 680 |
"You handle brainstorming, alternative ideas, framing, wording, and concept generation.\n"
|
|
|
|
| 703 |
"output format requirements, brevity requirements, AND expert influence.\n\n"
|
| 704 |
"You MUST respond with a JSON object in this exact structure:\n"
|
| 705 |
'{\n'
|
| 706 |
+
' "status": "PASS" or "PASS_WITH_WARNINGS" or "FAIL",\n'
|
| 707 |
+
' "reason": "short explanation",\n'
|
| 708 |
+
' "warnings": ["optional list of minor cosmetic or stylistic notes"],\n'
|
| 709 |
+
' "issues": [\n'
|
| 710 |
' {\n'
|
| 711 |
+
' "type": "format" | "brevity" | "constraint" | "consistency" | "directness" | "evidence" | "expert_influence" | "other",\n'
|
| 712 |
+
' "message": "what is wrong",\n'
|
| 713 |
+
' "owner": "Synthesizer" | "Planner" | "Research Analyst" | "<specialist role name>"\n'
|
| 714 |
' }\n'
|
| 715 |
' ],\n'
|
| 716 |
+
' "correction_instruction": "specific minimal fix"\n'
|
| 717 |
'}\n\n'
|
| 718 |
+
"STATUS LEVELS — use the right one:\n"
|
| 719 |
+
"- PASS: The answer is correct, complete, properly formatted, and meets all criteria.\n"
|
| 720 |
+
"- PASS_WITH_WARNINGS: The answer is substantively correct and usable, but has minor\n"
|
| 721 |
+
" cosmetic or stylistic issues (e.g. slightly verbose, could be tighter, minor formatting\n"
|
| 722 |
+
" quirks). List these in the 'warnings' array. Do NOT put them in 'issues'.\n"
|
| 723 |
+
"- FAIL: The answer has substantive problems — wrong content, missing key information,\n"
|
| 724 |
+
" wrong format, ignores the question, unsupported claims, or expert contributions ignored.\n"
|
| 725 |
+
" Only FAIL triggers a revision cycle.\n\n"
|
| 726 |
+
"FOCUS ON CONTENT, NOT COSMETICS:\n"
|
| 727 |
+
"- Minor bullet formatting, heading style, or whitespace are NOT reasons to FAIL.\n"
|
| 728 |
+
"- A slightly verbose answer that correctly addresses the question is PASS_WITH_WARNINGS, not FAIL.\n"
|
| 729 |
+
"- Reserve FAIL for answers that are genuinely wrong, incomplete, or miss the point.\n\n"
|
| 730 |
"Validation rules:\n"
|
| 731 |
"- Check that the output DIRECTLY answers the user's question.\n"
|
| 732 |
"- Check that the output format matches what was requested (single choice, table, code, etc.).\n"
|
|
|
|
| 740 |
" * If multiple experts contributed, their relevant points are incorporated or consciously noted.\n"
|
| 741 |
" * The answer is NOT just a paraphrase of planner text with no expert content.\n"
|
| 742 |
" * FAIL with type 'expert_influence' if expert contributions were ignored.\n"
|
| 743 |
+
"- FAIL if any substantive check fails.\n"
|
| 744 |
+
"- PASS_WITH_WARNINGS if content is good but minor polish is needed.\n"
|
| 745 |
+
"- PASS only if ALL checks pass with no issues at all.\n"
|
| 746 |
)
|
| 747 |
|
| 748 |
_PLANNER_REVIEW_SYSTEM = (
|
| 749 |
"You are the Planner reviewing QA feedback.\n"
|
| 750 |
"CRITICAL RULE: QA results are BINDING.\n"
|
| 751 |
+
"- If QA status is PASS or PASS_WITH_WARNINGS: approve the result.\n"
|
| 752 |
"- If QA status is FAIL: you MUST revise. You may NOT approve a FAIL result.\n"
|
| 753 |
"- If this is the final revision (max reached) and QA still FAIL:\n"
|
| 754 |
" you must directly fix the QA issues in your response before approving.\n\n"
|
| 755 |
+
"If QA PASSED (or PASS_WITH_WARNINGS), respond with:\n"
|
| 756 |
"DECISION: APPROVED\n"
|
| 757 |
"FINAL ANSWER:\n<the approved output, reproduced in full>\n\n"
|
| 758 |
"If QA FAILED and revisions remain, respond with:\n"
|
|
|
|
| 1135 |
# Each step receives the shared state and an append-only trace list,
|
| 1136 |
# updates state in place, appends log lines, and returns updated state.
|
| 1137 |
|
| 1138 |
+
def _step_plan(
|
| 1139 |
+
chat_model, state: WorkflowState, trace: List[str],
|
| 1140 |
+
enabled_role_keys: Optional[List[str]] = None,
|
| 1141 |
+
) -> WorkflowState:
|
| 1142 |
"""Planner: analyse the task, produce a plan, decide which specialist to call."""
|
| 1143 |
trace.append("\n╔══ [PLANNER] Analysing task... ══╗")
|
| 1144 |
fmt = state.get("output_format", "other")
|
|
|
|
| 1154 |
f"\nPrevious QA report:\n{state['qa_report']}"
|
| 1155 |
"\nAdjust the plan to address the QA issues."
|
| 1156 |
)
|
| 1157 |
+
planner_system = _build_planner_system(enabled_role_keys or [])
|
| 1158 |
+
plan_text = _llm_call(chat_model, planner_system, content)
|
| 1159 |
state["plan"] = plan_text
|
| 1160 |
state["current_role"] = _decide_role(plan_text)
|
| 1161 |
trace.append(plan_text)
|
|
|
|
| 1222 |
"""
|
| 1223 |
trace.append("\n╔══ [QA TESTER] Reviewing output... ══╗")
|
| 1224 |
|
| 1225 |
+
# Apply post-processing format fixes before QA evaluation
|
| 1226 |
+
state["draft_output"] = postprocess_format_fixes(state["draft_output"])
|
| 1227 |
+
|
| 1228 |
fmt = state.get("output_format", "other")
|
| 1229 |
brevity = state.get("brevity_requirement", "normal")
|
| 1230 |
format_rules = get_qa_format_instruction(fmt, brevity)
|
|
|
|
| 1292 |
# Also extract legacy role feedback for backward compatibility
|
| 1293 |
state["qa_role_feedback"] = _parse_qa_role_feedback(text)
|
| 1294 |
|
| 1295 |
+
result_label = ("✅ PASS" if qa_result.status == "PASS"
|
| 1296 |
+
else "⚠ PASS_WITH_WARNINGS" if qa_result.passed_with_warnings
|
| 1297 |
+
else "❌ FAIL")
|
| 1298 |
trace.append(text)
|
| 1299 |
+
if qa_result.warnings:
|
| 1300 |
+
trace.append(f" ⚠ QA warnings: {'; '.join(qa_result.warnings)}")
|
| 1301 |
if qa_result.issues:
|
| 1302 |
issues_summary = "; ".join(
|
| 1303 |
f"{i.owner}: {i.message[:60]}{'…' if len(i.message) > 60 else ''}"
|
|
|
|
| 2128 |
|
| 2129 |
try:
|
| 2130 |
if planner_active:
|
| 2131 |
+
state = _step_plan(chat_model, state, trace,
|
| 2132 |
+
enabled_role_keys=active_specialist_keys)
|
| 2133 |
|
| 2134 |
# Parse shared task assumptions from planner output
|
| 2135 |
assumptions = parse_task_assumptions(state["plan"])
|
|
|
|
| 2172 |
f"\n[ROLE SELECTION] {len(selected_roles)} specialist(s) selected: "
|
| 2173 |
+ ", ".join(AGENT_ROLES.get(k, k) for k in selected_roles)
|
| 2174 |
)
|
| 2175 |
+
# Append detailed scoring trace when available
|
| 2176 |
+
if hasattr(selected_roles, 'format_trace'):
|
| 2177 |
+
trace.append(selected_roles.format_trace(AGENT_ROLES))
|
| 2178 |
|
| 2179 |
# Step 4: Run ALL selected specialists (initial run only)
|
| 2180 |
if primary_role not in selected_roles:
|
|
|
|
| 2262 |
else:
|
| 2263 |
state["qa_passed"] = True
|
| 2264 |
state["qa_report"] = "QA Tester is disabled — skipping quality review."
|
| 2265 |
+
state["qa_structured"] = {"status": "PASS", "reason": "", "issues": [], "warnings": [], "correction_instruction": ""}
|
| 2266 |
trace.append("\n[QA Tester disabled] Skipping quality review — auto-pass.")
|
| 2267 |
|
| 2268 |
# Update planner state
|
test_workflow.py
CHANGED
|
@@ -31,6 +31,7 @@ from workflow_helpers import (
|
|
| 31 |
identify_revision_targets,
|
| 32 |
strip_internal_noise,
|
| 33 |
compress_final_answer,
|
|
|
|
| 34 |
PlannerState,
|
| 35 |
FailureRecord,
|
| 36 |
get_synthesizer_format_instruction,
|
|
@@ -45,6 +46,10 @@ from workflow_helpers import (
|
|
| 45 |
format_contributions_for_qa,
|
| 46 |
parse_used_contributions,
|
| 47 |
check_expert_influence,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
)
|
| 49 |
from evidence import (
|
| 50 |
EvidenceItem,
|
|
@@ -244,6 +249,167 @@ class TestStructuredQAParsing(unittest.TestCase):
|
|
| 244 |
self.assertEqual(d["status"], "FAIL")
|
| 245 |
self.assertEqual(len(d["issues"]), 1)
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
# ============================================================
|
| 249 |
# Test: Role Selection
|
|
|
|
| 31 |
identify_revision_targets,
|
| 32 |
strip_internal_noise,
|
| 33 |
compress_final_answer,
|
| 34 |
+
postprocess_format_fixes,
|
| 35 |
PlannerState,
|
| 36 |
FailureRecord,
|
| 37 |
get_synthesizer_format_instruction,
|
|
|
|
| 46 |
format_contributions_for_qa,
|
| 47 |
parse_used_contributions,
|
| 48 |
check_expert_influence,
|
| 49 |
+
extract_task_features,
|
| 50 |
+
ROLE_CAPABILITIES,
|
| 51 |
+
RoleScore,
|
| 52 |
+
score_roles,
|
| 53 |
)
|
| 54 |
from evidence import (
|
| 55 |
EvidenceItem,
|
|
|
|
| 249 |
self.assertEqual(d["status"], "FAIL")
|
| 250 |
self.assertEqual(len(d["issues"]), 1)
|
| 251 |
|
| 252 |
+
def test_parse_json_pass_with_warnings(self):
|
| 253 |
+
"""PASS_WITH_WARNINGS is parsed correctly from JSON."""
|
| 254 |
+
qa_text = json.dumps({
|
| 255 |
+
"status": "PASS_WITH_WARNINGS",
|
| 256 |
+
"reason": "Content is good but slightly verbose",
|
| 257 |
+
"warnings": ["Could be more concise", "Minor formatting quirk"],
|
| 258 |
+
"issues": [],
|
| 259 |
+
"correction_instruction": ""
|
| 260 |
+
})
|
| 261 |
+
result = parse_structured_qa(qa_text)
|
| 262 |
+
self.assertTrue(result.passed)
|
| 263 |
+
self.assertTrue(result.passed_with_warnings)
|
| 264 |
+
self.assertEqual(result.status, "PASS_WITH_WARNINGS")
|
| 265 |
+
self.assertEqual(len(result.warnings), 2)
|
| 266 |
+
self.assertIn("Could be more concise", result.warnings)
|
| 267 |
+
self.assertEqual(len(result.issues), 0)
|
| 268 |
+
|
| 269 |
+
def test_parse_legacy_pass_with_warnings(self):
|
| 270 |
+
"""PASS_WITH_WARNINGS is recognized in legacy text format."""
|
| 271 |
+
qa_text = (
|
| 272 |
+
"REQUIREMENTS CHECKED:\n- All met\n\n"
|
| 273 |
+
"ISSUES FOUND:\nNone\n\n"
|
| 274 |
+
"RESULT: PASS_WITH_WARNINGS\n\n"
|
| 275 |
+
"RECOMMENDED FIXES:\nNone"
|
| 276 |
+
)
|
| 277 |
+
result = parse_structured_qa(qa_text)
|
| 278 |
+
self.assertTrue(result.passed)
|
| 279 |
+
self.assertTrue(result.passed_with_warnings)
|
| 280 |
+
|
| 281 |
+
def test_pass_with_warnings_to_dict_includes_warnings(self):
|
| 282 |
+
result = QAResult(
|
| 283 |
+
status="PASS_WITH_WARNINGS",
|
| 284 |
+
reason="Minor issues",
|
| 285 |
+
warnings=["Slightly verbose"],
|
| 286 |
+
issues=[],
|
| 287 |
+
)
|
| 288 |
+
d = result.to_dict()
|
| 289 |
+
self.assertEqual(d["status"], "PASS_WITH_WARNINGS")
|
| 290 |
+
self.assertEqual(d["warnings"], ["Slightly verbose"])
|
| 291 |
+
|
| 292 |
+
def test_pass_is_not_passed_with_warnings(self):
|
| 293 |
+
result = QAResult(status="PASS")
|
| 294 |
+
self.assertTrue(result.passed)
|
| 295 |
+
self.assertFalse(result.passed_with_warnings)
|
| 296 |
+
|
| 297 |
+
def test_fail_is_not_passed(self):
|
| 298 |
+
result = QAResult(status="FAIL")
|
| 299 |
+
self.assertFalse(result.passed)
|
| 300 |
+
self.assertFalse(result.passed_with_warnings)
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
# ============================================================
|
| 304 |
+
# Test: Post-Processing Format Fixes
|
| 305 |
+
# ============================================================
|
| 306 |
+
|
| 307 |
+
class TestPostprocessFormatFixes(unittest.TestCase):
|
| 308 |
+
|
| 309 |
+
def test_removes_markdown_headings(self):
|
| 310 |
+
text = "# Heading\nSome content.\n## Subheading\nMore content."
|
| 311 |
+
result = postprocess_format_fixes(text)
|
| 312 |
+
self.assertNotIn("#", result)
|
| 313 |
+
self.assertIn("Some content.", result)
|
| 314 |
+
self.assertIn("More content.", result)
|
| 315 |
+
|
| 316 |
+
def test_converts_bullets_to_sentences(self):
|
| 317 |
+
text = "- First point\n- Second point\n- Third point"
|
| 318 |
+
result = postprocess_format_fixes(text)
|
| 319 |
+
self.assertNotIn("- ", result)
|
| 320 |
+
self.assertIn("First point.", result)
|
| 321 |
+
self.assertIn("Second point.", result)
|
| 322 |
+
|
| 323 |
+
def test_collapses_blank_lines(self):
|
| 324 |
+
text = "Line 1\n\n\n\n\nLine 2"
|
| 325 |
+
result = postprocess_format_fixes(text)
|
| 326 |
+
self.assertNotIn("\n\n\n", result)
|
| 327 |
+
self.assertIn("Line 1", result)
|
| 328 |
+
self.assertIn("Line 2", result)
|
| 329 |
+
|
| 330 |
+
def test_removes_json_traces(self):
|
| 331 |
+
text = 'Answer here. {"status": "PASS", "reason": "ok"} End.'
|
| 332 |
+
result = postprocess_format_fixes(text)
|
| 333 |
+
self.assertNotIn('"status"', result)
|
| 334 |
+
self.assertIn("Answer here.", result)
|
| 335 |
+
|
| 336 |
+
def test_preserves_clean_text(self):
|
| 337 |
+
text = "This is already clean text with no issues."
|
| 338 |
+
result = postprocess_format_fixes(text)
|
| 339 |
+
self.assertEqual(result, text)
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
# ============================================================
|
| 343 |
+
# Test: Task Feature Extraction and Role Scoring
|
| 344 |
+
# ============================================================
|
| 345 |
+
|
| 346 |
+
class TestTaskFeatureExtraction(unittest.TestCase):
|
| 347 |
+
|
| 348 |
+
def test_extract_coding_features(self):
|
| 349 |
+
features = extract_task_features("write Python code to parse CSV", "coding_task")
|
| 350 |
+
self.assertIn("technical", features)
|
| 351 |
+
|
| 352 |
+
def test_extract_design_features(self):
|
| 353 |
+
features = extract_task_features("design a user interface for the dashboard")
|
| 354 |
+
self.assertIn("design", features)
|
| 355 |
+
|
| 356 |
+
def test_extract_research_features(self):
|
| 357 |
+
features = extract_task_features("research the history of quantum computing")
|
| 358 |
+
self.assertIn("research", features)
|
| 359 |
+
|
| 360 |
+
def test_extract_multiple_features(self):
|
| 361 |
+
features = extract_task_features("analyze data trends and compare security vulnerabilities")
|
| 362 |
+
self.assertIn("analysis", features)
|
| 363 |
+
self.assertIn("data", features)
|
| 364 |
+
self.assertIn("security", features)
|
| 365 |
+
self.assertIn("comparison", features)
|
| 366 |
+
|
| 367 |
+
def test_category_adds_implied_features(self):
|
| 368 |
+
features = extract_task_features("do something", "factual_question")
|
| 369 |
+
self.assertIn("research", features)
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
class TestRoleScoring(unittest.TestCase):
|
| 373 |
+
|
| 374 |
+
def setUp(self):
|
| 375 |
+
self.all_roles = list(WorkflowConfig.CORE_ROLE_KEYS) + list(WorkflowConfig.PERSONA_ROLE_KEYS)
|
| 376 |
+
self.config = WorkflowConfig(strict_mode=True, allow_persona_roles=False, max_specialists_per_task=3)
|
| 377 |
+
|
| 378 |
+
def test_scores_are_populated(self):
|
| 379 |
+
features = extract_task_features("write Python code to build an API", "coding_task")
|
| 380 |
+
scores = score_roles(features, self.all_roles, self.config, "coding_task")
|
| 381 |
+
self.assertTrue(len(scores) > 0)
|
| 382 |
+
# Technical should score highest
|
| 383 |
+
technical_scores = [s for s in scores if s.role_key == "technical"]
|
| 384 |
+
self.assertTrue(len(technical_scores) == 1)
|
| 385 |
+
self.assertGreater(technical_scores[0].score, 0)
|
| 386 |
+
|
| 387 |
+
def test_persona_roles_filtered(self):
|
| 388 |
+
features = extract_task_features("budget analysis")
|
| 389 |
+
scores = score_roles(features, self.all_roles, self.config)
|
| 390 |
+
persona_scores = [s for s in scores if s.is_persona]
|
| 391 |
+
for ps in persona_scores:
|
| 392 |
+
self.assertTrue(ps.filtered_reason)
|
| 393 |
+
|
| 394 |
+
def test_persona_roles_allowed(self):
|
| 395 |
+
config = WorkflowConfig(strict_mode=True, allow_persona_roles=True, max_specialists_per_task=5)
|
| 396 |
+
features = extract_task_features("budget analysis")
|
| 397 |
+
scores = score_roles(features, self.all_roles, config)
|
| 398 |
+
accountant = [s for s in scores if s.role_key == "accountant"]
|
| 399 |
+
self.assertTrue(len(accountant) == 1)
|
| 400 |
+
self.assertEqual(accountant[0].filtered_reason, "")
|
| 401 |
+
|
| 402 |
+
def test_selection_result_format_trace(self):
|
| 403 |
+
config = WorkflowConfig(strict_mode=True, allow_persona_roles=False, max_specialists_per_task=3)
|
| 404 |
+
result = select_relevant_roles(
|
| 405 |
+
"write Python code", self.all_roles, config, task_category="coding_task"
|
| 406 |
+
)
|
| 407 |
+
self.assertTrue(hasattr(result, 'format_trace'))
|
| 408 |
+
trace = result.format_trace({"technical": "Technical Expert"})
|
| 409 |
+
self.assertIn("ROLE SCORING", trace)
|
| 410 |
+
self.assertIn("Task features:", trace)
|
| 411 |
+
self.assertIn("SELECTED", trace)
|
| 412 |
+
|
| 413 |
|
| 414 |
# ============================================================
|
| 415 |
# Test: Role Selection
|
workflow_helpers.py
CHANGED
|
@@ -234,14 +234,19 @@ class QAIssue:
|
|
| 234 |
|
| 235 |
@dataclass
|
| 236 |
class QAResult:
|
| 237 |
-
status: str # "PASS" | "FAIL"
|
| 238 |
reason: str = ""
|
| 239 |
issues: List[QAIssue] = field(default_factory=list)
|
|
|
|
| 240 |
correction_instruction: str = ""
|
| 241 |
|
| 242 |
@property
|
| 243 |
def passed(self) -> bool:
|
| 244 |
-
return self.status
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
def owners(self) -> List[str]:
|
| 247 |
"""Return unique owner labels from issues."""
|
|
@@ -255,6 +260,7 @@ class QAResult:
|
|
| 255 |
{"type": i.type, "message": i.message, "owner": i.owner}
|
| 256 |
for i in self.issues
|
| 257 |
],
|
|
|
|
| 258 |
"correction_instruction": self.correction_instruction,
|
| 259 |
}
|
| 260 |
|
|
@@ -293,6 +299,7 @@ def parse_structured_qa(qa_text: str) -> QAResult:
|
|
| 293 |
status=data.get("status", "FAIL"),
|
| 294 |
reason=data.get("reason", ""),
|
| 295 |
issues=issues,
|
|
|
|
| 296 |
correction_instruction=data.get("correction_instruction", ""),
|
| 297 |
)
|
| 298 |
except (json.JSONDecodeError, KeyError):
|
|
@@ -301,7 +308,9 @@ def parse_structured_qa(qa_text: str) -> QAResult:
|
|
| 301 |
# Fallback: parse from legacy text format
|
| 302 |
status = "FAIL"
|
| 303 |
lower = qa_text.lower()
|
| 304 |
-
if "result:
|
|
|
|
|
|
|
| 305 |
status = "PASS"
|
| 306 |
|
| 307 |
reason = ""
|
|
@@ -491,72 +500,188 @@ ROLE_RELEVANCE: Dict[str, Dict[str, Any]] = {
|
|
| 491 |
}
|
| 492 |
|
| 493 |
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
config.max_specialists_per_task roles.
|
| 505 |
|
| 506 |
-
|
| 507 |
-
is factual, the research role is always included.
|
| 508 |
"""
|
| 509 |
lower = user_request.lower()
|
| 510 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
|
| 512 |
-
|
| 513 |
-
meta = ROLE_RELEVANCE.get(role_key)
|
| 514 |
-
if not meta:
|
| 515 |
-
continue
|
| 516 |
|
| 517 |
-
# Skip persona roles unless config allows them
|
| 518 |
-
if meta.get("is_persona") and not config.allow_persona_roles:
|
| 519 |
-
continue
|
| 520 |
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
|
| 531 |
-
# Task-category affinity bonus
|
| 532 |
role_tasks = meta.get("task_types", [])
|
| 533 |
if task_category in role_tasks:
|
| 534 |
score += 2
|
| 535 |
|
| 536 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
|
| 538 |
-
|
| 539 |
-
|
|
|
|
| 540 |
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
if len(selected) >= config.max_specialists_per_task:
|
| 545 |
break
|
| 546 |
-
# In strict mode, only include roles with score > 0 (
|
| 547 |
-
if config.strict_mode and score == 0 and selected:
|
| 548 |
continue
|
| 549 |
-
selected.append(role_key)
|
| 550 |
|
| 551 |
# Ensure at least one specialist is selected
|
| 552 |
-
if not selected and
|
| 553 |
-
selected.append(
|
| 554 |
|
| 555 |
-
#
|
| 556 |
if not selected:
|
| 557 |
-
for rk in
|
| 558 |
-
|
| 559 |
-
if not meta.get("is_persona"):
|
| 560 |
selected.append(rk)
|
| 561 |
break
|
| 562 |
|
|
@@ -567,7 +692,41 @@ def select_relevant_roles(
|
|
| 567 |
and "research" not in selected):
|
| 568 |
selected.append("research")
|
| 569 |
|
| 570 |
-
return selected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
|
| 572 |
|
| 573 |
# ============================================================
|
|
@@ -717,6 +876,44 @@ def compress_final_answer(
|
|
| 717 |
return answer
|
| 718 |
|
| 719 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 720 |
# ============================================================
|
| 721 |
# Planner State
|
| 722 |
# ============================================================
|
|
|
|
| 234 |
|
| 235 |
@dataclass
|
| 236 |
class QAResult:
|
| 237 |
+
status: str # "PASS" | "PASS_WITH_WARNINGS" | "FAIL"
|
| 238 |
reason: str = ""
|
| 239 |
issues: List[QAIssue] = field(default_factory=list)
|
| 240 |
+
warnings: List[str] = field(default_factory=list)
|
| 241 |
correction_instruction: str = ""
|
| 242 |
|
| 243 |
@property
|
| 244 |
def passed(self) -> bool:
|
| 245 |
+
return self.status in ("PASS", "PASS_WITH_WARNINGS")
|
| 246 |
+
|
| 247 |
+
@property
|
| 248 |
+
def passed_with_warnings(self) -> bool:
|
| 249 |
+
return self.status == "PASS_WITH_WARNINGS"
|
| 250 |
|
| 251 |
def owners(self) -> List[str]:
|
| 252 |
"""Return unique owner labels from issues."""
|
|
|
|
| 260 |
{"type": i.type, "message": i.message, "owner": i.owner}
|
| 261 |
for i in self.issues
|
| 262 |
],
|
| 263 |
+
"warnings": self.warnings,
|
| 264 |
"correction_instruction": self.correction_instruction,
|
| 265 |
}
|
| 266 |
|
|
|
|
| 299 |
status=data.get("status", "FAIL"),
|
| 300 |
reason=data.get("reason", ""),
|
| 301 |
issues=issues,
|
| 302 |
+
warnings=[str(w) for w in data.get("warnings", [])],
|
| 303 |
correction_instruction=data.get("correction_instruction", ""),
|
| 304 |
)
|
| 305 |
except (json.JSONDecodeError, KeyError):
|
|
|
|
| 308 |
# Fallback: parse from legacy text format
|
| 309 |
status = "FAIL"
|
| 310 |
lower = qa_text.lower()
|
| 311 |
+
if "result: pass_with_warnings" in lower:
|
| 312 |
+
status = "PASS_WITH_WARNINGS"
|
| 313 |
+
elif "result: pass" in lower:
|
| 314 |
status = "PASS"
|
| 315 |
|
| 316 |
reason = ""
|
|
|
|
| 500 |
}
|
| 501 |
|
| 502 |
|
| 503 |
+
# ============================================================
|
| 504 |
+
# Role Capability Metadata and Task Feature Extraction
|
| 505 |
+
# ============================================================
|
| 506 |
+
|
| 507 |
+
# Simple capability tags per role — used for transparent scoring
|
| 508 |
+
ROLE_CAPABILITIES: Dict[str, List[str]] = {
|
| 509 |
+
"creative": ["creative", "design", "ideas", "writing", "brainstorm", "opinion"],
|
| 510 |
+
"technical": ["technical", "analysis", "engineering", "calculation", "code", "implementation"],
|
| 511 |
+
"research": ["research", "facts", "evidence", "information", "comparison", "history"],
|
| 512 |
+
"security": ["risk", "safety", "compliance", "security", "vulnerability"],
|
| 513 |
+
"data_analyst": ["data", "statistics", "analysis", "metrics", "patterns"],
|
| 514 |
+
"labour_union_rep": ["labor", "policy", "workplace", "rights", "fairness"],
|
| 515 |
+
"ux_designer": ["design", "usability", "interface", "user_experience", "accessibility"],
|
| 516 |
+
"lawyer": ["legal", "compliance", "contracts", "liability", "regulation"],
|
| 517 |
+
"mad_professor": ["persona", "speculation", "radical", "humor"],
|
| 518 |
+
"accountant": ["persona", "cost", "budget", "financial"],
|
| 519 |
+
"artist": ["persona", "creative", "aesthetic", "vision"],
|
| 520 |
+
"lazy_slacker": ["persona", "simple_answer", "minimal"],
|
| 521 |
+
"black_metal_fundamentalist": ["persona", "stylistic", "humor", "music"],
|
| 522 |
+
"doris": ["persona", "humor"],
|
| 523 |
+
"chairman_of_board": ["persona", "strategy", "corporate", "governance"],
|
| 524 |
+
"maga_appointee": ["persona", "political", "deregulation"],
|
| 525 |
+
}
|
| 526 |
+
|
| 527 |
+
# Keywords in the user request that map to task features
|
| 528 |
+
TASK_FEATURE_KEYWORDS: Dict[str, List[str]] = {
|
| 529 |
+
"analysis": ["analy", "evaluate", "assess", "review", "examine", "investigate"],
|
| 530 |
+
"creative": ["creative", "brainstorm", "ideas", "imagine", "invent", "story", "write a"],
|
| 531 |
+
"design": ["design", "wireframe", "prototype", "layout", "visual", "ui", "ux", "interface", "usability",
|
| 532 |
+
"user experience", "accessibility", "login page", "user interface"],
|
| 533 |
+
"technical": ["code", "implement", "build", "architecture", "api", "debug", "software", "program",
|
| 534 |
+
"algorithm", "system", "deploy", "python", "javascript", "rust", "java", "react",
|
| 535 |
+
"framework", "performance"],
|
| 536 |
+
"research": ["research", "study", "evidence", "literature", "paper", "facts", "history",
|
| 537 |
+
"information", "find out", "look up"],
|
| 538 |
+
"policy": ["policy", "regulation", "law", "compliance", "legal", "rights", "labor", "labour",
|
| 539 |
+
"union", "worker", "employment", "workplace"],
|
| 540 |
+
"simple_answer": ["yes or no", "pick one", "choose", "which one", "red or blue", "agree on one"],
|
| 541 |
+
"opinion": ["opinion", "perspective", "viewpoint", "discuss", "debate", "pros and cons",
|
| 542 |
+
"should i", "what do you think", "agree", "disagree"],
|
| 543 |
+
"comparison": ["compare", "comparison", "versus", "vs", "difference", "better"],
|
| 544 |
+
"data": ["data", "statistics", "metric", "trend", "pattern", "chart", "dashboard", "csv",
|
| 545 |
+
"spreadsheet", "dataset"],
|
| 546 |
+
"security": ["security", "vulnerability", "attack", "encryption", "password", "exploit",
|
| 547 |
+
"firewall", "gdpr", "privacy"],
|
| 548 |
+
"cost": ["cost", "budget", "expense", "cheap", "price", "financial", "roi"],
|
| 549 |
+
"humor": ["funny", "joke", "humorous", "kvlt", "metal", "nihil"],
|
| 550 |
+
"music": ["music", "metal", "band", "song", "album", "guitar"],
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
# Generalist fallback roles used when no capability matches
|
| 554 |
+
_GENERALIST_ROLES = ("creative", "technical", "research")
|
| 555 |
+
|
| 556 |
|
| 557 |
+
def extract_task_features(user_request: str, task_category: str = "other") -> List[str]:
|
| 558 |
+
"""Derive task features from the user request and task category.
|
|
|
|
| 559 |
|
| 560 |
+
Returns a deduplicated list of feature tags like ["design", "opinion"].
|
|
|
|
| 561 |
"""
|
| 562 |
lower = user_request.lower()
|
| 563 |
+
features: List[str] = []
|
| 564 |
+
|
| 565 |
+
for feature, keywords in TASK_FEATURE_KEYWORDS.items():
|
| 566 |
+
for kw in keywords:
|
| 567 |
+
if kw in lower:
|
| 568 |
+
features.append(feature)
|
| 569 |
+
break # one match per feature is enough
|
| 570 |
+
|
| 571 |
+
# Add features implied by the task category
|
| 572 |
+
category_features: Dict[str, List[str]] = {
|
| 573 |
+
"coding_task": ["technical", "code"],
|
| 574 |
+
"creative_writing": ["creative"],
|
| 575 |
+
"factual_question": ["research"],
|
| 576 |
+
"comparison": ["comparison", "research"],
|
| 577 |
+
"analysis": ["analysis"],
|
| 578 |
+
"summarization": ["research"],
|
| 579 |
+
"opinion_discussion": ["opinion"],
|
| 580 |
+
"planning": ["analysis"],
|
| 581 |
+
}
|
| 582 |
+
for f in category_features.get(task_category, []):
|
| 583 |
+
if f not in features:
|
| 584 |
+
features.append(f)
|
| 585 |
|
| 586 |
+
return features
|
|
|
|
|
|
|
|
|
|
| 587 |
|
|
|
|
|
|
|
|
|
|
| 588 |
|
| 589 |
+
@dataclass
|
| 590 |
+
class RoleScore:
|
| 591 |
+
"""Scoring details for a single role — used for transparent logging."""
|
| 592 |
+
role_key: str
|
| 593 |
+
role_label: str
|
| 594 |
+
score: int
|
| 595 |
+
matched_capabilities: List[str]
|
| 596 |
+
is_persona: bool
|
| 597 |
+
filtered_reason: str = "" # why it was excluded, if any
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
def score_roles(
|
| 601 |
+
task_features: List[str],
|
| 602 |
+
active_role_keys: List[str],
|
| 603 |
+
config: WorkflowConfig,
|
| 604 |
+
task_category: str = "other",
|
| 605 |
+
) -> List[RoleScore]:
|
| 606 |
+
"""Score each active role by capability overlap with task features.
|
| 607 |
|
| 608 |
+
Returns all RoleScore objects (including filtered ones) for transparency.
|
| 609 |
+
"""
|
| 610 |
+
feature_set = set(task_features)
|
| 611 |
+
results: List[RoleScore] = []
|
| 612 |
+
|
| 613 |
+
# Import here to avoid circular — role labels come from the caller
|
| 614 |
+
for role_key in active_role_keys:
|
| 615 |
+
capabilities = ROLE_CAPABILITIES.get(role_key, [])
|
| 616 |
+
meta = ROLE_RELEVANCE.get(role_key, {})
|
| 617 |
+
is_persona = meta.get("is_persona", False)
|
| 618 |
+
role_label = meta.get("description", role_key)
|
| 619 |
+
|
| 620 |
+
# Capability overlap score
|
| 621 |
+
matched = [cap for cap in capabilities if cap in feature_set]
|
| 622 |
+
score = len(matched)
|
| 623 |
|
| 624 |
+
# Task-category affinity bonus (from ROLE_RELEVANCE)
|
| 625 |
role_tasks = meta.get("task_types", [])
|
| 626 |
if task_category in role_tasks:
|
| 627 |
score += 2
|
| 628 |
|
| 629 |
+
rs = RoleScore(
|
| 630 |
+
role_key=role_key,
|
| 631 |
+
role_label=role_label,
|
| 632 |
+
score=score,
|
| 633 |
+
matched_capabilities=matched,
|
| 634 |
+
is_persona=is_persona,
|
| 635 |
+
)
|
| 636 |
+
|
| 637 |
+
# Filter personas unless allowed
|
| 638 |
+
if is_persona and not config.allow_persona_roles:
|
| 639 |
+
rs.filtered_reason = "persona role not allowed"
|
| 640 |
+
|
| 641 |
+
results.append(rs)
|
| 642 |
+
|
| 643 |
+
return results
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
def select_relevant_roles(
|
| 647 |
+
user_request: str,
|
| 648 |
+
active_role_keys: List[str],
|
| 649 |
+
config: WorkflowConfig,
|
| 650 |
+
task_category: str = "other",
|
| 651 |
+
) -> List[str]:
|
| 652 |
+
"""Select the most relevant specialist roles for a given request.
|
| 653 |
|
| 654 |
+
Uses capability-based scoring: extracts task features from the request,
|
| 655 |
+
scores each active role by capability overlap, and returns the top roles
|
| 656 |
+
up to ``config.max_specialists_per_task``.
|
| 657 |
|
| 658 |
+
Returns a ``_SelectionResult`` (list subclass) so callers can also access
|
| 659 |
+
``.scoring_info`` for transparent trace logging.
|
| 660 |
+
"""
|
| 661 |
+
task_features = extract_task_features(user_request, task_category)
|
| 662 |
+
role_scores = score_roles(task_features, active_role_keys, config, task_category)
|
| 663 |
+
|
| 664 |
+
# Separate eligible from filtered
|
| 665 |
+
eligible = [rs for rs in role_scores if not rs.filtered_reason]
|
| 666 |
+
eligible.sort(key=lambda rs: (-rs.score, active_role_keys.index(rs.role_key)))
|
| 667 |
+
|
| 668 |
+
selected: List[str] = []
|
| 669 |
+
for rs in eligible:
|
| 670 |
if len(selected) >= config.max_specialists_per_task:
|
| 671 |
break
|
| 672 |
+
# In strict mode, only include roles with score > 0 (unless we have none yet)
|
| 673 |
+
if config.strict_mode and rs.score == 0 and selected:
|
| 674 |
continue
|
| 675 |
+
selected.append(rs.role_key)
|
| 676 |
|
| 677 |
# Ensure at least one specialist is selected
|
| 678 |
+
if not selected and eligible:
|
| 679 |
+
selected.append(eligible[0].role_key)
|
| 680 |
|
| 681 |
+
# Generalist fallback when nothing matched
|
| 682 |
if not selected:
|
| 683 |
+
for rk in _GENERALIST_ROLES:
|
| 684 |
+
if rk in active_role_keys:
|
|
|
|
| 685 |
selected.append(rk)
|
| 686 |
break
|
| 687 |
|
|
|
|
| 692 |
and "research" not in selected):
|
| 693 |
selected.append("research")
|
| 694 |
|
| 695 |
+
return _SelectionResult(selected, role_scores, task_features)
|
| 696 |
+
|
| 697 |
+
|
| 698 |
+
class _SelectionResult(list):
|
| 699 |
+
"""A list of role keys with attached scoring metadata.
|
| 700 |
+
|
| 701 |
+
Behaves exactly like a ``list[str]`` so existing code continues to work,
|
| 702 |
+
but also carries ``scoring_info`` and ``task_features`` for trace logging.
|
| 703 |
+
"""
|
| 704 |
+
|
| 705 |
+
def __init__(
|
| 706 |
+
self,
|
| 707 |
+
selected: List[str],
|
| 708 |
+
scoring_info: List[RoleScore],
|
| 709 |
+
task_features: List[str],
|
| 710 |
+
):
|
| 711 |
+
super().__init__(selected)
|
| 712 |
+
self.scoring_info = scoring_info
|
| 713 |
+
self.task_features = task_features
|
| 714 |
+
|
| 715 |
+
def format_trace(self, role_labels: Optional[Dict[str, str]] = None) -> str:
|
| 716 |
+
"""Return a human-readable ROLE SCORING trace block."""
|
| 717 |
+
lines = ["── ROLE SCORING ──"]
|
| 718 |
+
lines.append(f"Task features: {self.task_features}")
|
| 719 |
+
for rs in sorted(self.scoring_info, key=lambda r: -r.score):
|
| 720 |
+
label = (role_labels or {}).get(rs.role_key, rs.role_key)
|
| 721 |
+
status = "SELECTED" if rs.role_key in self else "skipped"
|
| 722 |
+
if rs.filtered_reason:
|
| 723 |
+
status = f"FILTERED ({rs.filtered_reason})"
|
| 724 |
+
caps = ", ".join(rs.matched_capabilities) if rs.matched_capabilities else "none"
|
| 725 |
+
lines.append(
|
| 726 |
+
f" {label}: score={rs.score} caps=[{caps}] → {status}"
|
| 727 |
+
)
|
| 728 |
+
lines.append("──────────────────")
|
| 729 |
+
return "\n".join(lines)
|
| 730 |
|
| 731 |
|
| 732 |
# ============================================================
|
|
|
|
| 876 |
return answer
|
| 877 |
|
| 878 |
|
| 879 |
+
def postprocess_format_fixes(text: str) -> str:
|
| 880 |
+
"""Apply lightweight format fixes before QA evaluation.
|
| 881 |
+
|
| 882 |
+
Converts common formatting artefacts so QA can focus on content quality
|
| 883 |
+
rather than failing for cosmetic issues.
|
| 884 |
+
"""
|
| 885 |
+
# Remove markdown headings (# / ## / ###)
|
| 886 |
+
text = re.sub(r'^#{1,4}\s+', '', text, flags=re.MULTILINE)
|
| 887 |
+
|
| 888 |
+
# Convert bullet-list lines to flowing sentences
|
| 889 |
+
def _bullets_to_sentences(m: re.Match) -> str:
|
| 890 |
+
lines = m.group(0).strip().splitlines()
|
| 891 |
+
sentences = []
|
| 892 |
+
for line in lines:
|
| 893 |
+
cleaned = re.sub(r'^\s*[-•*]\s+', '', line).strip()
|
| 894 |
+
if cleaned:
|
| 895 |
+
# Ensure it ends with a full stop
|
| 896 |
+
if cleaned[-1] not in '.!?':
|
| 897 |
+
cleaned += '.'
|
| 898 |
+
sentences.append(cleaned)
|
| 899 |
+
return ' '.join(sentences)
|
| 900 |
+
|
| 901 |
+
text = re.sub(
|
| 902 |
+
r'(?:^\s*[-•*]\s+.+\n?){2,}',
|
| 903 |
+
_bullets_to_sentences,
|
| 904 |
+
text,
|
| 905 |
+
flags=re.MULTILINE,
|
| 906 |
+
)
|
| 907 |
+
|
| 908 |
+
# Collapse runs of 3+ blank lines into 2
|
| 909 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 910 |
+
|
| 911 |
+
# Remove leftover JSON-like traces (e.g. {"status": ...} blocks)
|
| 912 |
+
text = re.sub(r'\{[^{}]*"status"\s*:[^{}]*\}', '', text)
|
| 913 |
+
|
| 914 |
+
return text.strip()
|
| 915 |
+
|
| 916 |
+
|
| 917 |
# ============================================================
|
| 918 |
# Planner State
|
| 919 |
# ============================================================
|