Spaces:

CatoG
/

Agent1

Running

App Files Files Community

CatoG commited on Mar 15

Commit

22173b0

1 Parent(s): 92d41c5

jkkjkj

Browse files

Files changed (3) hide show

app.py +77 -25
test_workflow.py +166 -0
workflow_helpers.py +245 -48

app.py CHANGED Viewed

@@ -19,6 +19,7 @@ from workflow_helpers import (
     PlannerState, FailureRecord,
     select_relevant_roles, identify_revision_targets,
     compress_final_answer, strip_internal_noise,
     get_synthesizer_format_instruction, get_qa_format_instruction,
     validate_output_format, format_violations_instruction,
     parse_task_assumptions, format_assumptions_for_prompt,
@@ -615,7 +616,7 @@ class WorkflowState(TypedDict):
 # --- Role system prompts ---
-_PLANNER_SYSTEM = (
     "You are the Planner in a strict planner–specialist–synthesizer–QA workflow.\n"
     "Your ONLY job is to PLAN and DELEGATE. You do NOT write the answer.\n\n"
     "Your responsibilities:\n"
@@ -623,14 +624,7 @@ _PLANNER_SYSTEM = (
     "2. Decide which specialist to call as the PRIMARY lead.\n"
     "   IMPORTANT: Select the FEWEST roles necessary. Do NOT call all roles.\n"
     "   Available specialists:\n"
-    "   - 'Creative Expert' (ideas, framing, wording, brainstorming)\n"
-    "   - 'Technical Expert' (code, architecture, implementation)\n"
-    "   - 'Research Analyst' (information gathering, literature review, fact-finding)\n"
-    "   - 'Security Reviewer' (security analysis, vulnerability checks)\n"
-    "   - 'Data Analyst' (data analysis, statistics, patterns)\n"
-    "   - 'Labour Union Representative' (worker rights, fair wages)\n"
-    "   - 'UX Designer' (user needs, usability, accessibility)\n"
-    "   - 'Lawyer' (legal compliance, liability, contracts)\n"
     "3. State clear success criteria.\n"
     "4. Identify the required output format and brevity level.\n"
     "5. Define shared assumptions that ALL specialists must use.\n"
@@ -642,6 +636,7 @@ _PLANNER_SYSTEM = (
     "- The specialists will create the content. The Synthesizer will combine it.\n"
     "- For simple questions, ONE specialist is enough.\n"
     "- Never call persona/gimmick roles unless the user explicitly asks for them.\n"
     "- QA results are BINDING — if QA says FAIL, you MUST revise, never approve.\n\n"
     "Respond in this exact format:\n"
     "TASK BREAKDOWN:\n<subtask list — what needs to be addressed, NOT the answers>\n\n"
@@ -652,6 +647,34 @@ _PLANNER_SYSTEM = (
     "GUIDANCE FOR SPECIALIST:\n<delegation instructions — what to focus on, NOT answer content>"
 )
 _CREATIVE_SYSTEM = (
     "You are the Creative Expert in a multi-role AI workflow.\n"
     "You handle brainstorming, alternative ideas, framing, wording, and concept generation.\n"
@@ -680,17 +703,30 @@ _QA_SYSTEM = (
     "output format requirements, brevity requirements, AND expert influence.\n\n"
     "You MUST respond with a JSON object in this exact structure:\n"
     '{\n'
-    '  \"status\": \"PASS\" or \"FAIL\",\n'
-    '  \"reason\": \"short explanation\",\n'
-    '  \"issues\": [\n'
     '    {\n'
-    '      \"type\": \"format\" | \"brevity\" | \"constraint\" | \"consistency\" | \"directness\" | \"evidence\" | \"expert_influence\" | \"other\",\n'
-    '      \"message\": \"what is wrong\",\n'
-    '      \"owner\": \"Synthesizer\" | \"Planner\" | \"Research Analyst\" | \"<specialist role name>\"\n'
     '    }\n'
     '  ],\n'
-    '  \"correction_instruction\": \"specific minimal fix\"\n'
     '}\n\n'
     "Validation rules:\n"
     "- Check that the output DIRECTLY answers the user's question.\n"
     "- Check that the output format matches what was requested (single choice, table, code, etc.).\n"
@@ -704,18 +740,19 @@ _QA_SYSTEM = (
     "  * If multiple experts contributed, their relevant points are incorporated or consciously noted.\n"
     "  * The answer is NOT just a paraphrase of planner text with no expert content.\n"
     "  * FAIL with type 'expert_influence' if expert contributions were ignored.\n"
-    "- FAIL if any of the above checks fail.\n"
-    "- PASS only if ALL checks pass.\n"
 )
 _PLANNER_REVIEW_SYSTEM = (
     "You are the Planner reviewing QA feedback.\n"
     "CRITICAL RULE: QA results are BINDING.\n"
-    "- If QA status is PASS: approve the result.\n"
     "- If QA status is FAIL: you MUST revise. You may NOT approve a FAIL result.\n"
     "- If this is the final revision (max reached) and QA still FAIL:\n"
     "  you must directly fix the QA issues in your response before approving.\n\n"
-    "If QA PASSED, respond with:\n"
     "DECISION: APPROVED\n"
     "FINAL ANSWER:\n<the approved output, reproduced in full>\n\n"
     "If QA FAILED and revisions remain, respond with:\n"
@@ -1098,7 +1135,10 @@ def _parse_qa_role_feedback(qa_text: str) -> Dict[str, str]:
 # Each step receives the shared state and an append-only trace list,
 # updates state in place, appends log lines, and returns updated state.
-def _step_plan(chat_model, state: WorkflowState, trace: List[str]) -> WorkflowState:
     """Planner: analyse the task, produce a plan, decide which specialist to call."""
     trace.append("\n╔══ [PLANNER] Analysing task... ══╗")
     fmt = state.get("output_format", "other")
@@ -1114,7 +1154,8 @@ def _step_plan(chat_model, state: WorkflowState, trace: List[str]) -> WorkflowSt
             f"\nPrevious QA report:\n{state['qa_report']}"
             "\nAdjust the plan to address the QA issues."
         )
-    plan_text = _llm_call(chat_model, _PLANNER_SYSTEM, content)
     state["plan"] = plan_text
     state["current_role"] = _decide_role(plan_text)
     trace.append(plan_text)
@@ -1181,6 +1222,9 @@ def _step_qa(
     """
     trace.append("\n╔══ [QA TESTER] Reviewing output... ══╗")
     fmt = state.get("output_format", "other")
     brevity = state.get("brevity_requirement", "normal")
     format_rules = get_qa_format_instruction(fmt, brevity)
@@ -1248,8 +1292,12 @@ def _step_qa(
     # Also extract legacy role feedback for backward compatibility
     state["qa_role_feedback"] = _parse_qa_role_feedback(text)
-    result_label = "✅ PASS" if state["qa_passed"] else "❌ FAIL"
     trace.append(text)
     if qa_result.issues:
         issues_summary = "; ".join(
             f"{i.owner}: {i.message[:60]}{'…' if len(i.message) > 60 else ''}"
@@ -2080,7 +2128,8 @@ def run_multi_role_workflow(
     try:
         if planner_active:
-            state = _step_plan(chat_model, state, trace)
             # Parse shared task assumptions from planner output
             assumptions = parse_task_assumptions(state["plan"])
@@ -2123,6 +2172,9 @@ def run_multi_role_workflow(
             f"\n[ROLE SELECTION] {len(selected_roles)} specialist(s) selected: "
             + ", ".join(AGENT_ROLES.get(k, k) for k in selected_roles)
         )
         # Step 4: Run ALL selected specialists (initial run only)
         if primary_role not in selected_roles:
@@ -2210,7 +2262,7 @@ def run_multi_role_workflow(
             else:
                 state["qa_passed"] = True
                 state["qa_report"] = "QA Tester is disabled — skipping quality review."
-                state["qa_structured"] = {"status": "PASS", "reason": "", "issues": [], "correction_instruction": ""}
                 trace.append("\n[QA Tester disabled] Skipping quality review — auto-pass.")
             # Update planner state

     PlannerState, FailureRecord,
     select_relevant_roles, identify_revision_targets,
     compress_final_answer, strip_internal_noise,
+    postprocess_format_fixes,
     get_synthesizer_format_instruction, get_qa_format_instruction,
     validate_output_format, format_violations_instruction,
     parse_task_assumptions, format_assumptions_for_prompt,
 # --- Role system prompts ---
+_PLANNER_SYSTEM_BASE = (
     "You are the Planner in a strict planner–specialist–synthesizer–QA workflow.\n"
     "Your ONLY job is to PLAN and DELEGATE. You do NOT write the answer.\n\n"
     "Your responsibilities:\n"
     "2. Decide which specialist to call as the PRIMARY lead.\n"
     "   IMPORTANT: Select the FEWEST roles necessary. Do NOT call all roles.\n"
     "   Available specialists:\n"
+    "{specialist_list}"
     "3. State clear success criteria.\n"
     "4. Identify the required output format and brevity level.\n"
     "5. Define shared assumptions that ALL specialists must use.\n"
     "- The specialists will create the content. The Synthesizer will combine it.\n"
     "- For simple questions, ONE specialist is enough.\n"
     "- Never call persona/gimmick roles unless the user explicitly asks for them.\n"
+    "- Only select from the specialists listed above — no others are available.\n"
     "- QA results are BINDING — if QA says FAIL, you MUST revise, never approve.\n\n"
     "Respond in this exact format:\n"
     "TASK BREAKDOWN:\n<subtask list — what needs to be addressed, NOT the answers>\n\n"
     "GUIDANCE FOR SPECIALIST:\n<delegation instructions — what to focus on, NOT answer content>"
 )
+def _build_planner_system(enabled_role_keys: List[str]) -> str:
+    """Build the planner system prompt with the actual enabled roles."""
+    role_descriptions = {
+        "creative": "'Creative Expert' (ideas, framing, wording, brainstorming)",
+        "technical": "'Technical Expert' (code, architecture, implementation)",
+        "research": "'Research Analyst' (information gathering, literature review, fact-finding)",
+        "security": "'Security Reviewer' (security analysis, vulnerability checks)",
+        "data_analyst": "'Data Analyst' (data analysis, statistics, patterns)",
+        "labour_union_rep": "'Labour Union Representative' (worker rights, fair wages)",
+        "ux_designer": "'UX Designer' (user needs, usability, accessibility)",
+        "lawyer": "'Lawyer' (legal compliance, liability, contracts)",
+        "mad_professor": "'Mad Professor' (wild ideas, provocative perspectives)",
+        "accountant": "'Accountant' (cost analysis, budgeting, financial review)",
+        "artist": "'Artist' (aesthetic vision, creative expression)",
+        "lazy_slacker": "'Lazy Slacker' (minimal effort, simple answers)",
+        "black_metal_fundamentalist": "'Black Metal Fundamentalist' (nihilistic perspective)",
+        "doris": "'Doris' (practical, no-nonsense perspective)",
+        "chairman_of_board": "'Chairman of the Board' (corporate strategy, governance)",
+        "maga_appointee": "'MAGA Appointee' (deregulation, America-first perspective)",
+    }
+    lines = []
+    for rk in enabled_role_keys:
+        desc = role_descriptions.get(rk, f"'{rk}'")
+        lines.append(f"   - {desc}\n")
+    specialist_list = "".join(lines) if lines else "   - (no specialists enabled)\n"
+    return _PLANNER_SYSTEM_BASE.format(specialist_list=specialist_list)
 _CREATIVE_SYSTEM = (
     "You are the Creative Expert in a multi-role AI workflow.\n"
     "You handle brainstorming, alternative ideas, framing, wording, and concept generation.\n"
     "output format requirements, brevity requirements, AND expert influence.\n\n"
     "You MUST respond with a JSON object in this exact structure:\n"
     '{\n'
+    '  "status": "PASS" or "PASS_WITH_WARNINGS" or "FAIL",\n'
+    '  "reason": "short explanation",\n'
+    '  "warnings": ["optional list of minor cosmetic or stylistic notes"],\n'
+    '  "issues": [\n'
     '    {\n'
+    '      "type": "format" | "brevity" | "constraint" | "consistency" | "directness" | "evidence" | "expert_influence" | "other",\n'
+    '      "message": "what is wrong",\n'
+    '      "owner": "Synthesizer" | "Planner" | "Research Analyst" | "<specialist role name>"\n'
     '    }\n'
     '  ],\n'
+    '  "correction_instruction": "specific minimal fix"\n'
     '}\n\n'
+    "STATUS LEVELS — use the right one:\n"
+    "- PASS: The answer is correct, complete, properly formatted, and meets all criteria.\n"
+    "- PASS_WITH_WARNINGS: The answer is substantively correct and usable, but has minor\n"
+    "  cosmetic or stylistic issues (e.g. slightly verbose, could be tighter, minor formatting\n"
+    "  quirks). List these in the 'warnings' array. Do NOT put them in 'issues'.\n"
+    "- FAIL: The answer has substantive problems — wrong content, missing key information,\n"
+    "  wrong format, ignores the question, unsupported claims, or expert contributions ignored.\n"
+    "  Only FAIL triggers a revision cycle.\n\n"
+    "FOCUS ON CONTENT, NOT COSMETICS:\n"
+    "- Minor bullet formatting, heading style, or whitespace are NOT reasons to FAIL.\n"
+    "- A slightly verbose answer that correctly addresses the question is PASS_WITH_WARNINGS, not FAIL.\n"
+    "- Reserve FAIL for answers that are genuinely wrong, incomplete, or miss the point.\n\n"
     "Validation rules:\n"
     "- Check that the output DIRECTLY answers the user's question.\n"
     "- Check that the output format matches what was requested (single choice, table, code, etc.).\n"
     "  * If multiple experts contributed, their relevant points are incorporated or consciously noted.\n"
     "  * The answer is NOT just a paraphrase of planner text with no expert content.\n"
     "  * FAIL with type 'expert_influence' if expert contributions were ignored.\n"
+    "- FAIL if any substantive check fails.\n"
+    "- PASS_WITH_WARNINGS if content is good but minor polish is needed.\n"
+    "- PASS only if ALL checks pass with no issues at all.\n"
 )
 _PLANNER_REVIEW_SYSTEM = (
     "You are the Planner reviewing QA feedback.\n"
     "CRITICAL RULE: QA results are BINDING.\n"
+    "- If QA status is PASS or PASS_WITH_WARNINGS: approve the result.\n"
     "- If QA status is FAIL: you MUST revise. You may NOT approve a FAIL result.\n"
     "- If this is the final revision (max reached) and QA still FAIL:\n"
     "  you must directly fix the QA issues in your response before approving.\n\n"
+    "If QA PASSED (or PASS_WITH_WARNINGS), respond with:\n"
     "DECISION: APPROVED\n"
     "FINAL ANSWER:\n<the approved output, reproduced in full>\n\n"
     "If QA FAILED and revisions remain, respond with:\n"
 # Each step receives the shared state and an append-only trace list,
 # updates state in place, appends log lines, and returns updated state.
+def _step_plan(
+    chat_model, state: WorkflowState, trace: List[str],
+    enabled_role_keys: Optional[List[str]] = None,
+) -> WorkflowState:
     """Planner: analyse the task, produce a plan, decide which specialist to call."""
     trace.append("\n╔══ [PLANNER] Analysing task... ══╗")
     fmt = state.get("output_format", "other")
             f"\nPrevious QA report:\n{state['qa_report']}"
             "\nAdjust the plan to address the QA issues."
         )
+    planner_system = _build_planner_system(enabled_role_keys or [])
+    plan_text = _llm_call(chat_model, planner_system, content)
     state["plan"] = plan_text
     state["current_role"] = _decide_role(plan_text)
     trace.append(plan_text)
     """
     trace.append("\n╔══ [QA TESTER] Reviewing output... ══╗")
+    # Apply post-processing format fixes before QA evaluation
+    state["draft_output"] = postprocess_format_fixes(state["draft_output"])
     fmt = state.get("output_format", "other")
     brevity = state.get("brevity_requirement", "normal")
     format_rules = get_qa_format_instruction(fmt, brevity)
     # Also extract legacy role feedback for backward compatibility
     state["qa_role_feedback"] = _parse_qa_role_feedback(text)
+    result_label = ("✅ PASS" if qa_result.status == "PASS"
+                     else "⚠ PASS_WITH_WARNINGS" if qa_result.passed_with_warnings
+                     else "❌ FAIL")
     trace.append(text)
+    if qa_result.warnings:
+        trace.append(f"  ⚠ QA warnings: {'; '.join(qa_result.warnings)}")
     if qa_result.issues:
         issues_summary = "; ".join(
             f"{i.owner}: {i.message[:60]}{'…' if len(i.message) > 60 else ''}"
     try:
         if planner_active:
+            state = _step_plan(chat_model, state, trace,
+                               enabled_role_keys=active_specialist_keys)
             # Parse shared task assumptions from planner output
             assumptions = parse_task_assumptions(state["plan"])
             f"\n[ROLE SELECTION] {len(selected_roles)} specialist(s) selected: "
             + ", ".join(AGENT_ROLES.get(k, k) for k in selected_roles)
         )
+        # Append detailed scoring trace when available
+        if hasattr(selected_roles, 'format_trace'):
+            trace.append(selected_roles.format_trace(AGENT_ROLES))
         # Step 4: Run ALL selected specialists (initial run only)
         if primary_role not in selected_roles:
             else:
                 state["qa_passed"] = True
                 state["qa_report"] = "QA Tester is disabled — skipping quality review."
+                state["qa_structured"] = {"status": "PASS", "reason": "", "issues": [], "warnings": [], "correction_instruction": ""}
                 trace.append("\n[QA Tester disabled] Skipping quality review — auto-pass.")
             # Update planner state

test_workflow.py CHANGED Viewed

@@ -31,6 +31,7 @@ from workflow_helpers import (
     identify_revision_targets,
     strip_internal_noise,
     compress_final_answer,
     PlannerState,
     FailureRecord,
     get_synthesizer_format_instruction,
@@ -45,6 +46,10 @@ from workflow_helpers import (
     format_contributions_for_qa,
     parse_used_contributions,
     check_expert_influence,
 )
 from evidence import (
     EvidenceItem,
@@ -244,6 +249,167 @@ class TestStructuredQAParsing(unittest.TestCase):
         self.assertEqual(d["status"], "FAIL")
         self.assertEqual(len(d["issues"]), 1)
 # ============================================================
 # Test: Role Selection

     identify_revision_targets,
     strip_internal_noise,
     compress_final_answer,
+    postprocess_format_fixes,
     PlannerState,
     FailureRecord,
     get_synthesizer_format_instruction,
     format_contributions_for_qa,
     parse_used_contributions,
     check_expert_influence,
+    extract_task_features,
+    ROLE_CAPABILITIES,
+    RoleScore,
+    score_roles,
 )
 from evidence import (
     EvidenceItem,
         self.assertEqual(d["status"], "FAIL")
         self.assertEqual(len(d["issues"]), 1)
+    def test_parse_json_pass_with_warnings(self):
+        """PASS_WITH_WARNINGS is parsed correctly from JSON."""
+        qa_text = json.dumps({
+            "status": "PASS_WITH_WARNINGS",
+            "reason": "Content is good but slightly verbose",
+            "warnings": ["Could be more concise", "Minor formatting quirk"],
+            "issues": [],
+            "correction_instruction": ""
+        })
+        result = parse_structured_qa(qa_text)
+        self.assertTrue(result.passed)
+        self.assertTrue(result.passed_with_warnings)
+        self.assertEqual(result.status, "PASS_WITH_WARNINGS")
+        self.assertEqual(len(result.warnings), 2)
+        self.assertIn("Could be more concise", result.warnings)
+        self.assertEqual(len(result.issues), 0)
+    def test_parse_legacy_pass_with_warnings(self):
+        """PASS_WITH_WARNINGS is recognized in legacy text format."""
+        qa_text = (
+            "REQUIREMENTS CHECKED:\n- All met\n\n"
+            "ISSUES FOUND:\nNone\n\n"
+            "RESULT: PASS_WITH_WARNINGS\n\n"
+            "RECOMMENDED FIXES:\nNone"
+        )
+        result = parse_structured_qa(qa_text)
+        self.assertTrue(result.passed)
+        self.assertTrue(result.passed_with_warnings)
+    def test_pass_with_warnings_to_dict_includes_warnings(self):
+        result = QAResult(
+            status="PASS_WITH_WARNINGS",
+            reason="Minor issues",
+            warnings=["Slightly verbose"],
+            issues=[],
+        )
+        d = result.to_dict()
+        self.assertEqual(d["status"], "PASS_WITH_WARNINGS")
+        self.assertEqual(d["warnings"], ["Slightly verbose"])
+    def test_pass_is_not_passed_with_warnings(self):
+        result = QAResult(status="PASS")
+        self.assertTrue(result.passed)
+        self.assertFalse(result.passed_with_warnings)
+    def test_fail_is_not_passed(self):
+        result = QAResult(status="FAIL")
+        self.assertFalse(result.passed)
+        self.assertFalse(result.passed_with_warnings)
+# ============================================================
+# Test: Post-Processing Format Fixes
+# ============================================================
+class TestPostprocessFormatFixes(unittest.TestCase):
+    def test_removes_markdown_headings(self):
+        text = "# Heading\nSome content.\n## Subheading\nMore content."
+        result = postprocess_format_fixes(text)
+        self.assertNotIn("#", result)
+        self.assertIn("Some content.", result)
+        self.assertIn("More content.", result)
+    def test_converts_bullets_to_sentences(self):
+        text = "- First point\n- Second point\n- Third point"
+        result = postprocess_format_fixes(text)
+        self.assertNotIn("- ", result)
+        self.assertIn("First point.", result)
+        self.assertIn("Second point.", result)
+    def test_collapses_blank_lines(self):
+        text = "Line 1\n\n\n\n\nLine 2"
+        result = postprocess_format_fixes(text)
+        self.assertNotIn("\n\n\n", result)
+        self.assertIn("Line 1", result)
+        self.assertIn("Line 2", result)
+    def test_removes_json_traces(self):
+        text = 'Answer here. {"status": "PASS", "reason": "ok"} End.'
+        result = postprocess_format_fixes(text)
+        self.assertNotIn('"status"', result)
+        self.assertIn("Answer here.", result)
+    def test_preserves_clean_text(self):
+        text = "This is already clean text with no issues."
+        result = postprocess_format_fixes(text)
+        self.assertEqual(result, text)
+# ============================================================
+# Test: Task Feature Extraction and Role Scoring
+# ============================================================
+class TestTaskFeatureExtraction(unittest.TestCase):
+    def test_extract_coding_features(self):
+        features = extract_task_features("write Python code to parse CSV", "coding_task")
+        self.assertIn("technical", features)
+    def test_extract_design_features(self):
+        features = extract_task_features("design a user interface for the dashboard")
+        self.assertIn("design", features)
+    def test_extract_research_features(self):
+        features = extract_task_features("research the history of quantum computing")
+        self.assertIn("research", features)
+    def test_extract_multiple_features(self):
+        features = extract_task_features("analyze data trends and compare security vulnerabilities")
+        self.assertIn("analysis", features)
+        self.assertIn("data", features)
+        self.assertIn("security", features)
+        self.assertIn("comparison", features)
+    def test_category_adds_implied_features(self):
+        features = extract_task_features("do something", "factual_question")
+        self.assertIn("research", features)
+class TestRoleScoring(unittest.TestCase):
+    def setUp(self):
+        self.all_roles = list(WorkflowConfig.CORE_ROLE_KEYS) + list(WorkflowConfig.PERSONA_ROLE_KEYS)
+        self.config = WorkflowConfig(strict_mode=True, allow_persona_roles=False, max_specialists_per_task=3)
+    def test_scores_are_populated(self):
+        features = extract_task_features("write Python code to build an API", "coding_task")
+        scores = score_roles(features, self.all_roles, self.config, "coding_task")
+        self.assertTrue(len(scores) > 0)
+        # Technical should score highest
+        technical_scores = [s for s in scores if s.role_key == "technical"]
+        self.assertTrue(len(technical_scores) == 1)
+        self.assertGreater(technical_scores[0].score, 0)
+    def test_persona_roles_filtered(self):
+        features = extract_task_features("budget analysis")
+        scores = score_roles(features, self.all_roles, self.config)
+        persona_scores = [s for s in scores if s.is_persona]
+        for ps in persona_scores:
+            self.assertTrue(ps.filtered_reason)
+    def test_persona_roles_allowed(self):
+        config = WorkflowConfig(strict_mode=True, allow_persona_roles=True, max_specialists_per_task=5)
+        features = extract_task_features("budget analysis")
+        scores = score_roles(features, self.all_roles, config)
+        accountant = [s for s in scores if s.role_key == "accountant"]
+        self.assertTrue(len(accountant) == 1)
+        self.assertEqual(accountant[0].filtered_reason, "")
+    def test_selection_result_format_trace(self):
+        config = WorkflowConfig(strict_mode=True, allow_persona_roles=False, max_specialists_per_task=3)
+        result = select_relevant_roles(
+            "write Python code", self.all_roles, config, task_category="coding_task"
+        )
+        self.assertTrue(hasattr(result, 'format_trace'))
+        trace = result.format_trace({"technical": "Technical Expert"})
+        self.assertIn("ROLE SCORING", trace)
+        self.assertIn("Task features:", trace)
+        self.assertIn("SELECTED", trace)
 # ============================================================
 # Test: Role Selection

workflow_helpers.py CHANGED Viewed

@@ -234,14 +234,19 @@ class QAIssue:
 @dataclass
 class QAResult:
-    status: str                          # "PASS" | "FAIL"
     reason: str = ""
     issues: List[QAIssue] = field(default_factory=list)
     correction_instruction: str = ""
     @property
     def passed(self) -> bool:
-        return self.status == "PASS"
     def owners(self) -> List[str]:
         """Return unique owner labels from issues."""
@@ -255,6 +260,7 @@ class QAResult:
                 {"type": i.type, "message": i.message, "owner": i.owner}
                 for i in self.issues
             ],
             "correction_instruction": self.correction_instruction,
         }
@@ -293,6 +299,7 @@ def parse_structured_qa(qa_text: str) -> QAResult:
                 status=data.get("status", "FAIL"),
                 reason=data.get("reason", ""),
                 issues=issues,
                 correction_instruction=data.get("correction_instruction", ""),
             )
         except (json.JSONDecodeError, KeyError):
@@ -301,7 +308,9 @@ def parse_structured_qa(qa_text: str) -> QAResult:
     # Fallback: parse from legacy text format
     status = "FAIL"
     lower = qa_text.lower()
-    if "result: pass" in lower:
         status = "PASS"
     reason = ""
@@ -491,72 +500,188 @@ ROLE_RELEVANCE: Dict[str, Dict[str, Any]] = {
 }
-def select_relevant_roles(
-    user_request: str,
-    active_role_keys: List[str],
-    config: WorkflowConfig,
-    task_category: str = "other",
-) -> List[str]:
-    """Select only the most relevant specialist roles for a given request.
-    Scores each active role by keyword match frequency and task-category affinity,
-    filters persona roles based on config, and returns at most
-    config.max_specialists_per_task roles.
-    If config.always_include_research_for_factual_tasks is True and the task
-    is factual, the research role is always included.
     """
     lower = user_request.lower()
-    scored: List[Tuple[int, str]] = []
-    for role_key in active_role_keys:
-        meta = ROLE_RELEVANCE.get(role_key)
-        if not meta:
-            continue
-        # Skip persona roles unless config allows them
-        if meta.get("is_persona") and not config.allow_persona_roles:
-            continue
-        score = 0
-        for kw in meta.get("keywords", []):
-            if kw.lower() in lower:
-                score += 1
-        # Domain affinity — boost if the request touches a role's domain
-        for domain in meta.get("domains", []):
-            if domain.lower() in lower:
-                score += 1
-        # Task-category affinity bonus
         role_tasks = meta.get("task_types", [])
         if task_category in role_tasks:
             score += 2
-        scored.append((score, role_key))
-    # Sort by score descending; keep deterministic order for ties
-    scored.sort(key=lambda x: (-x[0], active_role_keys.index(x[1])))
-    # Always include at least one role
-    selected = []
-    for score, role_key in scored:
         if len(selected) >= config.max_specialists_per_task:
             break
-        # In strict mode, only include roles with score > 0 (except if we have none)
-        if config.strict_mode and score == 0 and selected:
             continue
-        selected.append(role_key)
     # Ensure at least one specialist is selected
-    if not selected and scored:
-        selected.append(scored[0][1])
-    # Fallback: if no roles matched at all, use the first available core role
     if not selected:
-        for rk in active_role_keys:
-            meta = ROLE_RELEVANCE.get(rk, {})
-            if not meta.get("is_persona"):
                 selected.append(rk)
                 break
@@ -567,7 +692,41 @@ def select_relevant_roles(
             and "research" not in selected):
         selected.append("research")
-    return selected
 # ============================================================
@@ -717,6 +876,44 @@ def compress_final_answer(
     return answer
 # ============================================================
 # Planner State
 # ============================================================

 @dataclass
 class QAResult:
+    status: str                          # "PASS" | "PASS_WITH_WARNINGS" | "FAIL"
     reason: str = ""
     issues: List[QAIssue] = field(default_factory=list)
+    warnings: List[str] = field(default_factory=list)
     correction_instruction: str = ""
     @property
     def passed(self) -> bool:
+        return self.status in ("PASS", "PASS_WITH_WARNINGS")
+    @property
+    def passed_with_warnings(self) -> bool:
+        return self.status == "PASS_WITH_WARNINGS"
     def owners(self) -> List[str]:
         """Return unique owner labels from issues."""
                 {"type": i.type, "message": i.message, "owner": i.owner}
                 for i in self.issues
             ],
+            "warnings": self.warnings,
             "correction_instruction": self.correction_instruction,
         }
                 status=data.get("status", "FAIL"),
                 reason=data.get("reason", ""),
                 issues=issues,
+                warnings=[str(w) for w in data.get("warnings", [])],
                 correction_instruction=data.get("correction_instruction", ""),
             )
         except (json.JSONDecodeError, KeyError):
     # Fallback: parse from legacy text format
     status = "FAIL"
     lower = qa_text.lower()
+    if "result: pass_with_warnings" in lower:
+        status = "PASS_WITH_WARNINGS"
+    elif "result: pass" in lower:
         status = "PASS"
     reason = ""
 }
+# ============================================================
+# Role Capability Metadata and Task Feature Extraction
+# ============================================================
+# Simple capability tags per role — used for transparent scoring
+ROLE_CAPABILITIES: Dict[str, List[str]] = {
+    "creative": ["creative", "design", "ideas", "writing", "brainstorm", "opinion"],
+    "technical": ["technical", "analysis", "engineering", "calculation", "code", "implementation"],
+    "research": ["research", "facts", "evidence", "information", "comparison", "history"],
+    "security": ["risk", "safety", "compliance", "security", "vulnerability"],
+    "data_analyst": ["data", "statistics", "analysis", "metrics", "patterns"],
+    "labour_union_rep": ["labor", "policy", "workplace", "rights", "fairness"],
+    "ux_designer": ["design", "usability", "interface", "user_experience", "accessibility"],
+    "lawyer": ["legal", "compliance", "contracts", "liability", "regulation"],
+    "mad_professor": ["persona", "speculation", "radical", "humor"],
+    "accountant": ["persona", "cost", "budget", "financial"],
+    "artist": ["persona", "creative", "aesthetic", "vision"],
+    "lazy_slacker": ["persona", "simple_answer", "minimal"],
+    "black_metal_fundamentalist": ["persona", "stylistic", "humor", "music"],
+    "doris": ["persona", "humor"],
+    "chairman_of_board": ["persona", "strategy", "corporate", "governance"],
+    "maga_appointee": ["persona", "political", "deregulation"],
+}
+# Keywords in the user request that map to task features
+TASK_FEATURE_KEYWORDS: Dict[str, List[str]] = {
+    "analysis": ["analy", "evaluate", "assess", "review", "examine", "investigate"],
+    "creative": ["creative", "brainstorm", "ideas", "imagine", "invent", "story", "write a"],
+    "design": ["design", "wireframe", "prototype", "layout", "visual", "ui", "ux", "interface", "usability",
+               "user experience", "accessibility", "login page", "user interface"],
+    "technical": ["code", "implement", "build", "architecture", "api", "debug", "software", "program",
+                   "algorithm", "system", "deploy", "python", "javascript", "rust", "java", "react",
+                   "framework", "performance"],
+    "research": ["research", "study", "evidence", "literature", "paper", "facts", "history",
+                  "information", "find out", "look up"],
+    "policy": ["policy", "regulation", "law", "compliance", "legal", "rights", "labor", "labour",
+                "union", "worker", "employment", "workplace"],
+    "simple_answer": ["yes or no", "pick one", "choose", "which one", "red or blue", "agree on one"],
+    "opinion": ["opinion", "perspective", "viewpoint", "discuss", "debate", "pros and cons",
+                 "should i", "what do you think", "agree", "disagree"],
+    "comparison": ["compare", "comparison", "versus", "vs", "difference", "better"],
+    "data": ["data", "statistics", "metric", "trend", "pattern", "chart", "dashboard", "csv",
+              "spreadsheet", "dataset"],
+    "security": ["security", "vulnerability", "attack", "encryption", "password", "exploit",
+                  "firewall", "gdpr", "privacy"],
+    "cost": ["cost", "budget", "expense", "cheap", "price", "financial", "roi"],
+    "humor": ["funny", "joke", "humorous", "kvlt", "metal", "nihil"],
+    "music": ["music", "metal", "band", "song", "album", "guitar"],
+}
+# Generalist fallback roles used when no capability matches
+_GENERALIST_ROLES = ("creative", "technical", "research")
+def extract_task_features(user_request: str, task_category: str = "other") -> List[str]:
+    """Derive task features from the user request and task category.
+    Returns a deduplicated list of feature tags like ["design", "opinion"].
     """
     lower = user_request.lower()
+    features: List[str] = []
+    for feature, keywords in TASK_FEATURE_KEYWORDS.items():
+        for kw in keywords:
+            if kw in lower:
+                features.append(feature)
+                break  # one match per feature is enough
+    # Add features implied by the task category
+    category_features: Dict[str, List[str]] = {
+        "coding_task": ["technical", "code"],
+        "creative_writing": ["creative"],
+        "factual_question": ["research"],
+        "comparison": ["comparison", "research"],
+        "analysis": ["analysis"],
+        "summarization": ["research"],
+        "opinion_discussion": ["opinion"],
+        "planning": ["analysis"],
+    }
+    for f in category_features.get(task_category, []):
+        if f not in features:
+            features.append(f)
+    return features
+@dataclass
+class RoleScore:
+    """Scoring details for a single role — used for transparent logging."""
+    role_key: str
+    role_label: str
+    score: int
+    matched_capabilities: List[str]
+    is_persona: bool
+    filtered_reason: str = ""  # why it was excluded, if any
+def score_roles(
+    task_features: List[str],
+    active_role_keys: List[str],
+    config: WorkflowConfig,
+    task_category: str = "other",
+) -> List[RoleScore]:
+    """Score each active role by capability overlap with task features.
+    Returns all RoleScore objects (including filtered ones) for transparency.
+    """
+    feature_set = set(task_features)
+    results: List[RoleScore] = []
+    # Import here to avoid circular — role labels come from the caller
+    for role_key in active_role_keys:
+        capabilities = ROLE_CAPABILITIES.get(role_key, [])
+        meta = ROLE_RELEVANCE.get(role_key, {})
+        is_persona = meta.get("is_persona", False)
+        role_label = meta.get("description", role_key)
+        # Capability overlap score
+        matched = [cap for cap in capabilities if cap in feature_set]
+        score = len(matched)
+        # Task-category affinity bonus (from ROLE_RELEVANCE)
         role_tasks = meta.get("task_types", [])
         if task_category in role_tasks:
             score += 2
+        rs = RoleScore(
+            role_key=role_key,
+            role_label=role_label,
+            score=score,
+            matched_capabilities=matched,
+            is_persona=is_persona,
+        )
+        # Filter personas unless allowed
+        if is_persona and not config.allow_persona_roles:
+            rs.filtered_reason = "persona role not allowed"
+        results.append(rs)
+    return results
+def select_relevant_roles(
+    user_request: str,
+    active_role_keys: List[str],
+    config: WorkflowConfig,
+    task_category: str = "other",
+) -> List[str]:
+    """Select the most relevant specialist roles for a given request.
+    Uses capability-based scoring: extracts task features from the request,
+    scores each active role by capability overlap, and returns the top roles
+    up to ``config.max_specialists_per_task``.
+    Returns a ``_SelectionResult`` (list subclass) so callers can also access
+    ``.scoring_info`` for transparent trace logging.
+    """
+    task_features = extract_task_features(user_request, task_category)
+    role_scores = score_roles(task_features, active_role_keys, config, task_category)
+    # Separate eligible from filtered
+    eligible = [rs for rs in role_scores if not rs.filtered_reason]
+    eligible.sort(key=lambda rs: (-rs.score, active_role_keys.index(rs.role_key)))
+    selected: List[str] = []
+    for rs in eligible:
         if len(selected) >= config.max_specialists_per_task:
             break
+        # In strict mode, only include roles with score > 0 (unless we have none yet)
+        if config.strict_mode and rs.score == 0 and selected:
             continue
+        selected.append(rs.role_key)
     # Ensure at least one specialist is selected
+    if not selected and eligible:
+        selected.append(eligible[0].role_key)
+    # Generalist fallback when nothing matched
     if not selected:
+        for rk in _GENERALIST_ROLES:
+            if rk in active_role_keys:
                 selected.append(rk)
                 break
             and "research" not in selected):
         selected.append("research")
+    return _SelectionResult(selected, role_scores, task_features)
+class _SelectionResult(list):
+    """A list of role keys with attached scoring metadata.
+    Behaves exactly like a ``list[str]`` so existing code continues to work,
+    but also carries ``scoring_info`` and ``task_features`` for trace logging.
+    """
+    def __init__(
+        self,
+        selected: List[str],
+        scoring_info: List[RoleScore],
+        task_features: List[str],
+    ):
+        super().__init__(selected)
+        self.scoring_info = scoring_info
+        self.task_features = task_features
+    def format_trace(self, role_labels: Optional[Dict[str, str]] = None) -> str:
+        """Return a human-readable ROLE SCORING trace block."""
+        lines = ["── ROLE SCORING ──"]
+        lines.append(f"Task features: {self.task_features}")
+        for rs in sorted(self.scoring_info, key=lambda r: -r.score):
+            label = (role_labels or {}).get(rs.role_key, rs.role_key)
+            status = "SELECTED" if rs.role_key in self else "skipped"
+            if rs.filtered_reason:
+                status = f"FILTERED ({rs.filtered_reason})"
+            caps = ", ".join(rs.matched_capabilities) if rs.matched_capabilities else "none"
+            lines.append(
+                f"  {label}: score={rs.score} caps=[{caps}] → {status}"
+            )
+        lines.append("──────────────────")
+        return "\n".join(lines)
 # ============================================================
     return answer
+def postprocess_format_fixes(text: str) -> str:
+    """Apply lightweight format fixes before QA evaluation.
+    Converts common formatting artefacts so QA can focus on content quality
+    rather than failing for cosmetic issues.
+    """
+    # Remove markdown headings (# / ## / ###)
+    text = re.sub(r'^#{1,4}\s+', '', text, flags=re.MULTILINE)
+    # Convert bullet-list lines to flowing sentences
+    def _bullets_to_sentences(m: re.Match) -> str:
+        lines = m.group(0).strip().splitlines()
+        sentences = []
+        for line in lines:
+            cleaned = re.sub(r'^\s*[-•*]\s+', '', line).strip()
+            if cleaned:
+                # Ensure it ends with a full stop
+                if cleaned[-1] not in '.!?':
+                    cleaned += '.'
+                sentences.append(cleaned)
+        return ' '.join(sentences)
+    text = re.sub(
+        r'(?:^\s*[-•*]\s+.+\n?){2,}',
+        _bullets_to_sentences,
+        text,
+        flags=re.MULTILINE,
+    )
+    # Collapse runs of 3+ blank lines into 2
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    # Remove leftover JSON-like traces (e.g. {"status": ...} blocks)
+    text = re.sub(r'\{[^{}]*"status"\s*:[^{}]*\}', '', text)
+    return text.strip()
 # ============================================================
 # Planner State
 # ============================================================