CatoG commited on
Commit
22173b0
·
1 Parent(s): 92d41c5
Files changed (3) hide show
  1. app.py +77 -25
  2. test_workflow.py +166 -0
  3. workflow_helpers.py +245 -48
app.py CHANGED
@@ -19,6 +19,7 @@ from workflow_helpers import (
19
  PlannerState, FailureRecord,
20
  select_relevant_roles, identify_revision_targets,
21
  compress_final_answer, strip_internal_noise,
 
22
  get_synthesizer_format_instruction, get_qa_format_instruction,
23
  validate_output_format, format_violations_instruction,
24
  parse_task_assumptions, format_assumptions_for_prompt,
@@ -615,7 +616,7 @@ class WorkflowState(TypedDict):
615
 
616
  # --- Role system prompts ---
617
 
618
- _PLANNER_SYSTEM = (
619
  "You are the Planner in a strict planner–specialist–synthesizer–QA workflow.\n"
620
  "Your ONLY job is to PLAN and DELEGATE. You do NOT write the answer.\n\n"
621
  "Your responsibilities:\n"
@@ -623,14 +624,7 @@ _PLANNER_SYSTEM = (
623
  "2. Decide which specialist to call as the PRIMARY lead.\n"
624
  " IMPORTANT: Select the FEWEST roles necessary. Do NOT call all roles.\n"
625
  " Available specialists:\n"
626
- " - 'Creative Expert' (ideas, framing, wording, brainstorming)\n"
627
- " - 'Technical Expert' (code, architecture, implementation)\n"
628
- " - 'Research Analyst' (information gathering, literature review, fact-finding)\n"
629
- " - 'Security Reviewer' (security analysis, vulnerability checks)\n"
630
- " - 'Data Analyst' (data analysis, statistics, patterns)\n"
631
- " - 'Labour Union Representative' (worker rights, fair wages)\n"
632
- " - 'UX Designer' (user needs, usability, accessibility)\n"
633
- " - 'Lawyer' (legal compliance, liability, contracts)\n"
634
  "3. State clear success criteria.\n"
635
  "4. Identify the required output format and brevity level.\n"
636
  "5. Define shared assumptions that ALL specialists must use.\n"
@@ -642,6 +636,7 @@ _PLANNER_SYSTEM = (
642
  "- The specialists will create the content. The Synthesizer will combine it.\n"
643
  "- For simple questions, ONE specialist is enough.\n"
644
  "- Never call persona/gimmick roles unless the user explicitly asks for them.\n"
 
645
  "- QA results are BINDING — if QA says FAIL, you MUST revise, never approve.\n\n"
646
  "Respond in this exact format:\n"
647
  "TASK BREAKDOWN:\n<subtask list — what needs to be addressed, NOT the answers>\n\n"
@@ -652,6 +647,34 @@ _PLANNER_SYSTEM = (
652
  "GUIDANCE FOR SPECIALIST:\n<delegation instructions — what to focus on, NOT answer content>"
653
  )
654
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
  _CREATIVE_SYSTEM = (
656
  "You are the Creative Expert in a multi-role AI workflow.\n"
657
  "You handle brainstorming, alternative ideas, framing, wording, and concept generation.\n"
@@ -680,17 +703,30 @@ _QA_SYSTEM = (
680
  "output format requirements, brevity requirements, AND expert influence.\n\n"
681
  "You MUST respond with a JSON object in this exact structure:\n"
682
  '{\n'
683
- ' \"status\": \"PASS\" or \"FAIL\",\n'
684
- ' \"reason\": \"short explanation\",\n'
685
- ' \"issues\": [\n'
 
686
  ' {\n'
687
- ' \"type\": \"format\" | \"brevity\" | \"constraint\" | \"consistency\" | \"directness\" | \"evidence\" | \"expert_influence\" | \"other\",\n'
688
- ' \"message\": \"what is wrong\",\n'
689
- ' \"owner\": \"Synthesizer\" | \"Planner\" | \"Research Analyst\" | \"<specialist role name>\"\n'
690
  ' }\n'
691
  ' ],\n'
692
- ' \"correction_instruction\": \"specific minimal fix\"\n'
693
  '}\n\n'
 
 
 
 
 
 
 
 
 
 
 
 
694
  "Validation rules:\n"
695
  "- Check that the output DIRECTLY answers the user's question.\n"
696
  "- Check that the output format matches what was requested (single choice, table, code, etc.).\n"
@@ -704,18 +740,19 @@ _QA_SYSTEM = (
704
  " * If multiple experts contributed, their relevant points are incorporated or consciously noted.\n"
705
  " * The answer is NOT just a paraphrase of planner text with no expert content.\n"
706
  " * FAIL with type 'expert_influence' if expert contributions were ignored.\n"
707
- "- FAIL if any of the above checks fail.\n"
708
- "- PASS only if ALL checks pass.\n"
 
709
  )
710
 
711
  _PLANNER_REVIEW_SYSTEM = (
712
  "You are the Planner reviewing QA feedback.\n"
713
  "CRITICAL RULE: QA results are BINDING.\n"
714
- "- If QA status is PASS: approve the result.\n"
715
  "- If QA status is FAIL: you MUST revise. You may NOT approve a FAIL result.\n"
716
  "- If this is the final revision (max reached) and QA still FAIL:\n"
717
  " you must directly fix the QA issues in your response before approving.\n\n"
718
- "If QA PASSED, respond with:\n"
719
  "DECISION: APPROVED\n"
720
  "FINAL ANSWER:\n<the approved output, reproduced in full>\n\n"
721
  "If QA FAILED and revisions remain, respond with:\n"
@@ -1098,7 +1135,10 @@ def _parse_qa_role_feedback(qa_text: str) -> Dict[str, str]:
1098
  # Each step receives the shared state and an append-only trace list,
1099
  # updates state in place, appends log lines, and returns updated state.
1100
 
1101
- def _step_plan(chat_model, state: WorkflowState, trace: List[str]) -> WorkflowState:
 
 
 
1102
  """Planner: analyse the task, produce a plan, decide which specialist to call."""
1103
  trace.append("\n╔══ [PLANNER] Analysing task... ══╗")
1104
  fmt = state.get("output_format", "other")
@@ -1114,7 +1154,8 @@ def _step_plan(chat_model, state: WorkflowState, trace: List[str]) -> WorkflowSt
1114
  f"\nPrevious QA report:\n{state['qa_report']}"
1115
  "\nAdjust the plan to address the QA issues."
1116
  )
1117
- plan_text = _llm_call(chat_model, _PLANNER_SYSTEM, content)
 
1118
  state["plan"] = plan_text
1119
  state["current_role"] = _decide_role(plan_text)
1120
  trace.append(plan_text)
@@ -1181,6 +1222,9 @@ def _step_qa(
1181
  """
1182
  trace.append("\n╔══ [QA TESTER] Reviewing output... ══╗")
1183
 
 
 
 
1184
  fmt = state.get("output_format", "other")
1185
  brevity = state.get("brevity_requirement", "normal")
1186
  format_rules = get_qa_format_instruction(fmt, brevity)
@@ -1248,8 +1292,12 @@ def _step_qa(
1248
  # Also extract legacy role feedback for backward compatibility
1249
  state["qa_role_feedback"] = _parse_qa_role_feedback(text)
1250
 
1251
- result_label = "✅ PASS" if state["qa_passed"] else "❌ FAIL"
 
 
1252
  trace.append(text)
 
 
1253
  if qa_result.issues:
1254
  issues_summary = "; ".join(
1255
  f"{i.owner}: {i.message[:60]}{'…' if len(i.message) > 60 else ''}"
@@ -2080,7 +2128,8 @@ def run_multi_role_workflow(
2080
 
2081
  try:
2082
  if planner_active:
2083
- state = _step_plan(chat_model, state, trace)
 
2084
 
2085
  # Parse shared task assumptions from planner output
2086
  assumptions = parse_task_assumptions(state["plan"])
@@ -2123,6 +2172,9 @@ def run_multi_role_workflow(
2123
  f"\n[ROLE SELECTION] {len(selected_roles)} specialist(s) selected: "
2124
  + ", ".join(AGENT_ROLES.get(k, k) for k in selected_roles)
2125
  )
 
 
 
2126
 
2127
  # Step 4: Run ALL selected specialists (initial run only)
2128
  if primary_role not in selected_roles:
@@ -2210,7 +2262,7 @@ def run_multi_role_workflow(
2210
  else:
2211
  state["qa_passed"] = True
2212
  state["qa_report"] = "QA Tester is disabled — skipping quality review."
2213
- state["qa_structured"] = {"status": "PASS", "reason": "", "issues": [], "correction_instruction": ""}
2214
  trace.append("\n[QA Tester disabled] Skipping quality review — auto-pass.")
2215
 
2216
  # Update planner state
 
19
  PlannerState, FailureRecord,
20
  select_relevant_roles, identify_revision_targets,
21
  compress_final_answer, strip_internal_noise,
22
+ postprocess_format_fixes,
23
  get_synthesizer_format_instruction, get_qa_format_instruction,
24
  validate_output_format, format_violations_instruction,
25
  parse_task_assumptions, format_assumptions_for_prompt,
 
616
 
617
  # --- Role system prompts ---
618
 
619
+ _PLANNER_SYSTEM_BASE = (
620
  "You are the Planner in a strict planner–specialist–synthesizer–QA workflow.\n"
621
  "Your ONLY job is to PLAN and DELEGATE. You do NOT write the answer.\n\n"
622
  "Your responsibilities:\n"
 
624
  "2. Decide which specialist to call as the PRIMARY lead.\n"
625
  " IMPORTANT: Select the FEWEST roles necessary. Do NOT call all roles.\n"
626
  " Available specialists:\n"
627
+ "{specialist_list}"
 
 
 
 
 
 
 
628
  "3. State clear success criteria.\n"
629
  "4. Identify the required output format and brevity level.\n"
630
  "5. Define shared assumptions that ALL specialists must use.\n"
 
636
  "- The specialists will create the content. The Synthesizer will combine it.\n"
637
  "- For simple questions, ONE specialist is enough.\n"
638
  "- Never call persona/gimmick roles unless the user explicitly asks for them.\n"
639
+ "- Only select from the specialists listed above — no others are available.\n"
640
  "- QA results are BINDING — if QA says FAIL, you MUST revise, never approve.\n\n"
641
  "Respond in this exact format:\n"
642
  "TASK BREAKDOWN:\n<subtask list — what needs to be addressed, NOT the answers>\n\n"
 
647
  "GUIDANCE FOR SPECIALIST:\n<delegation instructions — what to focus on, NOT answer content>"
648
  )
649
 
650
+
651
+ def _build_planner_system(enabled_role_keys: List[str]) -> str:
652
+ """Build the planner system prompt with the actual enabled roles."""
653
+ role_descriptions = {
654
+ "creative": "'Creative Expert' (ideas, framing, wording, brainstorming)",
655
+ "technical": "'Technical Expert' (code, architecture, implementation)",
656
+ "research": "'Research Analyst' (information gathering, literature review, fact-finding)",
657
+ "security": "'Security Reviewer' (security analysis, vulnerability checks)",
658
+ "data_analyst": "'Data Analyst' (data analysis, statistics, patterns)",
659
+ "labour_union_rep": "'Labour Union Representative' (worker rights, fair wages)",
660
+ "ux_designer": "'UX Designer' (user needs, usability, accessibility)",
661
+ "lawyer": "'Lawyer' (legal compliance, liability, contracts)",
662
+ "mad_professor": "'Mad Professor' (wild ideas, provocative perspectives)",
663
+ "accountant": "'Accountant' (cost analysis, budgeting, financial review)",
664
+ "artist": "'Artist' (aesthetic vision, creative expression)",
665
+ "lazy_slacker": "'Lazy Slacker' (minimal effort, simple answers)",
666
+ "black_metal_fundamentalist": "'Black Metal Fundamentalist' (nihilistic perspective)",
667
+ "doris": "'Doris' (practical, no-nonsense perspective)",
668
+ "chairman_of_board": "'Chairman of the Board' (corporate strategy, governance)",
669
+ "maga_appointee": "'MAGA Appointee' (deregulation, America-first perspective)",
670
+ }
671
+ lines = []
672
+ for rk in enabled_role_keys:
673
+ desc = role_descriptions.get(rk, f"'{rk}'")
674
+ lines.append(f" - {desc}\n")
675
+ specialist_list = "".join(lines) if lines else " - (no specialists enabled)\n"
676
+ return _PLANNER_SYSTEM_BASE.format(specialist_list=specialist_list)
677
+
678
  _CREATIVE_SYSTEM = (
679
  "You are the Creative Expert in a multi-role AI workflow.\n"
680
  "You handle brainstorming, alternative ideas, framing, wording, and concept generation.\n"
 
703
  "output format requirements, brevity requirements, AND expert influence.\n\n"
704
  "You MUST respond with a JSON object in this exact structure:\n"
705
  '{\n'
706
+ ' "status": "PASS" or "PASS_WITH_WARNINGS" or "FAIL",\n'
707
+ ' "reason": "short explanation",\n'
708
+ ' "warnings": ["optional list of minor cosmetic or stylistic notes"],\n'
709
+ ' "issues": [\n'
710
  ' {\n'
711
+ ' "type": "format" | "brevity" | "constraint" | "consistency" | "directness" | "evidence" | "expert_influence" | "other",\n'
712
+ ' "message": "what is wrong",\n'
713
+ ' "owner": "Synthesizer" | "Planner" | "Research Analyst" | "<specialist role name>"\n'
714
  ' }\n'
715
  ' ],\n'
716
+ ' "correction_instruction": "specific minimal fix"\n'
717
  '}\n\n'
718
+ "STATUS LEVELS — use the right one:\n"
719
+ "- PASS: The answer is correct, complete, properly formatted, and meets all criteria.\n"
720
+ "- PASS_WITH_WARNINGS: The answer is substantively correct and usable, but has minor\n"
721
+ " cosmetic or stylistic issues (e.g. slightly verbose, could be tighter, minor formatting\n"
722
+ " quirks). List these in the 'warnings' array. Do NOT put them in 'issues'.\n"
723
+ "- FAIL: The answer has substantive problems — wrong content, missing key information,\n"
724
+ " wrong format, ignores the question, unsupported claims, or expert contributions ignored.\n"
725
+ " Only FAIL triggers a revision cycle.\n\n"
726
+ "FOCUS ON CONTENT, NOT COSMETICS:\n"
727
+ "- Minor bullet formatting, heading style, or whitespace are NOT reasons to FAIL.\n"
728
+ "- A slightly verbose answer that correctly addresses the question is PASS_WITH_WARNINGS, not FAIL.\n"
729
+ "- Reserve FAIL for answers that are genuinely wrong, incomplete, or miss the point.\n\n"
730
  "Validation rules:\n"
731
  "- Check that the output DIRECTLY answers the user's question.\n"
732
  "- Check that the output format matches what was requested (single choice, table, code, etc.).\n"
 
740
  " * If multiple experts contributed, their relevant points are incorporated or consciously noted.\n"
741
  " * The answer is NOT just a paraphrase of planner text with no expert content.\n"
742
  " * FAIL with type 'expert_influence' if expert contributions were ignored.\n"
743
+ "- FAIL if any substantive check fails.\n"
744
+ "- PASS_WITH_WARNINGS if content is good but minor polish is needed.\n"
745
+ "- PASS only if ALL checks pass with no issues at all.\n"
746
  )
747
 
748
  _PLANNER_REVIEW_SYSTEM = (
749
  "You are the Planner reviewing QA feedback.\n"
750
  "CRITICAL RULE: QA results are BINDING.\n"
751
+ "- If QA status is PASS or PASS_WITH_WARNINGS: approve the result.\n"
752
  "- If QA status is FAIL: you MUST revise. You may NOT approve a FAIL result.\n"
753
  "- If this is the final revision (max reached) and QA still FAIL:\n"
754
  " you must directly fix the QA issues in your response before approving.\n\n"
755
+ "If QA PASSED (or PASS_WITH_WARNINGS), respond with:\n"
756
  "DECISION: APPROVED\n"
757
  "FINAL ANSWER:\n<the approved output, reproduced in full>\n\n"
758
  "If QA FAILED and revisions remain, respond with:\n"
 
1135
  # Each step receives the shared state and an append-only trace list,
1136
  # updates state in place, appends log lines, and returns updated state.
1137
 
1138
+ def _step_plan(
1139
+ chat_model, state: WorkflowState, trace: List[str],
1140
+ enabled_role_keys: Optional[List[str]] = None,
1141
+ ) -> WorkflowState:
1142
  """Planner: analyse the task, produce a plan, decide which specialist to call."""
1143
  trace.append("\n╔══ [PLANNER] Analysing task... ══╗")
1144
  fmt = state.get("output_format", "other")
 
1154
  f"\nPrevious QA report:\n{state['qa_report']}"
1155
  "\nAdjust the plan to address the QA issues."
1156
  )
1157
+ planner_system = _build_planner_system(enabled_role_keys or [])
1158
+ plan_text = _llm_call(chat_model, planner_system, content)
1159
  state["plan"] = plan_text
1160
  state["current_role"] = _decide_role(plan_text)
1161
  trace.append(plan_text)
 
1222
  """
1223
  trace.append("\n╔══ [QA TESTER] Reviewing output... ══╗")
1224
 
1225
+ # Apply post-processing format fixes before QA evaluation
1226
+ state["draft_output"] = postprocess_format_fixes(state["draft_output"])
1227
+
1228
  fmt = state.get("output_format", "other")
1229
  brevity = state.get("brevity_requirement", "normal")
1230
  format_rules = get_qa_format_instruction(fmt, brevity)
 
1292
  # Also extract legacy role feedback for backward compatibility
1293
  state["qa_role_feedback"] = _parse_qa_role_feedback(text)
1294
 
1295
+ result_label = ("✅ PASS" if qa_result.status == "PASS"
1296
+ else "⚠ PASS_WITH_WARNINGS" if qa_result.passed_with_warnings
1297
+ else "❌ FAIL")
1298
  trace.append(text)
1299
+ if qa_result.warnings:
1300
+ trace.append(f" ⚠ QA warnings: {'; '.join(qa_result.warnings)}")
1301
  if qa_result.issues:
1302
  issues_summary = "; ".join(
1303
  f"{i.owner}: {i.message[:60]}{'…' if len(i.message) > 60 else ''}"
 
2128
 
2129
  try:
2130
  if planner_active:
2131
+ state = _step_plan(chat_model, state, trace,
2132
+ enabled_role_keys=active_specialist_keys)
2133
 
2134
  # Parse shared task assumptions from planner output
2135
  assumptions = parse_task_assumptions(state["plan"])
 
2172
  f"\n[ROLE SELECTION] {len(selected_roles)} specialist(s) selected: "
2173
  + ", ".join(AGENT_ROLES.get(k, k) for k in selected_roles)
2174
  )
2175
+ # Append detailed scoring trace when available
2176
+ if hasattr(selected_roles, 'format_trace'):
2177
+ trace.append(selected_roles.format_trace(AGENT_ROLES))
2178
 
2179
  # Step 4: Run ALL selected specialists (initial run only)
2180
  if primary_role not in selected_roles:
 
2262
  else:
2263
  state["qa_passed"] = True
2264
  state["qa_report"] = "QA Tester is disabled — skipping quality review."
2265
+ state["qa_structured"] = {"status": "PASS", "reason": "", "issues": [], "warnings": [], "correction_instruction": ""}
2266
  trace.append("\n[QA Tester disabled] Skipping quality review — auto-pass.")
2267
 
2268
  # Update planner state
test_workflow.py CHANGED
@@ -31,6 +31,7 @@ from workflow_helpers import (
31
  identify_revision_targets,
32
  strip_internal_noise,
33
  compress_final_answer,
 
34
  PlannerState,
35
  FailureRecord,
36
  get_synthesizer_format_instruction,
@@ -45,6 +46,10 @@ from workflow_helpers import (
45
  format_contributions_for_qa,
46
  parse_used_contributions,
47
  check_expert_influence,
 
 
 
 
48
  )
49
  from evidence import (
50
  EvidenceItem,
@@ -244,6 +249,167 @@ class TestStructuredQAParsing(unittest.TestCase):
244
  self.assertEqual(d["status"], "FAIL")
245
  self.assertEqual(len(d["issues"]), 1)
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
  # ============================================================
249
  # Test: Role Selection
 
31
  identify_revision_targets,
32
  strip_internal_noise,
33
  compress_final_answer,
34
+ postprocess_format_fixes,
35
  PlannerState,
36
  FailureRecord,
37
  get_synthesizer_format_instruction,
 
46
  format_contributions_for_qa,
47
  parse_used_contributions,
48
  check_expert_influence,
49
+ extract_task_features,
50
+ ROLE_CAPABILITIES,
51
+ RoleScore,
52
+ score_roles,
53
  )
54
  from evidence import (
55
  EvidenceItem,
 
249
  self.assertEqual(d["status"], "FAIL")
250
  self.assertEqual(len(d["issues"]), 1)
251
 
252
+ def test_parse_json_pass_with_warnings(self):
253
+ """PASS_WITH_WARNINGS is parsed correctly from JSON."""
254
+ qa_text = json.dumps({
255
+ "status": "PASS_WITH_WARNINGS",
256
+ "reason": "Content is good but slightly verbose",
257
+ "warnings": ["Could be more concise", "Minor formatting quirk"],
258
+ "issues": [],
259
+ "correction_instruction": ""
260
+ })
261
+ result = parse_structured_qa(qa_text)
262
+ self.assertTrue(result.passed)
263
+ self.assertTrue(result.passed_with_warnings)
264
+ self.assertEqual(result.status, "PASS_WITH_WARNINGS")
265
+ self.assertEqual(len(result.warnings), 2)
266
+ self.assertIn("Could be more concise", result.warnings)
267
+ self.assertEqual(len(result.issues), 0)
268
+
269
+ def test_parse_legacy_pass_with_warnings(self):
270
+ """PASS_WITH_WARNINGS is recognized in legacy text format."""
271
+ qa_text = (
272
+ "REQUIREMENTS CHECKED:\n- All met\n\n"
273
+ "ISSUES FOUND:\nNone\n\n"
274
+ "RESULT: PASS_WITH_WARNINGS\n\n"
275
+ "RECOMMENDED FIXES:\nNone"
276
+ )
277
+ result = parse_structured_qa(qa_text)
278
+ self.assertTrue(result.passed)
279
+ self.assertTrue(result.passed_with_warnings)
280
+
281
+ def test_pass_with_warnings_to_dict_includes_warnings(self):
282
+ result = QAResult(
283
+ status="PASS_WITH_WARNINGS",
284
+ reason="Minor issues",
285
+ warnings=["Slightly verbose"],
286
+ issues=[],
287
+ )
288
+ d = result.to_dict()
289
+ self.assertEqual(d["status"], "PASS_WITH_WARNINGS")
290
+ self.assertEqual(d["warnings"], ["Slightly verbose"])
291
+
292
+ def test_pass_is_not_passed_with_warnings(self):
293
+ result = QAResult(status="PASS")
294
+ self.assertTrue(result.passed)
295
+ self.assertFalse(result.passed_with_warnings)
296
+
297
+ def test_fail_is_not_passed(self):
298
+ result = QAResult(status="FAIL")
299
+ self.assertFalse(result.passed)
300
+ self.assertFalse(result.passed_with_warnings)
301
+
302
+
303
+ # ============================================================
304
+ # Test: Post-Processing Format Fixes
305
+ # ============================================================
306
+
307
+ class TestPostprocessFormatFixes(unittest.TestCase):
308
+
309
+ def test_removes_markdown_headings(self):
310
+ text = "# Heading\nSome content.\n## Subheading\nMore content."
311
+ result = postprocess_format_fixes(text)
312
+ self.assertNotIn("#", result)
313
+ self.assertIn("Some content.", result)
314
+ self.assertIn("More content.", result)
315
+
316
+ def test_converts_bullets_to_sentences(self):
317
+ text = "- First point\n- Second point\n- Third point"
318
+ result = postprocess_format_fixes(text)
319
+ self.assertNotIn("- ", result)
320
+ self.assertIn("First point.", result)
321
+ self.assertIn("Second point.", result)
322
+
323
+ def test_collapses_blank_lines(self):
324
+ text = "Line 1\n\n\n\n\nLine 2"
325
+ result = postprocess_format_fixes(text)
326
+ self.assertNotIn("\n\n\n", result)
327
+ self.assertIn("Line 1", result)
328
+ self.assertIn("Line 2", result)
329
+
330
+ def test_removes_json_traces(self):
331
+ text = 'Answer here. {"status": "PASS", "reason": "ok"} End.'
332
+ result = postprocess_format_fixes(text)
333
+ self.assertNotIn('"status"', result)
334
+ self.assertIn("Answer here.", result)
335
+
336
+ def test_preserves_clean_text(self):
337
+ text = "This is already clean text with no issues."
338
+ result = postprocess_format_fixes(text)
339
+ self.assertEqual(result, text)
340
+
341
+
342
+ # ============================================================
343
+ # Test: Task Feature Extraction and Role Scoring
344
+ # ============================================================
345
+
346
+ class TestTaskFeatureExtraction(unittest.TestCase):
347
+
348
+ def test_extract_coding_features(self):
349
+ features = extract_task_features("write Python code to parse CSV", "coding_task")
350
+ self.assertIn("technical", features)
351
+
352
+ def test_extract_design_features(self):
353
+ features = extract_task_features("design a user interface for the dashboard")
354
+ self.assertIn("design", features)
355
+
356
+ def test_extract_research_features(self):
357
+ features = extract_task_features("research the history of quantum computing")
358
+ self.assertIn("research", features)
359
+
360
+ def test_extract_multiple_features(self):
361
+ features = extract_task_features("analyze data trends and compare security vulnerabilities")
362
+ self.assertIn("analysis", features)
363
+ self.assertIn("data", features)
364
+ self.assertIn("security", features)
365
+ self.assertIn("comparison", features)
366
+
367
+ def test_category_adds_implied_features(self):
368
+ features = extract_task_features("do something", "factual_question")
369
+ self.assertIn("research", features)
370
+
371
+
372
+ class TestRoleScoring(unittest.TestCase):
373
+
374
+ def setUp(self):
375
+ self.all_roles = list(WorkflowConfig.CORE_ROLE_KEYS) + list(WorkflowConfig.PERSONA_ROLE_KEYS)
376
+ self.config = WorkflowConfig(strict_mode=True, allow_persona_roles=False, max_specialists_per_task=3)
377
+
378
+ def test_scores_are_populated(self):
379
+ features = extract_task_features("write Python code to build an API", "coding_task")
380
+ scores = score_roles(features, self.all_roles, self.config, "coding_task")
381
+ self.assertTrue(len(scores) > 0)
382
+ # Technical should score highest
383
+ technical_scores = [s for s in scores if s.role_key == "technical"]
384
+ self.assertTrue(len(technical_scores) == 1)
385
+ self.assertGreater(technical_scores[0].score, 0)
386
+
387
+ def test_persona_roles_filtered(self):
388
+ features = extract_task_features("budget analysis")
389
+ scores = score_roles(features, self.all_roles, self.config)
390
+ persona_scores = [s for s in scores if s.is_persona]
391
+ for ps in persona_scores:
392
+ self.assertTrue(ps.filtered_reason)
393
+
394
+ def test_persona_roles_allowed(self):
395
+ config = WorkflowConfig(strict_mode=True, allow_persona_roles=True, max_specialists_per_task=5)
396
+ features = extract_task_features("budget analysis")
397
+ scores = score_roles(features, self.all_roles, config)
398
+ accountant = [s for s in scores if s.role_key == "accountant"]
399
+ self.assertTrue(len(accountant) == 1)
400
+ self.assertEqual(accountant[0].filtered_reason, "")
401
+
402
+ def test_selection_result_format_trace(self):
403
+ config = WorkflowConfig(strict_mode=True, allow_persona_roles=False, max_specialists_per_task=3)
404
+ result = select_relevant_roles(
405
+ "write Python code", self.all_roles, config, task_category="coding_task"
406
+ )
407
+ self.assertTrue(hasattr(result, 'format_trace'))
408
+ trace = result.format_trace({"technical": "Technical Expert"})
409
+ self.assertIn("ROLE SCORING", trace)
410
+ self.assertIn("Task features:", trace)
411
+ self.assertIn("SELECTED", trace)
412
+
413
 
414
  # ============================================================
415
  # Test: Role Selection
workflow_helpers.py CHANGED
@@ -234,14 +234,19 @@ class QAIssue:
234
 
235
  @dataclass
236
  class QAResult:
237
- status: str # "PASS" | "FAIL"
238
  reason: str = ""
239
  issues: List[QAIssue] = field(default_factory=list)
 
240
  correction_instruction: str = ""
241
 
242
  @property
243
  def passed(self) -> bool:
244
- return self.status == "PASS"
 
 
 
 
245
 
246
  def owners(self) -> List[str]:
247
  """Return unique owner labels from issues."""
@@ -255,6 +260,7 @@ class QAResult:
255
  {"type": i.type, "message": i.message, "owner": i.owner}
256
  for i in self.issues
257
  ],
 
258
  "correction_instruction": self.correction_instruction,
259
  }
260
 
@@ -293,6 +299,7 @@ def parse_structured_qa(qa_text: str) -> QAResult:
293
  status=data.get("status", "FAIL"),
294
  reason=data.get("reason", ""),
295
  issues=issues,
 
296
  correction_instruction=data.get("correction_instruction", ""),
297
  )
298
  except (json.JSONDecodeError, KeyError):
@@ -301,7 +308,9 @@ def parse_structured_qa(qa_text: str) -> QAResult:
301
  # Fallback: parse from legacy text format
302
  status = "FAIL"
303
  lower = qa_text.lower()
304
- if "result: pass" in lower:
 
 
305
  status = "PASS"
306
 
307
  reason = ""
@@ -491,72 +500,188 @@ ROLE_RELEVANCE: Dict[str, Dict[str, Any]] = {
491
  }
492
 
493
 
494
- def select_relevant_roles(
495
- user_request: str,
496
- active_role_keys: List[str],
497
- config: WorkflowConfig,
498
- task_category: str = "other",
499
- ) -> List[str]:
500
- """Select only the most relevant specialist roles for a given request.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
 
502
- Scores each active role by keyword match frequency and task-category affinity,
503
- filters persona roles based on config, and returns at most
504
- config.max_specialists_per_task roles.
505
 
506
- If config.always_include_research_for_factual_tasks is True and the task
507
- is factual, the research role is always included.
508
  """
509
  lower = user_request.lower()
510
- scored: List[Tuple[int, str]] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
 
512
- for role_key in active_role_keys:
513
- meta = ROLE_RELEVANCE.get(role_key)
514
- if not meta:
515
- continue
516
 
517
- # Skip persona roles unless config allows them
518
- if meta.get("is_persona") and not config.allow_persona_roles:
519
- continue
520
 
521
- score = 0
522
- for kw in meta.get("keywords", []):
523
- if kw.lower() in lower:
524
- score += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
 
526
- # Domain affinity boost if the request touches a role's domain
527
- for domain in meta.get("domains", []):
528
- if domain.lower() in lower:
529
- score += 1
 
 
 
 
 
 
 
 
 
 
 
530
 
531
- # Task-category affinity bonus
532
  role_tasks = meta.get("task_types", [])
533
  if task_category in role_tasks:
534
  score += 2
535
 
536
- scored.append((score, role_key))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
 
538
- # Sort by score descending; keep deterministic order for ties
539
- scored.sort(key=lambda x: (-x[0], active_role_keys.index(x[1])))
 
540
 
541
- # Always include at least one role
542
- selected = []
543
- for score, role_key in scored:
 
 
 
 
 
 
 
 
 
544
  if len(selected) >= config.max_specialists_per_task:
545
  break
546
- # In strict mode, only include roles with score > 0 (except if we have none)
547
- if config.strict_mode and score == 0 and selected:
548
  continue
549
- selected.append(role_key)
550
 
551
  # Ensure at least one specialist is selected
552
- if not selected and scored:
553
- selected.append(scored[0][1])
554
 
555
- # Fallback: if no roles matched at all, use the first available core role
556
  if not selected:
557
- for rk in active_role_keys:
558
- meta = ROLE_RELEVANCE.get(rk, {})
559
- if not meta.get("is_persona"):
560
  selected.append(rk)
561
  break
562
 
@@ -567,7 +692,41 @@ def select_relevant_roles(
567
  and "research" not in selected):
568
  selected.append("research")
569
 
570
- return selected
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
 
572
 
573
  # ============================================================
@@ -717,6 +876,44 @@ def compress_final_answer(
717
  return answer
718
 
719
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
720
  # ============================================================
721
  # Planner State
722
  # ============================================================
 
234
 
235
  @dataclass
236
  class QAResult:
237
+ status: str # "PASS" | "PASS_WITH_WARNINGS" | "FAIL"
238
  reason: str = ""
239
  issues: List[QAIssue] = field(default_factory=list)
240
+ warnings: List[str] = field(default_factory=list)
241
  correction_instruction: str = ""
242
 
243
  @property
244
  def passed(self) -> bool:
245
+ return self.status in ("PASS", "PASS_WITH_WARNINGS")
246
+
247
+ @property
248
+ def passed_with_warnings(self) -> bool:
249
+ return self.status == "PASS_WITH_WARNINGS"
250
 
251
  def owners(self) -> List[str]:
252
  """Return unique owner labels from issues."""
 
260
  {"type": i.type, "message": i.message, "owner": i.owner}
261
  for i in self.issues
262
  ],
263
+ "warnings": self.warnings,
264
  "correction_instruction": self.correction_instruction,
265
  }
266
 
 
299
  status=data.get("status", "FAIL"),
300
  reason=data.get("reason", ""),
301
  issues=issues,
302
+ warnings=[str(w) for w in data.get("warnings", [])],
303
  correction_instruction=data.get("correction_instruction", ""),
304
  )
305
  except (json.JSONDecodeError, KeyError):
 
308
  # Fallback: parse from legacy text format
309
  status = "FAIL"
310
  lower = qa_text.lower()
311
+ if "result: pass_with_warnings" in lower:
312
+ status = "PASS_WITH_WARNINGS"
313
+ elif "result: pass" in lower:
314
  status = "PASS"
315
 
316
  reason = ""
 
500
  }
501
 
502
 
503
+ # ============================================================
504
+ # Role Capability Metadata and Task Feature Extraction
505
+ # ============================================================
506
+
507
+ # Simple capability tags per role — used for transparent scoring
508
+ ROLE_CAPABILITIES: Dict[str, List[str]] = {
509
+ "creative": ["creative", "design", "ideas", "writing", "brainstorm", "opinion"],
510
+ "technical": ["technical", "analysis", "engineering", "calculation", "code", "implementation"],
511
+ "research": ["research", "facts", "evidence", "information", "comparison", "history"],
512
+ "security": ["risk", "safety", "compliance", "security", "vulnerability"],
513
+ "data_analyst": ["data", "statistics", "analysis", "metrics", "patterns"],
514
+ "labour_union_rep": ["labor", "policy", "workplace", "rights", "fairness"],
515
+ "ux_designer": ["design", "usability", "interface", "user_experience", "accessibility"],
516
+ "lawyer": ["legal", "compliance", "contracts", "liability", "regulation"],
517
+ "mad_professor": ["persona", "speculation", "radical", "humor"],
518
+ "accountant": ["persona", "cost", "budget", "financial"],
519
+ "artist": ["persona", "creative", "aesthetic", "vision"],
520
+ "lazy_slacker": ["persona", "simple_answer", "minimal"],
521
+ "black_metal_fundamentalist": ["persona", "stylistic", "humor", "music"],
522
+ "doris": ["persona", "humor"],
523
+ "chairman_of_board": ["persona", "strategy", "corporate", "governance"],
524
+ "maga_appointee": ["persona", "political", "deregulation"],
525
+ }
526
+
527
+ # Keywords in the user request that map to task features
528
+ TASK_FEATURE_KEYWORDS: Dict[str, List[str]] = {
529
+ "analysis": ["analy", "evaluate", "assess", "review", "examine", "investigate"],
530
+ "creative": ["creative", "brainstorm", "ideas", "imagine", "invent", "story", "write a"],
531
+ "design": ["design", "wireframe", "prototype", "layout", "visual", "ui", "ux", "interface", "usability",
532
+ "user experience", "accessibility", "login page", "user interface"],
533
+ "technical": ["code", "implement", "build", "architecture", "api", "debug", "software", "program",
534
+ "algorithm", "system", "deploy", "python", "javascript", "rust", "java", "react",
535
+ "framework", "performance"],
536
+ "research": ["research", "study", "evidence", "literature", "paper", "facts", "history",
537
+ "information", "find out", "look up"],
538
+ "policy": ["policy", "regulation", "law", "compliance", "legal", "rights", "labor", "labour",
539
+ "union", "worker", "employment", "workplace"],
540
+ "simple_answer": ["yes or no", "pick one", "choose", "which one", "red or blue", "agree on one"],
541
+ "opinion": ["opinion", "perspective", "viewpoint", "discuss", "debate", "pros and cons",
542
+ "should i", "what do you think", "agree", "disagree"],
543
+ "comparison": ["compare", "comparison", "versus", "vs", "difference", "better"],
544
+ "data": ["data", "statistics", "metric", "trend", "pattern", "chart", "dashboard", "csv",
545
+ "spreadsheet", "dataset"],
546
+ "security": ["security", "vulnerability", "attack", "encryption", "password", "exploit",
547
+ "firewall", "gdpr", "privacy"],
548
+ "cost": ["cost", "budget", "expense", "cheap", "price", "financial", "roi"],
549
+ "humor": ["funny", "joke", "humorous", "kvlt", "metal", "nihil"],
550
+ "music": ["music", "metal", "band", "song", "album", "guitar"],
551
+ }
552
+
553
+ # Generalist fallback roles used when no capability matches
554
+ _GENERALIST_ROLES = ("creative", "technical", "research")
555
+
556
 
557
+ def extract_task_features(user_request: str, task_category: str = "other") -> List[str]:
558
+ """Derive task features from the user request and task category.
 
559
 
560
+ Returns a deduplicated list of feature tags like ["design", "opinion"].
 
561
  """
562
  lower = user_request.lower()
563
+ features: List[str] = []
564
+
565
+ for feature, keywords in TASK_FEATURE_KEYWORDS.items():
566
+ for kw in keywords:
567
+ if kw in lower:
568
+ features.append(feature)
569
+ break # one match per feature is enough
570
+
571
+ # Add features implied by the task category
572
+ category_features: Dict[str, List[str]] = {
573
+ "coding_task": ["technical", "code"],
574
+ "creative_writing": ["creative"],
575
+ "factual_question": ["research"],
576
+ "comparison": ["comparison", "research"],
577
+ "analysis": ["analysis"],
578
+ "summarization": ["research"],
579
+ "opinion_discussion": ["opinion"],
580
+ "planning": ["analysis"],
581
+ }
582
+ for f in category_features.get(task_category, []):
583
+ if f not in features:
584
+ features.append(f)
585
 
586
+ return features
 
 
 
587
 
 
 
 
588
 
589
+ @dataclass
590
+ class RoleScore:
591
+ """Scoring details for a single role — used for transparent logging."""
592
+ role_key: str
593
+ role_label: str
594
+ score: int
595
+ matched_capabilities: List[str]
596
+ is_persona: bool
597
+ filtered_reason: str = "" # why it was excluded, if any
598
+
599
+
600
+ def score_roles(
601
+ task_features: List[str],
602
+ active_role_keys: List[str],
603
+ config: WorkflowConfig,
604
+ task_category: str = "other",
605
+ ) -> List[RoleScore]:
606
+ """Score each active role by capability overlap with task features.
607
 
608
+ Returns all RoleScore objects (including filtered ones) for transparency.
609
+ """
610
+ feature_set = set(task_features)
611
+ results: List[RoleScore] = []
612
+
613
+ # Import here to avoid circular — role labels come from the caller
614
+ for role_key in active_role_keys:
615
+ capabilities = ROLE_CAPABILITIES.get(role_key, [])
616
+ meta = ROLE_RELEVANCE.get(role_key, {})
617
+ is_persona = meta.get("is_persona", False)
618
+ role_label = meta.get("description", role_key)
619
+
620
+ # Capability overlap score
621
+ matched = [cap for cap in capabilities if cap in feature_set]
622
+ score = len(matched)
623
 
624
+ # Task-category affinity bonus (from ROLE_RELEVANCE)
625
  role_tasks = meta.get("task_types", [])
626
  if task_category in role_tasks:
627
  score += 2
628
 
629
+ rs = RoleScore(
630
+ role_key=role_key,
631
+ role_label=role_label,
632
+ score=score,
633
+ matched_capabilities=matched,
634
+ is_persona=is_persona,
635
+ )
636
+
637
+ # Filter personas unless allowed
638
+ if is_persona and not config.allow_persona_roles:
639
+ rs.filtered_reason = "persona role not allowed"
640
+
641
+ results.append(rs)
642
+
643
+ return results
644
+
645
+
646
+ def select_relevant_roles(
647
+ user_request: str,
648
+ active_role_keys: List[str],
649
+ config: WorkflowConfig,
650
+ task_category: str = "other",
651
+ ) -> List[str]:
652
+ """Select the most relevant specialist roles for a given request.
653
 
654
+ Uses capability-based scoring: extracts task features from the request,
655
+ scores each active role by capability overlap, and returns the top roles
656
+ up to ``config.max_specialists_per_task``.
657
 
658
+ Returns a ``_SelectionResult`` (list subclass) so callers can also access
659
+ ``.scoring_info`` for transparent trace logging.
660
+ """
661
+ task_features = extract_task_features(user_request, task_category)
662
+ role_scores = score_roles(task_features, active_role_keys, config, task_category)
663
+
664
+ # Separate eligible from filtered
665
+ eligible = [rs for rs in role_scores if not rs.filtered_reason]
666
+ eligible.sort(key=lambda rs: (-rs.score, active_role_keys.index(rs.role_key)))
667
+
668
+ selected: List[str] = []
669
+ for rs in eligible:
670
  if len(selected) >= config.max_specialists_per_task:
671
  break
672
+ # In strict mode, only include roles with score > 0 (unless we have none yet)
673
+ if config.strict_mode and rs.score == 0 and selected:
674
  continue
675
+ selected.append(rs.role_key)
676
 
677
  # Ensure at least one specialist is selected
678
+ if not selected and eligible:
679
+ selected.append(eligible[0].role_key)
680
 
681
+ # Generalist fallback when nothing matched
682
  if not selected:
683
+ for rk in _GENERALIST_ROLES:
684
+ if rk in active_role_keys:
 
685
  selected.append(rk)
686
  break
687
 
 
692
  and "research" not in selected):
693
  selected.append("research")
694
 
695
+ return _SelectionResult(selected, role_scores, task_features)
696
+
697
+
698
+ class _SelectionResult(list):
699
+ """A list of role keys with attached scoring metadata.
700
+
701
+ Behaves exactly like a ``list[str]`` so existing code continues to work,
702
+ but also carries ``scoring_info`` and ``task_features`` for trace logging.
703
+ """
704
+
705
+ def __init__(
706
+ self,
707
+ selected: List[str],
708
+ scoring_info: List[RoleScore],
709
+ task_features: List[str],
710
+ ):
711
+ super().__init__(selected)
712
+ self.scoring_info = scoring_info
713
+ self.task_features = task_features
714
+
715
+ def format_trace(self, role_labels: Optional[Dict[str, str]] = None) -> str:
716
+ """Return a human-readable ROLE SCORING trace block."""
717
+ lines = ["── ROLE SCORING ──"]
718
+ lines.append(f"Task features: {self.task_features}")
719
+ for rs in sorted(self.scoring_info, key=lambda r: -r.score):
720
+ label = (role_labels or {}).get(rs.role_key, rs.role_key)
721
+ status = "SELECTED" if rs.role_key in self else "skipped"
722
+ if rs.filtered_reason:
723
+ status = f"FILTERED ({rs.filtered_reason})"
724
+ caps = ", ".join(rs.matched_capabilities) if rs.matched_capabilities else "none"
725
+ lines.append(
726
+ f" {label}: score={rs.score} caps=[{caps}] → {status}"
727
+ )
728
+ lines.append("──────────────────")
729
+ return "\n".join(lines)
730
 
731
 
732
  # ============================================================
 
876
  return answer
877
 
878
 
879
+ def postprocess_format_fixes(text: str) -> str:
880
+ """Apply lightweight format fixes before QA evaluation.
881
+
882
+ Converts common formatting artefacts so QA can focus on content quality
883
+ rather than failing for cosmetic issues.
884
+ """
885
+ # Remove markdown headings (# / ## / ###)
886
+ text = re.sub(r'^#{1,4}\s+', '', text, flags=re.MULTILINE)
887
+
888
+ # Convert bullet-list lines to flowing sentences
889
+ def _bullets_to_sentences(m: re.Match) -> str:
890
+ lines = m.group(0).strip().splitlines()
891
+ sentences = []
892
+ for line in lines:
893
+ cleaned = re.sub(r'^\s*[-•*]\s+', '', line).strip()
894
+ if cleaned:
895
+ # Ensure it ends with a full stop
896
+ if cleaned[-1] not in '.!?':
897
+ cleaned += '.'
898
+ sentences.append(cleaned)
899
+ return ' '.join(sentences)
900
+
901
+ text = re.sub(
902
+ r'(?:^\s*[-•*]\s+.+\n?){2,}',
903
+ _bullets_to_sentences,
904
+ text,
905
+ flags=re.MULTILINE,
906
+ )
907
+
908
+ # Collapse runs of 3+ blank lines into 2
909
+ text = re.sub(r'\n{3,}', '\n\n', text)
910
+
911
+ # Remove leftover JSON-like traces (e.g. {"status": ...} blocks)
912
+ text = re.sub(r'\{[^{}]*"status"\s*:[^{}]*\}', '', text)
913
+
914
+ return text.strip()
915
+
916
+
917
  # ============================================================
918
  # Planner State
919
  # ============================================================