cesjavi commited on
Commit
aeb2234
·
1 Parent(s): ffac2f3

Harden report quality checks and fix Space version loading

Browse files
Dockerfile CHANGED
@@ -26,6 +26,7 @@ ENV PYTHONUNBUFFERED=1
26
 
27
  WORKDIR /app
28
 
 
29
  COPY backend/requirements.txt backend/requirements.txt
30
  RUN pip install --no-cache-dir -r backend/requirements.txt
31
 
 
26
 
27
  WORKDIR /app
28
 
29
+ COPY VERSION VERSION
30
  COPY backend/requirements.txt backend/requirements.txt
31
  RUN pip install --no-cache-dir -r backend/requirements.txt
32
 
backend/main.py CHANGED
@@ -7,10 +7,20 @@ from pathlib import Path
7
  from dotenv import load_dotenv
8
  import sentry_sdk
9
 
 
 
 
 
 
 
 
 
 
 
10
  # Load environment variables
11
  load_dotenv()
12
  FRONTEND_DIST = Path(__file__).resolve().parent.parent / "frontend" / "dist"
13
- APP_VERSION = (Path(__file__).resolve().parent.parent / "VERSION").read_text(encoding="utf-8").strip()
14
 
15
  # Sentry Initialization
16
  SENTRY_DSN = os.getenv("SENTRY_DSN")
 
7
  from dotenv import load_dotenv
8
  import sentry_sdk
9
 
10
+
11
+ def _load_app_version() -> str:
12
+ version_file = Path(__file__).resolve().parent.parent / "VERSION"
13
+ if version_file.exists():
14
+ value = version_file.read_text(encoding="utf-8").strip()
15
+ if value:
16
+ return value
17
+ return os.getenv("APP_VERSION", "0.7.0")
18
+
19
+
20
  # Load environment variables
21
  load_dotenv()
22
  FRONTEND_DIST = Path(__file__).resolve().parent.parent / "frontend" / "dist"
23
+ APP_VERSION = _load_app_version()
24
 
25
  # Sentry Initialization
26
  SENTRY_DSN = os.getenv("SENTRY_DSN")
backend/routers/agent_runner.py CHANGED
@@ -1,6 +1,7 @@
1
  from fastapi import APIRouter, HTTPException, BackgroundTasks
2
  from services.supabase_service import supabase
3
  from services.agent_runner_service import AgentRunnerService
 
4
  import logging
5
 
6
  router = APIRouter()
@@ -10,10 +11,15 @@ logger = logging.getLogger("uvicorn")
10
  def _assert_task_quality(task: dict):
11
  output_data = task.get("output_data") or {}
12
  if not isinstance(output_data, dict):
13
- return
 
 
 
 
 
14
  quality_review = output_data.get("quality_review")
15
  if not quality_review:
16
- return
17
  if quality_review.get("approved"):
18
  return
19
  reasons = quality_review.get("fail_reasons") or ["Task output failed quality validation."]
 
1
  from fastapi import APIRouter, HTTPException, BackgroundTasks
2
  from services.supabase_service import supabase
3
  from services.agent_runner_service import AgentRunnerService
4
+ from services.output_quality import report_text_from_output
5
  import logging
6
 
7
  router = APIRouter()
 
11
  def _assert_task_quality(task: dict):
12
  output_data = task.get("output_data") or {}
13
  if not isinstance(output_data, dict):
14
+ raise HTTPException(status_code=400, detail="Task output is missing or malformed.")
15
+ if output_data.get("error"):
16
+ raise HTTPException(status_code=400, detail=f"Task execution failed: {output_data['error']}")
17
+ rendered = report_text_from_output(output_data).strip()
18
+ if not rendered or rendered in ("{}", "[]"):
19
+ raise HTTPException(status_code=400, detail="Task has no usable output to approve.")
20
  quality_review = output_data.get("quality_review")
21
  if not quality_review:
22
+ raise HTTPException(status_code=400, detail="Task output is missing quality validation.")
23
  if quality_review.get("approved"):
24
  return
25
  reasons = quality_review.get("fail_reasons") or ["Task output failed quality validation."]
backend/services/agent_runner_service.py CHANGED
@@ -50,6 +50,18 @@ class AgentRunnerService:
50
  .execute()
51
  context = context_res.data if context_res.data else []
52
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  extra_context = ""
54
  if include_semantic_context:
55
  extra_context = await semantic_backprop.get_project_context(project_id, task_id)
@@ -99,7 +111,7 @@ class AgentRunnerService:
99
  # 4. Execute Run with timing
100
  start_time = time.time()
101
  task_instructions = task.get("description") or task["title"]
102
- task_instructions = f"{task_instructions}\n\n{build_quality_instructions(task)}"
103
  result = await agent.run(task_instructions, context, extra_context=extra_context)
104
  duration = time.time() - start_time
105
 
@@ -115,7 +127,7 @@ class AgentRunnerService:
115
  result["security_warning"] = f"Output sanitized: suspicious pattern '{pattern}' detected."
116
  # We don't block yet, but we flag it.
117
 
118
- quality_review = validate_output(task, result)
119
  result["quality_review"] = quality_review
120
 
121
  # 6. Save to Cache
 
50
  .execute()
51
  context = context_res.data if context_res.data else []
52
 
53
+ project_data = task.get("project")
54
+ if not isinstance(project_data, dict):
55
+ project_res = (
56
+ supabase.table("projects")
57
+ .select("name,description,context")
58
+ .eq("id", project_id)
59
+ .single()
60
+ .execute()
61
+ )
62
+ project_data = project_res.data if project_res and project_res.data else {}
63
+ quality_task = {**task, "project": project_data}
64
+
65
  extra_context = ""
66
  if include_semantic_context:
67
  extra_context = await semantic_backprop.get_project_context(project_id, task_id)
 
111
  # 4. Execute Run with timing
112
  start_time = time.time()
113
  task_instructions = task.get("description") or task["title"]
114
+ task_instructions = f"{task_instructions}\n\n{build_quality_instructions(quality_task)}"
115
  result = await agent.run(task_instructions, context, extra_context=extra_context)
116
  duration = time.time() - start_time
117
 
 
127
  result["security_warning"] = f"Output sanitized: suspicious pattern '{pattern}' detected."
128
  # We don't block yet, but we flag it.
129
 
130
+ quality_review = validate_output(quality_task, result)
131
  result["quality_review"] = quality_review
132
 
133
  # 6. Save to Cache
backend/services/orchestrator_service.py CHANGED
@@ -68,6 +68,18 @@ def _format_output_for_report(output_data) -> str:
68
 
69
  return clean_report_text(dedupe_lines("\n".join(_format_value_for_report(primary))))
70
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def _output_text(output_data) -> str:
72
  return _format_output_for_report(output_data).lower()
73
 
@@ -422,6 +434,12 @@ class OrchestratorService:
422
  excluded: list[dict] = []
423
  for task in tasks:
424
  output_data = task.get("output_data") or {}
 
 
 
 
 
 
425
  quality_review = output_data.get("quality_review") if isinstance(output_data, dict) else None
426
  if quality_review and not quality_review.get("approved", False):
427
  excluded.append({
@@ -502,9 +520,9 @@ class OrchestratorService:
502
  lines.extend(["## Excluded Content", ""])
503
  for excluded in excluded_tasks:
504
  lines.append(f"- Excluded task output: {excluded['title']} ({'; '.join(excluded['reasons'])})")
505
- for excluded_line in report_exclusions[:25]:
506
  if excluded_line:
507
- lines.append(f"- Removed low-quality line: {excluded_line}")
508
  lines.append("")
509
 
510
  # Final Conclusion Generation
@@ -542,7 +560,7 @@ class OrchestratorService:
542
  conclusion,
543
  "",
544
  "## Completion Status",
545
- f"All {len(tasks)} tasks are approved. {len(curated_tasks)} task outputs passed final quality validation. Project status: completed."
546
  ])
547
 
548
  supabase.table("projects").update({"status": "completed"}).eq("id", project_id).execute()
 
68
 
69
  return clean_report_text(dedupe_lines("\n".join(_format_value_for_report(primary))))
70
 
71
+
72
+ def _has_usable_output(output_data) -> bool:
73
+ if not output_data:
74
+ return False
75
+ if isinstance(output_data, dict):
76
+ if output_data.get("error"):
77
+ return False
78
+ primary = output_data.get("data")
79
+ if primary in (None, "", [], {}):
80
+ return False
81
+ return True
82
+
83
  def _output_text(output_data) -> str:
84
  return _format_output_for_report(output_data).lower()
85
 
 
434
  excluded: list[dict] = []
435
  for task in tasks:
436
  output_data = task.get("output_data") or {}
437
+ if not _has_usable_output(output_data):
438
+ excluded.append({
439
+ "title": task.get("title", "Untitled task"),
440
+ "reasons": ["Task has no usable approved output."]
441
+ })
442
+ continue
443
  quality_review = output_data.get("quality_review") if isinstance(output_data, dict) else None
444
  if quality_review and not quality_review.get("approved", False):
445
  excluded.append({
 
520
  lines.extend(["## Excluded Content", ""])
521
  for excluded in excluded_tasks:
522
  lines.append(f"- Excluded task output: {excluded['title']} ({'; '.join(excluded['reasons'])})")
523
+ for excluded_line in list(dict.fromkeys(report_exclusions))[:10]:
524
  if excluded_line:
525
+ lines.append(f"- {excluded_line}")
526
  lines.append("")
527
 
528
  # Final Conclusion Generation
 
560
  conclusion,
561
  "",
562
  "## Completion Status",
563
+ f"{len(tasks)} tasks reached done status. {len(curated_tasks)} task outputs passed final quality validation. {len(excluded_tasks)} task outputs were excluded from the final report."
564
  ])
565
 
566
  supabase.table("projects").update({"status": "completed"}).eq("id", project_id).execute()
backend/services/output_quality.py CHANGED
@@ -30,6 +30,40 @@ SENSITIVE_FACT_PATTERNS = [
30
  r"\bprofit\b",
31
  ]
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  STRICT_TASK_PATTERNS = [
34
  r"\bresearch\b",
35
  r"\banaly[sz]e\b",
@@ -55,7 +89,8 @@ def _stringify_payload(value: Any) -> str:
55
 
56
 
57
  def build_quality_instructions(task: dict) -> str:
58
- task_text = f"{task.get('title', '')}\n{task.get('description', '')}".lower()
 
59
  strict_mode = any(re.search(pattern, task_text, re.IGNORECASE) for pattern in STRICT_TASK_PATTERNS)
60
 
61
  base = [
@@ -63,8 +98,10 @@ def build_quality_instructions(task: dict) -> str:
63
  "- Never use placeholder names like Competitor A, Dashboard B, Product C, or Our Company.",
64
  "- If a real named entity cannot be identified with confidence, return unknown instead of inventing one.",
65
  "- Keep the output strictly within the requested scope.",
 
66
  "- Do not include generic filler sections that were not requested.",
67
  "- Use clean UTF-8/ASCII friendly text. Do not output corrupted characters.",
 
68
  ]
69
 
70
  if strict_mode:
@@ -80,10 +117,51 @@ def build_quality_instructions(task: dict) -> str:
80
  return "\n".join(base)
81
 
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  def validate_output(task: dict, result: dict) -> dict:
84
  raw_text = _stringify_payload(result.get("raw_output"))
85
  data_text = _stringify_payload(result.get("data"))
86
  combined = "\n".join(part for part in [raw_text, data_text] if part).strip()
 
 
 
 
 
 
 
87
 
88
  fail_reasons: list[str] = []
89
  must_fix: list[str] = []
@@ -110,6 +188,14 @@ def validate_output(task: dict, result: dict) -> dict:
110
  fail_reasons.append("Output contains encoding corruption.")
111
  must_fix.append("Remove corrupted characters and normalize text encoding.")
112
 
 
 
 
 
 
 
 
 
113
  for pattern in GENERIC_FILLER_PATTERNS:
114
  if re.search(pattern, combined, re.IGNORECASE):
115
  unsupported_claims.append(pattern.replace("\\b", "").replace("?", ""))
@@ -146,6 +232,10 @@ def validate_output(task: dict, result: dict) -> dict:
146
  score = 100
147
  if placeholder_entities:
148
  score = min(score, 20)
 
 
 
 
149
  if any(item.startswith("Sensitive fact without source:") for item in unsupported_claims):
150
  score = min(score, 30)
151
  if duplicate_claims:
@@ -207,10 +297,13 @@ def filter_report_sections(text: str) -> tuple[str, list[str]]:
207
  for line in text.splitlines():
208
  lowered = line.lower()
209
  if any(re.search(pattern, lowered, re.IGNORECASE) for pattern in PLACEHOLDER_PATTERNS):
210
- excluded.append(line.strip())
211
  continue
212
  if any(re.search(pattern, lowered, re.IGNORECASE) for pattern in GENERIC_FILLER_PATTERNS):
213
- excluded.append(line.strip())
 
 
 
214
  continue
215
  kept_lines.append(line)
216
  return "\n".join(kept_lines).strip(), excluded
 
30
  r"\bprofit\b",
31
  ]
32
 
33
+ RAW_DUMP_PATTERNS = [
34
+ r"```(?:json)?",
35
+ r'"raw_text"\s*:',
36
+ r'"projectoverview"\s*:',
37
+ r'"projectoverview"\s*:',
38
+ r'"userstories"\s*:',
39
+ r'"datamodel"\s*:',
40
+ ]
41
+
42
+ LATAM_HINTS = [
43
+ "mercadolibre",
44
+ "mercado libre",
45
+ "latam",
46
+ "latin america",
47
+ "argentina",
48
+ "mexico",
49
+ "brazil",
50
+ "brasil",
51
+ "chile",
52
+ "colombia",
53
+ "peru",
54
+ "uruguay",
55
+ ]
56
+
57
+ SEA_HINTS = [
58
+ "indonesia",
59
+ "yogyakarta",
60
+ "bali",
61
+ "southeast asia",
62
+ "tokopedia",
63
+ "shopee",
64
+ "jakarta",
65
+ ]
66
+
67
  STRICT_TASK_PATTERNS = [
68
  r"\bresearch\b",
69
  r"\banaly[sz]e\b",
 
89
 
90
 
91
  def build_quality_instructions(task: dict) -> str:
92
+ project_text = _project_text(task)
93
+ task_text = f"{task.get('title', '')}\n{task.get('description', '')}\n{project_text}".lower()
94
  strict_mode = any(re.search(pattern, task_text, re.IGNORECASE) for pattern in STRICT_TASK_PATTERNS)
95
 
96
  base = [
 
98
  "- Never use placeholder names like Competitor A, Dashboard B, Product C, or Our Company.",
99
  "- If a real named entity cannot be identified with confidence, return unknown instead of inventing one.",
100
  "- Keep the output strictly within the requested scope.",
101
+ "- Stay aligned with the project's stated geography, competitors, and market context. Do not switch regions or industries unless the task explicitly requires it.",
102
  "- Do not include generic filler sections that were not requested.",
103
  "- Use clean UTF-8/ASCII friendly text. Do not output corrupted characters.",
104
+ "- Do not return raw JSON dumps, code blocks, repository scaffolds, or intermediate planning artifacts unless the task explicitly asks for them.",
105
  ]
106
 
107
  if strict_mode:
 
117
  return "\n".join(base)
118
 
119
 
120
+ def _project_text(task: dict) -> str:
121
+ project = task.get("project")
122
+ if isinstance(project, dict):
123
+ return "\n".join(
124
+ str(project.get(key, "") or "")
125
+ for key in ("name", "description", "context")
126
+ )
127
+ return str(task.get("project_context") or "")
128
+
129
+
130
+ def _contains_any(text: str, terms: list[str]) -> bool:
131
+ lowered = text.lower()
132
+ return any(term in lowered for term in terms)
133
+
134
+
135
+ def _looks_like_raw_dump(text: str) -> bool:
136
+ if any(re.search(pattern, text, re.IGNORECASE) for pattern in RAW_DUMP_PATTERNS):
137
+ return True
138
+ stripped = text.strip()
139
+ if stripped.startswith("{") or stripped.startswith("["):
140
+ return True
141
+ return False
142
+
143
+
144
+ def _is_context_drift(task_text: str, output_text: str) -> bool:
145
+ task_lower = task_text.lower()
146
+ output_lower = output_text.lower()
147
+
148
+ if _contains_any(task_lower, LATAM_HINTS) and _contains_any(output_lower, SEA_HINTS):
149
+ return True
150
+
151
+ return False
152
+
153
+
154
  def validate_output(task: dict, result: dict) -> dict:
155
  raw_text = _stringify_payload(result.get("raw_output"))
156
  data_text = _stringify_payload(result.get("data"))
157
  combined = "\n".join(part for part in [raw_text, data_text] if part).strip()
158
+ task_text = "\n".join(
159
+ [
160
+ str(task.get("title", "") or ""),
161
+ str(task.get("description", "") or ""),
162
+ _project_text(task),
163
+ ]
164
+ )
165
 
166
  fail_reasons: list[str] = []
167
  must_fix: list[str] = []
 
188
  fail_reasons.append("Output contains encoding corruption.")
189
  must_fix.append("Remove corrupted characters and normalize text encoding.")
190
 
191
+ if _looks_like_raw_dump(combined):
192
+ fail_reasons.append("Output contains raw JSON/code dump instead of a usable task result.")
193
+ must_fix.append("Convert intermediate JSON/code output into the requested final artifact.")
194
+
195
+ if _is_context_drift(task_text, combined):
196
+ fail_reasons.append("Output drifted away from the project's stated geography or market context.")
197
+ must_fix.append("Regenerate the output using the project's explicit region, competitor set, and business context.")
198
+
199
  for pattern in GENERIC_FILLER_PATTERNS:
200
  if re.search(pattern, combined, re.IGNORECASE):
201
  unsupported_claims.append(pattern.replace("\\b", "").replace("?", ""))
 
232
  score = 100
233
  if placeholder_entities:
234
  score = min(score, 20)
235
+ if _looks_like_raw_dump(combined):
236
+ score = min(score, 20)
237
+ if _is_context_drift(task_text, combined):
238
+ score = min(score, 20)
239
  if any(item.startswith("Sensitive fact without source:") for item in unsupported_claims):
240
  score = min(score, 30)
241
  if duplicate_claims:
 
297
  for line in text.splitlines():
298
  lowered = line.lower()
299
  if any(re.search(pattern, lowered, re.IGNORECASE) for pattern in PLACEHOLDER_PATTERNS):
300
+ excluded.append("Removed placeholder content.")
301
  continue
302
  if any(re.search(pattern, lowered, re.IGNORECASE) for pattern in GENERIC_FILLER_PATTERNS):
303
+ excluded.append("Removed generic filler outside the requested scope.")
304
+ continue
305
+ if _looks_like_raw_dump(line):
306
+ excluded.append("Removed raw JSON/code dump content.")
307
  continue
308
  kept_lines.append(line)
309
  return "\n".join(kept_lines).strip(), excluded