Spaces:

cesjavi
/

aubm

Build error

App Files Files Community

cesjavi commited on 2 days ago

Commit

aeb2234

1 Parent(s): ffac2f3

Harden report quality checks and fix Space version loading

Browse files

Files changed (6) hide show

Dockerfile +1 -0
backend/main.py +11 -1
backend/routers/agent_runner.py +8 -2
backend/services/agent_runner_service.py +14 -2
backend/services/orchestrator_service.py +21 -3
backend/services/output_quality.py +96 -3

Dockerfile CHANGED Viewed

@@ -26,6 +26,7 @@ ENV PYTHONUNBUFFERED=1
 WORKDIR /app
 COPY backend/requirements.txt backend/requirements.txt
 RUN pip install --no-cache-dir -r backend/requirements.txt

 WORKDIR /app
+COPY VERSION VERSION
 COPY backend/requirements.txt backend/requirements.txt
 RUN pip install --no-cache-dir -r backend/requirements.txt

backend/main.py CHANGED Viewed

@@ -7,10 +7,20 @@ from pathlib import Path
 from dotenv import load_dotenv
 import sentry_sdk
 # Load environment variables
 load_dotenv()
 FRONTEND_DIST = Path(__file__).resolve().parent.parent / "frontend" / "dist"
-APP_VERSION = (Path(__file__).resolve().parent.parent / "VERSION").read_text(encoding="utf-8").strip()
 # Sentry Initialization
 SENTRY_DSN = os.getenv("SENTRY_DSN")

 from dotenv import load_dotenv
 import sentry_sdk
+def _load_app_version() -> str:
+    version_file = Path(__file__).resolve().parent.parent / "VERSION"
+    if version_file.exists():
+        value = version_file.read_text(encoding="utf-8").strip()
+        if value:
+            return value
+    return os.getenv("APP_VERSION", "0.7.0")
 # Load environment variables
 load_dotenv()
 FRONTEND_DIST = Path(__file__).resolve().parent.parent / "frontend" / "dist"
+APP_VERSION = _load_app_version()
 # Sentry Initialization
 SENTRY_DSN = os.getenv("SENTRY_DSN")

backend/routers/agent_runner.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from fastapi import APIRouter, HTTPException, BackgroundTasks
 from services.supabase_service import supabase
 from services.agent_runner_service import AgentRunnerService
 import logging
 router = APIRouter()
@@ -10,10 +11,15 @@ logger = logging.getLogger("uvicorn")
 def _assert_task_quality(task: dict):
     output_data = task.get("output_data") or {}
     if not isinstance(output_data, dict):
-        return
     quality_review = output_data.get("quality_review")
     if not quality_review:
-        return
     if quality_review.get("approved"):
         return
     reasons = quality_review.get("fail_reasons") or ["Task output failed quality validation."]

 from fastapi import APIRouter, HTTPException, BackgroundTasks
 from services.supabase_service import supabase
 from services.agent_runner_service import AgentRunnerService
+from services.output_quality import report_text_from_output
 import logging
 router = APIRouter()
 def _assert_task_quality(task: dict):
     output_data = task.get("output_data") or {}
     if not isinstance(output_data, dict):
+        raise HTTPException(status_code=400, detail="Task output is missing or malformed.")
+    if output_data.get("error"):
+        raise HTTPException(status_code=400, detail=f"Task execution failed: {output_data['error']}")
+    rendered = report_text_from_output(output_data).strip()
+    if not rendered or rendered in ("{}", "[]"):
+        raise HTTPException(status_code=400, detail="Task has no usable output to approve.")
     quality_review = output_data.get("quality_review")
     if not quality_review:
+        raise HTTPException(status_code=400, detail="Task output is missing quality validation.")
     if quality_review.get("approved"):
         return
     reasons = quality_review.get("fail_reasons") or ["Task output failed quality validation."]

backend/services/agent_runner_service.py CHANGED Viewed

@@ -50,6 +50,18 @@ class AgentRunnerService:
                 .execute()
             context = context_res.data if context_res.data else []
             extra_context = ""
             if include_semantic_context:
                 extra_context = await semantic_backprop.get_project_context(project_id, task_id)
@@ -99,7 +111,7 @@ class AgentRunnerService:
             # 4. Execute Run with timing
             start_time = time.time()
             task_instructions = task.get("description") or task["title"]
-            task_instructions = f"{task_instructions}\n\n{build_quality_instructions(task)}"
             result = await agent.run(task_instructions, context, extra_context=extra_context)
             duration = time.time() - start_time
@@ -115,7 +127,7 @@ class AgentRunnerService:
                     result["security_warning"] = f"Output sanitized: suspicious pattern '{pattern}' detected."
                     # We don't block yet, but we flag it.
-            quality_review = validate_output(task, result)
             result["quality_review"] = quality_review
             # 6. Save to Cache

                 .execute()
             context = context_res.data if context_res.data else []
+            project_data = task.get("project")
+            if not isinstance(project_data, dict):
+                project_res = (
+                    supabase.table("projects")
+                    .select("name,description,context")
+                    .eq("id", project_id)
+                    .single()
+                    .execute()
+                )
+                project_data = project_res.data if project_res and project_res.data else {}
+            quality_task = {**task, "project": project_data}
             extra_context = ""
             if include_semantic_context:
                 extra_context = await semantic_backprop.get_project_context(project_id, task_id)
             # 4. Execute Run with timing
             start_time = time.time()
             task_instructions = task.get("description") or task["title"]
+            task_instructions = f"{task_instructions}\n\n{build_quality_instructions(quality_task)}"
             result = await agent.run(task_instructions, context, extra_context=extra_context)
             duration = time.time() - start_time
                     result["security_warning"] = f"Output sanitized: suspicious pattern '{pattern}' detected."
                     # We don't block yet, but we flag it.
+            quality_review = validate_output(quality_task, result)
             result["quality_review"] = quality_review
             # 6. Save to Cache

backend/services/orchestrator_service.py CHANGED Viewed

@@ -68,6 +68,18 @@ def _format_output_for_report(output_data) -> str:
     return clean_report_text(dedupe_lines("\n".join(_format_value_for_report(primary))))
 def _output_text(output_data) -> str:
     return _format_output_for_report(output_data).lower()
@@ -422,6 +434,12 @@ class OrchestratorService:
         excluded: list[dict] = []
         for task in tasks:
             output_data = task.get("output_data") or {}
             quality_review = output_data.get("quality_review") if isinstance(output_data, dict) else None
             if quality_review and not quality_review.get("approved", False):
                 excluded.append({
@@ -502,9 +520,9 @@ class OrchestratorService:
             lines.extend(["## Excluded Content", ""])
             for excluded in excluded_tasks:
                 lines.append(f"- Excluded task output: {excluded['title']} ({'; '.join(excluded['reasons'])})")
-            for excluded_line in report_exclusions[:25]:
                 if excluded_line:
-                    lines.append(f"- Removed low-quality line: {excluded_line}")
             lines.append("")
         # Final Conclusion Generation
@@ -542,7 +560,7 @@ class OrchestratorService:
             conclusion,
             "",
             "## Completion Status",
-            f"All {len(tasks)} tasks are approved. {len(curated_tasks)} task outputs passed final quality validation. Project status: completed."
         ])
         supabase.table("projects").update({"status": "completed"}).eq("id", project_id).execute()

     return clean_report_text(dedupe_lines("\n".join(_format_value_for_report(primary))))
+def _has_usable_output(output_data) -> bool:
+    if not output_data:
+        return False
+    if isinstance(output_data, dict):
+        if output_data.get("error"):
+            return False
+        primary = output_data.get("data")
+        if primary in (None, "", [], {}):
+            return False
+    return True
 def _output_text(output_data) -> str:
     return _format_output_for_report(output_data).lower()
         excluded: list[dict] = []
         for task in tasks:
             output_data = task.get("output_data") or {}
+            if not _has_usable_output(output_data):
+                excluded.append({
+                    "title": task.get("title", "Untitled task"),
+                    "reasons": ["Task has no usable approved output."]
+                })
+                continue
             quality_review = output_data.get("quality_review") if isinstance(output_data, dict) else None
             if quality_review and not quality_review.get("approved", False):
                 excluded.append({
             lines.extend(["## Excluded Content", ""])
             for excluded in excluded_tasks:
                 lines.append(f"- Excluded task output: {excluded['title']} ({'; '.join(excluded['reasons'])})")
+            for excluded_line in list(dict.fromkeys(report_exclusions))[:10]:
                 if excluded_line:
+                    lines.append(f"- {excluded_line}")
             lines.append("")
         # Final Conclusion Generation
             conclusion,
             "",
             "## Completion Status",
+            f"{len(tasks)} tasks reached done status. {len(curated_tasks)} task outputs passed final quality validation. {len(excluded_tasks)} task outputs were excluded from the final report."
         ])
         supabase.table("projects").update({"status": "completed"}).eq("id", project_id).execute()

backend/services/output_quality.py CHANGED Viewed

@@ -30,6 +30,40 @@ SENSITIVE_FACT_PATTERNS = [
     r"\bprofit\b",
 ]
 STRICT_TASK_PATTERNS = [
     r"\bresearch\b",
     r"\banaly[sz]e\b",
@@ -55,7 +89,8 @@ def _stringify_payload(value: Any) -> str:
 def build_quality_instructions(task: dict) -> str:
-    task_text = f"{task.get('title', '')}\n{task.get('description', '')}".lower()
     strict_mode = any(re.search(pattern, task_text, re.IGNORECASE) for pattern in STRICT_TASK_PATTERNS)
     base = [
@@ -63,8 +98,10 @@ def build_quality_instructions(task: dict) -> str:
         "- Never use placeholder names like Competitor A, Dashboard B, Product C, or Our Company.",
         "- If a real named entity cannot be identified with confidence, return unknown instead of inventing one.",
         "- Keep the output strictly within the requested scope.",
         "- Do not include generic filler sections that were not requested.",
         "- Use clean UTF-8/ASCII friendly text. Do not output corrupted characters.",
     ]
     if strict_mode:
@@ -80,10 +117,51 @@ def build_quality_instructions(task: dict) -> str:
     return "\n".join(base)
 def validate_output(task: dict, result: dict) -> dict:
     raw_text = _stringify_payload(result.get("raw_output"))
     data_text = _stringify_payload(result.get("data"))
     combined = "\n".join(part for part in [raw_text, data_text] if part).strip()
     fail_reasons: list[str] = []
     must_fix: list[str] = []
@@ -110,6 +188,14 @@ def validate_output(task: dict, result: dict) -> dict:
         fail_reasons.append("Output contains encoding corruption.")
         must_fix.append("Remove corrupted characters and normalize text encoding.")
     for pattern in GENERIC_FILLER_PATTERNS:
         if re.search(pattern, combined, re.IGNORECASE):
             unsupported_claims.append(pattern.replace("\\b", "").replace("?", ""))
@@ -146,6 +232,10 @@ def validate_output(task: dict, result: dict) -> dict:
     score = 100
     if placeholder_entities:
         score = min(score, 20)
     if any(item.startswith("Sensitive fact without source:") for item in unsupported_claims):
         score = min(score, 30)
     if duplicate_claims:
@@ -207,10 +297,13 @@ def filter_report_sections(text: str) -> tuple[str, list[str]]:
     for line in text.splitlines():
         lowered = line.lower()
         if any(re.search(pattern, lowered, re.IGNORECASE) for pattern in PLACEHOLDER_PATTERNS):
-            excluded.append(line.strip())
             continue
         if any(re.search(pattern, lowered, re.IGNORECASE) for pattern in GENERIC_FILLER_PATTERNS):
-            excluded.append(line.strip())
             continue
         kept_lines.append(line)
     return "\n".join(kept_lines).strip(), excluded

     r"\bprofit\b",
 ]
+RAW_DUMP_PATTERNS = [
+    r"```(?:json)?",
+    r'"raw_text"\s*:',
+    r'"projectoverview"\s*:',
+    r'"projectoverview"\s*:',
+    r'"userstories"\s*:',
+    r'"datamodel"\s*:',
+]
+LATAM_HINTS = [
+    "mercadolibre",
+    "mercado libre",
+    "latam",
+    "latin america",
+    "argentina",
+    "mexico",
+    "brazil",
+    "brasil",
+    "chile",
+    "colombia",
+    "peru",
+    "uruguay",
+]
+SEA_HINTS = [
+    "indonesia",
+    "yogyakarta",
+    "bali",
+    "southeast asia",
+    "tokopedia",
+    "shopee",
+    "jakarta",
+]
 STRICT_TASK_PATTERNS = [
     r"\bresearch\b",
     r"\banaly[sz]e\b",
 def build_quality_instructions(task: dict) -> str:
+    project_text = _project_text(task)
+    task_text = f"{task.get('title', '')}\n{task.get('description', '')}\n{project_text}".lower()
     strict_mode = any(re.search(pattern, task_text, re.IGNORECASE) for pattern in STRICT_TASK_PATTERNS)
     base = [
         "- Never use placeholder names like Competitor A, Dashboard B, Product C, or Our Company.",
         "- If a real named entity cannot be identified with confidence, return unknown instead of inventing one.",
         "- Keep the output strictly within the requested scope.",
+        "- Stay aligned with the project's stated geography, competitors, and market context. Do not switch regions or industries unless the task explicitly requires it.",
         "- Do not include generic filler sections that were not requested.",
         "- Use clean UTF-8/ASCII friendly text. Do not output corrupted characters.",
+        "- Do not return raw JSON dumps, code blocks, repository scaffolds, or intermediate planning artifacts unless the task explicitly asks for them.",
     ]
     if strict_mode:
     return "\n".join(base)
+def _project_text(task: dict) -> str:
+    project = task.get("project")
+    if isinstance(project, dict):
+        return "\n".join(
+            str(project.get(key, "") or "")
+            for key in ("name", "description", "context")
+        )
+    return str(task.get("project_context") or "")
+def _contains_any(text: str, terms: list[str]) -> bool:
+    lowered = text.lower()
+    return any(term in lowered for term in terms)
+def _looks_like_raw_dump(text: str) -> bool:
+    if any(re.search(pattern, text, re.IGNORECASE) for pattern in RAW_DUMP_PATTERNS):
+        return True
+    stripped = text.strip()
+    if stripped.startswith("{") or stripped.startswith("["):
+        return True
+    return False
+def _is_context_drift(task_text: str, output_text: str) -> bool:
+    task_lower = task_text.lower()
+    output_lower = output_text.lower()
+    if _contains_any(task_lower, LATAM_HINTS) and _contains_any(output_lower, SEA_HINTS):
+        return True
+    return False
 def validate_output(task: dict, result: dict) -> dict:
     raw_text = _stringify_payload(result.get("raw_output"))
     data_text = _stringify_payload(result.get("data"))
     combined = "\n".join(part for part in [raw_text, data_text] if part).strip()
+    task_text = "\n".join(
+        [
+            str(task.get("title", "") or ""),
+            str(task.get("description", "") or ""),
+            _project_text(task),
+        ]
+    )
     fail_reasons: list[str] = []
     must_fix: list[str] = []
         fail_reasons.append("Output contains encoding corruption.")
         must_fix.append("Remove corrupted characters and normalize text encoding.")
+    if _looks_like_raw_dump(combined):
+        fail_reasons.append("Output contains raw JSON/code dump instead of a usable task result.")
+        must_fix.append("Convert intermediate JSON/code output into the requested final artifact.")
+    if _is_context_drift(task_text, combined):
+        fail_reasons.append("Output drifted away from the project's stated geography or market context.")
+        must_fix.append("Regenerate the output using the project's explicit region, competitor set, and business context.")
     for pattern in GENERIC_FILLER_PATTERNS:
         if re.search(pattern, combined, re.IGNORECASE):
             unsupported_claims.append(pattern.replace("\\b", "").replace("?", ""))
     score = 100
     if placeholder_entities:
         score = min(score, 20)
+    if _looks_like_raw_dump(combined):
+        score = min(score, 20)
+    if _is_context_drift(task_text, combined):
+        score = min(score, 20)
     if any(item.startswith("Sensitive fact without source:") for item in unsupported_claims):
         score = min(score, 30)
     if duplicate_claims:
     for line in text.splitlines():
         lowered = line.lower()
         if any(re.search(pattern, lowered, re.IGNORECASE) for pattern in PLACEHOLDER_PATTERNS):
+            excluded.append("Removed placeholder content.")
             continue
         if any(re.search(pattern, lowered, re.IGNORECASE) for pattern in GENERIC_FILLER_PATTERNS):
+            excluded.append("Removed generic filler outside the requested scope.")
+            continue
+        if _looks_like_raw_dump(line):
+            excluded.append("Removed raw JSON/code dump content.")
             continue
         kept_lines.append(line)
     return "\n".join(kept_lines).strip(), excluded