| """ |
| ================================================================ |
| 医疗 RAG Agent — 端到端任务完成率 (E2E Task Completion Rate) |
| ================================================================ |
| 测试层级: |
| 单元测试 (test1.py): 单工具调用准确性 ✅ 67 passed |
| 集成测试 (test2.py): 多步骤工具链协作 ✅ 37 passed |
| 回归测试 (test3.py): 防退化 & 边界守护 ✅ 52 passed |
| 安全红队 (test4.py): 对抗性攻击防御 ✅ 45 passed |
| E2E完成率(test5.py): 端到端任务完成率 ← 当前文件 |
| |
| E2E 完成率 vs 其他测试: |
| test1-4 问: "代码对不对?" (工程质量) |
| test5 问: "Agent 好不好用?" (产品质量) |
| |
| 核心概念: |
| 给 Agent 一个完整的医学问题, 验证它能否: |
| 1. 走完全部流程 (Milvus → PDF → Neo4j → LLM) |
| 2. 输出符合质量要求的回答 |
| 3. 在各种降级场景下仍给出可接受的回答 |
| |
| 任务完成度分级: |
| ✅ FULL_SUCCESS — 三路召回 + LLM 全部成功, 回答包含预期关键信息 |
| 🟡 PARTIAL — 部分召回失败, 但回答仍然有用 |
| ❌ FAILED — 回答缺失关键信息 / 系统崩溃 / 返回空 |
| |
| 测试场景: |
| 场景 1: 标准医学问答 (症状/药物/治疗/检查/饮食) |
| 场景 2: 降级场景下的任务完成 |
| 场景 3: 答案质量评估 (关键词覆盖 / 长度 / 结构) |
| 场景 4: 多轮查询稳定性 |
| 场景 5: 全量 Test Suite 完成率统计 |
| |
| 运行: |
| pytest test5.py -v --tb=short |
| pytest test5.py -v -k "standard_medical" # 标准医学问答 |
| pytest test5.py -v -k "completion_rate" # 完成率统计 |
| ================================================================ |
| """ |
|
|
| import sys |
| import os |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) |
|
|
| import types |
| import pytest |
| import json |
| import hashlib |
| import uuid |
| import time |
| import datetime |
| from enum import Enum |
| from unittest.mock import MagicMock, patch |
| from dataclasses import dataclass, field |
| from typing import Optional, List, Dict, Callable |
|
|
|
|
| |
| |
| |
|
|
| def _ensure_mock_module(name): |
| if name not in sys.modules: |
| sys.modules[name] = MagicMock() |
|
|
| for mod in [ |
| "langchain_classic", "langchain_classic.retrievers", |
| "langchain_classic.retrievers.parent_document_retriever", |
| "langchain_milvus", "langchain_text_splitters", |
| "langchain_core", "langchain_core.stores", "langchain_core.documents", |
| "langchain.embeddings", "langchain.embeddings.base", |
| "neo4j", "dotenv", "uvicorn", |
| "fastapi", "fastapi.middleware", "fastapi.middleware.cors", |
| ]: |
| _ensure_mock_module(mod) |
|
|
| class _FakeEmbeddingsBase: |
| pass |
| sys.modules["langchain.embeddings.base"].Embeddings = _FakeEmbeddingsBase |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class FakeDocument: |
| page_content: str |
| metadata: dict = field(default_factory=dict) |
|
|
| class FakeChatResponse: |
| def __init__(self, content): |
| msg = type('Msg', (), {'content': content})() |
| choice = type('Choice', (), {'message': msg})() |
| self.choices = [choice] |
|
|
| class FakeRedisClient: |
| def __init__(self): |
| self._store = {} |
| self._expiry = {} |
| def ping(self): return True |
| def get(self, key): return self._store.get(key) |
| def set(self, key, value, ex=None, nx=False): |
| if nx and key in self._store: return False |
| self._store[key] = value |
| if ex: self._expiry[key] = ex |
| return True |
| def setex(self, key, expire, value): |
| self._store[key] = value; self._expiry[key] = expire; return True |
| def delete(self, key): return 1 if self._store.pop(key, None) is not None else 0 |
| def register_script(self, script): |
| def f(keys=None, args=None): |
| if keys and args and self._store.get(keys[0]) == args[0]: |
| del self._store[keys[0]]; return 1 |
| return 0 |
| return f |
|
|
| def make_redis_manager(): |
| from new_redis import RedisClientWrapper |
| RedisClientWrapper._pool = "FAKE" |
| mgr = object.__new__(RedisClientWrapper) |
| mgr.client = FakeRedisClient() |
| mgr.unlock_script = mgr.client.register_script("") |
| return mgr |
|
|
|
|
| |
| |
| |
|
|
| class TaskResult(Enum): |
| """任务完成度分级""" |
| FULL_SUCCESS = "full_success" |
| PARTIAL = "partial" |
| FAILED = "failed" |
|
|
|
|
| @dataclass |
| class MedicalTestCase: |
| """ |
| 一个完整的 E2E 测试用例定义 |
| |
| task_id: 唯一标识 |
| question: 用户输入的医学问题 |
| category: 问题类别 (症状/药物/治疗/检查/饮食/预防) |
| milvus_docs: Milvus 应返回的文档 |
| pdf_content: PDF 检索应返回的内容 |
| neo4j_results: Neo4j 应返回的实体 |
| expected_answer: LLM 应生成的回答 |
| required_keywords: 回答中必须包含的关键词 |
| min_length: 回答最短长度 |
| difficulty: 难度 (easy/medium/hard) |
| """ |
| task_id: str |
| question: str |
| category: str |
| milvus_docs: List[str] |
| pdf_content: str |
| neo4j_results: List[str] |
| expected_answer: str |
| required_keywords: List[str] |
| min_length: int = 10 |
| difficulty: str = "medium" |
|
|
|
|
| @dataclass |
| class TaskEvaluation: |
| """一个测试用例的评估结果""" |
| task_id: str |
| question: str |
| category: str |
| result: TaskResult |
| actual_answer: str |
| keywords_found: List[str] |
| keywords_missing: List[str] |
| milvus_contributed: bool |
| pdf_contributed: bool |
| neo4j_contributed: bool |
| error: Optional[str] = None |
|
|
|
|
| def evaluate_task( |
| test_case: MedicalTestCase, |
| actual_answer: str, |
| prompt_sent: str, |
| error: Optional[str] = None, |
| ) -> TaskEvaluation: |
| """ |
| 评估单个任务的完成度 |
| |
| 评分规则: |
| - FULL_SUCCESS: 无错误 + 所有必需关键词命中 + 长度达标 |
| - PARTIAL: 无错误 + 至少 50% 关键词命中 |
| - FAILED: 有错误 / 关键词命中 < 50% / 回答为空 |
| """ |
| if error or not actual_answer: |
| return TaskEvaluation( |
| task_id=test_case.task_id, |
| question=test_case.question, |
| category=test_case.category, |
| result=TaskResult.FAILED, |
| actual_answer=actual_answer or "", |
| keywords_found=[], keywords_missing=test_case.required_keywords, |
| milvus_contributed=False, pdf_contributed=False, neo4j_contributed=False, |
| error=error, |
| ) |
|
|
| |
| found = [kw for kw in test_case.required_keywords if kw in actual_answer] |
| missing = [kw for kw in test_case.required_keywords if kw not in actual_answer] |
| hit_rate = len(found) / len(test_case.required_keywords) if test_case.required_keywords else 1.0 |
|
|
| |
| milvus_ok = any(doc in prompt_sent for doc in test_case.milvus_docs) if test_case.milvus_docs else True |
| pdf_ok = test_case.pdf_content in prompt_sent if test_case.pdf_content else True |
| neo4j_ok = any(r in prompt_sent for r in test_case.neo4j_results) if test_case.neo4j_results else True |
|
|
| |
| if hit_rate >= 1.0 and len(actual_answer) >= test_case.min_length: |
| result = TaskResult.FULL_SUCCESS |
| elif hit_rate >= 0.5: |
| result = TaskResult.PARTIAL |
| else: |
| result = TaskResult.FAILED |
|
|
| return TaskEvaluation( |
| task_id=test_case.task_id, |
| question=test_case.question, |
| category=test_case.category, |
| result=result, |
| actual_answer=actual_answer, |
| keywords_found=found, |
| keywords_missing=missing, |
| milvus_contributed=milvus_ok, |
| pdf_contributed=pdf_ok, |
| neo4j_contributed=neo4j_ok, |
| ) |
|
|
|
|
| |
| |
| |
|
|
| MEDICAL_TEST_SUITE: List[MedicalTestCase] = [ |
| |
| MedicalTestCase( |
| task_id="SYM-001", |
| question="高血压的主要症状有哪些?", |
| category="症状", |
| milvus_docs=["高血压常见症状包括头晕、头痛、耳鸣"], |
| pdf_content="根据《中国高血压防治指南》: 早期高血压多无明显症状, 部分患者表现为头晕头痛", |
| neo4j_results=["头晕", "头痛", "耳鸣", "心悸"], |
| expected_answer="高血压主要症状包括头晕、头痛、耳鸣、心悸等, 早期可能无明显症状。", |
| required_keywords=["头晕", "头痛"], |
| ), |
| MedicalTestCase( |
| task_id="SYM-002", |
| question="糖尿病的早期信号是什么?", |
| category="症状", |
| milvus_docs=["糖尿病典型症状为三多一少: 多饮多食多尿体重减少"], |
| pdf_content="2型糖尿病早期可出现口渴、多尿、视力模糊", |
| neo4j_results=["多饮", "多食", "多尿", "体重减少"], |
| expected_answer="糖尿病早期信号包括三多一少: 多饮、多食、多尿和体重减少。", |
| required_keywords=["多饮", "多尿"], |
| ), |
|
|
| |
| MedicalTestCase( |
| task_id="DRUG-001", |
| question="高血压常用的降压药有哪些?", |
| category="药物", |
| milvus_docs=["常用降压药包括: ACEI类(如依那普利)、ARB类(如缬沙坦)、CCB类(如氨氯地平)"], |
| pdf_content="一线降压药物: 钙通道阻滞剂、ACEI、ARB、利尿剂、β受体阻滞剂", |
| neo4j_results=["氨氯地平", "缬沙坦", "依那普利"], |
| expected_answer="常用降压药包括氨氯地平、缬沙坦、依那普利等, 分属CCB、ARB、ACEI等类别。", |
| required_keywords=["氨氯地平", "缬沙坦"], |
| ), |
| MedicalTestCase( |
| task_id="DRUG-002", |
| question="二甲双胍的副作用有哪些?", |
| category="药物", |
| milvus_docs=["二甲双胍常见副作用: 胃肠道反应(恶心、腹泻)、维生素B12缺乏"], |
| pdf_content="严重不良反应: 乳酸酸中毒(罕见), 肝肾功能不全者慎用", |
| neo4j_results=["恶心", "腹泻", "乳酸酸中毒"], |
| expected_answer="二甲双胍常见副作用包括恶心、腹泻等胃肠道反应, 罕见严重副作用为乳酸酸中毒。", |
| required_keywords=["恶心", "腹泻"], |
| ), |
|
|
| |
| MedicalTestCase( |
| task_id="TREAT-001", |
| question="冠心病的治疗方案有哪些?", |
| category="治疗", |
| milvus_docs=["冠心病治疗: 药物治疗(抗血小板、他汀)、介入治疗(PCI)、搭桥手术(CABG)"], |
| pdf_content="冠心病综合管理: 生活方式干预 + 药物治疗 + 必要时血运重建", |
| neo4j_results=["抗血小板治疗", "PCI", "CABG"], |
| expected_answer="冠心病治疗包括药物(抗血小板、他汀类)、介入治疗(PCI)和搭桥手术(CABG)。", |
| required_keywords=["药物", "PCI"], |
| ), |
| MedicalTestCase( |
| task_id="TREAT-002", |
| question="幽门螺杆菌的根除方案是什么?", |
| category="治疗", |
| milvus_docs=["Hp根除: 四联疗法(PPI+铋剂+两种抗生素), 疗程14天"], |
| pdf_content="推荐方案: 质子泵抑制剂+枸橼酸铋钾+阿莫西林+克拉霉素", |
| neo4j_results=["四联疗法", "PPI", "阿莫西林"], |
| expected_answer="四联疗法: PPI + 铋剂 + 阿莫西林 + 克拉霉素, 疗程14天。", |
| required_keywords=["四联", "阿莫西林"], |
| ), |
|
|
| |
| MedicalTestCase( |
| task_id="DIET-001", |
| question="高血压患者不能吃什么?", |
| category="饮食", |
| milvus_docs=["高血压饮食禁忌: 高盐食物、腌制品、酒精、高脂肪食物"], |
| pdf_content="《中国居民膳食指南》建议高血压患者每日钠摄入<5g", |
| neo4j_results=["高盐", "腌制品", "酒精"], |
| expected_answer="高血压患者应避免高盐食物、腌制品、酒精, 每日钠摄入量控制在5g以下。", |
| required_keywords=["高盐", "酒精"], |
| ), |
| MedicalTestCase( |
| task_id="DIET-002", |
| question="糖尿病患者可以吃水果吗?", |
| category="饮食", |
| milvus_docs=["糖尿病患者可以适量食用低GI水果, 如苹果、柚子、樱桃"], |
| pdf_content="建议选择升糖指数低的水果, 控制在每天200g以内, 两餐之间食用", |
| neo4j_results=["低GI水果", "苹果", "柚子"], |
| expected_answer="可以适量吃低GI水果如苹果、柚子, 每天不超过200g, 建议两餐之间食用。", |
| required_keywords=["低GI", "适量"], |
| ), |
|
|
| |
| MedicalTestCase( |
| task_id="EXAM-001", |
| question="高血压需要做哪些检查?", |
| category="检查", |
| milvus_docs=["高血压常规检查: 血压测量、血常规、尿常规、心电图、眼底检查"], |
| pdf_content="推荐检查: 24小时动态血压监测、血脂四项、肾功能、超声心动图", |
| neo4j_results=["心电图", "血常规", "肾功能"], |
| expected_answer="高血压需做血压测量、心电图、血常规、尿常规、眼底检查、肾功能检查等。", |
| required_keywords=["心电图", "血压"], |
| ), |
|
|
| |
| MedicalTestCase( |
| task_id="PREV-001", |
| question="如何预防脑卒中?", |
| category="预防", |
| milvus_docs=["脑卒中预防: 控制血压、戒烟限酒、规律运动、控制血糖血脂"], |
| pdf_content="一级预防: 管理高危因素(高血压、糖尿病、房颤); 二级预防: 抗血小板/抗凝", |
| neo4j_results=["控制血压", "戒烟", "规律运动"], |
| expected_answer="预防脑卒中关键措施: 控制血压、戒烟限酒、规律运动、管理血糖血脂。", |
| required_keywords=["控制血压", "戒烟"], |
| ), |
| ] |
|
|
|
|
| |
| |
| |
|
|
| def build_mocks_from_case(case: MedicalTestCase, milvus_fail=False, pdf_fail=False, neo4j_fail=False): |
| """根据测试用例构建全套 Mock""" |
|
|
| |
| milvus = MagicMock() |
| if milvus_fail: |
| milvus.similarity_search.side_effect = ConnectionError("Milvus down") |
| else: |
| milvus.similarity_search.return_value = [ |
| FakeDocument(page_content=doc) for doc in case.milvus_docs |
| ] |
|
|
| |
| pdf = MagicMock() |
| if pdf_fail: |
| pdf.invoke.side_effect = Exception("PDF error") |
| else: |
| pdf.invoke.return_value = [FakeDocument(page_content=case.pdf_content)] if case.pdf_content else [] |
|
|
| |
| neo4j_driver = MagicMock() |
| sess = MagicMock() |
| if neo4j_fail: |
| sess.run.side_effect = Exception("Neo4j down") |
| else: |
| sess.run.return_value = [(r,) for r in case.neo4j_results] |
| neo4j_driver.session.return_value.__enter__ = MagicMock(return_value=sess) |
| neo4j_driver.session.return_value.__exit__ = MagicMock(return_value=False) |
|
|
| |
| req = MagicMock() |
| if neo4j_fail: |
| req.post.side_effect = ConnectionError("Cypher API down") |
| else: |
| gen = MagicMock(); gen.status_code = 200 |
| gen.json.return_value = { |
| "cypher_query": "MATCH (d:Disease)-[:has_symptom]->(s) RETURN s.name", |
| "confidence": 0.95, "validated": True, |
| } |
| val = MagicMock(); val.status_code = 200 |
| val.json.return_value = {"is_valid": True} |
| req.post.side_effect = [gen, val] |
|
|
| |
| llm = MagicMock() |
| llm.chat.completions.create.return_value = FakeChatResponse(case.expected_answer) |
|
|
| return milvus, pdf, neo4j_driver, llm, req |
|
|
|
|
| def perform_rag_testable(query, milvus, pdf, neo4j_driver, llm, requests_module): |
| """依赖注入版 perform_rag_and_llm""" |
| import json as _json |
|
|
| try: |
| results = milvus.similarity_search(query, k=10, ranker_type="rrf", ranker_params={"k": 100}) |
| context = "\n\n".join(d.page_content for d in results) if results else "" |
| except Exception: |
| context = "" |
|
|
| pdf_res = "" |
| try: |
| docs = pdf.invoke(query) |
| if docs and len(docs) >= 1: |
| pdf_res = docs[0].page_content |
| except Exception: |
| pass |
| context = context + "\n" + pdf_res |
|
|
| neo4j_res = "" |
| try: |
| resp = requests_module.post("http://0.0.0.0:8101/generate", |
| _json.dumps({"natural_language_query": query})) |
| if resp.status_code == 200: |
| d = resp.json() |
| if d["cypher_query"] and float(d["confidence"]) >= 0.9 and d["validated"]: |
| vresp = requests_module.post("http://0.0.0.0:8101/validate", |
| _json.dumps({"cypher_query": d["cypher_query"]})) |
| if vresp.status_code == 200 and vresp.json()["is_valid"]: |
| with neo4j_driver.session() as session: |
| try: |
| record = session.run(d["cypher_query"]) |
| neo4j_res = ','.join(list(map(lambda x: x[0], record))) |
| except Exception: |
| neo4j_res = "" |
| except Exception: |
| pass |
| context = context + "\n" + neo4j_res |
|
|
| SYSTEM = "System: 你是一个非常得力的医学助手, 你可以通过从数据库中检索出的信息找到问题的答案." |
| USER = f"""User: 利用介于<context>和</context>之间的信息来回答问题. |
| <context> |
| {context} |
| </context> |
| <question> |
| {query} |
| </question>""" |
|
|
| response = llm.chat.completions.create( |
| model="gpt-4o-mini", |
| messages=[{"role": "user", "content": SYSTEM + USER}], |
| temperature=0.7, |
| ) |
| return response.choices[0].message.content, SYSTEM + USER |
|
|
|
|
| def run_e2e_task(case: MedicalTestCase, milvus_fail=False, pdf_fail=False, neo4j_fail=False) -> TaskEvaluation: |
| """执行单个 E2E 任务并评估""" |
| milvus, pdf, neo4j, llm, req = build_mocks_from_case( |
| case, milvus_fail=milvus_fail, pdf_fail=pdf_fail, neo4j_fail=neo4j_fail |
| ) |
|
|
| error = None |
| answer = "" |
| prompt = "" |
| try: |
| answer, prompt = perform_rag_testable(case.question, milvus, pdf, neo4j, llm, req) |
| except Exception as e: |
| error = str(e) |
|
|
| return evaluate_task(case, answer, prompt, error) |
|
|
|
|
| |
| |
| |
|
|
| class TestStandardMedicalQueries: |
| """ |
| 验证: 三路召回全部正常时, 每个问题都能完成任务 |
| 预期: 全部 FULL_SUCCESS |
| """ |
|
|
| @pytest.mark.parametrize("case", MEDICAL_TEST_SUITE, ids=[c.task_id for c in MEDICAL_TEST_SUITE]) |
| def test_full_pipeline_each_case(self, case): |
| """每个医学用例走完整链路""" |
| ev = run_e2e_task(case) |
|
|
| assert ev.result == TaskResult.FULL_SUCCESS, ( |
| f"\n[{ev.task_id}] {ev.question}" |
| f"\n 期望: FULL_SUCCESS, 实际: {ev.result.value}" |
| f"\n 命中关键词: {ev.keywords_found}" |
| f"\n 缺失关键词: {ev.keywords_missing}" |
| f"\n 回答: {ev.actual_answer[:80]}..." |
| ) |
|
|
| @pytest.mark.parametrize("case", MEDICAL_TEST_SUITE, ids=[c.task_id for c in MEDICAL_TEST_SUITE]) |
| def test_all_three_sources_contribute(self, case): |
| """三路召回都参与了 prompt 构建""" |
| ev = run_e2e_task(case) |
|
|
| assert ev.milvus_contributed, f"[{ev.task_id}] Milvus 内容未出现在 prompt" |
| assert ev.pdf_contributed, f"[{ev.task_id}] PDF 内容未出现在 prompt" |
| assert ev.neo4j_contributed, f"[{ev.task_id}] Neo4j 内容未出现在 prompt" |
|
|
| @pytest.mark.parametrize("case", MEDICAL_TEST_SUITE, ids=[c.task_id for c in MEDICAL_TEST_SUITE]) |
| def test_answer_length_adequate(self, case): |
| """回答长度达标""" |
| ev = run_e2e_task(case) |
| assert len(ev.actual_answer) >= case.min_length, ( |
| f"[{ev.task_id}] 回答太短: {len(ev.actual_answer)} < {case.min_length}" |
| ) |
|
|
|
|
| |
| |
| |
|
|
| class TestDegradedCompletion: |
| """ |
| 验证: 部分组件故障时, Agent 仍能给出可接受的回答 |
| 预期: 至少 PARTIAL (不能 FAILED) |
| """ |
|
|
| @pytest.mark.parametrize("case", MEDICAL_TEST_SUITE[:5], ids=[c.task_id for c in MEDICAL_TEST_SUITE[:5]]) |
| def test_neo4j_down_still_completes(self, case): |
| """Neo4j 宕机 → 仍能回答 (Milvus + PDF 补位)""" |
| ev = run_e2e_task(case, neo4j_fail=True) |
|
|
| assert ev.result != TaskResult.FAILED, ( |
| f"[{ev.task_id}] Neo4j 宕机时不应完全失败" |
| f"\n 回答: {ev.actual_answer[:80]}" |
| ) |
| assert ev.error is None, "不应有未捕获的异常" |
|
|
| @pytest.mark.parametrize("case", MEDICAL_TEST_SUITE[:5], ids=[c.task_id for c in MEDICAL_TEST_SUITE[:5]]) |
| def test_pdf_down_still_completes(self, case): |
| """PDF 宕机 → 仍能回答 (Milvus + Neo4j 补位)""" |
| ev = run_e2e_task(case, pdf_fail=True) |
|
|
| assert ev.result != TaskResult.FAILED, f"[{ev.task_id}] PDF 宕机时不应完全失败" |
|
|
| @pytest.mark.parametrize("case", MEDICAL_TEST_SUITE[:5], ids=[c.task_id for c in MEDICAL_TEST_SUITE[:5]]) |
| def test_milvus_down_still_completes(self, case): |
| """Milvus 宕机 → 仍能回答 (PDF + Neo4j 补位)""" |
| ev = run_e2e_task(case, milvus_fail=True) |
|
|
| assert ev.result != TaskResult.FAILED, f"[{ev.task_id}] Milvus 宕机时不应完全失败" |
|
|
| def test_all_sources_down_still_no_crash(self): |
| """三路全挂 → 不崩溃, LLM 用自身知识回答""" |
| case = MEDICAL_TEST_SUITE[0] |
| ev = run_e2e_task(case, milvus_fail=True, pdf_fail=True, neo4j_fail=True) |
| assert ev.error is None, "三路全挂不应导致异常" |
| assert len(ev.actual_answer) > 0, "应仍有回答 (LLM 经验知识)" |
|
|
|
|
| |
| |
| |
|
|
| class TestAnswerQuality: |
| """ |
| 不仅验证 "有没有回答", 还验证 "回答质量好不好" |
| """ |
|
|
| def test_symptom_answer_contains_specific_symptoms(self): |
| """症状查询 → 回答应包含具体症状名称, 不能只说 '请就医'""" |
| case = MEDICAL_TEST_SUITE[0] |
| ev = run_e2e_task(case) |
|
|
| |
| vague_phrases = ["请咨询医生", "因人而异", "无法确定"] |
| for phrase in vague_phrases: |
| assert phrase not in ev.actual_answer, ( |
| f"症状查询不应返回空泛回复: 包含 '{phrase}'" |
| ) |
|
|
| def test_drug_answer_contains_drug_names(self): |
| """药物查询 → 回答应包含具体药物名称""" |
| case = MEDICAL_TEST_SUITE[2] |
| ev = run_e2e_task(case) |
|
|
| drug_count = sum(1 for kw in ["氨氯地平", "缬沙坦", "依那普利"] if kw in ev.actual_answer) |
| assert drug_count >= 2, f"药物查询应至少提及 2 种药名, 实际 {drug_count} 种" |
|
|
| def test_treatment_answer_has_structure(self): |
| """治疗查询 → 回答应涵盖多种治疗手段""" |
| case = MEDICAL_TEST_SUITE[4] |
| ev = run_e2e_task(case) |
|
|
| methods = sum(1 for kw in ["药物", "介入", "手术", "PCI", "CABG", "生活方式"] |
| if kw in ev.actual_answer) |
| assert methods >= 2, f"治疗方案应涵盖 ≥2 种治疗手段, 实际 {methods} 种" |
|
|
| def test_diet_answer_is_actionable(self): |
| """饮食查询 → 回答应有具体的可执行建议""" |
| case = MEDICAL_TEST_SUITE[6] |
| ev = run_e2e_task(case) |
|
|
| |
| actionable = any(kw in ev.actual_answer for kw in ["高盐", "腌制", "5g", "酒精", "避免"]) |
| assert actionable, "饮食建议应具体可执行, 不应空泛" |
|
|
| def test_answer_not_empty_for_any_category(self): |
| """所有类别的问题都不应返回空回答""" |
| for case in MEDICAL_TEST_SUITE: |
| ev = run_e2e_task(case) |
| assert len(ev.actual_answer.strip()) > 0, f"[{ev.task_id}] 回答为空" |
|
|
|
|
| |
| |
| |
|
|
| class TestMultiQueryStability: |
| """ |
| 验证: 连续执行多个查询, 结果稳定且不互相干扰 |
| """ |
|
|
| def test_sequential_queries_all_succeed(self): |
| """连续 10 个查询全部成功""" |
| results = [] |
| for case in MEDICAL_TEST_SUITE: |
| ev = run_e2e_task(case) |
| results.append(ev) |
|
|
| success_count = sum(1 for ev in results if ev.result == TaskResult.FULL_SUCCESS) |
| assert success_count == len(MEDICAL_TEST_SUITE), ( |
| f"连续查询: {success_count}/{len(MEDICAL_TEST_SUITE)} 成功" |
| ) |
|
|
| def test_same_question_returns_consistent_answer(self): |
| """同一问题查两次, 结果一致""" |
| case = MEDICAL_TEST_SUITE[0] |
| ev1 = run_e2e_task(case) |
| ev2 = run_e2e_task(case) |
|
|
| assert ev1.actual_answer == ev2.actual_answer, "同一问题应返回一致的回答" |
| assert ev1.result == ev2.result, "同一问题的评估结果应一致" |
|
|
| def test_redis_cache_across_sequential_queries(self): |
| """通过 Redis 的连续查询: 第一次 miss, 第二次 hit""" |
| mgr = make_redis_manager() |
| case = MEDICAL_TEST_SUITE[0] |
|
|
| call_count = 0 |
| def counting_rag(q): |
| nonlocal call_count |
| call_count += 1 |
| milvus, pdf, neo4j, llm, req = build_mocks_from_case(case) |
| answer, _ = perform_rag_testable(q, milvus, pdf, neo4j, llm, req) |
| return answer |
|
|
| |
| r1 = mgr.get_or_compute(case.question, lambda: counting_rag(case.question)) |
| assert call_count == 1 |
|
|
| |
| r2 = mgr.get_or_compute(case.question, lambda: counting_rag(case.question)) |
| assert call_count == 1, "第二次应命中缓存" |
| assert r1 == r2 |
|
|
| def test_different_categories_no_cross_contamination(self): |
| """不同类别的查询结果不互相污染""" |
| symptom_case = MEDICAL_TEST_SUITE[0] |
| drug_case = MEDICAL_TEST_SUITE[2] |
|
|
| ev_sym = run_e2e_task(symptom_case) |
| ev_drug = run_e2e_task(drug_case) |
|
|
| |
| assert ev_sym.actual_answer != ev_drug.actual_answer, "不同类别应返回不同回答" |
|
|
|
|
| |
| |
| |
|
|
| class TestCompletionRateReport: |
| """ |
| 统计整个 Test Suite 的完成率指标 |
| 这是给面试官/团队展示的核心 "产品指标" |
| """ |
|
|
| def test_overall_completion_rate_above_threshold(self): |
| """ |
| 核心指标: 全量 FULL_SUCCESS 率 ≥ 90% |
| 这是衡量 Agent 能否上线的关键门槛 |
| """ |
| results = [run_e2e_task(case) for case in MEDICAL_TEST_SUITE] |
|
|
| full = sum(1 for ev in results if ev.result == TaskResult.FULL_SUCCESS) |
| partial = sum(1 for ev in results if ev.result == TaskResult.PARTIAL) |
| failed = sum(1 for ev in results if ev.result == TaskResult.FAILED) |
| total = len(results) |
|
|
| rate = full / total |
| assert rate >= 0.9, ( |
| f"\n{'='*60}" |
| f"\n E2E 任务完成率报告" |
| f"\n{'='*60}" |
| f"\n 总任务数: {total}" |
| f"\n 完全成功: {full} ({full/total*100:.0f}%)" |
| f"\n 部分成功: {partial} ({partial/total*100:.0f}%)" |
| f"\n 失败: {failed} ({failed/total*100:.0f}%)" |
| f"\n 完成率: {rate*100:.1f}%" |
| f"\n 门槛: 90%" |
| f"\n{'='*60}" |
| ) |
|
|
| def test_degraded_completion_rate_above_threshold(self): |
| """ |
| 降级场景完成率: Neo4j 挂掉时 ≥ 80% 仍可完成 |
| """ |
| results = [run_e2e_task(case, neo4j_fail=True) for case in MEDICAL_TEST_SUITE] |
|
|
| non_failed = sum(1 for ev in results if ev.result != TaskResult.FAILED) |
| total = len(results) |
| rate = non_failed / total |
|
|
| assert rate >= 0.8, ( |
| f"\nNeo4j 降级完成率: {rate*100:.1f}% (门槛 80%)" |
| ) |
|
|
| def test_zero_crash_rate(self): |
| """零崩溃率: 所有任务都不应产生未捕获的异常""" |
| results = [run_e2e_task(case) for case in MEDICAL_TEST_SUITE] |
| errors = [(ev.task_id, ev.error) for ev in results if ev.error] |
|
|
| assert len(errors) == 0, f"有 {len(errors)} 个任务崩溃: {errors}" |
|
|
| def test_per_category_completion_rates(self): |
| """按类别统计完成率, 每个类别都不应低于 80%""" |
| results = [run_e2e_task(case) for case in MEDICAL_TEST_SUITE] |
|
|
| categories = {} |
| for ev in results: |
| if ev.category not in categories: |
| categories[ev.category] = {"total": 0, "success": 0} |
| categories[ev.category]["total"] += 1 |
| if ev.result == TaskResult.FULL_SUCCESS: |
| categories[ev.category]["success"] += 1 |
|
|
| for cat, stats in categories.items(): |
| rate = stats["success"] / stats["total"] |
| assert rate >= 0.8, ( |
| f"类别 [{cat}] 完成率: {rate*100:.0f}% (门槛 80%)" |
| ) |
|
|
| def test_completion_rate_report_printout(self, capsys): |
| """打印完整的完成率报告 (人类可读)""" |
| results = [run_e2e_task(case) for case in MEDICAL_TEST_SUITE] |
|
|
| |
| categories = {} |
| for ev in results: |
| categories.setdefault(ev.category, []).append(ev) |
|
|
| print("\n") |
| print("=" * 70) |
| print(" 医疗 RAG Agent — E2E 任务完成率报告") |
| print("=" * 70) |
|
|
| total_full, total_partial, total_fail = 0, 0, 0 |
|
|
| for cat, evs in sorted(categories.items()): |
| full = sum(1 for e in evs if e.result == TaskResult.FULL_SUCCESS) |
| partial = sum(1 for e in evs if e.result == TaskResult.PARTIAL) |
| fail = sum(1 for e in evs if e.result == TaskResult.FAILED) |
| total_full += full |
| total_partial += partial |
| total_fail += fail |
|
|
| print(f"\n [{cat}] ({len(evs)} 个任务)") |
| for e in evs: |
| icon = {"full_success": "✅", "partial": "🟡", "failed": "❌"}[e.result.value] |
| kw_info = f"关键词 {len(e.keywords_found)}/{len(e.keywords_found)+len(e.keywords_missing)}" |
| print(f" {icon} {e.task_id}: {e.question[:30]}... | {kw_info}") |
|
|
| total = len(results) |
| print(f"\n{'─' * 70}") |
| print(f" 总计: {total} 个任务") |
| print(f" ✅ 完全成功: {total_full} ({total_full/total*100:.0f}%)") |
| print(f" 🟡 部分成功: {total_partial} ({total_partial/total*100:.0f}%)") |
| print(f" ❌ 失败: {total_fail} ({total_fail/total*100:.0f}%)") |
| print(f" 📊 完成率: {total_full/total*100:.1f}%") |
| print("=" * 70) |
|
|
| |
| assert True |
|
|
|
|
| |
| if __name__ == "__main__": |
| pytest.main([__file__, "-v", "--tb=short", "-s"]) |