agentV2 / test /test6.py
drewli20200316's picture
Add test/test6.py
2d0bb6a verified
"""
================================================================
医疗 RAG Agent — Cost & Efficiency 评测 (成本与效率)
================================================================
测试层级:
单元测试 (test1.py): 单工具调用准确性 ✅ 67 passed
集成测试 (test2.py): 多步骤工具链协作 ✅ 37 passed
回归测试 (test3.py): 防退化 & 边界守护 ✅ 52 passed
安全红队 (test4.py): 对抗性攻击防御 ✅ 45 passed
E2E完成率(test5.py): 端到端任务完成率 ✅ 60 passed
成本效率 (test6.py): Cost & Efficiency ← 当前文件
为什么要测成本?
Agent 每回答一个问题要: 1次 Embedding + 1次 Milvus + 1次 PDF +
2次 Cypher API + 1次 Neo4j + 1次 LLM = 至少 7 次外部调用
在生产环境中, 这些调用直接关系 token 消耗和 API 费用
测试维度:
维度 1: 外部调用次数审计 (每次查询调了几次 API?)
维度 2: Token 消耗估算 (Prompt + Response 共多少 token?)
维度 3: 缓存节省量化 (Redis 命中省了多少调用?)
维度 4: 降级场景的成本影响 (组件故障时成本变化)
维度 5: 成本报告 (人类可读的费用估算)
运行:
pytest test6.py -v --tb=short -s
pytest test6.py -v -k "call_count" # 调用次数
pytest test6.py -v -k "token" # Token 消耗
pytest test6.py -v -k "cache_saving" # 缓存节省
================================================================
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
import types
import pytest
import json
import hashlib
import time
from unittest.mock import MagicMock, patch, call
from dataclasses import dataclass, field
from typing import Optional, List, Dict
# ================================================================
# 前置: Mock 缺失依赖
# ================================================================
def _ensure_mock_module(name):
if name not in sys.modules:
sys.modules[name] = MagicMock()
for mod in [
"langchain_classic", "langchain_classic.retrievers",
"langchain_classic.retrievers.parent_document_retriever",
"langchain_milvus", "langchain_text_splitters",
"langchain_core", "langchain_core.stores", "langchain_core.documents",
"langchain.embeddings", "langchain.embeddings.base",
"neo4j", "dotenv", "uvicorn",
"fastapi", "fastapi.middleware", "fastapi.middleware.cors",
]:
_ensure_mock_module(mod)
class _FakeEmbeddingsBase:
pass
sys.modules["langchain.embeddings.base"].Embeddings = _FakeEmbeddingsBase
# ================================================================
# 基础设施
# ================================================================
@dataclass
class FakeDocument:
page_content: str
metadata: dict = field(default_factory=dict)
class FakeChatResponse:
def __init__(self, content):
msg = type('Msg', (), {'content': content})()
choice = type('Choice', (), {'message': msg})()
self.choices = [choice]
class FakeRedisClient:
def __init__(self):
self._store = {}
self._expiry = {}
def ping(self): return True
def get(self, key): return self._store.get(key)
def set(self, key, value, ex=None, nx=False):
if nx and key in self._store: return False
self._store[key] = value
if ex: self._expiry[key] = ex
return True
def setex(self, key, expire, value):
self._store[key] = value; self._expiry[key] = expire; return True
def delete(self, key): return 1 if self._store.pop(key, None) is not None else 0
def register_script(self, script):
def f(keys=None, args=None):
if keys and args and self._store.get(keys[0]) == args[0]:
del self._store[keys[0]]; return 1
return 0
return f
def make_redis_manager():
from new_redis import RedisClientWrapper
RedisClientWrapper._pool = "FAKE"
mgr = object.__new__(RedisClientWrapper)
mgr.client = FakeRedisClient()
mgr.unlock_script = mgr.client.register_script("")
return mgr
# ================================================================
# 成本追踪器: 记录所有外部调用和资源消耗
# ================================================================
@dataclass
class CostTracker:
"""追踪单次查询的全部外部调用和资源消耗"""
# 调用次数
milvus_calls: int = 0
pdf_calls: int = 0
cypher_generate_calls: int = 0
cypher_validate_calls: int = 0
neo4j_session_calls: int = 0
llm_calls: int = 0
redis_get_calls: int = 0
redis_set_calls: int = 0
# Token 估算 (中文约 1 token ≈ 1.5 字符)
prompt_chars: int = 0
response_chars: int = 0
# 时间
start_time: float = 0.0
end_time: float = 0.0
@property
def total_external_calls(self) -> int:
return (self.milvus_calls + self.pdf_calls +
self.cypher_generate_calls + self.cypher_validate_calls +
self.neo4j_session_calls + self.llm_calls)
@property
def estimated_prompt_tokens(self) -> int:
"""粗估 prompt token 数 (中文 ≈ 1.5 字符/token)"""
return int(self.prompt_chars / 1.5) if self.prompt_chars else 0
@property
def estimated_response_tokens(self) -> int:
return int(self.response_chars / 1.5) if self.response_chars else 0
@property
def estimated_total_tokens(self) -> int:
return self.estimated_prompt_tokens + self.estimated_response_tokens
@property
def elapsed_ms(self) -> float:
return (self.end_time - self.start_time) * 1000 if self.end_time else 0
def estimated_cost_usd(self, model="gpt-4o-mini") -> float:
"""
估算 API 费用 (USD)
gpt-4o-mini: $0.15/1M input + $0.60/1M output
gpt-4o: $2.50/1M input + $10.00/1M output
text-embedding-3-small: $0.02/1M tokens
"""
pricing = {
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
"gpt-4o": {"input": 2.50, "output": 10.00},
}
p = pricing.get(model, pricing["gpt-4o-mini"])
input_cost = self.estimated_prompt_tokens * p["input"] / 1_000_000
output_cost = self.estimated_response_tokens * p["output"] / 1_000_000
# Embedding 调用 (1次/查询)
embed_cost = 50 * 0.02 / 1_000_000 # ~50 tokens per query
return input_cost + output_cost + embed_cost
def build_tracked_mocks(tracker: CostTracker, neo4j_fail=False):
"""构建带调用计数的 Mock 组件"""
# Milvus
milvus = MagicMock()
def milvus_search(*args, **kwargs):
tracker.milvus_calls += 1
return [FakeDocument(page_content="高血压患者应控制钠摄入量不超过5克")]
milvus.similarity_search.side_effect = milvus_search
# PDF
pdf = MagicMock()
def pdf_invoke(*args, **kwargs):
tracker.pdf_calls += 1
return [FakeDocument(page_content="《中国高血压防治指南》建议低盐低脂饮食")]
pdf.invoke.side_effect = pdf_invoke
# Neo4j Driver
neo4j_driver = MagicMock()
sess = MagicMock()
def neo4j_run(*args, **kwargs):
tracker.neo4j_session_calls += 1
if neo4j_fail:
raise Exception("Neo4j down")
return [("氨氯地平",), ("缬沙坦",)]
sess.run.side_effect = neo4j_run
neo4j_driver.session.return_value.__enter__ = MagicMock(return_value=sess)
neo4j_driver.session.return_value.__exit__ = MagicMock(return_value=False)
# Cypher API (requests)
req = MagicMock()
call_index = [0]
def req_post(url, *args, **kwargs):
if neo4j_fail:
raise ConnectionError("Cypher API down")
if "/generate" in url:
tracker.cypher_generate_calls += 1
resp = MagicMock(); resp.status_code = 200
resp.json.return_value = {
"cypher_query": "MATCH (d:Disease)-[:has_drug]->(m) RETURN m.name",
"confidence": 0.95, "validated": True,
}
return resp
elif "/validate" in url:
tracker.cypher_validate_calls += 1
resp = MagicMock(); resp.status_code = 200
resp.json.return_value = {"is_valid": True}
return resp
req.post.side_effect = req_post
# LLM
llm = MagicMock()
def llm_create(*args, **kwargs):
tracker.llm_calls += 1
prompt = kwargs.get("messages", [{}])[0].get("content", "")
tracker.prompt_chars = len(prompt)
answer = "高血压患者应避免高盐饮食, 建议每日钠摄入不超过5克, 常用药物包括氨氯地平、缬沙坦等。"
tracker.response_chars = len(answer)
return FakeChatResponse(answer)
llm.chat.completions.create.side_effect = llm_create
return milvus, pdf, neo4j_driver, llm, req
def perform_rag_tracked(query, milvus, pdf, neo4j_driver, llm, requests_module):
"""依赖注入版 perform_rag_and_llm"""
import json as _json
try:
results = milvus.similarity_search(query, k=10, ranker_type="rrf", ranker_params={"k": 100})
context = "\n\n".join(d.page_content for d in results) if results else ""
except Exception:
context = ""
pdf_res = ""
try:
docs = pdf.invoke(query)
if docs and len(docs) >= 1:
pdf_res = docs[0].page_content
except Exception:
pass
context = context + "\n" + pdf_res
neo4j_res = ""
try:
resp = requests_module.post("http://0.0.0.0:8101/generate",
_json.dumps({"natural_language_query": query}))
if resp.status_code == 200:
d = resp.json()
if d["cypher_query"] and float(d["confidence"]) >= 0.9 and d["validated"]:
vresp = requests_module.post("http://0.0.0.0:8101/validate",
_json.dumps({"cypher_query": d["cypher_query"]}))
if vresp.status_code == 200 and vresp.json()["is_valid"]:
with neo4j_driver.session() as session:
try:
record = session.run(d["cypher_query"])
neo4j_res = ','.join(list(map(lambda x: x[0], record)))
except Exception:
neo4j_res = ""
except Exception:
pass
context = context + "\n" + neo4j_res
SYSTEM = "System: 你是一个非常得力的医学助手, 你可以通过从数据库中检索出的信息找到问题的答案."
USER = f"""User: 利用介于<context>和</context>之间的信息来回答问题.
<context>
{context}
</context>
<question>
{query}
</question>"""
response = llm.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": SYSTEM + USER}],
temperature=0.7,
)
return response.choices[0].message.content
def run_tracked_query(query="高血压不能吃什么?", neo4j_fail=False) -> CostTracker:
"""执行一次查询并返回成本追踪数据"""
tracker = CostTracker()
milvus, pdf, neo4j, llm, req = build_tracked_mocks(tracker, neo4j_fail=neo4j_fail)
tracker.start_time = time.time()
perform_rag_tracked(query, milvus, pdf, neo4j, llm, req)
tracker.end_time = time.time()
return tracker
# ================================================================
# 维度 1: 外部调用次数审计
# ================================================================
class TestExternalCallCount:
"""
核心问题: 回答一个问题到底调了多少次外部 API?
每多一次调用 = 多一份延迟 + 多一份费用 + 多一个故障点
"""
def test_normal_query_call_count(self):
"""正常查询: 精确审计每个组件的调用次数"""
t = run_tracked_query()
assert t.milvus_calls == 1, f"Milvus 应调 1 次, 实际 {t.milvus_calls}"
assert t.pdf_calls == 1, f"PDF 应调 1 次, 实际 {t.pdf_calls}"
assert t.cypher_generate_calls == 1, f"Cypher /generate 应调 1 次, 实际 {t.cypher_generate_calls}"
assert t.cypher_validate_calls == 1, f"Cypher /validate 应调 1 次, 实际 {t.cypher_validate_calls}"
assert t.neo4j_session_calls == 1, f"Neo4j session.run 应调 1 次, 实际 {t.neo4j_session_calls}"
assert t.llm_calls == 1, f"LLM 应调 1 次, 实际 {t.llm_calls}"
def test_total_external_calls_is_six(self):
"""正常查询总外部调用次数 = 6"""
t = run_tracked_query()
assert t.total_external_calls == 6, (
f"总外部调用应为 6, 实际 {t.total_external_calls}"
f"\n Milvus={t.milvus_calls}, PDF={t.pdf_calls},"
f" Cypher生成={t.cypher_generate_calls}, Cypher校验={t.cypher_validate_calls},"
f" Neo4j={t.neo4j_session_calls}, LLM={t.llm_calls}"
)
def test_no_duplicate_llm_calls(self):
"""LLM 严格只调 1 次 (最贵的组件)"""
t = run_tracked_query()
assert t.llm_calls == 1, f"LLM 不应重复调用, 实际 {t.llm_calls}"
def test_neo4j_down_reduces_calls(self):
"""Neo4j 宕机: 减少 3 次外部调用 (generate + validate + session)"""
t = run_tracked_query(neo4j_fail=True)
assert t.cypher_generate_calls == 0, "Cypher API 不可用时不应有 /generate 调用"
assert t.cypher_validate_calls == 0, "Cypher API 不可用时不应有 /validate 调用"
assert t.neo4j_session_calls == 0, "Cypher API 不可用时不应有 session.run"
assert t.total_external_calls == 3, (
f"Neo4j 宕机时总调用应为 3 (Milvus+PDF+LLM), 实际 {t.total_external_calls}"
)
def test_multiple_queries_each_has_own_calls(self):
"""多个查询: 每个查询独立计数"""
trackers = [run_tracked_query(f"问题{i}") for i in range(5)]
for i, t in enumerate(trackers):
assert t.llm_calls == 1, f"查询 {i}: LLM 调用应为 1"
assert t.total_external_calls == 6, f"查询 {i}: 总调用应为 6"
def test_embedding_call_per_milvus_search(self):
"""
每次 Milvus similarity_search 内部会调用 1 次 Embedding
(由 Milvus SDK 内部处理, 这里验证 Milvus 调用次数)
"""
t = run_tracked_query()
# Milvus 的 similarity_search 内部封装了 Embedding 调用
# 1 次 Milvus search = 1 次 Embedding (隐含)
assert t.milvus_calls == 1, "每次查询应只触发 1 次 Milvus 搜索 (含 1 次 Embedding)"
# ================================================================
# 维度 2: Token 消耗估算
# ================================================================
class TestTokenConsumption:
"""
核心问题: 每次查询消耗多少 token?
token 是 LLM 计费的直接单位
"""
def test_prompt_token_count_reasonable(self):
"""Prompt token 数在合理范围 (50-2000)"""
t = run_tracked_query()
tokens = t.estimated_prompt_tokens
assert 50 <= tokens <= 2000, f"Prompt tokens {tokens} 超出合理范围 [50, 2000]"
def test_response_token_count_reasonable(self):
"""Response token 数在合理范围 (5-500)"""
t = run_tracked_query()
tokens = t.estimated_response_tokens
assert 5 <= tokens <= 500, f"Response tokens {tokens} 超出合理范围 [5, 500]"
def test_total_token_count_per_query(self):
"""单次查询总 token 数 < 3000 (gpt-4o-mini 上下文窗口远大于此)"""
t = run_tracked_query()
total = t.estimated_total_tokens
assert total < 3000, f"单次查询 token {total} 不应超过 3000"
def test_prompt_is_largest_cost_component(self):
"""Prompt token 应占总 token 的大部分 (>60%)"""
t = run_tracked_query()
if t.estimated_total_tokens > 0:
prompt_ratio = t.estimated_prompt_tokens / t.estimated_total_tokens
assert prompt_ratio > 0.6, (
f"Prompt 占比 {prompt_ratio:.1%}, 应 >60% (context 是大头)"
)
def test_longer_query_means_more_tokens(self):
"""更长的问题 → 更多的 prompt token"""
t_short = run_tracked_query("高血压")
t_long = run_tracked_query("请详细介绍高血压的所有相关症状以及对应的治疗方案和饮食建议")
# 问题更长, prompt 应更大 (因为 query 出现在 <question> 中)
assert t_long.prompt_chars >= t_short.prompt_chars, (
f"长问题 prompt ({t_long.prompt_chars}) 应 ≥ 短问题 ({t_short.prompt_chars})"
)
def test_context_contributes_most_tokens(self):
"""Context (三路召回内容) 是 prompt 中 token 最大的来源"""
t = run_tracked_query()
# 验证 prompt 中包含了 context 内容 (通过 prompt 长度 > 纯模板)
# 纯模板 (System + User + 标签) ≈ 120 字符
pure_template = 120
context_chars = t.prompt_chars - pure_template
assert context_chars > 0, "Context 应为 prompt 贡献内容"
context_ratio = context_chars / t.prompt_chars
assert context_ratio > 0.3, (
f"Context 占 prompt 比例 {context_ratio:.1%}, 应 >30%"
f"\n (Mock 数据较短; 生产环境 context 占比通常 >70%)"
)
# ================================================================
# 维度 3: 缓存节省量化
# ================================================================
class TestCacheSavings:
"""
核心问题: Redis 缓存帮我们省了多少钱?
每次缓存命中 = 省了 6 次外部调用
"""
def test_cache_hit_saves_all_external_calls(self):
"""缓存命中: 0 次外部调用 (省了 6 次)"""
mgr = make_redis_manager()
first_tracker = CostTracker()
milvus, pdf, neo4j, llm, req = build_tracked_mocks(first_tracker)
def first_rag():
return perform_rag_tracked("高血压", milvus, pdf, neo4j, llm, req)
# 第一次: Miss, 走 RAG
mgr.get_or_compute("高血压", first_rag)
assert first_tracker.total_external_calls == 6
# 第二次: Hit, 不走 RAG
second_tracker = CostTracker()
milvus2, pdf2, neo4j2, llm2, req2 = build_tracked_mocks(second_tracker)
def second_rag():
return perform_rag_tracked("高血压", milvus2, pdf2, neo4j2, llm2, req2)
mgr.get_or_compute("高血压", second_rag)
assert second_tracker.total_external_calls == 0, (
f"缓存命中时不应有外部调用, 实际 {second_tracker.total_external_calls}"
)
def test_cache_saves_llm_cost(self):
"""缓存命中: 节省 LLM 调用费用"""
mgr = make_redis_manager()
first_t = CostTracker()
m, p, n, l, r = build_tracked_mocks(first_t)
mgr.get_or_compute("Q1", lambda: perform_rag_tracked("Q1", m, p, n, l, r))
second_t = CostTracker()
m2, p2, n2, l2, r2 = build_tracked_mocks(second_t)
mgr.get_or_compute("Q1", lambda: perform_rag_tracked("Q1", m2, p2, n2, l2, r2))
assert first_t.llm_calls == 1, "第一次应调 LLM"
assert second_t.llm_calls == 0, "第二次缓存命中, 不应调 LLM"
def test_ten_queries_same_question_only_one_rag(self):
"""同一问题查 10 次, 只走 1 次 RAG"""
mgr = make_redis_manager()
total_llm_calls = 0
for i in range(10):
t = CostTracker()
m, p, n, l, r = build_tracked_mocks(t)
mgr.get_or_compute("重复问题", lambda: perform_rag_tracked("重复问题", m, p, n, l, r))
total_llm_calls += t.llm_calls
assert total_llm_calls == 1, f"10 次查询只应调 1 次 LLM, 实际 {total_llm_calls}"
def test_cache_saving_ratio_over_batch(self):
"""批量查询: 50% 重复率 → 节省约 50% 的外部调用"""
mgr = make_redis_manager()
questions = ["Q1", "Q2", "Q3", "Q4", "Q5"] * 2 # 10 次查询, 5 个不同问题
total_external = 0
for q in questions:
t = CostTracker()
m, p, n, l, r = build_tracked_mocks(t)
mgr.get_or_compute(q, lambda: perform_rag_tracked(q, m, p, n, l, r))
total_external += t.total_external_calls
# 5 个唯一问题 × 6 次调用 = 30 次; 5 个重复 × 0 次 = 0; 总计 30
no_cache_total = len(questions) * 6 # 60 (如果没缓存)
saving_ratio = 1 - (total_external / no_cache_total)
assert saving_ratio >= 0.4, (
f"缓存节省率 {saving_ratio:.1%}, 预期 ≥40%"
f"\n 实际总调用: {total_external}, 无缓存总调用: {no_cache_total}"
)
def test_cache_saving_dollar_estimate(self):
"""估算缓存节省的美元费用"""
t = run_tracked_query()
cost_per_query = t.estimated_cost_usd()
# 假设每天 1000 次查询, 50% 缓存命中率
daily_queries = 1000
hit_rate = 0.5
daily_cost_no_cache = daily_queries * cost_per_query
daily_cost_with_cache = daily_queries * (1 - hit_rate) * cost_per_query
daily_savings = daily_cost_no_cache - daily_cost_with_cache
# 只验证计算逻辑正确
assert daily_savings > 0, "缓存应节省费用"
assert daily_savings == daily_cost_no_cache * hit_rate
# ================================================================
# 维度 4: 降级场景的成本影响
# ================================================================
class TestDegradedCost:
"""
组件故障不仅影响质量, 也影响成本
部分降级 → 调用次数减少 → 费用降低 (但质量也降低)
"""
def test_neo4j_down_saves_three_calls(self):
"""Neo4j 宕机: 节省 3 次调用 (generate + validate + session)"""
t_normal = run_tracked_query(neo4j_fail=False)
t_degraded = run_tracked_query(neo4j_fail=True)
saved = t_normal.total_external_calls - t_degraded.total_external_calls
assert saved == 3, f"Neo4j 宕机应节省 3 次调用, 实际节省 {saved}"
def test_degraded_cost_is_lower(self):
"""降级时 LLM prompt 更短 (没有 Neo4j context) → token 更少"""
t_normal = run_tracked_query(neo4j_fail=False)
t_degraded = run_tracked_query(neo4j_fail=True)
# Neo4j 结果不在 context 中, prompt 更短
assert t_degraded.prompt_chars <= t_normal.prompt_chars, (
f"降级时 prompt 应更短: 降级={t_degraded.prompt_chars}, 正常={t_normal.prompt_chars}"
)
def test_llm_still_called_once_even_when_degraded(self):
"""降级时 LLM 仍然只调 1 次"""
t = run_tracked_query(neo4j_fail=True)
assert t.llm_calls == 1, "降级时 LLM 仍应只调 1 次"
def test_cost_comparison_normal_vs_degraded(self):
"""正常 vs 降级的成本对比"""
t_normal = run_tracked_query(neo4j_fail=False)
t_degraded = run_tracked_query(neo4j_fail=True)
cost_normal = t_normal.estimated_cost_usd()
cost_degraded = t_degraded.estimated_cost_usd()
# 降级成本应 ≤ 正常成本 (少了 context)
assert cost_degraded <= cost_normal, (
f"降级费用 ${cost_degraded:.6f} 应 ≤ 正常费用 ${cost_normal:.6f}"
)
# ================================================================
# 维度 5: 成本效率报告
# ================================================================
class TestCostEfficiencyReport:
"""生成人类可读的成本效率报告"""
def test_single_query_cost_breakdown(self):
"""单次查询成本明细"""
t = run_tracked_query()
assert t.total_external_calls > 0
assert t.estimated_total_tokens > 0
assert t.estimated_cost_usd() >= 0
def test_batch_efficiency_metrics(self):
"""批量查询效率指标"""
trackers = [run_tracked_query(f"问题{i}") for i in range(10)]
avg_calls = sum(t.total_external_calls for t in trackers) / len(trackers)
avg_tokens = sum(t.estimated_total_tokens for t in trackers) / len(trackers)
avg_cost = sum(t.estimated_cost_usd() for t in trackers) / len(trackers)
assert avg_calls == 6, f"平均调用次数应为 6, 实际 {avg_calls}"
assert avg_tokens > 0, "平均 token 应 > 0"
assert avg_cost > 0, "平均费用应 > 0"
def test_model_cost_comparison(self):
"""不同模型的费用对比: gpt-4o-mini vs gpt-4o"""
t = run_tracked_query()
cost_mini = t.estimated_cost_usd("gpt-4o-mini")
cost_4o = t.estimated_cost_usd("gpt-4o")
assert cost_4o > cost_mini, "gpt-4o 应比 gpt-4o-mini 贵"
ratio = cost_4o / cost_mini if cost_mini > 0 else float('inf')
assert ratio > 5, f"gpt-4o 应比 mini 贵 5 倍以上, 实际 {ratio:.1f} 倍"
def test_cost_report_printout(self, capsys):
"""打印完整成本效率报告"""
t = run_tracked_query("高血压不能吃什么?")
print("\n")
print("=" * 70)
print(" 医疗 RAG Agent — Cost & Efficiency 报告")
print("=" * 70)
print(f"\n 📋 查询: '高血压不能吃什么?'")
print(f"\n ── 外部调用明细 ──")
print(f" Milvus 向量搜索: {t.milvus_calls} 次")
print(f" PDF 父子检索: {t.pdf_calls} 次")
print(f" Cypher /generate: {t.cypher_generate_calls} 次")
print(f" Cypher /validate: {t.cypher_validate_calls} 次")
print(f" Neo4j session.run: {t.neo4j_session_calls} 次")
print(f" LLM 推理: {t.llm_calls} 次")
print(f" ────────────────────────────")
print(f" 总外部调用: {t.total_external_calls} 次")
print(f"\n ── Token 消耗 ──")
print(f" Prompt: ~{t.estimated_prompt_tokens} tokens ({t.prompt_chars} 字符)")
print(f" Response: ~{t.estimated_response_tokens} tokens ({t.response_chars} 字符)")
print(f" 总计: ~{t.estimated_total_tokens} tokens")
print(f"\n ── 费用估算 (per query) ──")
print(f" gpt-4o-mini: ${t.estimated_cost_usd('gpt-4o-mini'):.6f}")
print(f" gpt-4o: ${t.estimated_cost_usd('gpt-4o'):.6f}")
# 月度预估
daily = 1000
monthly = daily * 30
hit_rate = 0.5
effective_queries = monthly * (1 - hit_rate)
print(f"\n ── 月度预估 (日均 {daily} 查询, 缓存命中率 {hit_rate:.0%}) ──")
print(f" 有效 LLM 调用: {int(effective_queries)} 次/月")
print(f" gpt-4o-mini 月费: ${effective_queries * t.estimated_cost_usd('gpt-4o-mini'):.2f}")
print(f" gpt-4o 月费: ${effective_queries * t.estimated_cost_usd('gpt-4o'):.2f}")
print(f" 缓存节省: {hit_rate:.0%} ({int(monthly * hit_rate)} 次 LLM 调用)")
# 降级对比
t_deg = run_tracked_query(neo4j_fail=True)
print(f"\n ── 降级场景对比 ──")
print(f" 正常: {t.total_external_calls} 次调用, ~{t.estimated_total_tokens} tokens, ${t.estimated_cost_usd():.6f}")
print(f" 降级: {t_deg.total_external_calls} 次调用, ~{t_deg.estimated_total_tokens} tokens, ${t_deg.estimated_cost_usd():.6f}")
print("=" * 70)
assert True # 报告打印成功即通过
# ================================================================
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short", "-s"])