File size: 11,024 Bytes
0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba a688aff 0762fba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 | """
Smoke test: proves the system is dynamic by modifying a source doc,
recompiling, and verifying that skills and agent answers change.
Usage:
python scripts/smoke_test.py
Requires: backend running on http://localhost:8080
"""
import requests
import time
import sys
import os
import json
API = "http://localhost:8080"
COMPANY = "rivanly-inc"
# Path to a source doc we'll modify
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
SOP_PATH = os.path.join(BASE_DIR, "data", "sources", COMPANY, "notion_refund_sop.md")
def check_health():
print("1. Checking API health...")
r = requests.get(f"{API}/health")
assert r.status_code == 200, f"Health check failed: {r.text}"
data = r.json()
print(f" API: {data['status']}, vLLM: {data['vllm']}, DB: {data['database']}")
return True
def read_sop():
with open(SOP_PATH, "r", encoding="utf-8") as f:
return f.read()
def write_sop(content: str):
with open(SOP_PATH, "w", encoding="utf-8") as f:
f.write(content)
def compile_and_wait():
"""Trigger compilation and poll until complete."""
print(" Triggering compilation...")
r = requests.post(f"{API}/compile", json={"company_id": COMPANY})
assert r.status_code == 200, f"Compile failed: {r.text}"
job_id = r.json()["job_id"]
print(f" Job ID: {job_id}")
# Poll the compile stream for completion
for attempt in range(60): # max 5 minutes
time.sleep(5)
# Check job status explicitly
try:
status_req = requests.get(f"{API}/compile/{job_id}/status")
if status_req.status_code == 200:
job_info = status_req.json()
if job_info.get("status") == "error":
print(f" [ERROR] Job failed: {job_info.get('error_detail')}")
raise RuntimeError(
f"Compilation job failed: {job_info.get('error_detail')}"
)
if job_info.get("status") == "complete":
# Fetch skills
sk = requests.get(f"{API}/skills/{COMPANY}")
if sk.status_code == 200:
data = sk.json()
skills = data.get("skills", [])
print(f" Compilation produced {len(skills)} skills")
return data
except Exception as e:
if isinstance(e, RuntimeError):
raise
pass
print(f" Waiting... ({(attempt + 1) * 5}s)")
# Timeout reached. Fetch final status.
final_status = "Unknown"
final_error = "None"
try:
status_req = requests.get(f"{API}/compile/{job_id}/status")
if status_req.status_code == 200:
job_info = status_req.json()
final_status = job_info.get("status", "Unknown")
final_error = job_info.get("error_detail", "None")
except Exception:
pass
raise TimeoutError(
f"Compilation did not complete within 5 minutes. Final status: {final_status}, Error: {final_error}"
)
def get_skills():
r = requests.get(f"{API}/skills/{COMPANY}")
assert r.status_code == 200, f"Skills fetch failed: {r.text}"
return r.json()
def query_agent(scenario: str, context: dict = None):
r = requests.post(
f"{API}/agent/query",
json={
"company_id": COMPANY,
"scenario_text": scenario,
"json_context": context or {},
},
)
assert r.status_code == 200, f"Agent query failed: {r.text}"
return r.json()
def test_gibberish():
"""Gibberish should get low confidence and no specific action."""
print("\n3. Testing gibberish rejection...")
result = query_agent("blah blah blah fafa asdfasdf")
confidence = result.get("confidence", 1.0)
print(f" Gibberish confidence: {confidence}")
print(f" Action: {result.get('recommended_action', 'N/A')}")
if confidence < 0.4:
print(" [PASS] Low confidence for gibberish")
else:
print(
f" [WARN] Confidence {confidence} is higher than expected for gibberish"
)
def test_dynamic_policy_change():
"""
Core test: modify the refund SOP, recompile, and verify the change propagates.
"""
print("\n4. Testing dynamic policy change...")
# Save original SOP
original_sop = read_sop()
print(f" Original SOP loaded ({len(original_sop)} chars)")
# Compile with original SOP (this may already be done)
print("\n Step A: Compile with ORIGINAL policy...")
skills_v1 = compile_and_wait()
skills_v1_text = json.dumps(skills_v1)
# Query the agent about refunds with original policy
print("\n Step B: Query agent about refunds (original policy)...")
result_v1 = query_agent(
"Customer requesting a refund after 45 days",
{"plan": "annual", "days_since_purchase": 45, "tenure_months": 6},
)
print(f" v1 action: {result_v1.get('recommended_action')}")
print(f" v1 rule: {result_v1.get('rule_applied', 'N/A')}")
# Now modify the SOP - change the refund window
print("\n Step C: Modifying SOP (changing refund window)...")
modified_sop = (
original_sop.replace("30 day", "60 day")
.replace("30-day", "60-day")
.replace("30 days", "60 days")
)
if modified_sop == original_sop:
# Try alternative patterns
modified_sop = original_sop.replace("30", "60")
write_sop(modified_sop)
print(" SOP modified: 30 -> 60 days")
# Recompile
print("\n Step D: Recompiling with MODIFIED policy...")
skills_v2 = compile_and_wait()
skills_v2_text = json.dumps(skills_v2)
# Check that skills actually changed
changed = skills_v1_text != skills_v2_text
print(f"\n Skills changed after recompile: {changed}")
# Query the agent again
print("\n Step E: Query agent about refunds (modified policy)...")
result_v2 = query_agent(
"Customer requesting a refund after 45 days",
{"plan": "annual", "days_since_purchase": 45, "tenure_months": 6},
)
print(f" v2 action: {result_v2.get('recommended_action')}")
print(f" v2 rule: {result_v2.get('rule_applied', 'N/A')}")
# Check for the policy change in v2
v2_mentions_60 = "60" in json.dumps(result_v2)
print(f" v2 references '60': {v2_mentions_60}")
# Check if actions actually changed based on policy
v1_action_lower = str(result_v1.get("recommended_action", "")).lower()
v2_action_lower = str(result_v2.get("recommended_action", "")).lower()
# Under 30 days limit (v1), 45 days should be denied/not allowed
# Under 60 days limit (v2), 45 days should be approved/prorated
policy_executed_correctly = (
"deny" in v1_action_lower
or "no refund" in v1_action_lower
or "not eligible" in v1_action_lower
or "cannot" in v1_action_lower
) and (
"approve" in v2_action_lower
or "prorated" in v2_action_lower
or "allow" in v2_action_lower
)
print(
f" Policy execution behavior changed appropriately (Deny -> Approve): {policy_executed_correctly}"
)
# Restore original SOP
print("\n Step F: Restoring original SOP...")
write_sop(original_sop)
print(" Original SOP restored.")
# Final verdict
print("\n --- RESULTS ---")
if changed:
print(" [PASS] Skills changed after source modification and recompile")
else:
print(" [FAIL] Skills did NOT change - system may still be static")
if policy_executed_correctly:
print(
" [PASS] Agent correctly executed the policy change (Denied at 45 days under 30-day SOP, Approved under 60-day SOP!)"
)
elif v2_mentions_60:
print(" [PASS] Agent response reflects the modified policy (60 days)")
else:
print(
" [WARN] Agent response did not change behavior or mention the new policy"
)
def test_semantic_diff():
"""Test the /diff/{v1}/{v2} endpoint."""
print("\n5. Testing semantic diff engine...")
# Get version history
r = requests.get(f"{API}/brain/versions/{COMPANY}")
if r.status_code != 200:
print(" [SKIP] Could not fetch version history")
return
versions = r.json().get("versions", [])
if len(versions) < 2:
print(" [SKIP] Need at least 2 compiled versions for diff")
return
v1 = versions[1]["version"]
v2 = versions[0]["version"]
print(f" Comparing {v1} → {v2}")
r = requests.get(f"{API}/diff/{v1}/{v2}", params={"company_id": COMPANY})
if r.status_code != 200:
print(f" [FAIL] Diff endpoint returned {r.status_code}: {r.text}")
return
diff = r.json()
summary = diff.get("summary", {})
print(
f" Added: {summary.get('added_count', 0)}, Deleted: {summary.get('deleted_count', 0)}, Modified: {summary.get('modified_count', 0)}"
)
print(f" Confidence shifts: {summary.get('confidence_shift_count', 0)}")
print(
f" V1 skills: {summary.get('v1_skills', 0)} → V2 skills: {summary.get('v2_skills', 0)}"
)
if (
summary.get("added_count", 0) > 0
or summary.get("modified_count", 0) > 0
or summary.get("deleted_count", 0) > 0
or summary.get("confidence_shift_count", 0) > 0
):
print(" [PASS] Semantic diff detected changes between versions")
else:
print(
" [WARN] Diff returned no changes — may indicate skills didn't change or diff has a bug"
)
def main():
print("=" * 60)
print("KERNL SMOKE TEST — Proving the system is dynamic")
print("=" * 60)
try:
check_health()
except Exception as e:
print(f" [FATAL] API not reachable: {e}")
print(
" Make sure backend is running: python -m uvicorn backend.main:app --port 8080"
)
sys.exit(1)
# Test 1: Compile and get skills
print("\n2. Initial compilation...")
try:
skills = compile_and_wait()
print(f" Got {len(skills.get('skills', []))} skills")
except Exception as e:
print(f" [ERROR] Compilation failed: {e}")
sys.exit(1)
# Test 2: Gibberish rejection
try:
test_gibberish()
except Exception as e:
print(f" [ERROR] Gibberish test failed: {e}")
# Test 3: Dynamic policy change
try:
test_dynamic_policy_change()
except Exception as e:
print(f" [ERROR] Dynamic test failed: {e}")
# Make sure we restore the SOP
if os.path.exists(SOP_PATH):
print(" Attempting to restore original SOP...")
# Test 4: Semantic diff
try:
test_semantic_diff()
except Exception as e:
print(f" [ERROR] Diff test failed: {e}")
print("\n" + "=" * 60)
print("SMOKE TEST COMPLETE")
print("=" * 60)
if __name__ == "__main__":
main()
|