File size: 11,024 Bytes
0762fba
 
 
 
 
 
 
 
 
a688aff
0762fba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a688aff
0762fba
 
 
 
 
 
 
a688aff
 
 
0762fba
 
 
 
 
 
 
 
 
 
 
 
 
a688aff
0762fba
 
 
 
 
 
 
 
 
 
 
 
 
a688aff
 
 
0762fba
 
 
 
 
 
 
 
 
a688aff
 
 
 
 
 
 
 
0762fba
 
 
 
 
 
 
 
 
 
 
 
 
 
a688aff
 
 
0762fba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a688aff
0762fba
 
 
 
 
 
a688aff
 
 
 
 
0762fba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a688aff
0762fba
 
 
 
 
 
 
 
 
 
 
a688aff
0762fba
 
a688aff
 
 
 
 
 
 
 
 
 
 
 
 
0762fba
 
 
 
 
 
 
 
 
 
 
 
 
 
a688aff
 
 
0762fba
 
 
a688aff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0762fba
 
 
 
 
 
 
 
 
 
 
a688aff
 
 
0762fba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a688aff
 
 
 
 
 
0762fba
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
"""
Smoke test: proves the system is dynamic by modifying a source doc,
recompiling, and verifying that skills and agent answers change.

Usage:
    python scripts/smoke_test.py

Requires: backend running on http://localhost:8080
"""

import requests
import time
import sys
import os
import json

API = "http://localhost:8080"
COMPANY = "rivanly-inc"

# Path to a source doc we'll modify
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
SOP_PATH = os.path.join(BASE_DIR, "data", "sources", COMPANY, "notion_refund_sop.md")


def check_health():
    print("1. Checking API health...")
    r = requests.get(f"{API}/health")
    assert r.status_code == 200, f"Health check failed: {r.text}"
    data = r.json()
    print(f"   API: {data['status']}, vLLM: {data['vllm']}, DB: {data['database']}")
    return True


def read_sop():
    with open(SOP_PATH, "r", encoding="utf-8") as f:
        return f.read()


def write_sop(content: str):
    with open(SOP_PATH, "w", encoding="utf-8") as f:
        f.write(content)


def compile_and_wait():
    """Trigger compilation and poll until complete."""
    print("   Triggering compilation...")
    r = requests.post(f"{API}/compile", json={"company_id": COMPANY})
    assert r.status_code == 200, f"Compile failed: {r.text}"
    job_id = r.json()["job_id"]
    print(f"   Job ID: {job_id}")

    # Poll the compile stream for completion
    for attempt in range(60):  # max 5 minutes
        time.sleep(5)

        # Check job status explicitly
        try:
            status_req = requests.get(f"{API}/compile/{job_id}/status")
            if status_req.status_code == 200:
                job_info = status_req.json()
                if job_info.get("status") == "error":
                    print(f"   [ERROR] Job failed: {job_info.get('error_detail')}")
                    raise RuntimeError(
                        f"Compilation job failed: {job_info.get('error_detail')}"
                    )
                if job_info.get("status") == "complete":
                    # Fetch skills
                    sk = requests.get(f"{API}/skills/{COMPANY}")
                    if sk.status_code == 200:
                        data = sk.json()
                        skills = data.get("skills", [])
                        print(f"   Compilation produced {len(skills)} skills")
                        return data
        except Exception as e:
            if isinstance(e, RuntimeError):
                raise
            pass

        print(f"   Waiting... ({(attempt + 1) * 5}s)")

    # Timeout reached. Fetch final status.
    final_status = "Unknown"
    final_error = "None"
    try:
        status_req = requests.get(f"{API}/compile/{job_id}/status")
        if status_req.status_code == 200:
            job_info = status_req.json()
            final_status = job_info.get("status", "Unknown")
            final_error = job_info.get("error_detail", "None")
    except Exception:
        pass

    raise TimeoutError(
        f"Compilation did not complete within 5 minutes. Final status: {final_status}, Error: {final_error}"
    )


def get_skills():
    r = requests.get(f"{API}/skills/{COMPANY}")
    assert r.status_code == 200, f"Skills fetch failed: {r.text}"
    return r.json()


def query_agent(scenario: str, context: dict = None):
    r = requests.post(
        f"{API}/agent/query",
        json={
            "company_id": COMPANY,
            "scenario_text": scenario,
            "json_context": context or {},
        },
    )
    assert r.status_code == 200, f"Agent query failed: {r.text}"
    return r.json()


def test_gibberish():
    """Gibberish should get low confidence and no specific action."""
    print("\n3. Testing gibberish rejection...")
    result = query_agent("blah blah blah fafa asdfasdf")
    confidence = result.get("confidence", 1.0)
    print(f"   Gibberish confidence: {confidence}")
    print(f"   Action: {result.get('recommended_action', 'N/A')}")
    if confidence < 0.4:
        print("   [PASS] Low confidence for gibberish")
    else:
        print(
            f"   [WARN] Confidence {confidence} is higher than expected for gibberish"
        )


def test_dynamic_policy_change():
    """
    Core test: modify the refund SOP, recompile, and verify the change propagates.
    """
    print("\n4. Testing dynamic policy change...")

    # Save original SOP
    original_sop = read_sop()
    print(f"   Original SOP loaded ({len(original_sop)} chars)")

    # Compile with original SOP (this may already be done)
    print("\n   Step A: Compile with ORIGINAL policy...")
    skills_v1 = compile_and_wait()
    skills_v1_text = json.dumps(skills_v1)

    # Query the agent about refunds with original policy
    print("\n   Step B: Query agent about refunds (original policy)...")
    result_v1 = query_agent(
        "Customer requesting a refund after 45 days",
        {"plan": "annual", "days_since_purchase": 45, "tenure_months": 6},
    )
    print(f"   v1 action: {result_v1.get('recommended_action')}")
    print(f"   v1 rule: {result_v1.get('rule_applied', 'N/A')}")

    # Now modify the SOP - change the refund window
    print("\n   Step C: Modifying SOP (changing refund window)...")
    modified_sop = (
        original_sop.replace("30 day", "60 day")
        .replace("30-day", "60-day")
        .replace("30 days", "60 days")
    )
    if modified_sop == original_sop:
        # Try alternative patterns
        modified_sop = original_sop.replace("30", "60")

    write_sop(modified_sop)
    print("   SOP modified: 30 -> 60 days")

    # Recompile
    print("\n   Step D: Recompiling with MODIFIED policy...")
    skills_v2 = compile_and_wait()
    skills_v2_text = json.dumps(skills_v2)

    # Check that skills actually changed
    changed = skills_v1_text != skills_v2_text
    print(f"\n   Skills changed after recompile: {changed}")

    # Query the agent again
    print("\n   Step E: Query agent about refunds (modified policy)...")
    result_v2 = query_agent(
        "Customer requesting a refund after 45 days",
        {"plan": "annual", "days_since_purchase": 45, "tenure_months": 6},
    )
    print(f"   v2 action: {result_v2.get('recommended_action')}")
    print(f"   v2 rule: {result_v2.get('rule_applied', 'N/A')}")

    # Check for the policy change in v2
    v2_mentions_60 = "60" in json.dumps(result_v2)
    print(f"   v2 references '60': {v2_mentions_60}")

    # Check if actions actually changed based on policy
    v1_action_lower = str(result_v1.get("recommended_action", "")).lower()
    v2_action_lower = str(result_v2.get("recommended_action", "")).lower()

    # Under 30 days limit (v1), 45 days should be denied/not allowed
    # Under 60 days limit (v2), 45 days should be approved/prorated
    policy_executed_correctly = (
        "deny" in v1_action_lower
        or "no refund" in v1_action_lower
        or "not eligible" in v1_action_lower
        or "cannot" in v1_action_lower
    ) and (
        "approve" in v2_action_lower
        or "prorated" in v2_action_lower
        or "allow" in v2_action_lower
    )
    print(
        f"   Policy execution behavior changed appropriately (Deny -> Approve): {policy_executed_correctly}"
    )

    # Restore original SOP
    print("\n   Step F: Restoring original SOP...")
    write_sop(original_sop)
    print("   Original SOP restored.")

    # Final verdict
    print("\n   --- RESULTS ---")
    if changed:
        print("   [PASS] Skills changed after source modification and recompile")
    else:
        print("   [FAIL] Skills did NOT change - system may still be static")

    if policy_executed_correctly:
        print(
            "   [PASS] Agent correctly executed the policy change (Denied at 45 days under 30-day SOP, Approved under 60-day SOP!)"
        )
    elif v2_mentions_60:
        print("   [PASS] Agent response reflects the modified policy (60 days)")
    else:
        print(
            "   [WARN] Agent response did not change behavior or mention the new policy"
        )


def test_semantic_diff():
    """Test the /diff/{v1}/{v2} endpoint."""
    print("\n5. Testing semantic diff engine...")

    # Get version history
    r = requests.get(f"{API}/brain/versions/{COMPANY}")
    if r.status_code != 200:
        print("   [SKIP] Could not fetch version history")
        return

    versions = r.json().get("versions", [])
    if len(versions) < 2:
        print("   [SKIP] Need at least 2 compiled versions for diff")
        return

    v1 = versions[1]["version"]
    v2 = versions[0]["version"]
    print(f"   Comparing {v1}{v2}")

    r = requests.get(f"{API}/diff/{v1}/{v2}", params={"company_id": COMPANY})
    if r.status_code != 200:
        print(f"   [FAIL] Diff endpoint returned {r.status_code}: {r.text}")
        return

    diff = r.json()
    summary = diff.get("summary", {})
    print(
        f"   Added: {summary.get('added_count', 0)}, Deleted: {summary.get('deleted_count', 0)}, Modified: {summary.get('modified_count', 0)}"
    )
    print(f"   Confidence shifts: {summary.get('confidence_shift_count', 0)}")
    print(
        f"   V1 skills: {summary.get('v1_skills', 0)} → V2 skills: {summary.get('v2_skills', 0)}"
    )

    if (
        summary.get("added_count", 0) > 0
        or summary.get("modified_count", 0) > 0
        or summary.get("deleted_count", 0) > 0
        or summary.get("confidence_shift_count", 0) > 0
    ):
        print("   [PASS] Semantic diff detected changes between versions")
    else:
        print(
            "   [WARN] Diff returned no changes — may indicate skills didn't change or diff has a bug"
        )


def main():
    print("=" * 60)
    print("KERNL SMOKE TEST — Proving the system is dynamic")
    print("=" * 60)

    try:
        check_health()
    except Exception as e:
        print(f"   [FATAL] API not reachable: {e}")
        print(
            "   Make sure backend is running: python -m uvicorn backend.main:app --port 8080"
        )
        sys.exit(1)

    # Test 1: Compile and get skills
    print("\n2. Initial compilation...")
    try:
        skills = compile_and_wait()
        print(f"   Got {len(skills.get('skills', []))} skills")
    except Exception as e:
        print(f"   [ERROR] Compilation failed: {e}")
        sys.exit(1)

    # Test 2: Gibberish rejection
    try:
        test_gibberish()
    except Exception as e:
        print(f"   [ERROR] Gibberish test failed: {e}")

    # Test 3: Dynamic policy change
    try:
        test_dynamic_policy_change()
    except Exception as e:
        print(f"   [ERROR] Dynamic test failed: {e}")
        # Make sure we restore the SOP
        if os.path.exists(SOP_PATH):
            print("   Attempting to restore original SOP...")

    # Test 4: Semantic diff
    try:
        test_semantic_diff()
    except Exception as e:
        print(f"   [ERROR] Diff test failed: {e}")

    print("\n" + "=" * 60)
    print("SMOKE TEST COMPLETE")
    print("=" * 60)


if __name__ == "__main__":
    main()