Spaces:

Tushar9802
/

sakhi

Sleeping

File size: 24,506 Bytes

745f62a

"""
Hackathon-grade quality test: 15 diverse Hindi medical transcripts.

Tests form extraction + danger sign detection across all 4 visit types.
Checks: value accuracy, hallucination, false positives, false negatives,
code-switching, unlabeled audio, edge cases.

Each test uses the correct schema for its visit type.
"""
import json
import os
import re
import sys
import time

os.environ["PYTHONIOENCODING"] = "utf-8"
sys.stdout.reconfigure(encoding="utf-8")

import ollama

FORM_SYSTEM_PROMPT = (
    "You are a clinical data extraction system for India's ASHA health worker program. "
    "Extract structured data from the Hindi/Hinglish home visit conversation into the requested JSON schema. "
    "ONLY extract information explicitly stated in the conversation. Use null for any field not mentioned.\n\n"
    "STRICT RULES:\n"
    "1. Do NOT invent names, dates, phone numbers, or addresses. If the patient is only called 'दीदी' or 'बहन', set name to null.\n"
    "2. If age is not explicitly stated as a number, set age to null. Do NOT guess from context.\n"
    "3. If blood group, HIV status, or other lab tests are not discussed, they MUST be null — never assume 'negative' or a default group.\n"
    "4. If the conversation has no speaker labels (ASHA/Patient), still extract data but be extra strict about nulls.\n"
    "5. Numbers may appear as Hindi words (e.g., 'एक सो दस बटा सत्तर' = 110/70). Convert them to digits.\n"
    "Return valid JSON only."
)

DANGER_SYSTEM_PROMPT = (
    "You are a clinical danger sign detection system for India's ASHA health worker program. "
    "Analyze the Hindi/Hinglish home visit conversation for NHM-defined danger signs.\n\n"
    "STRICT RULES:\n"
    "1. ONLY flag a danger sign if the EXACT words proving it appear in the conversation.\n"
    "2. utterance_evidence MUST be a verbatim copy-paste from the conversation — do NOT paraphrase or fabricate.\n"
    "3. If a vital sign is NORMAL (e.g., BP 110/70, temperature 37°C), that is NOT a danger sign.\n"
    "4. Most routine visits have ZERO danger signs. Return an empty danger_signs array when none exist.\n"
    "5. When in doubt, do NOT flag — a missed flag is better than a false alarm.\n"
    "Return valid JSON only."
)

# ============================================================
# 15 TEST CASES
# ============================================================
# Each: (name, visit_type, schema_name, transcript,
#         expected_form_checks, expected_danger_min, expected_danger_max,
#         expected_referral, hallucination_traps)
#
# expected_form_checks: dict of {json_path: expected_value}
#   use dotted paths like "vitals.bp_systolic"
# hallucination_traps: list of field paths that MUST be null

TESTS = [
    # ── ANC CASES ──
    # 1. ANC Normal — all vitals mentioned, labeled speakers
    (
        "ANC Normal — full vitals",
        "anc_visit", "anc_visit",
        (
            "ASHA: नमस्ते, कैसे हैं आप?\n"
            "Patient: नमस्ते दीदी, मैं ठीक हूँ।\n"
            "ASHA: आपका BP 110/70 है, बिल्कुल ठीक है। वजन 58 kg है। Hb 11.5 आया था।\n"
            "ASHA: आप 24 हफ्ते की हैं। IFA रोज़ ले रही हैं? TT पहला लग गया।\n"
            "Patient: हाँ दीदी। डिलीवरी PHC में करवाएँगे।"
        ),
        {
            "vitals.bp_systolic": 110, "vitals.bp_diastolic": 70,
            "vitals.weight_kg": 58, "vitals.hemoglobin_gm_percent": 11.5,
            "pregnancy.gestational_weeks": 24,
            "pregnancy.expected_delivery_place": "PHC",
        },
        0, 0, "routine_followup",
        ["patient.name", "patient.age", "lab_results.blood_group", "lab_results.hiv_status"],
    ),
    # 2. ANC Preeclampsia — multiple danger signs
    (
        "ANC Preeclampsia — multi-danger",
        "anc_visit", "anc_visit",
        (
            "ASHA: नमस्ते दीदी, कैसे हैं?\n"
            "Patient: दीदी, बहुत सिरदर्द हो रहा है। आँखों के सामने धुंधला दिखता है।\n"
            "Patient: चेहरे पर सूजन आ गई है।\n"
            "ASHA: BP चेक करती हूँ... 155/100 आ रहा है। बहुत ज़्यादा है।\n"
            "Patient: पैरों में भी काफी सूजन है।\n"
            "ASHA: आपको तुरंत PHC जाना होगा। आप 8 महीने की हैं।"
        ),
        {"vitals.bp_systolic": 155, "vitals.bp_diastolic": 100},
        2, 5, "refer_immediately",
        ["patient.name", "lab_results.blood_group"],
    ),
    # 3. ANC Severe anemia — low Hb
    (
        "ANC Severe Anemia",
        "anc_visit", "anc_visit",
        (
            "ASHA: Hb report आया?\n"
            "Patient: हाँ, 6.5 आया है। बहुत कम है। चक्कर आते हैं। साँस लेने में तकलीफ़ होती है।\n"
            "ASHA: BP 100/60 है। वजन 45 kg। आप 20 हफ्ते की हैं।\n"
            "ASHA: आपको PHC में आयरन injection लेना होगा।"
        ),
        {
            "vitals.bp_systolic": 100, "vitals.bp_diastolic": 60,
            "vitals.weight_kg": 45, "vitals.hemoglobin_gm_percent": 6.5,
            "pregnancy.gestational_weeks": 20,
        },
        1, 3, "refer_immediately",
        ["patient.name", "lab_results.blood_group"],
    ),
    # 4. ANC — only partial info mentioned
    (
        "ANC Partial Info — sparse transcript",
        "anc_visit", "anc_visit",
        (
            "ASHA: BP ठीक है, 118/76 है।\n"
            "Patient: ठीक है दीदी।"
        ),
        {"vitals.bp_systolic": 118, "vitals.bp_diastolic": 76},
        0, 0, "routine_followup",
        ["patient.name", "patient.age", "vitals.weight_kg", "vitals.hemoglobin_gm_percent",
         "pregnancy.gestational_weeks", "lab_results.blood_group", "lab_results.hiv_status"],
    ),
    # 5. ANC Unlabeled — no speaker labels (realistic ASR output)
    (
        "ANC Unlabeled ASR output",
        "anc_visit", "anc_visit",
        (
            "नमस्ते कैसे हैं BP check करती हूँ BP 120/80 है normal है "
            "weight 55 kg है Hb test करवाया था 10.2 आया था थोड़ा low है "
            "IFA रोज़ लेना गर्भ 28 weeks का है delivery के लिए district hospital जाएँगे"
        ),
        {
            "vitals.bp_systolic": 120, "vitals.bp_diastolic": 80,
            "vitals.weight_kg": 55, "vitals.hemoglobin_gm_percent": 10.2,
            "pregnancy.gestational_weeks": 28,
        },
        0, 0, "routine_followup",
        ["patient.name", "lab_results.blood_group"],
    ),
    # 6. ANC Hinglish heavy — code-switching
    (
        "ANC Hinglish heavy code-switch",
        "anc_visit", "anc_visit",
        (
            "ASHA: Hello didi, aaj check-up hai. BP check karti hoon. 130/85 hai, thoda high.\n"
            "Patient: Koi problem hai kya?\n"
            "ASHA: Abhi nahi, but monitor karna hoga. Weight 62 kg. Hb report mein 9.8 aaya.\n"
            "ASHA: Aap 32 weeks ki hain. Baby ki movement kaisi hai?\n"
            "Patient: Bahut move karta hai.\n"
            "ASHA: Good. Delivery ke liye district hospital ready hai?"
        ),
        {
            "vitals.bp_systolic": 130, "vitals.bp_diastolic": 85,
            "vitals.weight_kg": 62, "vitals.hemoglobin_gm_percent": 9.8,
            "pregnancy.gestational_weeks": 32,
        },
        0, 1, "routine_followup",  # BP 130/85 is borderline, 0-1 flags acceptable
        ["patient.name", "lab_results.blood_group"],
    ),
    # 7. ANC with named patient — name should be extracted
    (
        "ANC with patient name Sunita",
        "anc_visit", "anc_visit",
        (
            "ASHA: नमस्ते सुनीता जी, आज का चेकअप करते हैं।\n"
            "सुनीता: नमस्ते दीदी। मेरी उम्र 25 साल है।\n"
            "ASHA: BP 116/74 है। वजन 54 kg। Hb 12.0 है। बहुत अच्छा।\n"
            "ASHA: 30 हफ्ते की हैं। सब ठीक चल रहा है।"
        ),
        {
            "patient.name": "सुनीता",
            "patient.age": 25,
            "vitals.bp_systolic": 116, "vitals.bp_diastolic": 74,
            "vitals.weight_kg": 54, "vitals.hemoglobin_gm_percent": 12.0,
            "pregnancy.gestational_weeks": 30,
        },
        0, 0, "routine_followup",
        ["lab_results.blood_group", "lab_results.hiv_status"],
    ),

    # ── PNC CASES ──
    # 8. PNC Normal — mother and baby fine
    (
        "PNC Normal — day 7",
        "pnc_visit", "pnc_visit",
        (
            "ASHA: नमस्ते दीदी। डिलीवरी को 7 दिन हो गए। आप कैसे हैं?\n"
            "Mother: मैं ठीक हूँ। बच्चा अच्छे से दूध पी रहा है।\n"
            "ASHA: बच्चे का वजन 3.1 kg है। नाभि सूखी है। तापमान सामान्य है।\n"
            "ASHA: आपका BP 118/76 है। खून बहना बंद हो गया?\n"
            "Mother: हाँ, अब बहुत कम है।"
        ),
        {
            "visit_info.visit_day": 7,
            "infant_assessment.weight_kg": 3.1,
        },
        0, 0, "routine_followup",
        [],
    ),
    # 9. PNC Danger — newborn not feeding + fever
    (
        "PNC Danger — newborn not feeding",
        "pnc_visit", "pnc_visit",
        (
            "ASHA: बच्चा कैसा है?\n"
            "Mother: दीदी, बच्चा बहुत सोता रहता है। दूध ठीक से नहीं पीता। 12 घंटे से दूध नहीं पिया।\n"
            "ASHA: बच्चे का रोना कैसा है?\n"
            "Mother: बहुत कमज़ोर आवाज़ में रोता है।\n"
            "ASHA: तापमान 100.5 डिग्री है। बुखार है। बच्चा सुस्त लग रहा है।\n"
            "ASHA: ये danger signs हैं। तुरंत PHC ले जाना होगा।"
        ),
        {"infant_assessment.temperature": 100.5},
        1, 4, "refer_immediately",
        [],
    ),
    # 10. PNC — heavy postpartum bleeding (maternal danger)
    (
        "PNC Danger — postpartum bleeding",
        "pnc_visit", "pnc_visit",
        (
            "ASHA: डिलीवरी को 3 दिन हुए। कैसे हैं?\n"
            "Mother: दीदी, बहुत ज़्यादा खून आ रहा है। pad 1 घंटे में भीग जाता है।\n"
            "Mother: चक्कर भी आ रहे हैं। बहुत कमज़ोरी है।\n"
            "ASHA: ये बहुत गंभीर है। तुरंत hospital जाना होगा।"
        ),
        {"visit_info.days_since_delivery": 3},
        1, 3, "refer_immediately",
        [],
    ),

    # ── DELIVERY CASES ──
    # 11. Delivery — normal institutional
    (
        "Delivery Normal — institutional",
        "delivery", "delivery",
        (
            "ASHA: डिलीवरी कब हुई?\n"
            "Mother: कल रात 3 बजे। लड़का हुआ है।\n"
            "ASHA: कहाँ हुई डिलीवरी?\n"
            "Mother: PHC में। normal delivery थी।\n"
            "ASHA: बच्चे का वजन?\n"
            "Mother: 2.8 kg है।\n"
            "ASHA: स्तनपान शुरू किया?\n"
            "Mother: हाँ, तुरंत शुरू किया। एक घंटे के अंदर।"
        ),
        {
            "delivery.place": "PHC",
            "delivery.type": "normal",
            "infant.sex": "male",
            "infant.birth_weight_kg": 2.8,
            "infant.breastfed_within_1hr": True,
        },
        0, 0, "routine_followup",
        [],
    ),
    # 12. Delivery — home delivery, low birth weight
    (
        "Delivery — home, LBW baby",
        "delivery", "delivery",
        (
            "ASHA: बच्चा कहाँ हुआ?\n"
            "Mother: घर पर ही हो गया। दाई ने करवाया। लड़की हुई है।\n"
            "ASHA: बच्ची का वजन बहुत कम है, 1.8 kg। ये low birth weight है।\n"
            "Mother: हाँ, बच्ची बहुत छोटी है।\n"
            "ASHA: बच्ची ने जन्म के समय रोया?\n"
            "Mother: हाँ, रोई थी।\n"
            "ASHA: बच्ची को गर्म रखना ज़रूरी है। PHC में चेकअप करवाना होगा।"
        ),
        {
            "delivery.place": "home",
            "infant.sex": "female",
            "infant.birth_weight_kg": 1.8,
            "infant.cried_at_birth": True,
        },
        1, 2, "refer_immediately",
        [],
    ),

    # ── CHILD HEALTH CASES ──
    # 13. Child health — routine, healthy
    (
        "Child Health — routine 9 months",
        "child_health", "child_health",
        (
            "ASHA: बच्चा कैसा है?\n"
            "Mother: बिल्कुल ठीक है दीदी। खूब खाता है, खेलता है।\n"
            "ASHA: वजन 8.5 kg है। 9 महीने के लिए अच्छा है।\n"
            "ASHA: Vitamin A दी थी? हाँ, 6 महीने में पहली dose दी थी।\n"
            "ASHA: टीके सब लगे हैं। बच्चा बैठता है, घुटनों पर चलता है। बढ़िया।"
        ),
        {
            "child.age_months": 9,
            "growth_assessment.weight_kg": 8.5,
            "immunization.up_to_date": True,
        },
        0, 0, "routine_followup",
        [],
    ),
    # 14. Child health — sick child, diarrhea + dehydration
    (
        "Child Health — diarrhea danger",
        "child_health", "child_health",
        (
            "ASHA: बच्चे को क्या हुआ?\n"
            "Mother: 3 दिन से दस्त लग रहे हैं। बहुत पतले पानी जैसे।\n"
            "Mother: खाना-पीना बंद कर दिया है। बहुत सुस्त हो गया है।\n"
            "ASHA: बच्चे का वजन 6.2 kg है। 12 महीने का है।\n"
            "ASHA: आँखें धँसी हुई हैं। ये dehydration के signs हैं। तुरंत PHC जाना होगा।"
        ),
        {
            "child.age_months": 12,
            "growth_assessment.weight_kg": 6.2,
            "illness_assessment.diarrhea": True,
            "illness_assessment.diarrhea_duration_days": 3,
        },
        1, 3, "refer_immediately",
        [],
    ),

    # ── EDGE CASES ──
    # 15. ANC — normal visit with ZERO concerning findings (false positive trap)
    (
        "ANC Zero Findings — false positive trap",
        "anc_visit", "anc_visit",
        (
            "ASHA: सब ठीक है दीदी?\n"
            "Patient: हाँ दीदी, बिल्कुल ठीक हूँ। कोई तकलीफ़ नहीं।\n"
            "ASHA: बहुत अच्छा। अगली बार आऊँगी। कोई तकलीफ़ हो तो फ़ोन कर दीजिए।\n"
            "Patient: ठीक है दीदी, धन्यवाद।"
        ),
        {},  # No vitals to check — nothing was measured
        0, 0, "routine_followup",
        ["patient.name", "patient.age", "vitals.bp_systolic", "vitals.weight_kg",
         "vitals.hemoglobin_gm_percent", "pregnancy.gestational_weeks",
         "lab_results.blood_group", "lab_results.hiv_status"],
    ),
]


def load_schemas():
    schemas = {}
    for name in ["anc_visit", "pnc_visit", "delivery", "child_health", "danger_signs"]:
        with open(f"configs/schemas/{name}.json", encoding="utf-8") as f:
            schemas[name] = json.load(f)
    return schemas


def get_nested(d, path):
    """Get value from dict using dotted path like 'vitals.bp_systolic'."""
    parts = path.split(".")
    for p in parts:
        if not isinstance(d, dict):
            return None
        d = d.get(p)
    return d


def parse_json_response(raw):
    clean = raw.strip().lstrip('\ufeff')
    clean = re.sub(r'^`{3,}\s*(?:json)?\s*[\r\n]*', '', clean, flags=re.IGNORECASE)
    clean = re.sub(r'[\r\n]*`{3,}\s*$', '', clean).strip()
    clean = re.sub(r',\s*([}\]])', r'\1', clean)
    if clean and clean[0] not in ('{', '['):
        idx = min(
            (clean.find("{") if clean.find("{") >= 0 else len(clean)),
            (clean.find("[") if clean.find("[") >= 0 else len(clean)),
        )
        if idx < len(clean):
            clean = clean[idx:]
    try:
        return json.loads(clean)
    except json.JSONDecodeError:
        for end in range(len(clean), max(0, len(clean) - 200), -1):
            if clean[end - 1] in ('}', ']'):
                try:
                    return json.loads(clean[:end])
                except json.JSONDecodeError:
                    continue
    return None


def run_all_tests(model):
    schemas = load_schemas()
    total_pass = 0
    total_fail = 0
    total_time = 0
    issues = []

    for (name, visit_type, schema_name, transcript,
         expected_form, danger_min, danger_max, expected_referral,
         must_be_null) in TESTS:

        schema = schemas[schema_name]
        danger_schema = schemas["danger_signs"]

        # ── Form extraction ──
        form_user = (
            f"Extract structured data from this ASHA home visit conversation:\n\n"
            f"{transcript}\n\n"
            f"Output JSON schema:\n{json.dumps(schema, ensure_ascii=False)}"
        )

        t0 = time.time()
        resp = ollama.chat(
            model=model,
            messages=[
                {"role": "system", "content": FORM_SYSTEM_PROMPT},
                {"role": "user", "content": form_user},
            ],
            options={"temperature": 0.0, "num_ctx": 4096},
        )
        form_time = time.time() - t0
        form_parsed = parse_json_response(resp.message.content)

        # ── Danger sign detection ──
        danger_user = (
            f"Analyze this ASHA home visit conversation for danger signs.\n\n"
            f"Visit type: {visit_type}\n\n"
            f"{transcript}\n\n"
            f"Output JSON schema:\n{json.dumps(danger_schema, ensure_ascii=False)}"
        )

        t0 = time.time()
        resp2 = ollama.chat(
            model=model,
            messages=[
                {"role": "system", "content": DANGER_SYSTEM_PROMPT},
                {"role": "user", "content": danger_user},
            ],
            options={"temperature": 0.0, "num_ctx": 4096},
        )
        danger_time = time.time() - t0
        danger_parsed = parse_json_response(resp2.message.content)

        elapsed = form_time + danger_time
        total_time += elapsed

        test_issues = []

        # ── Check form values ──
        if form_parsed is None:
            test_issues.append("FORM_PARSE_FAIL")
        else:
            for path, expected_val in expected_form.items():
                got = get_nested(form_parsed, path)
                if got is None:
                    test_issues.append(f"MISSING {path} (expected {expected_val})")
                else:
                    try:
                        if isinstance(expected_val, bool):
                            if got != expected_val:
                                test_issues.append(f"WRONG {path}: {got} != {expected_val}")
                        elif isinstance(expected_val, (int, float)):
                            if abs(float(got) - float(expected_val)) > 0.5:
                                test_issues.append(f"WRONG {path}: {got} != {expected_val}")
                        elif isinstance(expected_val, str):
                            got_lower = str(got).lower().strip()
                            exp_lower = expected_val.lower().strip()
                            # Allow partial match for names and places
                            if exp_lower not in got_lower and got_lower not in exp_lower:
                                test_issues.append(f"WRONG {path}: {got} != {expected_val}")
                    except (ValueError, TypeError):
                        if str(got) != str(expected_val):
                            test_issues.append(f"WRONG {path}: {got} != {expected_val}")

            # ── Check hallucination traps ──
            for path in must_be_null:
                val = get_nested(form_parsed, path)
                if val is not None and str(val).lower() not in ("null", "none", ""):
                    test_issues.append(f"HALLUC {path}={val}")

        # ── Check danger signs ──
        if danger_parsed is None:
            test_issues.append("DANGER_PARSE_FAIL")
        else:
            signs = danger_parsed.get("danger_signs", [])
            n_signs = len(signs) if isinstance(signs, list) else 0
            if n_signs < danger_min:
                test_issues.append(f"FALSE_NEG: {n_signs} signs < {danger_min} expected")
            if n_signs > danger_max:
                test_issues.append(f"FALSE_POS: {n_signs} signs > {danger_max} expected")

            # Check referral
            ref = danger_parsed.get("referral_decision", {})
            ref_decision = ref.get("decision", "")
            # Group equivalent referral decisions
            SAFE_REFERRALS = {"routine_followup", "continue_monitoring"}
            URGENT_REFERRALS = {"refer_immediately", "refer_within_24h"}
            if expected_referral:
                exp_group = "safe" if expected_referral in SAFE_REFERRALS else "urgent"
                got_group = "safe" if ref_decision in SAFE_REFERRALS else "urgent"
                if exp_group != got_group:
                    test_issues.append(f"REFERRAL: {ref_decision} != {expected_referral}")

        # ── Verdict ──
        if test_issues:
            status = "FAIL"
            total_fail += 1
        else:
            status = "PASS"
            total_pass += 1

        issues_str = "; ".join(test_issues) if test_issues else "all checks OK"
        print(f"  {status} [{name}] ({elapsed:.1f}s) {issues_str}")

    print(f"\n  Score: {total_pass}/{total_pass + total_fail}, avg {total_time / (total_pass + total_fail):.1f}s/test")
    return total_pass, total_fail


def main():
    models = [
        "gemma4:e4b-it-q4_K_M",
        "sakhi:latest",  # fine-tuned LoRA — 9/15 vs base 15/15, base wins
    ]

    results = {}
    for model in models:
        print(f"\n{'=' * 70}")
        print(f" {model}")
        print(f"{'=' * 70}")
        p, f = run_all_tests(model)
        results[model] = (p, f)

    print(f"\n{'=' * 70}")
    print("FINAL SCORES")
    print(f"{'=' * 70}")
    for model, (p, f) in results.items():
        pct = p / (p + f) * 100
        print(f"  {p}/{p+f} ({pct:.0f}%)  {model}")


if __name__ == "__main__":
    main()