""" Hackathon-grade quality test: 15 diverse Hindi medical transcripts. Tests form extraction + danger sign detection across all 4 visit types. Checks: value accuracy, hallucination, false positives, false negatives, code-switching, unlabeled audio, edge cases. Each test uses the correct schema for its visit type. """ import json import os import re import sys import time os.environ["PYTHONIOENCODING"] = "utf-8" sys.stdout.reconfigure(encoding="utf-8") import ollama FORM_SYSTEM_PROMPT = ( "You are a clinical data extraction system for India's ASHA health worker program. " "Extract structured data from the Hindi/Hinglish home visit conversation into the requested JSON schema. " "ONLY extract information explicitly stated in the conversation. Use null for any field not mentioned.\n\n" "STRICT RULES:\n" "1. Do NOT invent names, dates, phone numbers, or addresses. If the patient is only called 'दीदी' or 'बहन', set name to null.\n" "2. If age is not explicitly stated as a number, set age to null. Do NOT guess from context.\n" "3. If blood group, HIV status, or other lab tests are not discussed, they MUST be null — never assume 'negative' or a default group.\n" "4. If the conversation has no speaker labels (ASHA/Patient), still extract data but be extra strict about nulls.\n" "5. Numbers may appear as Hindi words (e.g., 'एक सो दस बटा सत्तर' = 110/70). Convert them to digits.\n" "Return valid JSON only." ) DANGER_SYSTEM_PROMPT = ( "You are a clinical danger sign detection system for India's ASHA health worker program. " "Analyze the Hindi/Hinglish home visit conversation for NHM-defined danger signs.\n\n" "STRICT RULES:\n" "1. ONLY flag a danger sign if the EXACT words proving it appear in the conversation.\n" "2. utterance_evidence MUST be a verbatim copy-paste from the conversation — do NOT paraphrase or fabricate.\n" "3. If a vital sign is NORMAL (e.g., BP 110/70, temperature 37°C), that is NOT a danger sign.\n" "4. Most routine visits have ZERO danger signs. Return an empty danger_signs array when none exist.\n" "5. When in doubt, do NOT flag — a missed flag is better than a false alarm.\n" "Return valid JSON only." ) # ============================================================ # 15 TEST CASES # ============================================================ # Each: (name, visit_type, schema_name, transcript, # expected_form_checks, expected_danger_min, expected_danger_max, # expected_referral, hallucination_traps) # # expected_form_checks: dict of {json_path: expected_value} # use dotted paths like "vitals.bp_systolic" # hallucination_traps: list of field paths that MUST be null TESTS = [ # ── ANC CASES ── # 1. ANC Normal — all vitals mentioned, labeled speakers ( "ANC Normal — full vitals", "anc_visit", "anc_visit", ( "ASHA: नमस्ते, कैसे हैं आप?\n" "Patient: नमस्ते दीदी, मैं ठीक हूँ।\n" "ASHA: आपका BP 110/70 है, बिल्कुल ठीक है। वजन 58 kg है। Hb 11.5 आया था।\n" "ASHA: आप 24 हफ्ते की हैं। IFA रोज़ ले रही हैं? TT पहला लग गया।\n" "Patient: हाँ दीदी। डिलीवरी PHC में करवाएँगे।" ), { "vitals.bp_systolic": 110, "vitals.bp_diastolic": 70, "vitals.weight_kg": 58, "vitals.hemoglobin_gm_percent": 11.5, "pregnancy.gestational_weeks": 24, "pregnancy.expected_delivery_place": "PHC", }, 0, 0, "routine_followup", ["patient.name", "patient.age", "lab_results.blood_group", "lab_results.hiv_status"], ), # 2. ANC Preeclampsia — multiple danger signs ( "ANC Preeclampsia — multi-danger", "anc_visit", "anc_visit", ( "ASHA: नमस्ते दीदी, कैसे हैं?\n" "Patient: दीदी, बहुत सिरदर्द हो रहा है। आँखों के सामने धुंधला दिखता है।\n" "Patient: चेहरे पर सूजन आ गई है।\n" "ASHA: BP चेक करती हूँ... 155/100 आ रहा है। बहुत ज़्यादा है।\n" "Patient: पैरों में भी काफी सूजन है।\n" "ASHA: आपको तुरंत PHC जाना होगा। आप 8 महीने की हैं।" ), {"vitals.bp_systolic": 155, "vitals.bp_diastolic": 100}, 2, 5, "refer_immediately", ["patient.name", "lab_results.blood_group"], ), # 3. ANC Severe anemia — low Hb ( "ANC Severe Anemia", "anc_visit", "anc_visit", ( "ASHA: Hb report आया?\n" "Patient: हाँ, 6.5 आया है। बहुत कम है। चक्कर आते हैं। साँस लेने में तकलीफ़ होती है।\n" "ASHA: BP 100/60 है। वजन 45 kg। आप 20 हफ्ते की हैं।\n" "ASHA: आपको PHC में आयरन injection लेना होगा।" ), { "vitals.bp_systolic": 100, "vitals.bp_diastolic": 60, "vitals.weight_kg": 45, "vitals.hemoglobin_gm_percent": 6.5, "pregnancy.gestational_weeks": 20, }, 1, 3, "refer_immediately", ["patient.name", "lab_results.blood_group"], ), # 4. ANC — only partial info mentioned ( "ANC Partial Info — sparse transcript", "anc_visit", "anc_visit", ( "ASHA: BP ठीक है, 118/76 है।\n" "Patient: ठीक है दीदी।" ), {"vitals.bp_systolic": 118, "vitals.bp_diastolic": 76}, 0, 0, "routine_followup", ["patient.name", "patient.age", "vitals.weight_kg", "vitals.hemoglobin_gm_percent", "pregnancy.gestational_weeks", "lab_results.blood_group", "lab_results.hiv_status"], ), # 5. ANC Unlabeled — no speaker labels (realistic ASR output) ( "ANC Unlabeled ASR output", "anc_visit", "anc_visit", ( "नमस्ते कैसे हैं BP check करती हूँ BP 120/80 है normal है " "weight 55 kg है Hb test करवाया था 10.2 आया था थोड़ा low है " "IFA रोज़ लेना गर्भ 28 weeks का है delivery के लिए district hospital जाएँगे" ), { "vitals.bp_systolic": 120, "vitals.bp_diastolic": 80, "vitals.weight_kg": 55, "vitals.hemoglobin_gm_percent": 10.2, "pregnancy.gestational_weeks": 28, }, 0, 0, "routine_followup", ["patient.name", "lab_results.blood_group"], ), # 6. ANC Hinglish heavy — code-switching ( "ANC Hinglish heavy code-switch", "anc_visit", "anc_visit", ( "ASHA: Hello didi, aaj check-up hai. BP check karti hoon. 130/85 hai, thoda high.\n" "Patient: Koi problem hai kya?\n" "ASHA: Abhi nahi, but monitor karna hoga. Weight 62 kg. Hb report mein 9.8 aaya.\n" "ASHA: Aap 32 weeks ki hain. Baby ki movement kaisi hai?\n" "Patient: Bahut move karta hai.\n" "ASHA: Good. Delivery ke liye district hospital ready hai?" ), { "vitals.bp_systolic": 130, "vitals.bp_diastolic": 85, "vitals.weight_kg": 62, "vitals.hemoglobin_gm_percent": 9.8, "pregnancy.gestational_weeks": 32, }, 0, 1, "routine_followup", # BP 130/85 is borderline, 0-1 flags acceptable ["patient.name", "lab_results.blood_group"], ), # 7. ANC with named patient — name should be extracted ( "ANC with patient name Sunita", "anc_visit", "anc_visit", ( "ASHA: नमस्ते सुनीता जी, आज का चेकअप करते हैं।\n" "सुनीता: नमस्ते दीदी। मेरी उम्र 25 साल है।\n" "ASHA: BP 116/74 है। वजन 54 kg। Hb 12.0 है। बहुत अच्छा।\n" "ASHA: 30 हफ्ते की हैं। सब ठीक चल रहा है।" ), { "patient.name": "सुनीता", "patient.age": 25, "vitals.bp_systolic": 116, "vitals.bp_diastolic": 74, "vitals.weight_kg": 54, "vitals.hemoglobin_gm_percent": 12.0, "pregnancy.gestational_weeks": 30, }, 0, 0, "routine_followup", ["lab_results.blood_group", "lab_results.hiv_status"], ), # ── PNC CASES ── # 8. PNC Normal — mother and baby fine ( "PNC Normal — day 7", "pnc_visit", "pnc_visit", ( "ASHA: नमस्ते दीदी। डिलीवरी को 7 दिन हो गए। आप कैसे हैं?\n" "Mother: मैं ठीक हूँ। बच्चा अच्छे से दूध पी रहा है।\n" "ASHA: बच्चे का वजन 3.1 kg है। नाभि सूखी है। तापमान सामान्य है।\n" "ASHA: आपका BP 118/76 है। खून बहना बंद हो गया?\n" "Mother: हाँ, अब बहुत कम है।" ), { "visit_info.visit_day": 7, "infant_assessment.weight_kg": 3.1, }, 0, 0, "routine_followup", [], ), # 9. PNC Danger — newborn not feeding + fever ( "PNC Danger — newborn not feeding", "pnc_visit", "pnc_visit", ( "ASHA: बच्चा कैसा है?\n" "Mother: दीदी, बच्चा बहुत सोता रहता है। दूध ठीक से नहीं पीता। 12 घंटे से दूध नहीं पिया।\n" "ASHA: बच्चे का रोना कैसा है?\n" "Mother: बहुत कमज़ोर आवाज़ में रोता है।\n" "ASHA: तापमान 100.5 डिग्री है। बुखार है। बच्चा सुस्त लग रहा है।\n" "ASHA: ये danger signs हैं। तुरंत PHC ले जाना होगा।" ), {"infant_assessment.temperature": 100.5}, 1, 4, "refer_immediately", [], ), # 10. PNC — heavy postpartum bleeding (maternal danger) ( "PNC Danger — postpartum bleeding", "pnc_visit", "pnc_visit", ( "ASHA: डिलीवरी को 3 दिन हुए। कैसे हैं?\n" "Mother: दीदी, बहुत ज़्यादा खून आ रहा है। pad 1 घंटे में भीग जाता है।\n" "Mother: चक्कर भी आ रहे हैं। बहुत कमज़ोरी है।\n" "ASHA: ये बहुत गंभीर है। तुरंत hospital जाना होगा।" ), {"visit_info.days_since_delivery": 3}, 1, 3, "refer_immediately", [], ), # ── DELIVERY CASES ── # 11. Delivery — normal institutional ( "Delivery Normal — institutional", "delivery", "delivery", ( "ASHA: डिलीवरी कब हुई?\n" "Mother: कल रात 3 बजे। लड़का हुआ है।\n" "ASHA: कहाँ हुई डिलीवरी?\n" "Mother: PHC में। normal delivery थी।\n" "ASHA: बच्चे का वजन?\n" "Mother: 2.8 kg है।\n" "ASHA: स्तनपान शुरू किया?\n" "Mother: हाँ, तुरंत शुरू किया। एक घंटे के अंदर।" ), { "delivery.place": "PHC", "delivery.type": "normal", "infant.sex": "male", "infant.birth_weight_kg": 2.8, "infant.breastfed_within_1hr": True, }, 0, 0, "routine_followup", [], ), # 12. Delivery — home delivery, low birth weight ( "Delivery — home, LBW baby", "delivery", "delivery", ( "ASHA: बच्चा कहाँ हुआ?\n" "Mother: घर पर ही हो गया। दाई ने करवाया। लड़की हुई है।\n" "ASHA: बच्ची का वजन बहुत कम है, 1.8 kg। ये low birth weight है।\n" "Mother: हाँ, बच्ची बहुत छोटी है।\n" "ASHA: बच्ची ने जन्म के समय रोया?\n" "Mother: हाँ, रोई थी।\n" "ASHA: बच्ची को गर्म रखना ज़रूरी है। PHC में चेकअप करवाना होगा।" ), { "delivery.place": "home", "infant.sex": "female", "infant.birth_weight_kg": 1.8, "infant.cried_at_birth": True, }, 1, 2, "refer_immediately", [], ), # ── CHILD HEALTH CASES ── # 13. Child health — routine, healthy ( "Child Health — routine 9 months", "child_health", "child_health", ( "ASHA: बच्चा कैसा है?\n" "Mother: बिल्कुल ठीक है दीदी। खूब खाता है, खेलता है।\n" "ASHA: वजन 8.5 kg है। 9 महीने के लिए अच्छा है।\n" "ASHA: Vitamin A दी थी? हाँ, 6 महीने में पहली dose दी थी।\n" "ASHA: टीके सब लगे हैं। बच्चा बैठता है, घुटनों पर चलता है। बढ़िया।" ), { "child.age_months": 9, "growth_assessment.weight_kg": 8.5, "immunization.up_to_date": True, }, 0, 0, "routine_followup", [], ), # 14. Child health — sick child, diarrhea + dehydration ( "Child Health — diarrhea danger", "child_health", "child_health", ( "ASHA: बच्चे को क्या हुआ?\n" "Mother: 3 दिन से दस्त लग रहे हैं। बहुत पतले पानी जैसे।\n" "Mother: खाना-पीना बंद कर दिया है। बहुत सुस्त हो गया है।\n" "ASHA: बच्चे का वजन 6.2 kg है। 12 महीने का है।\n" "ASHA: आँखें धँसी हुई हैं। ये dehydration के signs हैं। तुरंत PHC जाना होगा।" ), { "child.age_months": 12, "growth_assessment.weight_kg": 6.2, "illness_assessment.diarrhea": True, "illness_assessment.diarrhea_duration_days": 3, }, 1, 3, "refer_immediately", [], ), # ── EDGE CASES ── # 15. ANC — normal visit with ZERO concerning findings (false positive trap) ( "ANC Zero Findings — false positive trap", "anc_visit", "anc_visit", ( "ASHA: सब ठीक है दीदी?\n" "Patient: हाँ दीदी, बिल्कुल ठीक हूँ। कोई तकलीफ़ नहीं।\n" "ASHA: बहुत अच्छा। अगली बार आऊँगी। कोई तकलीफ़ हो तो फ़ोन कर दीजिए।\n" "Patient: ठीक है दीदी, धन्यवाद।" ), {}, # No vitals to check — nothing was measured 0, 0, "routine_followup", ["patient.name", "patient.age", "vitals.bp_systolic", "vitals.weight_kg", "vitals.hemoglobin_gm_percent", "pregnancy.gestational_weeks", "lab_results.blood_group", "lab_results.hiv_status"], ), ] def load_schemas(): schemas = {} for name in ["anc_visit", "pnc_visit", "delivery", "child_health", "danger_signs"]: with open(f"configs/schemas/{name}.json", encoding="utf-8") as f: schemas[name] = json.load(f) return schemas def get_nested(d, path): """Get value from dict using dotted path like 'vitals.bp_systolic'.""" parts = path.split(".") for p in parts: if not isinstance(d, dict): return None d = d.get(p) return d def parse_json_response(raw): clean = raw.strip().lstrip('\ufeff') clean = re.sub(r'^`{3,}\s*(?:json)?\s*[\r\n]*', '', clean, flags=re.IGNORECASE) clean = re.sub(r'[\r\n]*`{3,}\s*$', '', clean).strip() clean = re.sub(r',\s*([}\]])', r'\1', clean) if clean and clean[0] not in ('{', '['): idx = min( (clean.find("{") if clean.find("{") >= 0 else len(clean)), (clean.find("[") if clean.find("[") >= 0 else len(clean)), ) if idx < len(clean): clean = clean[idx:] try: return json.loads(clean) except json.JSONDecodeError: for end in range(len(clean), max(0, len(clean) - 200), -1): if clean[end - 1] in ('}', ']'): try: return json.loads(clean[:end]) except json.JSONDecodeError: continue return None def run_all_tests(model): schemas = load_schemas() total_pass = 0 total_fail = 0 total_time = 0 issues = [] for (name, visit_type, schema_name, transcript, expected_form, danger_min, danger_max, expected_referral, must_be_null) in TESTS: schema = schemas[schema_name] danger_schema = schemas["danger_signs"] # ── Form extraction ── form_user = ( f"Extract structured data from this ASHA home visit conversation:\n\n" f"{transcript}\n\n" f"Output JSON schema:\n{json.dumps(schema, ensure_ascii=False)}" ) t0 = time.time() resp = ollama.chat( model=model, messages=[ {"role": "system", "content": FORM_SYSTEM_PROMPT}, {"role": "user", "content": form_user}, ], options={"temperature": 0.0, "num_ctx": 4096}, ) form_time = time.time() - t0 form_parsed = parse_json_response(resp.message.content) # ── Danger sign detection ── danger_user = ( f"Analyze this ASHA home visit conversation for danger signs.\n\n" f"Visit type: {visit_type}\n\n" f"{transcript}\n\n" f"Output JSON schema:\n{json.dumps(danger_schema, ensure_ascii=False)}" ) t0 = time.time() resp2 = ollama.chat( model=model, messages=[ {"role": "system", "content": DANGER_SYSTEM_PROMPT}, {"role": "user", "content": danger_user}, ], options={"temperature": 0.0, "num_ctx": 4096}, ) danger_time = time.time() - t0 danger_parsed = parse_json_response(resp2.message.content) elapsed = form_time + danger_time total_time += elapsed test_issues = [] # ── Check form values ── if form_parsed is None: test_issues.append("FORM_PARSE_FAIL") else: for path, expected_val in expected_form.items(): got = get_nested(form_parsed, path) if got is None: test_issues.append(f"MISSING {path} (expected {expected_val})") else: try: if isinstance(expected_val, bool): if got != expected_val: test_issues.append(f"WRONG {path}: {got} != {expected_val}") elif isinstance(expected_val, (int, float)): if abs(float(got) - float(expected_val)) > 0.5: test_issues.append(f"WRONG {path}: {got} != {expected_val}") elif isinstance(expected_val, str): got_lower = str(got).lower().strip() exp_lower = expected_val.lower().strip() # Allow partial match for names and places if exp_lower not in got_lower and got_lower not in exp_lower: test_issues.append(f"WRONG {path}: {got} != {expected_val}") except (ValueError, TypeError): if str(got) != str(expected_val): test_issues.append(f"WRONG {path}: {got} != {expected_val}") # ── Check hallucination traps ── for path in must_be_null: val = get_nested(form_parsed, path) if val is not None and str(val).lower() not in ("null", "none", ""): test_issues.append(f"HALLUC {path}={val}") # ── Check danger signs ── if danger_parsed is None: test_issues.append("DANGER_PARSE_FAIL") else: signs = danger_parsed.get("danger_signs", []) n_signs = len(signs) if isinstance(signs, list) else 0 if n_signs < danger_min: test_issues.append(f"FALSE_NEG: {n_signs} signs < {danger_min} expected") if n_signs > danger_max: test_issues.append(f"FALSE_POS: {n_signs} signs > {danger_max} expected") # Check referral ref = danger_parsed.get("referral_decision", {}) ref_decision = ref.get("decision", "") # Group equivalent referral decisions SAFE_REFERRALS = {"routine_followup", "continue_monitoring"} URGENT_REFERRALS = {"refer_immediately", "refer_within_24h"} if expected_referral: exp_group = "safe" if expected_referral in SAFE_REFERRALS else "urgent" got_group = "safe" if ref_decision in SAFE_REFERRALS else "urgent" if exp_group != got_group: test_issues.append(f"REFERRAL: {ref_decision} != {expected_referral}") # ── Verdict ── if test_issues: status = "FAIL" total_fail += 1 else: status = "PASS" total_pass += 1 issues_str = "; ".join(test_issues) if test_issues else "all checks OK" print(f" {status} [{name}] ({elapsed:.1f}s) {issues_str}") print(f"\n Score: {total_pass}/{total_pass + total_fail}, avg {total_time / (total_pass + total_fail):.1f}s/test") return total_pass, total_fail def main(): models = [ "gemma4:e4b-it-q4_K_M", "sakhi:latest", # fine-tuned LoRA — 9/15 vs base 15/15, base wins ] results = {} for model in models: print(f"\n{'=' * 70}") print(f" {model}") print(f"{'=' * 70}") p, f = run_all_tests(model) results[model] = (p, f) print(f"\n{'=' * 70}") print("FINAL SCORES") print(f"{'=' * 70}") for model, (p, f) in results.items(): pct = p / (p + f) * 100 print(f" {p}/{p+f} ({pct:.0f}%) {model}") if __name__ == "__main__": main()