Spaces:
Sleeping
Sleeping
| """Field-level diff: base vs sakhi on the same 15 transcripts. | |
| The existing quality harness only checks `expected_form_checks` (pass/fail on | |
| specific fields). This script captures FULL form JSON from both models and | |
| diffs every leaf path, so we can identify cases where the fine-tune extracted | |
| information the base model missed (or vice versa). | |
| """ | |
| import json | |
| import os | |
| import sys | |
| import time | |
| os.environ["PYTHONIOENCODING"] = "utf-8" | |
| sys.stdout.reconfigure(encoding="utf-8") | |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | |
| from test_ollama_quality import ( | |
| DANGER_SYSTEM_PROMPT, | |
| FORM_SYSTEM_PROMPT, | |
| TESTS, | |
| load_schemas, | |
| parse_json_response, | |
| ) | |
| import ollama | |
| MODELS = ["gemma4:e4b-it-q4_K_M", "sakhi:latest"] | |
| OUT_PATH = "FIELD_COVERAGE_DIFF.md" | |
| def flatten(d, prefix=""): | |
| """Return {dotted_path: value} for all leaves.""" | |
| out = {} | |
| if isinstance(d, dict): | |
| for k, v in d.items(): | |
| p = f"{prefix}.{k}" if prefix else k | |
| out.update(flatten(v, p)) | |
| elif isinstance(d, list): | |
| for i, v in enumerate(d): | |
| out.update(flatten(v, f"{prefix}[{i}]")) | |
| else: | |
| out[prefix] = d | |
| return out | |
| def is_null(v): | |
| return v is None or (isinstance(v, str) and v.strip().lower() in ("", "null", "none")) | |
| def run_one(model, transcript, schema, danger_schema, visit_type): | |
| form_user = ( | |
| f"Extract structured data from this ASHA home visit conversation:\n\n" | |
| f"{transcript}\n\n" | |
| f"Output JSON schema:\n{json.dumps(schema, ensure_ascii=False)}" | |
| ) | |
| r1 = ollama.chat( | |
| model=model, | |
| messages=[ | |
| {"role": "system", "content": FORM_SYSTEM_PROMPT}, | |
| {"role": "user", "content": form_user}, | |
| ], | |
| options={"temperature": 0.0, "num_ctx": 4096}, | |
| ) | |
| form = parse_json_response(r1.message.content) or {} | |
| danger_user = ( | |
| f"Analyze this ASHA home visit conversation for danger signs.\n\n" | |
| f"Visit type: {visit_type}\n\n" | |
| f"{transcript}\n\n" | |
| f"Output JSON schema:\n{json.dumps(danger_schema, ensure_ascii=False)}" | |
| ) | |
| r2 = ollama.chat( | |
| model=model, | |
| messages=[ | |
| {"role": "system", "content": DANGER_SYSTEM_PROMPT}, | |
| {"role": "user", "content": danger_user}, | |
| ], | |
| options={"temperature": 0.0, "num_ctx": 4096}, | |
| ) | |
| danger = parse_json_response(r2.message.content) or {} | |
| return form, danger | |
| def main(): | |
| schemas = load_schemas() | |
| results = [] | |
| for idx, test in enumerate(TESTS, 1): | |
| (name, visit_type, schema_name, transcript, | |
| expected_form, danger_min, danger_max, expected_referral, | |
| must_be_null) = test | |
| schema = schemas[schema_name] | |
| danger_schema = schemas["danger_signs"] | |
| print(f"\n[{idx}/{len(TESTS)}] {name}") | |
| outputs = {} | |
| for model in MODELS: | |
| t0 = time.time() | |
| form, danger = run_one(model, transcript, schema, danger_schema, visit_type) | |
| outputs[model] = {"form": form, "danger": danger, "elapsed": time.time() - t0} | |
| print(f" {model}: {outputs[model]['elapsed']:.1f}s") | |
| results.append({"name": name, "outputs": outputs, "expected_form": expected_form, | |
| "must_be_null": must_be_null}) | |
| # Analyze diffs | |
| sakhi_only_count = 0 | |
| base_only_count = 0 | |
| diff_rows = [] | |
| lines = ["# Field Coverage Diff: base vs sakhi\n"] | |
| lines.append(f"Date: {time.strftime('%Y-%m-%d %H:%M')}\n") | |
| lines.append("Captures every form leaf path, filtering out fields already covered by " | |
| "the pass/fail harness (`expected_form_checks` + `hallucination_traps`).\n") | |
| for r in results: | |
| base = flatten(r["outputs"]["gemma4:e4b-it-q4_K_M"]["form"]) | |
| sakhi = flatten(r["outputs"]["sakhi:latest"]["form"]) | |
| tested_paths = set(r["expected_form"].keys()) | set(r["must_be_null"]) | |
| sakhi_only = [] | |
| base_only = [] | |
| differ = [] | |
| for path in set(base) | set(sakhi): | |
| if path in tested_paths: | |
| continue | |
| b, s = base.get(path), sakhi.get(path) | |
| if is_null(b) and not is_null(s): | |
| sakhi_only.append((path, s)) | |
| elif is_null(s) and not is_null(b): | |
| base_only.append((path, b)) | |
| elif not is_null(b) and not is_null(s) and b != s: | |
| differ.append((path, b, s)) | |
| sakhi_only_count += len(sakhi_only) | |
| base_only_count += len(base_only) | |
| if sakhi_only or base_only or differ: | |
| lines.append(f"\n## {r['name']}\n") | |
| if sakhi_only: | |
| lines.append(f"**Sakhi extracted, base returned null** ({len(sakhi_only)}):") | |
| for p, v in sorted(sakhi_only): | |
| lines.append(f"- `{p}` = `{v}`") | |
| lines.append("") | |
| if base_only: | |
| lines.append(f"**Base extracted, sakhi returned null** ({len(base_only)}):") | |
| for p, v in sorted(base_only): | |
| lines.append(f"- `{p}` = `{v}`") | |
| lines.append("") | |
| if differ: | |
| lines.append(f"**Differ** ({len(differ)}):") | |
| for p, b, s in sorted(differ): | |
| lines.append(f"- `{p}`: base=`{b}`, sakhi=`{s}`") | |
| lines.append("") | |
| summary = ( | |
| f"\n## Summary\n\n" | |
| f"- Sakhi extracted fields base left null: **{sakhi_only_count}**\n" | |
| f"- Base extracted fields sakhi left null: **{base_only_count}**\n" | |
| ) | |
| lines.insert(2, summary) | |
| with open(OUT_PATH, "w", encoding="utf-8") as f: | |
| f.write("\n".join(lines)) | |
| print(f"\nSummary: sakhi_extra={sakhi_only_count}, base_extra={base_only_count}") | |
| print(f"Written to {OUT_PATH}") | |
| if __name__ == "__main__": | |
| main() | |