Spaces:

Tushar9802
/

sakhi

Sleeping

File size: 5,872 Bytes

745f62a

"""Field-level diff: base vs sakhi on the same 15 transcripts.

The existing quality harness only checks `expected_form_checks` (pass/fail on
specific fields). This script captures FULL form JSON from both models and
diffs every leaf path, so we can identify cases where the fine-tune extracted
information the base model missed (or vice versa).
"""
import json
import os
import sys
import time

os.environ["PYTHONIOENCODING"] = "utf-8"
sys.stdout.reconfigure(encoding="utf-8")

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from test_ollama_quality import (
    DANGER_SYSTEM_PROMPT,
    FORM_SYSTEM_PROMPT,
    TESTS,
    load_schemas,
    parse_json_response,
)

import ollama

MODELS = ["gemma4:e4b-it-q4_K_M", "sakhi:latest"]
OUT_PATH = "FIELD_COVERAGE_DIFF.md"


def flatten(d, prefix=""):
    """Return {dotted_path: value} for all leaves."""
    out = {}
    if isinstance(d, dict):
        for k, v in d.items():
            p = f"{prefix}.{k}" if prefix else k
            out.update(flatten(v, p))
    elif isinstance(d, list):
        for i, v in enumerate(d):
            out.update(flatten(v, f"{prefix}[{i}]"))
    else:
        out[prefix] = d
    return out


def is_null(v):
    return v is None or (isinstance(v, str) and v.strip().lower() in ("", "null", "none"))


def run_one(model, transcript, schema, danger_schema, visit_type):
    form_user = (
        f"Extract structured data from this ASHA home visit conversation:\n\n"
        f"{transcript}\n\n"
        f"Output JSON schema:\n{json.dumps(schema, ensure_ascii=False)}"
    )
    r1 = ollama.chat(
        model=model,
        messages=[
            {"role": "system", "content": FORM_SYSTEM_PROMPT},
            {"role": "user", "content": form_user},
        ],
        options={"temperature": 0.0, "num_ctx": 4096},
    )
    form = parse_json_response(r1.message.content) or {}

    danger_user = (
        f"Analyze this ASHA home visit conversation for danger signs.\n\n"
        f"Visit type: {visit_type}\n\n"
        f"{transcript}\n\n"
        f"Output JSON schema:\n{json.dumps(danger_schema, ensure_ascii=False)}"
    )
    r2 = ollama.chat(
        model=model,
        messages=[
            {"role": "system", "content": DANGER_SYSTEM_PROMPT},
            {"role": "user", "content": danger_user},
        ],
        options={"temperature": 0.0, "num_ctx": 4096},
    )
    danger = parse_json_response(r2.message.content) or {}
    return form, danger


def main():
    schemas = load_schemas()
    results = []

    for idx, test in enumerate(TESTS, 1):
        (name, visit_type, schema_name, transcript,
         expected_form, danger_min, danger_max, expected_referral,
         must_be_null) = test
        schema = schemas[schema_name]
        danger_schema = schemas["danger_signs"]
        print(f"\n[{idx}/{len(TESTS)}] {name}")

        outputs = {}
        for model in MODELS:
            t0 = time.time()
            form, danger = run_one(model, transcript, schema, danger_schema, visit_type)
            outputs[model] = {"form": form, "danger": danger, "elapsed": time.time() - t0}
            print(f"  {model}: {outputs[model]['elapsed']:.1f}s")

        results.append({"name": name, "outputs": outputs, "expected_form": expected_form,
                        "must_be_null": must_be_null})

    # Analyze diffs
    sakhi_only_count = 0
    base_only_count = 0
    diff_rows = []

    lines = ["# Field Coverage Diff: base vs sakhi\n"]
    lines.append(f"Date: {time.strftime('%Y-%m-%d %H:%M')}\n")
    lines.append("Captures every form leaf path, filtering out fields already covered by "
                 "the pass/fail harness (`expected_form_checks` + `hallucination_traps`).\n")

    for r in results:
        base = flatten(r["outputs"]["gemma4:e4b-it-q4_K_M"]["form"])
        sakhi = flatten(r["outputs"]["sakhi:latest"]["form"])
        tested_paths = set(r["expected_form"].keys()) | set(r["must_be_null"])

        sakhi_only = []
        base_only = []
        differ = []
        for path in set(base) | set(sakhi):
            if path in tested_paths:
                continue
            b, s = base.get(path), sakhi.get(path)
            if is_null(b) and not is_null(s):
                sakhi_only.append((path, s))
            elif is_null(s) and not is_null(b):
                base_only.append((path, b))
            elif not is_null(b) and not is_null(s) and b != s:
                differ.append((path, b, s))

        sakhi_only_count += len(sakhi_only)
        base_only_count += len(base_only)

        if sakhi_only or base_only or differ:
            lines.append(f"\n## {r['name']}\n")
            if sakhi_only:
                lines.append(f"**Sakhi extracted, base returned null** ({len(sakhi_only)}):")
                for p, v in sorted(sakhi_only):
                    lines.append(f"- `{p}` = `{v}`")
                lines.append("")
            if base_only:
                lines.append(f"**Base extracted, sakhi returned null** ({len(base_only)}):")
                for p, v in sorted(base_only):
                    lines.append(f"- `{p}` = `{v}`")
                lines.append("")
            if differ:
                lines.append(f"**Differ** ({len(differ)}):")
                for p, b, s in sorted(differ):
                    lines.append(f"- `{p}`: base=`{b}`, sakhi=`{s}`")
                lines.append("")

    summary = (
        f"\n## Summary\n\n"
        f"- Sakhi extracted fields base left null: **{sakhi_only_count}**\n"
        f"- Base extracted fields sakhi left null: **{base_only_count}**\n"
    )
    lines.insert(2, summary)

    with open(OUT_PATH, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

    print(f"\nSummary: sakhi_extra={sakhi_only_count}, base_extra={base_only_count}")
    print(f"Written to {OUT_PATH}")


if __name__ == "__main__":
    main()