import torch def compute_candidate_diversity(candidates: list[dict]) -> dict: """Mean pairwise cosine *distance* among candidate texts. 1.0 = maximally different, 0.0 = identical paraphrases. Empty candidate texts are filtered out before encoding, so `n_candidates` in the result is the count of *non-empty* texts (may be < len(candidates)). """ texts = [c.get("text", "").strip() for c in candidates] texts = [t for t in texts if t] n = len(texts) if n < 2: return {"candidate_diversity": 0.0, "n_candidates": n} from backend.retrieval.vector_store import embed_texts vecs = embed_texts(texts) sims = vecs @ vecs.T iu = torch.triu_indices(n, n, offset=1) return { "candidate_diversity": round(float(1.0 - sims[iu[0], iu[1]].mean().item()), 4), "n_candidates": n, }