Spaces:

cmpatino
/

optimal-screening-app

Sleeping

App Files Files Community

cmpatino HF Staff commited on 5 days ago

Commit

a1b4ce8

1 Parent(s): cfe9277

feat: add risk calculation gradio app

Browse files

Files changed (11) hide show

.gitignore +5 -0
README.md +10 -2
app.py +290 -0
configs/example-risk.yaml +8 -0
optimal_screening/__init__.py +3 -0
optimal_screening/analysis/__init__.py +20 -0
optimal_screening/analysis/stratified.py +674 -0
optimal_screening/cli/__init__.py +3 -0
optimal_screening/cli/calculate_risk.py +137 -0
optimal_screening/data_sources.py +36 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.env
+.venv/
+__pycache__/
+*.py[cod]
+runs/

README.md CHANGED Viewed

@@ -5,9 +5,17 @@ colorFrom: yellow
 colorTo: indigo
 sdk: gradio
 sdk_version: 6.14.0
-python_version: '3.13'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorTo: indigo
 sdk: gradio
 sdk_version: 6.14.0
+python_version: '3.11'
 app_file: app.py
 pinned: false
 ---
+Gradio app for computing optimal screening risk curves from form inputs.
+The default form values mirror:
+```bash
+uv run calculate-risk configs/example-risk.yaml
+```
+The app accepts a Hugging Face dataset, an uploaded CSV, or pasted CSV rows. Each run writes a temporary JSON file and exposes it through the download button.

app.py ADDED Viewed

	@@ -0,0 +1,290 @@

+from __future__ import annotations
+import json
+import tempfile
+from pathlib import Path
+from typing import Any
+from uuid import uuid4
+import gradio as gr
+from optimal_screening.cli.calculate_risk import calculate_from_config
+ROOT = Path(__file__).parent
+SOURCE_CSV_UPLOAD = "Upload CSV"
+SOURCE_CSV_PASTE = "Paste CSV"
+SOURCE_HF_DATASET = "Hugging Face dataset"
+DEFAULT_DATASET = "cmpatino/landmine-detection"
+DEFAULT_SPLIT = "train"
+DEFAULT_OUTCOME = "mines_outcome"
+DEFAULT_STRATA = "Municipio"
+DEFAULT_BETA = 0.1
+DEFAULT_ALPHA_VALUES = "0.0, 0.05, 0.1"
+def _result_summary(result: dict[str, Any], output_path: Path) -> str:
+    alpha_count = len(result.get("alpha_values", []))
+    total_samples = result.get("total_samples", "unknown")
+    total_positive = result.get("total_positive", "unknown")
+    beta = result.get("beta", "unknown")
+    return (
+        f"Computed `{alpha_count}` alpha point(s) with beta `{beta}`.\n\n"
+        f"Total samples: `{total_samples}`  \n"
+        f"Total positives: `{total_positive}`  \n"
+        f"Output file: `{output_path.name}`"
+    )
+def _uploaded_path(uploaded_csv: Any) -> str | None:
+    if uploaded_csv is None:
+        return None
+    if isinstance(uploaded_csv, list):
+        if not uploaded_csv:
+            return None
+        uploaded_csv = uploaded_csv[0]
+    if isinstance(uploaded_csv, str):
+        return uploaded_csv
+    if hasattr(uploaded_csv, "name"):
+        return str(uploaded_csv.name)
+    return str(uploaded_csv)
+def _parse_list(value: str, field: str) -> list[str]:
+    values = [item.strip() for item in value.replace("\n", ",").split(",") if item.strip()]
+    if not values:
+        raise ValueError(f"{field} must include at least one value.")
+    return values
+def _parse_optional_float_list(value: str) -> list[float] | None:
+    if not value.strip():
+        return None
+    return [float(item) for item in _parse_list(value, "alpha values")]
+def _optional_text(value: str | None) -> str | None:
+    if value is None:
+        return None
+    value = value.strip()
+    return value or None
+def _result_filename(value: str | None) -> str:
+    filename = Path(value.strip()).name if value and value.strip() else "risk-results.json"
+    if not filename.endswith(".json"):
+        filename = f"{filename}.json"
+    return filename
+def _source_visibility(source: str) -> tuple[Any, Any, Any, Any, Any]:
+    return (
+        gr.update(visible=source == SOURCE_CSV_UPLOAD),
+        gr.update(visible=source == SOURCE_CSV_PASTE),
+        gr.update(visible=source == SOURCE_HF_DATASET),
+        gr.update(visible=source == SOURCE_HF_DATASET),
+        gr.update(visible=source == SOURCE_HF_DATASET),
+    )
+def _build_config(
+    *,
+    data_source: str,
+    uploaded_csv: Any,
+    pasted_csv: str,
+    hf_dataset: str,
+    hf_split: str,
+    hf_revision: str,
+    outcome: str,
+    strata: str,
+    beta: float,
+    prediction_col: str,
+    risk_col: str,
+    alpha_values: str,
+    result_filename: str,
+    run_dir: Path,
+) -> dict[str, Any]:
+    config: dict[str, Any] = {
+        "outcome": outcome.strip(),
+        "strata": _parse_list(strata, "strata"),
+        "beta": float(beta),
+        "output": str(run_dir / _result_filename(result_filename)),
+    }
+    if data_source == SOURCE_CSV_UPLOAD:
+        csv_path = _uploaded_path(uploaded_csv)
+        if csv_path is None:
+            raise ValueError("Upload a CSV file before calculating.")
+        config["csv"] = csv_path
+    elif data_source == SOURCE_CSV_PASTE:
+        if not pasted_csv.strip():
+            raise ValueError("Paste CSV data before calculating.")
+        pasted_csv_path = run_dir / "input.csv"
+        pasted_csv_path.write_text(pasted_csv.strip() + "\n")
+        config["csv"] = str(pasted_csv_path)
+    elif data_source == SOURCE_HF_DATASET:
+        dataset = hf_dataset.strip()
+        if not dataset:
+            raise ValueError("Hugging Face dataset is required.")
+        config["hf_dataset"] = dataset
+        config["hf_split"] = hf_split.strip() or "train"
+        revision = _optional_text(hf_revision)
+        if revision is not None:
+            config["hf_revision"] = revision
+    else:
+        raise ValueError(f"Unknown data source: {data_source}")
+    prediction = _optional_text(prediction_col)
+    if prediction is not None:
+        config["prediction_col"] = prediction
+    risk = _optional_text(risk_col)
+    if risk is not None:
+        config["risk_col"] = risk
+    alpha_quantiles = _parse_optional_float_list(alpha_values)
+    if alpha_quantiles is not None:
+        config["alpha_quantiles"] = alpha_quantiles
+    return config
+def calculate_risk(
+    data_source: str,
+    uploaded_csv: Any,
+    pasted_csv: str,
+    hf_dataset: str,
+    hf_split: str,
+    hf_revision: str,
+    outcome: str,
+    strata: str,
+    beta: float,
+    prediction_col: str,
+    risk_col: str,
+    alpha_values: str,
+    result_filename: str,
+) -> tuple[str, dict[str, Any] | None, Any]:
+    try:
+        run_dir = Path(tempfile.gettempdir()) / "risk-calculation" / uuid4().hex
+        run_dir.mkdir(parents=True, exist_ok=True)
+        config = _build_config(
+            data_source=data_source,
+            uploaded_csv=uploaded_csv,
+            pasted_csv=pasted_csv,
+            hf_dataset=hf_dataset,
+            hf_split=hf_split,
+            hf_revision=hf_revision,
+            outcome=outcome,
+            strata=strata,
+            beta=beta,
+            prediction_col=prediction_col,
+            risk_col=risk_col,
+            alpha_values=alpha_values,
+            result_filename=result_filename,
+            run_dir=run_dir,
+        )
+        config_path = run_dir / "risk-config.json"
+        config_path.write_text(json.dumps(config, indent=2))
+        calculated_output_path = calculate_from_config(config_path)
+        result = json.loads(calculated_output_path.read_text())
+        return _result_summary(result, calculated_output_path), result, gr.update(
+            value=str(calculated_output_path),
+            interactive=True,
+        )
+    except Exception as exc:  # noqa: BLE001 - show validation/runtime errors in the interface.
+        return f"Calculation failed: `{exc}`", None, gr.update(value=None, interactive=False)
+with gr.Blocks(title="Risk Calculation") as demo:
+    gr.Markdown("# Risk Calculation")
+    with gr.Row():
+        with gr.Column(scale=2):
+            data_source = gr.Radio(
+                choices=[SOURCE_HF_DATASET, SOURCE_CSV_UPLOAD, SOURCE_CSV_PASTE],
+                value=SOURCE_HF_DATASET,
+                label="Data source",
+            )
+            uploaded_csv = gr.File(
+                label="Upload CSV",
+                file_types=[".csv"],
+                type="filepath",
+                visible=False,
+            )
+            pasted_csv = gr.Textbox(
+                label="Paste CSV",
+                lines=8,
+                max_lines=16,
+                placeholder="risk,outcome,group\n0.9,1,a\n0.1,0,b",
+                visible=False,
+            )
+            hf_dataset = gr.Textbox(
+                value=DEFAULT_DATASET,
+                label="Hugging Face dataset",
+            )
+            with gr.Row():
+                hf_split = gr.Textbox(value=DEFAULT_SPLIT, label="Split")
+                hf_revision = gr.Textbox(value="", label="Revision")
+            outcome = gr.Textbox(value=DEFAULT_OUTCOME, label="Outcome column")
+            strata = gr.Textbox(value=DEFAULT_STRATA, label="Strata columns")
+            beta = gr.Number(
+                value=DEFAULT_BETA,
+                label="Treatment budget beta",
+                minimum=0,
+                maximum=1,
+                step=0.01,
+            )
+            prediction_col = gr.Textbox(value="probability", label="Prediction column")
+            risk_col = gr.Textbox(value="", label="Risk column")
+            alpha_values = gr.Textbox(
+                value=DEFAULT_ALPHA_VALUES,
+                label="Alpha values",
+            )
+            result_filename = gr.Textbox(value="risk-results.json", label="Result file name")
+            run_button = gr.Button("Calculate risk", variant="primary")
+        with gr.Column(scale=3):
+            status_output = gr.Markdown(label="Status")
+            download_output = gr.DownloadButton(
+                label="Download results JSON",
+                value=None,
+                interactive=False,
+            )
+            result_output = gr.JSON(label="Results")
+    data_source.change(
+        fn=_source_visibility,
+        inputs=data_source,
+        outputs=[uploaded_csv, pasted_csv, hf_dataset, hf_split, hf_revision],
+        show_progress="hidden",
+    )
+    run_button.click(
+        fn=calculate_risk,
+        inputs=[
+            data_source,
+            uploaded_csv,
+            pasted_csv,
+            hf_dataset,
+            hf_split,
+            hf_revision,
+            outcome,
+            strata,
+            beta,
+            prediction_col,
+            risk_col,
+            alpha_values,
+            result_filename,
+        ],
+        outputs=[status_output, result_output, download_output],
+        api_name="calculate_risk",
+    )
+if __name__ == "__main__":
+    demo.queue().launch()

configs/example-risk.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+hf_dataset: cmpatino/landmine-detection
+hf_split: train
+outcome: mines_outcome
+strata:
+  - Municipio
+beta: 0.1
+alpha_quantiles: [0.0, 0.05, 0.1]
+output: runs/example-risk-output.json

optimal_screening/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """Camera-ready optimal screening code for paper replication."""
2	+
3	+ __all__ = []

optimal_screening/analysis/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from .stratified import (
+    RISK_PRESETS,
+    SIMULATION_SIZE,
+    compute_empirical_probabilities,
+    compute_intuitive_optimal_curve,
+    compute_optimal_screening_curve,
+    compute_random_screening_curve,
+    generate_simulation_data,
+)
+__all__ = [
+    "RISK_PRESETS",
+    "SIMULATION_SIZE",
+    "compute_empirical_probabilities",
+    "compute_intuitive_optimal_curve",
+    "compute_optimal_screening_curve",
+    "compute_random_screening_curve",
+    "generate_simulation_data",
+]

optimal_screening/analysis/stratified.py ADDED Viewed

	@@ -0,0 +1,674 @@

+from __future__ import annotations
+from collections import defaultdict
+from collections.abc import Sequence
+from typing import Any
+import numpy as np
+SIMULATION_SIZE = 100_000
+RISK_PRESETS: dict[str, tuple[float, float] | float] = {
+    "uniform": (1.0, 1.0),
+    "bimodal": (0.1, 0.1),
+    "unimodal": (10.0, 10.0),
+    "delta_half": 0.5,
+}
+def compute_empirical_probabilities(
+    rows: list[dict[str, Any]],
+    outcome_col: str,
+    strata_features: Sequence[str],
+) -> dict[tuple[Any, ...], dict[str, Any]]:
+    """Compute empirical P(Y=1|X) for each feature stratum from true outcomes.
+    Args:
+        rows: List of data rows (dicts) with features and outcome
+        outcome_col: Name of the outcome column (e.g., "PINCP > 50k")
+        strata_features: List of feature names to define strata (e.g., ['AGEP', 'SEX'])
+    Returns:
+        Dictionary mapping stratum key -> {
+            'probability': empirical P(Y=1|X) = (# Y=1) / (# total),
+            'count': number of samples in stratum,
+            'positive_count': number of Y=1 samples,
+            'features': dict of feature values for this stratum
+        }
+    Example:
+        >>> rows = [
+        ...     {'AGEP': 35, 'SEX': 1, 'PINCP > 50k': True},
+        ...     {'AGEP': 35, 'SEX': 1, 'PINCP > 50k': False},
+        ...     {'AGEP': 35, 'SEX': 1, 'PINCP > 50k': True},
+        ... ]
+        >>> strata = compute_empirical_probabilities(rows, 'PINCP > 50k', ['AGEP', 'SEX'])
+        >>> strata[(35, 1)]['probability']
+        0.6666666666666666
+        >>> strata[(35, 1)]['count']
+        3
+    """
+    # Group by strata and count outcomes
+    strata_counts: dict[tuple[Any, ...], dict[str, int]] = defaultdict(lambda: {"total": 0, "positive": 0})
+    strata_features_map: dict[tuple[Any, ...], dict[str, Any]] = {}
+    for row in rows:
+        # Create stratum key from selected features
+        stratum_key = tuple(row.get(f) for f in strata_features)
+        # Count outcomes
+        outcome_value = row.get(outcome_col)
+        strata_counts[stratum_key]["total"] += 1
+        # Convert outcome to boolean (handle "True"/"False" strings, True/False, 1/0, etc.)
+        if _is_positive_outcome(outcome_value):
+            strata_counts[stratum_key]["positive"] += 1
+        # Store feature values for this stratum
+        if stratum_key not in strata_features_map:
+            strata_features_map[stratum_key] = {f: row.get(f) for f in strata_features}
+    # Compute empirical P(Y=1|X) for each stratum
+    result = {}
+    for stratum_key, counts in strata_counts.items():
+        total = counts["total"]
+        positive = counts["positive"]
+        result[stratum_key] = {
+            "probability": positive / total if total > 0 else 0.0,
+            "count": total,
+            "positive_count": positive,
+            "features": strata_features_map[stratum_key],
+        }
+    return result
+def _is_positive_outcome(value: Any) -> bool:
+    """Helper to determine if outcome value represents Y=1."""
+    if value is None:
+        return False
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, (int, float)):
+        return value > 0
+    if isinstance(value, str):
+        return value.lower() in ("true", "1", "yes", "t", "y")
+    return False
+def generate_simulation_data(
+    a: float | None = None,
+    b: float | None = None,
+    size: int = SIMULATION_SIZE,
+    seed: int | None = None,
+    point_mass: float | None = None,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Generate synthetic risk scores and binary outcomes.
+    Supports two modes:
+    - **Beta-Binomial**: risk_scores ~ Beta(a, b), outcomes ~ Binomial(1, risk_scores).
+    - **Point mass**: all risk_scores = *point_mass*, outcomes ~ Binomial(1, point_mass).
+    Args:
+        a: Alpha parameter of the Beta distribution (ignored when *point_mass* is set).
+        b: Beta parameter of the Beta distribution (ignored when *point_mass* is set).
+        size: Number of samples to generate.
+        seed: Random seed for reproducibility.
+        point_mass: If provided, every risk score is set to this constant value.
+    Returns:
+        Tuple of (risk_scores, outcomes).
+    """
+    rng = np.random.default_rng(seed)
+    if point_mass is not None:
+        risk_scores = np.full(size, point_mass)
+    else:
+        risk_scores = rng.beta(a, b, size=size)
+    outcomes = rng.binomial(1, risk_scores)
+    return risk_scores, outcomes
+def compute_optimal_screening_curve(
+    rows: list[dict[str, Any]],
+    outcome_col: str,
+    strata_features: Sequence[str],
+    prediction_col: str = "probability",
+    beta: float = 0.5,
+    alpha_quantiles: Sequence[float] | None = None,
+    max_iterations: int = 20,
+    tolerance: float = 1e-6,
+    seed: int | None = None,
+    use_custom_risk_col: str | None = None,
+    simulation: str | tuple[float, float] | None = None,
+) -> dict[str, Any]:
+    """Compute optimal screening curve with treatment budget β and screening budget α.
+    Band structure (highest to lowest risk):
+    - Band 1: Top (β - α) - Treated, model predictions
+    - Band 2: Next (α - avg_risk(Band 3)) - Treated, model predictions
+    - Band 3: Next α - Screened (true outcomes)
+    - Band 4: Bottom (1 - β - α + avg_risk) - Untreated (predict 0)
+    Uses iterative method to resolve circular dependency between Band 2 and Band 3.
+    Args:
+        rows: List of data rows with features, outcome, and predictions
+        outcome_col: Name of outcome column
+        strata_features: Features defining strata for computing empirical P(Y=1|X)
+        prediction_col: Column name for model predictions
+        beta: Treatment budget (proportion who can be treated)
+        alpha_quantiles: Screening budget levels to evaluate
+        max_iterations: Maximum iterations for convergence
+        tolerance: Convergence tolerance for avg_risk
+        seed: Random seed for uniform distribution override (for debugging)
+        use_custom_risk_col: If provided, use this column for risk instead of computing
+            empirical probabilities from strata. Useful for comparing LLM predictions
+            with empirical baselines.
+        simulation: If provided, generate synthetic data from a Beta distribution instead
+            of using real data. Pass a preset name ('uniform', 'bimodal', 'unimodal') or
+            a tuple (a, b) of Beta distribution parameters. Uses SIMULATION_SIZE samples.
+    Returns:
+        Dictionary with screening curves and band information
+    """
+    if alpha_quantiles is None:
+        # Default: 10 equally spaced values from 0 to beta
+        alpha_quantiles = [beta * i / 49 for i in range(50)]
+    # Assign each row its risk (simulation, custom, or empirical)
+    rows_with_risk = []
+    if simulation is not None:
+        # Generate synthetic data from a Beta distribution
+        if isinstance(simulation, str):
+            if simulation not in RISK_PRESETS:
+                raise ValueError(f"Unknown simulation preset '{simulation}'. Choose from {list(RISK_PRESETS.keys())}.")
+            preset = RISK_PRESETS[simulation]
+        else:
+            preset = simulation
+        if isinstance(preset, (int, float)):
+            risk_scores, outcomes = generate_simulation_data(size=SIMULATION_SIZE, seed=seed, point_mass=float(preset))
+        else:
+            a, b = preset
+            risk_scores, outcomes = generate_simulation_data(a, b, size=SIMULATION_SIZE, seed=seed)
+        for i in range(SIMULATION_SIZE):
+            rows_with_risk.append(
+                {
+                    "row": {"_sim_index": i, "_sim_feature": 0, outcome_col: bool(outcomes[i])},
+                    "empirical_risk": float(risk_scores[i]),
+                    "true_outcome": bool(outcomes[i]),
+                    "model_prediction": float(risk_scores[i]),
+                }
+            )
+    elif use_custom_risk_col is not None:
+        # Use custom risk column directly
+        for row in rows:
+            risk = row.get(use_custom_risk_col, 0.5)
+            rows_with_risk.append(
+                {
+                    "row": row,
+                    "empirical_risk": risk,
+                    "true_outcome": _is_positive_outcome(row.get(outcome_col)),
+                    "model_prediction": row.get(prediction_col, 0.5),
+                }
+            )
+    else:
+        # Compute empirical P(Y=1|X) for each stratum
+        empirical_probs = compute_empirical_probabilities(rows, outcome_col, strata_features)
+        for row in rows:
+            stratum_key = tuple(row.get(f) for f in strata_features)
+            empirical_risk = empirical_probs.get(stratum_key, {}).get("probability", 0.5)
+            rows_with_risk.append(
+                {
+                    "row": row,
+                    "empirical_risk": empirical_risk,
+                    "true_outcome": _is_positive_outcome(row.get(outcome_col)),
+                    "model_prediction": row.get(prediction_col, 0.5),
+                }
+            )
+    # Sort by risk (highest to lowest)
+    rows_with_risk.sort(key=lambda x: x["empirical_risk"], reverse=True)
+    total_positive = sum(1 for r in rows_with_risk if r["true_outcome"])
+    n = len(rows_with_risk)
+    # Results storage
+    results = {
+        "beta": beta,
+        "alpha_values": [],
+        "true_positives": [],
+        "band_info": [],
+        "total_positive": total_positive,
+        "total_samples": n,
+    }
+    for alpha in alpha_quantiles:
+        assert alpha <= beta, f"Screening budget α={alpha} exceeds treatment budget β={beta}"
+        # Iteratively find Band 3 position.
+        prev_avg_risk = 0.0
+        for _iteration in range(max_iterations):
+            # Compute target mass: ∫ f(risk) d(risk) = target
+            # Where f(risk) is the density over risk values
+            # For discrete: sum of (count at each risk / total count) = proportion of population at that risk
+            band1_target_mass = beta - alpha
+            # Band 2 size: ∫ 1 × f(risk) d(risk) over Band 2 = ∫ (1 - risk) × f(risk) d(risk) over Band 3
+            # Since Band 3 has mass α and average risk prev_avg_risk:
+            # ∫ (1 - risk) × f(risk) d(risk) over Band 3 = α × (1 - prev_avg_risk)
+            band2_target_mass = alpha * (1 - prev_avg_risk)
+            band3_target_mass = alpha
+            # Band 1: Find index where cumulative proportion of population = band1_target_mass
+            # This is: ∫ f(risk) d(risk) from risk=1 down to some risk threshold
+            cumulative_mass = 0.0
+            band1_end_idx = 0
+            for i in range(n):
+                # Each person contributes 1/n to the density (proportion of population)
+                population_contribution = 1.0 / n
+                cumulative_mass += population_contribution
+                if cumulative_mass >= band1_target_mass:
+                    band1_end_idx = i + 1
+                    break
+            if band1_end_idx == 0 and band1_target_mass > 0:
+                band1_end_idx = 1  # At least one person
+            # Band 2: Continue from Band 1 end
+            target_mass_band1_plus_band2 = band1_target_mass + band2_target_mass
+            band2_end_idx = band1_end_idx
+            for i in range(band1_end_idx, n):
+                population_contribution = 1.0 / n
+                cumulative_mass += population_contribution
+                if cumulative_mass >= target_mass_band1_plus_band2:
+                    band2_end_idx = i + 1
+                    break
+            # Band 3: Continue from Band 2 end
+            target_mass_band1_plus_band2_plus_band3 = band1_target_mass + band2_target_mass + band3_target_mass
+            band3_end_idx = band2_end_idx
+            for i in range(band2_end_idx, n):
+                population_contribution = 1.0 / n
+                cumulative_mass += population_contribution
+                if cumulative_mass >= target_mass_band1_plus_band2_plus_band3:
+                    band3_end_idx = i + 1
+                    break
+            # Ensure indices are within bounds
+            band1_end_idx = min(band1_end_idx, n)
+            band2_end_idx = min(band2_end_idx, n)
+            band3_end_idx = min(band3_end_idx, n)
+            # Compute average risk of Band 3
+            if band3_end_idx > band2_end_idx:
+                band3_risks = [rows_with_risk[i]["empirical_risk"] for i in range(band2_end_idx, band3_end_idx)]
+                current_avg_risk = np.mean(band3_risks) if band3_risks else 0.0
+            else:
+                current_avg_risk = 0.0
+            # Check convergence
+            if abs(current_avg_risk - prev_avg_risk) < tolerance:
+                break
+            prev_avg_risk = current_avg_risk
+        # Final band sizes (keep the indices from the last iteration)
+        # The indices are already set from the converged iteration above
+        avg_risk_band3 = prev_avg_risk
+        # Compute integrals: ∫ risk × (1/n) dx for each band (for reporting purposes)
+        band1_integral = sum(rows_with_risk[i]["empirical_risk"] / n for i in range(0, band1_end_idx))
+        band2_integral = sum(rows_with_risk[i]["empirical_risk"] / n for i in range(band1_end_idx, band2_end_idx))
+        band3_integral = sum(rows_with_risk[i]["empirical_risk"] / n for i in range(band2_end_idx, band3_end_idx))
+        band4_integral = sum(rows_with_risk[i]["empirical_risk"] / n for i in range(band3_end_idx, n))
+        # Population proportions = ∫ f(risk) d(risk) for each band
+        # This is the "mass" used for band selection
+        band1_pop_prop = band1_end_idx / n
+        band2_pop_prop = (band2_end_idx - band1_end_idx) / n
+        band3_pop_prop = (band3_end_idx - band2_end_idx) / n
+        band4_pop_prop = (n - band3_end_idx) / n
+        # Expected negatives in Band 3: ∫ (1 - risk) × f(risk) d(risk) over Band 3
+        band3_expected_negatives = sum(
+            (1 - rows_with_risk[i]["empirical_risk"]) / n for i in range(band2_end_idx, band3_end_idx)
+        )
+        # Count true positives in each band
+        tp_count = 0
+        # Band 1: Treated, empirical predictions
+        for i in range(0, band1_end_idx):
+            item = rows_with_risk[i]
+            if item["true_outcome"]:
+                tp_count += 1
+        # Band 2: Treated, empirical predictions
+        for i in range(band1_end_idx, band2_end_idx):
+            item = rows_with_risk[i]
+            if item["true_outcome"]:
+                tp_count += 1
+        # Band 3: Screened, use true outcomes
+        for i in range(band2_end_idx, band3_end_idx):
+            item = rows_with_risk[i]
+            if item["true_outcome"]:
+                tp_count += 1
+        # Band 4: Untreated, predict 0 (no TPs)
+        # (no contribution to tp_count)
+        results["alpha_values"].append(alpha)
+        # Enforce monotonicity: TP can never decrease as screening budget grows
+        tp_count = max(tp_count, results["true_positives"][-1] if results["true_positives"] else 0)
+        results["true_positives"].append(tp_count)
+        results["band_info"].append(
+            {
+                "alpha": alpha,
+                "band1_integral": band1_integral,
+                "band2_integral": band2_integral,
+                "band3_integral": band3_integral,
+                "band4_integral": band4_integral,
+                "band1_pop_prop": band1_pop_prop,
+                "band2_pop_prop": band2_pop_prop,
+                "band3_pop_prop": band3_pop_prop,
+                "band4_pop_prop": band4_pop_prop,
+                "band3_expected_negatives": band3_expected_negatives,
+                "avg_risk_band3": avg_risk_band3,
+                "band1_end_idx": band1_end_idx,
+                "band2_end_idx": band2_end_idx,
+                "band3_end_idx": band3_end_idx,
+            }
+        )
+    return results
+def compute_random_screening_curve(
+    rows: list[dict[str, Any]],
+    outcome_col: str,
+    strata_features: Sequence[str],
+    prediction_col: str = "probability",
+    beta: float = 0.5,
+    alpha_quantiles: Sequence[float] | None = None,
+    seed: int = 42,
+    use_custom_risk_col: str | None = None,
+    simulation: str | tuple[float, float] | None = None,
+) -> dict[str, Any]:
+    """Compute random screening baseline curve.
+    This baseline screens α proportion of the population at random (instead of targeting
+    low-risk individuals). It treats:
+    1. All screened individuals with Y=1 (true positive outcome)
+    2. From unscreened, treats top (β + prop_screened_negatives - prop_screened_positives) by risk
+    The intuition: by randomly screening, we identify some negatives and don't waste treatment
+    budget on them, allowing us to treat more high-risk unscreened individuals.
+    Args:
+        rows: List of data rows with features, outcome, and predictions
+        outcome_col: Name of outcome column
+        strata_features: Features defining strata (used for risk scoring)
+        prediction_col: Column name for model predictions
+        beta: Treatment budget (proportion who can be treated)
+        alpha_quantiles: Screening budget levels to evaluate
+        seed: Random seed for reproducible random screening
+        use_custom_risk_col: If provided, use this column for risk instead of empirical
+        simulation: If provided, generate synthetic data from a Beta distribution instead
+            of using real data. Pass a preset name ('uniform', 'bimodal', 'unimodal') or
+            a tuple (a, b) of Beta distribution parameters. Uses SIMULATION_SIZE samples.
+    Returns:
+        Dictionary with screening curves
+    """
+    if alpha_quantiles is None:
+        alpha_quantiles = [beta * i / 49 for i in range(50)]
+    # Assign each row its risk (simulation, custom, or empirical)
+    rows_with_risk = []
+    if simulation is not None:
+        # Generate synthetic data from a Beta distribution
+        if isinstance(simulation, str):
+            if simulation not in RISK_PRESETS:
+                raise ValueError(f"Unknown simulation preset '{simulation}'. Choose from {list(RISK_PRESETS.keys())}.")
+            preset = RISK_PRESETS[simulation]
+        else:
+            preset = simulation
+        if isinstance(preset, (int, float)):
+            risk_scores, outcomes = generate_simulation_data(size=SIMULATION_SIZE, seed=seed, point_mass=float(preset))
+        else:
+            a, b = preset
+            risk_scores, outcomes = generate_simulation_data(a, b, size=SIMULATION_SIZE, seed=seed)
+        for i in range(SIMULATION_SIZE):
+            rows_with_risk.append(
+                {
+                    "row": {"_sim_index": i, "_sim_feature": 0, outcome_col: bool(outcomes[i])},
+                    "empirical_risk": float(risk_scores[i]),
+                    "true_outcome": bool(outcomes[i]),
+                    "model_prediction": float(risk_scores[i]),
+                }
+            )
+    elif use_custom_risk_col is not None:
+        # Use custom risk column directly
+        for row in rows:
+            risk = row.get(use_custom_risk_col, 0.5)
+            rows_with_risk.append(
+                {
+                    "row": row,
+                    "empirical_risk": risk,
+                    "true_outcome": _is_positive_outcome(row.get(outcome_col)),
+                    "model_prediction": row.get(prediction_col, 0.5),
+                }
+            )
+    else:
+        # Compute empirical P(Y=1|X) for each stratum
+        empirical_probs = compute_empirical_probabilities(rows, outcome_col, strata_features)
+        for row in rows:
+            stratum_key = tuple(row.get(f) for f in strata_features)
+            empirical_risk = empirical_probs.get(stratum_key, {}).get("probability", 0.5)
+            rows_with_risk.append(
+                {
+                    "row": row,
+                    "empirical_risk": empirical_risk,
+                    "true_outcome": _is_positive_outcome(row.get(outcome_col)),
+                    "model_prediction": row.get(prediction_col, 0.5),
+                }
+            )
+    total_positive = sum(1 for r in rows_with_risk if r["true_outcome"])
+    n = len(rows_with_risk)
+    # Results storage
+    results = {
+        "beta": beta,
+        "alpha_values": [],
+        "true_positives": [],
+        "total_positive": total_positive,
+        "total_samples": n,
+    }
+    # Set random seed for reproducibility — use a single permutation so that
+    # screened sets are nested (larger α always includes the smaller α set).
+    rng = np.random.RandomState(seed)
+    random_order = rng.permutation(n)
+    for alpha in alpha_quantiles:
+        assert alpha <= beta, f"Screening budget α={alpha} exceeds treatment budget β={beta}"
+        # Screen α proportion uniformly at random
+        n_screen = min(int(alpha * n), n)
+        n_treat = int(beta * n)
+        screened_indices = set(random_order[:n_screen])
+        # Identify screened positives (gamma mass)
+        screened_positive_indices = {idx for idx in screened_indices if rows_with_risk[idx]["true_outcome"]}
+        gamma_count = len(screened_positive_indices)
+        # Treat screened positives up to budget
+        tp_from_screening = min(gamma_count, n_treat)
+        remaining_budget = max(0, n_treat - tp_from_screening)
+        # Pool for risk-based treatment: everyone except screened positives
+        pool = [(idx, rows_with_risk[idx]) for idx in range(n) if idx not in screened_positive_indices]
+        pool.sort(key=lambda x: x[1]["empirical_risk"], reverse=True)
+        # Treat top (β - γ) mass by risk score
+        n_treat_by_risk = min(remaining_budget, len(pool))
+        tp_from_risk = sum(1 for i in range(n_treat_by_risk) if pool[i][1]["true_outcome"])
+        tp_count = tp_from_screening + tp_from_risk
+        results["alpha_values"].append(alpha)
+        results["true_positives"].append(tp_count)
+    return results
+def compute_intuitive_optimal_curve(
+    rows: list[dict[str, Any]],
+    outcome_col: str,
+    strata_features: Sequence[str],
+    prediction_col: str = "probability",
+    beta: float = 0.5,
+    alpha_quantiles: Sequence[float] | None = None,
+    seed: int | None = None,
+    use_custom_risk_col: str | None = None,
+    simulation: str | tuple[float, float] | None = None,
+) -> dict[str, Any]:
+    """Compute intuitive-optimal screening curve.
+    Algorithm (all bands are adjacent slices of the risk-sorted population):
+      1. Band A: treat the top (β − α) mass by risk (highest risk, no screening).
+      2. Band B: screen the next α mass.  Let γ ≤ α be the mass of screened
+         individuals with Y=0.  Screened Y=1 are treated; screened Y=0 are not.
+      3. Band C: treat the next γ mass below the screened band (replaces the
+         screened negatives, preserving total treatment budget = β).
+    Args:
+        rows: List of data rows (ignored when *simulation* is set).
+        outcome_col: Name of outcome column.
+        strata_features: Features defining strata.
+        prediction_col: Column name for model predictions.
+        beta: Treatment budget (proportion who can be treated).
+        alpha_quantiles: Screening budget levels to evaluate.
+        seed: Random seed for simulation mode.
+        use_custom_risk_col: Use this column for risk instead of empirical.
+        simulation: Preset name or (a, b) Beta parameters for synthetic data.
+    Returns:
+        Dictionary with alpha_values, true_positives, total_positive, total_samples.
+    """
+    if alpha_quantiles is None:
+        alpha_quantiles = [beta * i / 49 for i in range(50)]
+    # --- Build rows_with_risk (same logic as compute_optimal_screening_curve) ---
+    rows_with_risk = []
+    if simulation is not None:
+        if isinstance(simulation, str):
+            if simulation not in RISK_PRESETS:
+                raise ValueError(f"Unknown simulation preset '{simulation}'. Choose from {list(RISK_PRESETS.keys())}.")
+            preset = RISK_PRESETS[simulation]
+        else:
+            preset = simulation
+        if isinstance(preset, (int, float)):
+            risk_scores, outcomes = generate_simulation_data(size=SIMULATION_SIZE, seed=seed, point_mass=float(preset))
+        else:
+            a, b = preset
+            risk_scores, outcomes = generate_simulation_data(a, b, size=SIMULATION_SIZE, seed=seed)
+        for i in range(SIMULATION_SIZE):
+            rows_with_risk.append(
+                {
+                    "row": {"_sim_index": i, "_sim_feature": 0, outcome_col: bool(outcomes[i])},
+                    "empirical_risk": float(risk_scores[i]),
+                    "true_outcome": bool(outcomes[i]),
+                    "model_prediction": float(risk_scores[i]),
+                }
+            )
+    elif use_custom_risk_col is not None:
+        for row in rows:
+            risk = row.get(use_custom_risk_col, 0.5)
+            rows_with_risk.append(
+                {
+                    "row": row,
+                    "empirical_risk": risk,
+                    "true_outcome": _is_positive_outcome(row.get(outcome_col)),
+                    "model_prediction": row.get(prediction_col, 0.5),
+                }
+            )
+    else:
+        empirical_probs = compute_empirical_probabilities(rows, outcome_col, strata_features)
+        for row in rows:
+            stratum_key = tuple(row.get(f) for f in strata_features)
+            empirical_risk = empirical_probs.get(stratum_key, {}).get("probability", 0.5)
+            rows_with_risk.append(
+                {
+                    "row": row,
+                    "empirical_risk": empirical_risk,
+                    "true_outcome": _is_positive_outcome(row.get(outcome_col)),
+                    "model_prediction": row.get(prediction_col, 0.5),
+                }
+            )
+    # Sort by risk (highest to lowest)
+    rows_with_risk.sort(key=lambda x: x["empirical_risk"], reverse=True)
+    total_positive = sum(1 for r in rows_with_risk if r["true_outcome"])
+    n = len(rows_with_risk)
+    results = {
+        "beta": beta,
+        "alpha_values": [],
+        "true_positives": [],
+        "total_positive": total_positive,
+        "total_samples": n,
+    }
+    for alpha in alpha_quantiles:
+        assert alpha <= beta, f"Screening budget α={alpha} exceeds treatment budget β={beta}"
+        band_a_end = int((beta - alpha) * n)
+        band_b_end = band_a_end + int(alpha * n)
+        # Band A: treated by risk
+        tp_band_a = 0
+        for i in range(band_a_end):
+            item = rows_with_risk[i]
+            if item["true_outcome"]:
+                tp_band_a += 1
+        # Band B: screened — Y=1 treated, Y=0 not treated
+        tp_band_b = 0
+        gamma_count = 0  # number of screened with Y=0
+        for i in range(band_a_end, band_b_end):
+            item = rows_with_risk[i]
+            if item["true_outcome"]:
+                tp_band_b += 1
+            else:
+                gamma_count += 1
+        # Band C: next gamma_count individuals treated by risk
+        band_c_end = min(band_b_end + gamma_count, n)
+        tp_band_c = 0
+        for i in range(band_b_end, band_c_end):
+            item = rows_with_risk[i]
+            if item["true_outcome"]:
+                tp_band_c += 1
+        tp_count = tp_band_a + tp_band_b + tp_band_c
+        results["alpha_values"].append(alpha)
+        results["true_positives"].append(tp_count)
+    return results

optimal_screening/cli/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """Command-line entry points for the camera-ready paper code."""
2	+
3	+ __all__ = []

optimal_screening/cli/calculate_risk.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+import yaml
+from optimal_screening.analysis import compute_optimal_screening_curve
+from optimal_screening.data_sources import load_dataframe
+REQUIRED_FIELDS = {"outcome", "strata", "beta"}
+def _read_config(path: Path) -> dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Config file not found: {path}")
+    text = path.read_text()
+    if path.suffix.lower() == ".json":
+        data = json.loads(text)
+    elif path.suffix.lower() in {".yaml", ".yml"}:
+        data = yaml.safe_load(text)
+    else:
+        raise ValueError("Config file must be YAML or JSON")
+    if not isinstance(data, dict):
+        raise ValueError("Config must be a mapping")
+    return data
+def _as_float_sequence(values: Any, field: str) -> list[float] | None:
+    if values is None:
+        return None
+    if not isinstance(values, list | tuple):
+        raise ValueError(f"{field} must be a list of numbers")
+    return [float(value) for value in values]
+def _validate_config(config: dict[str, Any]) -> dict[str, Any]:
+    missing = sorted(REQUIRED_FIELDS - set(config))
+    if missing:
+        raise ValueError(f"Missing required config fields: {missing}")
+    has_csv = config.get("csv") is not None
+    has_hf_dataset = config.get("hf_dataset") is not None
+    if has_csv == has_hf_dataset:
+        raise ValueError("Config must provide exactly one data source: csv or hf_dataset")
+    strata = config["strata"]
+    if not isinstance(strata, list) or not strata or not all(isinstance(item, str) for item in strata):
+        raise ValueError("strata must be a non-empty list of column names")
+    beta = float(config["beta"])
+    if not 0 < beta <= 1:
+        raise ValueError("beta must be in the interval (0, 1]")
+    alpha_quantiles = _as_float_sequence(config.get("alpha_quantiles"), "alpha_quantiles")
+    if alpha_quantiles is not None:
+        invalid = [alpha for alpha in alpha_quantiles if alpha < 0 or alpha > beta]
+        if invalid:
+            raise ValueError(f"alpha_quantiles must be between 0 and beta={beta}; invalid values: {invalid}")
+    return {
+        "csv": str(config["csv"]) if has_csv else None,
+        "hf_dataset": str(config["hf_dataset"]) if has_hf_dataset else None,
+        "hf_split": str(config.get("hf_split", "train")),
+        "hf_revision": str(config["hf_revision"]) if config.get("hf_revision") is not None else None,
+        "outcome": str(config["outcome"]),
+        "strata": strata,
+        "beta": beta,
+        "prediction_col": str(config.get("prediction_col", "probability")),
+        "risk_col": str(config["risk_col"]) if config.get("risk_col") is not None else None,
+        "alpha_quantiles": alpha_quantiles,
+        "output": str(config.get("output", "runs/optimal_screening_curve.json")),
+    }
+def _json_safe(value: Any) -> Any:
+    if isinstance(value, dict):
+        return {key: _json_safe(item) for key, item in value.items()}
+    if isinstance(value, list | tuple):
+        return [_json_safe(item) for item in value]
+    if hasattr(value, "item"):
+        return value.item()
+    return value
+def calculate_from_config(config_path: Path) -> Path:
+    config = _validate_config(_read_config(config_path))
+    df, dataset_label = load_dataframe(
+        csv_path=config["csv"],
+        hf_dataset=config["hf_dataset"],
+        hf_split=config["hf_split"],
+        hf_revision=config["hf_revision"],
+    )
+    required_cols = {config["outcome"], *config["strata"]}
+    if config["risk_col"]:
+        required_cols.add(config["risk_col"])
+    elif config["prediction_col"] in df.columns:
+        required_cols.add(config["prediction_col"])
+    missing_cols = sorted(required_cols - set(df.columns))
+    if missing_cols:
+        raise ValueError(f"Missing required columns in {dataset_label}: {missing_cols}")
+    result = compute_optimal_screening_curve(
+        rows=df.to_dict("records"),
+        outcome_col=config["outcome"],
+        strata_features=config["strata"],
+        prediction_col=config["prediction_col"],
+        beta=config["beta"],
+        alpha_quantiles=config["alpha_quantiles"],
+        use_custom_risk_col=config["risk_col"],
+    )
+    output_path = Path(config["output"])
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps(_json_safe(result), indent=2))
+    return output_path
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Compute an optimal screening curve from a YAML or JSON config")
+    parser.add_argument("config", help="Path to a YAML or JSON config file")
+    args = parser.parse_args()
+    output_path = calculate_from_config(Path(args.config))
+    print(f"Wrote {output_path}")
+if __name__ == "__main__":
+    main()

optimal_screening/data_sources.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from __future__ import annotations
+from pathlib import Path
+import pandas as pd
+from datasets import load_dataset
+def load_hf_dataframe(dataset: str, split: str = "train", revision: str | None = None) -> pd.DataFrame:
+    """Load a tabular Hugging Face dataset split as a pandas DataFrame."""
+    kwargs = {"path": dataset, "split": split}
+    if revision is not None:
+        kwargs["revision"] = revision
+    return load_dataset(**kwargs).to_pandas()
+def load_dataframe(
+    *,
+    csv_path: str | None = None,
+    hf_dataset: str | None = None,
+    hf_split: str = "train",
+    hf_revision: str | None = None,
+) -> tuple[pd.DataFrame, str]:
+    """Load a DataFrame from exactly one supported source and return a source label."""
+    sources = [source is not None for source in (csv_path, hf_dataset)]
+    if sum(sources) != 1:
+        raise ValueError("Provide exactly one data source: csv_path or hf_dataset")
+    if hf_dataset is not None:
+        return load_hf_dataframe(hf_dataset, split=hf_split, revision=hf_revision), hf_dataset
+    assert csv_path is not None
+    path = Path(csv_path)
+    if not path.exists():
+        raise FileNotFoundError(f"CSV file not found: {path}")
+    return pd.read_csv(path), str(path)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio==6.14.0
+PyYAML>=6.0
+datasets>=2.18
+numpy>=1.24
+pandas>=2.0