Spaces:

cmpatino
/

optimal-screening-app

Sleeping

App Files Files Community

cmpatino HF Staff commited on 3 days ago

Commit

09f8c96

1 Parent(s): 2ebff59

Update app for optimal screening CSV output

Browse files

Files changed (6) hide show

README.md +4 -4
app.py +76 -72
configs/example-risk.yaml +2 -2
optimal_screening/analysis/__init__.py +2 -0
optimal_screening/analysis/stratified.py +200 -118
optimal_screening/cli/get_optimal_screening.py +129 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Optimal Screening Calculation
 emoji: 🐠
 colorFrom: yellow
 colorTo: indigo
@@ -10,12 +10,12 @@ app_file: app.py
 pinned: false
 ---
-Gradio app for computing optimal screening risk curves from form inputs.
 The default form values mirror:
 ```bash
-uv run calculate-risk configs/example-risk.yaml
 ```
-The app accepts a Hugging Face dataset, an uploaded CSV, or pasted CSV rows. Each run writes a temporary JSON file and exposes it through the download button.

 ---
+title: Optimal Screening Decisions
 emoji: 🐠
 colorFrom: yellow
 colorTo: indigo
 pinned: false
 ---
+Gradio app for writing optimal screening decisions from form inputs.
 The default form values mirror:
 ```bash
+uv run get-optimal-screening configs/example-risk.yaml
 ```
+The app accepts a Hugging Face dataset, an uploaded CSV, or pasted CSV rows. Each run writes a temporary CSV file with an added screening decision column and exposes it through the download button.

app.py CHANGED Viewed

@@ -7,11 +7,11 @@ from typing import Any
 from uuid import uuid4
 import gradio as gr
-from optimal_screening.cli.calculate_risk import calculate_from_config
-ROOT = Path(__file__).parent
 SOURCE_CSV_UPLOAD = "Upload CSV"
 SOURCE_CSV_PASTE = "Paste CSV"
 SOURCE_HF_DATASET = "Hugging Face dataset"
@@ -21,21 +21,8 @@ DEFAULT_SPLIT = "train"
 DEFAULT_OUTCOME = "mines_outcome"
 DEFAULT_STRATA = "Municipio"
 DEFAULT_BETA = 0.1
-DEFAULT_ALPHA_VALUES = "0.0, 0.05, 0.1"
-def _result_summary(result: dict[str, Any], output_path: Path) -> str:
-    alpha_count = len(result.get("alpha_values", []))
-    total_samples = result.get("total_samples", "unknown")
-    total_positive = result.get("total_positive", "unknown")
-    beta = result.get("beta", "unknown")
-    return (
-        f"Computed `{alpha_count}` alpha point(s) with beta `{beta}`.\n\n"
-        f"Total samples: `{total_samples}`  \n"
-        f"Total positives: `{total_positive}`  \n"
-        f"Output file: `{output_path.name}`"
-    )
 def _uploaded_path(uploaded_csv: Any) -> str | None:
@@ -59,12 +46,6 @@ def _parse_list(value: str, field: str) -> list[str]:
     return values
-def _parse_optional_float_list(value: str) -> list[float] | None:
-    if not value.strip():
-        return None
-    return [float(item) for item in _parse_list(value, "alpha values")]
 def _optional_text(value: str | None) -> str | None:
     if value is None:
         return None
@@ -72,10 +53,10 @@ def _optional_text(value: str | None) -> str | None:
     return value or None
-def _result_filename(value: str | None) -> str:
-    filename = Path(value.strip()).name if value and value.strip() else "risk-results.json"
-    if not filename.endswith(".json"):
-        filename = f"{filename}.json"
     return filename
@@ -100,27 +81,29 @@ def _build_config(
     outcome: str,
     strata: str,
     beta: float,
     prediction_col: str,
     risk_col: str,
-    alpha_values: str,
-    result_filename: str,
     run_dir: Path,
 ) -> dict[str, Any]:
     config: dict[str, Any] = {
         "outcome": outcome.strip(),
         "strata": _parse_list(strata, "strata"),
         "beta": float(beta),
-        "output": str(run_dir / _result_filename(result_filename)),
     }
     if data_source == SOURCE_CSV_UPLOAD:
         csv_path = _uploaded_path(uploaded_csv)
         if csv_path is None:
-            raise ValueError("Upload a CSV file before calculating.")
         config["csv"] = csv_path
     elif data_source == SOURCE_CSV_PASTE:
         if not pasted_csv.strip():
-            raise ValueError("Paste CSV data before calculating.")
         pasted_csv_path = run_dir / "input.csv"
         pasted_csv_path.write_text(pasted_csv.strip() + "\n")
         config["csv"] = str(pasted_csv_path)
@@ -144,14 +127,30 @@ def _build_config(
     if risk is not None:
         config["risk_col"] = risk
-    alpha_quantiles = _parse_optional_float_list(alpha_values)
-    if alpha_quantiles is not None:
-        config["alpha_quantiles"] = alpha_quantiles
     return config
-def calculate_risk(
     data_source: str,
     uploaded_csv: Any,
     pasted_csv: str,
@@ -161,13 +160,14 @@ def calculate_risk(
     outcome: str,
     strata: str,
     beta: float,
     prediction_col: str,
     risk_col: str,
-    alpha_values: str,
-    result_filename: str,
-) -> tuple[str, dict[str, Any] | None, Any]:
     try:
-        run_dir = Path(tempfile.gettempdir()) / "risk-calculation" / uuid4().hex
         run_dir.mkdir(parents=True, exist_ok=True)
         config = _build_config(
@@ -180,28 +180,26 @@ def calculate_risk(
             outcome=outcome,
             strata=strata,
             beta=beta,
             prediction_col=prediction_col,
             risk_col=risk_col,
-            alpha_values=alpha_values,
-            result_filename=result_filename,
             run_dir=run_dir,
         )
-        config_path = run_dir / "risk-config.json"
         config_path.write_text(json.dumps(config, indent=2))
-        calculated_output_path = calculate_from_config(config_path)
-        result = json.loads(calculated_output_path.read_text())
-        return _result_summary(result, calculated_output_path), result, gr.update(
-            value=str(calculated_output_path),
-            interactive=True,
-        )
     except Exception as exc:  # noqa: BLE001 - show validation/runtime errors in the interface.
-        return f"Calculation failed: `{exc}`", None, gr.update(value=None, interactive=False)
-with gr.Blocks(title="Risk Calculation") as demo:
-    gr.Markdown("# Risk Calculation")
     with gr.Row():
         with gr.Column(scale=2):
@@ -233,30 +231,35 @@ with gr.Blocks(title="Risk Calculation") as demo:
             outcome = gr.Textbox(value=DEFAULT_OUTCOME, label="Outcome column")
             strata = gr.Textbox(value=DEFAULT_STRATA, label="Strata columns")
-            beta = gr.Number(
-                value=DEFAULT_BETA,
-                label="Treatment budget beta",
-                minimum=0,
-                maximum=1,
-                step=0.01,
-            )
             prediction_col = gr.Textbox(value="probability", label="Prediction column")
             risk_col = gr.Textbox(value="", label="Risk column")
-            alpha_values = gr.Textbox(
-                value=DEFAULT_ALPHA_VALUES,
-                label="Alpha values",
-            )
-            result_filename = gr.Textbox(value="risk-results.json", label="Result file name")
-            run_button = gr.Button("Calculate risk", variant="primary")
         with gr.Column(scale=3):
             status_output = gr.Markdown(label="Status")
             download_output = gr.DownloadButton(
-                label="Download results JSON",
                 value=None,
                 interactive=False,
             )
-            result_output = gr.JSON(label="Results")
     data_source.change(
         fn=_source_visibility,
@@ -265,7 +268,7 @@ with gr.Blocks(title="Risk Calculation") as demo:
         show_progress="hidden",
     )
     run_button.click(
-        fn=calculate_risk,
         inputs=[
             data_source,
             uploaded_csv,
@@ -276,13 +279,14 @@ with gr.Blocks(title="Risk Calculation") as demo:
             outcome,
             strata,
             beta,
             prediction_col,
             risk_col,
-            alpha_values,
-            result_filename,
         ],
-        outputs=[status_output, result_output, download_output],
-        api_name="calculate_risk",
     )

 from uuid import uuid4
 import gradio as gr
+import pandas as pd
+from optimal_screening.cli.get_optimal_screening import get_optimal_screening_from_config
 SOURCE_CSV_UPLOAD = "Upload CSV"
 SOURCE_CSV_PASTE = "Paste CSV"
 SOURCE_HF_DATASET = "Hugging Face dataset"
 DEFAULT_OUTCOME = "mines_outcome"
 DEFAULT_STRATA = "Municipio"
 DEFAULT_BETA = 0.1
+DEFAULT_ALPHA = 0.05
+DEFAULT_ACTION_COL = "screening_decision"
 def _uploaded_path(uploaded_csv: Any) -> str | None:
     return values
 def _optional_text(value: str | None) -> str | None:
     if value is None:
         return None
     return value or None
+def _csv_filename(value: str | None) -> str:
+    filename = Path(value.strip()).name if value and value.strip() else "optimal-screening.csv"
+    if not filename.endswith(".csv"):
+        filename = f"{filename}.csv"
     return filename
     outcome: str,
     strata: str,
     beta: float,
+    alpha: float,
     prediction_col: str,
     risk_col: str,
+    action_col: str,
+    output_filename: str,
     run_dir: Path,
 ) -> dict[str, Any]:
     config: dict[str, Any] = {
         "outcome": outcome.strip(),
         "strata": _parse_list(strata, "strata"),
         "beta": float(beta),
+        "alpha": float(alpha),
+        "output": str(run_dir / _csv_filename(output_filename)),
     }
     if data_source == SOURCE_CSV_UPLOAD:
         csv_path = _uploaded_path(uploaded_csv)
         if csv_path is None:
+            raise ValueError("Upload a CSV file before running.")
         config["csv"] = csv_path
     elif data_source == SOURCE_CSV_PASTE:
         if not pasted_csv.strip():
+            raise ValueError("Paste CSV data before running.")
         pasted_csv_path = run_dir / "input.csv"
         pasted_csv_path.write_text(pasted_csv.strip() + "\n")
         config["csv"] = str(pasted_csv_path)
     if risk is not None:
         config["risk_col"] = risk
+    action = _optional_text(action_col)
+    if action is not None:
+        config["action_col"] = action
     return config
+def _result_summary(output_path: Path, action_col: str) -> tuple[str, pd.DataFrame]:
+    df = pd.read_csv(output_path)
+    summary_lines = [
+        f"Wrote `{output_path.name}`.",
+        "",
+        f"Rows: `{len(df)}`",
+    ]
+    if action_col in df.columns:
+        counts = df[action_col].value_counts().sort_index()
+        count_text = ", ".join(f"{int(action)}: {int(count)}" for action, count in counts.items())
+        summary_lines.append(f"{action_col}: `{count_text}`")
+    return "\n".join(summary_lines), df.head(100)
+def get_optimal_screening(
     data_source: str,
     uploaded_csv: Any,
     pasted_csv: str,
     outcome: str,
     strata: str,
     beta: float,
+    alpha: float,
     prediction_col: str,
     risk_col: str,
+    action_col: str,
+    output_filename: str,
+) -> tuple[str, pd.DataFrame | None, Any]:
     try:
+        run_dir = Path(tempfile.gettempdir()) / "optimal-screening" / uuid4().hex
         run_dir.mkdir(parents=True, exist_ok=True)
         config = _build_config(
             outcome=outcome,
             strata=strata,
             beta=beta,
+            alpha=alpha,
             prediction_col=prediction_col,
             risk_col=risk_col,
+            action_col=action_col,
+            output_filename=output_filename,
             run_dir=run_dir,
         )
+        config_path = run_dir / "optimal-screening-config.json"
         config_path.write_text(json.dumps(config, indent=2))
+        output_path = get_optimal_screening_from_config(config_path)
+        summary, preview = _result_summary(output_path, config.get("action_col", DEFAULT_ACTION_COL))
+        return summary, preview, gr.update(value=str(output_path), interactive=True)
     except Exception as exc:  # noqa: BLE001 - show validation/runtime errors in the interface.
+        return f"Run failed: `{exc}`", None, gr.update(value=None, interactive=False)
+with gr.Blocks(title="Optimal Screening Decisions") as demo:
+    gr.Markdown("# Optimal Screening Decisions")
     with gr.Row():
         with gr.Column(scale=2):
             outcome = gr.Textbox(value=DEFAULT_OUTCOME, label="Outcome column")
             strata = gr.Textbox(value=DEFAULT_STRATA, label="Strata columns")
+            with gr.Row():
+                beta = gr.Number(
+                    value=DEFAULT_BETA,
+                    label="Treatment budget beta",
+                    minimum=0,
+                    maximum=1,
+                    step=0.01,
+                )
+                alpha = gr.Number(
+                    value=DEFAULT_ALPHA,
+                    label="Screening budget alpha",
+                    minimum=0,
+                    maximum=1,
+                    step=0.01,
+                )
             prediction_col = gr.Textbox(value="probability", label="Prediction column")
             risk_col = gr.Textbox(value="", label="Risk column")
+            action_col = gr.Textbox(value=DEFAULT_ACTION_COL, label="Action column")
+            output_filename = gr.Textbox(value="optimal-screening.csv", label="Output file name")
+            run_button = gr.Button("Run", variant="primary")
         with gr.Column(scale=3):
             status_output = gr.Markdown(label="Status")
             download_output = gr.DownloadButton(
+                label="Download CSV",
                 value=None,
                 interactive=False,
             )
+            preview_output = gr.Dataframe(label="CSV preview", interactive=False)
     data_source.change(
         fn=_source_visibility,
         show_progress="hidden",
     )
     run_button.click(
+        fn=get_optimal_screening,
         inputs=[
             data_source,
             uploaded_csv,
             outcome,
             strata,
             beta,
+            alpha,
             prediction_col,
             risk_col,
+            action_col,
+            output_filename,
         ],
+        outputs=[status_output, preview_output, download_output],
+        api_name="get_optimal_screening",
     )

configs/example-risk.yaml CHANGED Viewed

@@ -4,5 +4,5 @@ outcome: mines_outcome
 strata:
   - Municipio
 beta: 0.1
-alpha_quantiles: [0.0, 0.05, 0.1]
-output: runs/example-risk-output.json

 strata:
   - Municipio
 beta: 0.1
+alpha: 0.05
+output: runs/example-risk-output.csv

optimal_screening/analysis/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ from .stratified import (
     SIMULATION_SIZE,
     compute_empirical_probabilities,
     compute_intuitive_optimal_curve,
     compute_optimal_screening_curve,
     compute_random_screening_curve,
     generate_simulation_data,
@@ -14,6 +15,7 @@ __all__ = [
     "SIMULATION_SIZE",
     "compute_empirical_probabilities",
     "compute_intuitive_optimal_curve",
     "compute_optimal_screening_curve",
     "compute_random_screening_curve",
     "generate_simulation_data",

     SIMULATION_SIZE,
     compute_empirical_probabilities,
     compute_intuitive_optimal_curve,
+    compute_optimal_screening_actions,
     compute_optimal_screening_curve,
     compute_random_screening_curve,
     generate_simulation_data,
     "SIMULATION_SIZE",
     "compute_empirical_probabilities",
     "compute_intuitive_optimal_curve",
+    "compute_optimal_screening_actions",
     "compute_optimal_screening_curve",
     "compute_random_screening_curve",
     "generate_simulation_data",

optimal_screening/analysis/stratified.py CHANGED Viewed

@@ -130,54 +130,15 @@ def generate_simulation_data(
     return risk_scores, outcomes
-def compute_optimal_screening_curve(
     rows: list[dict[str, Any]],
     outcome_col: str,
     strata_features: Sequence[str],
-    prediction_col: str = "probability",
-    beta: float = 0.5,
-    alpha_quantiles: Sequence[float] | None = None,
-    max_iterations: int = 20,
-    tolerance: float = 1e-6,
-    seed: int | None = None,
-    use_custom_risk_col: str | None = None,
-    simulation: str | tuple[float, float] | None = None,
-) -> dict[str, Any]:
-    """Compute optimal screening curve with treatment budget β and screening budget α.
-    Band structure (highest to lowest risk):
-    - Band 1: Top (β - α) - Treated, model predictions
-    - Band 2: Next (α - avg_risk(Band 3)) - Treated, model predictions
-    - Band 3: Next α - Screened (true outcomes)
-    - Band 4: Bottom (1 - β - α + avg_risk) - Untreated (predict 0)
-    Uses iterative method to resolve circular dependency between Band 2 and Band 3.
-    Args:
-        rows: List of data rows with features, outcome, and predictions
-        outcome_col: Name of outcome column
-        strata_features: Features defining strata for computing empirical P(Y=1|X)
-        prediction_col: Column name for model predictions
-        beta: Treatment budget (proportion who can be treated)
-        alpha_quantiles: Screening budget levels to evaluate
-        max_iterations: Maximum iterations for convergence
-        tolerance: Convergence tolerance for avg_risk
-        seed: Random seed for uniform distribution override (for debugging)
-        use_custom_risk_col: If provided, use this column for risk instead of computing
-            empirical probabilities from strata. Useful for comparing LLM predictions
-            with empirical baselines.
-        simulation: If provided, generate synthetic data from a Beta distribution instead
-            of using real data. Pass a preset name ('uniform', 'bimodal', 'unimodal') or
-            a tuple (a, b) of Beta distribution parameters. Uses SIMULATION_SIZE samples.
-    Returns:
-        Dictionary with screening curves and band information
-    """
-    if alpha_quantiles is None:
-        # Default: 10 equally spaced values from 0 to beta
-        alpha_quantiles = [beta * i / 49 for i in range(50)]
-    # Assign each row its risk (simulation, custom, or empirical)
     rows_with_risk = []
     if simulation is not None:
@@ -202,11 +163,12 @@ def compute_optimal_screening_curve(
                     "empirical_risk": float(risk_scores[i]),
                     "true_outcome": bool(outcomes[i]),
                     "model_prediction": float(risk_scores[i]),
                 }
             )
     elif use_custom_risk_col is not None:
         # Use custom risk column directly
-        for row in rows:
             risk = row.get(use_custom_risk_col, 0.5)
             rows_with_risk.append(
                 {
@@ -214,13 +176,14 @@ def compute_optimal_screening_curve(
                     "empirical_risk": risk,
                     "true_outcome": _is_positive_outcome(row.get(outcome_col)),
                     "model_prediction": row.get(prediction_col, 0.5),
                 }
             )
     else:
         # Compute empirical P(Y=1|X) for each stratum
         empirical_probs = compute_empirical_probabilities(rows, outcome_col, strata_features)
-        for row in rows:
             stratum_key = tuple(row.get(f) for f in strata_features)
             empirical_risk = empirical_probs.get(stratum_key, {}).get("probability", 0.5)
@@ -230,9 +193,192 @@ def compute_optimal_screening_curve(
                     "empirical_risk": empirical_risk,
                     "true_outcome": _is_positive_outcome(row.get(outcome_col)),
                     "model_prediction": row.get(prediction_col, 0.5),
                 }
             )
     # Sort by risk (highest to lowest)
     rows_with_risk.sort(key=lambda x: x["empirical_risk"], reverse=True)
@@ -250,77 +396,13 @@ def compute_optimal_screening_curve(
     }
     for alpha in alpha_quantiles:
-        assert alpha <= beta, f"Screening budget α={alpha} exceeds treatment budget β={beta}"
-        # Iteratively find Band 3 position.
-        prev_avg_risk = 0.0
-        for _iteration in range(max_iterations):
-            # Compute target mass: ∫ f(risk) d(risk) = target
-            # Where f(risk) is the density over risk values
-            # For discrete: sum of (count at each risk / total count) = proportion of population at that risk
-            band1_target_mass = beta - alpha
-            # Band 2 size: ∫ 1 × f(risk) d(risk) over Band 2 = ∫ (1 - risk) × f(risk) d(risk) over Band 3
-            # Since Band 3 has mass α and average risk prev_avg_risk:
-            # ∫ (1 - risk) × f(risk) d(risk) over Band 3 = α × (1 - prev_avg_risk)
-            band2_target_mass = alpha * (1 - prev_avg_risk)
-            band3_target_mass = alpha
-            # Band 1: Find index where cumulative proportion of population = band1_target_mass
-            # This is: ∫ f(risk) d(risk) from risk=1 down to some risk threshold
-            cumulative_mass = 0.0
-            band1_end_idx = 0
-            for i in range(n):
-                # Each person contributes 1/n to the density (proportion of population)
-                population_contribution = 1.0 / n
-                cumulative_mass += population_contribution
-                if cumulative_mass >= band1_target_mass:
-                    band1_end_idx = i + 1
-                    break
-            if band1_end_idx == 0 and band1_target_mass > 0:
-                band1_end_idx = 1  # At least one person
-            # Band 2: Continue from Band 1 end
-            target_mass_band1_plus_band2 = band1_target_mass + band2_target_mass
-            band2_end_idx = band1_end_idx
-            for i in range(band1_end_idx, n):
-                population_contribution = 1.0 / n
-                cumulative_mass += population_contribution
-                if cumulative_mass >= target_mass_band1_plus_band2:
-                    band2_end_idx = i + 1
-                    break
-            # Band 3: Continue from Band 2 end
-            target_mass_band1_plus_band2_plus_band3 = band1_target_mass + band2_target_mass + band3_target_mass
-            band3_end_idx = band2_end_idx
-            for i in range(band2_end_idx, n):
-                population_contribution = 1.0 / n
-                cumulative_mass += population_contribution
-                if cumulative_mass >= target_mass_band1_plus_band2_plus_band3:
-                    band3_end_idx = i + 1
-                    break
-            # Ensure indices are within bounds
-            band1_end_idx = min(band1_end_idx, n)
-            band2_end_idx = min(band2_end_idx, n)
-            band3_end_idx = min(band3_end_idx, n)
-            # Compute average risk of Band 3
-            if band3_end_idx > band2_end_idx:
-                band3_risks = [rows_with_risk[i]["empirical_risk"] for i in range(band2_end_idx, band3_end_idx)]
-                current_avg_risk = np.mean(band3_risks) if band3_risks else 0.0
-            else:
-                current_avg_risk = 0.0
-            # Check convergence
-            if abs(current_avg_risk - prev_avg_risk) < tolerance:
-                break
-            prev_avg_risk = current_avg_risk
-        # Final band sizes (keep the indices from the last iteration)
-        # The indices are already set from the converged iteration above
-        avg_risk_band3 = prev_avg_risk
         # Compute integrals: ∫ risk × (1/n) dx for each band (for reporting purposes)
         band1_integral = sum(rows_with_risk[i]["empirical_risk"] / n for i in range(0, band1_end_idx))

     return risk_scores, outcomes
+def _build_rows_with_risk(
     rows: list[dict[str, Any]],
     outcome_col: str,
     strata_features: Sequence[str],
+    prediction_col: str,
+    seed: int | None,
+    use_custom_risk_col: str | None,
+    simulation: str | tuple[float, float] | None,
+) -> list[dict[str, Any]]:
     rows_with_risk = []
     if simulation is not None:
                     "empirical_risk": float(risk_scores[i]),
                     "true_outcome": bool(outcomes[i]),
                     "model_prediction": float(risk_scores[i]),
+                    "_input_index": i,
                 }
             )
     elif use_custom_risk_col is not None:
         # Use custom risk column directly
+        for input_index, row in enumerate(rows):
             risk = row.get(use_custom_risk_col, 0.5)
             rows_with_risk.append(
                 {
                     "empirical_risk": risk,
                     "true_outcome": _is_positive_outcome(row.get(outcome_col)),
                     "model_prediction": row.get(prediction_col, 0.5),
+                    "_input_index": input_index,
                 }
             )
     else:
         # Compute empirical P(Y=1|X) for each stratum
         empirical_probs = compute_empirical_probabilities(rows, outcome_col, strata_features)
+        for input_index, row in enumerate(rows):
             stratum_key = tuple(row.get(f) for f in strata_features)
             empirical_risk = empirical_probs.get(stratum_key, {}).get("probability", 0.5)
                     "empirical_risk": empirical_risk,
                     "true_outcome": _is_positive_outcome(row.get(outcome_col)),
                     "model_prediction": row.get(prediction_col, 0.5),
+                    "_input_index": input_index,
                 }
             )
+    return rows_with_risk
+def _end_index_for_target_mass(n: int, target_mass: float) -> int:
+    if n <= 0 or target_mass <= 0:
+        return 0
+    cumulative_mass = 0.0
+    for i in range(n):
+        cumulative_mass += 1.0 / n
+        if cumulative_mass >= target_mass:
+            return i + 1
+    return n
+def _find_optimal_band_indices(
+    rows_with_risk: list[dict[str, Any]],
+    beta: float,
+    alpha: float,
+    max_iterations: int,
+    tolerance: float,
+) -> tuple[int, int, int, float]:
+    assert alpha <= beta, f"Screening budget α={alpha} exceeds treatment budget β={beta}"
+    n = len(rows_with_risk)
+    if n == 0:
+        return 0, 0, 0, 0.0
+    prev_avg_risk = 0.0
+    band1_end_idx = 0
+    band2_end_idx = 0
+    band3_end_idx = 0
+    avg_risk_band3 = 0.0
+    for _iteration in range(max_iterations):
+        # Compute target mass: ∫ f(risk) d(risk) = target
+        # Where f(risk) is the density over risk values
+        # For discrete: sum of (count at each risk / total count) = proportion of population at that risk
+        band1_target_mass = beta - alpha
+        # Band 2 size: ∫ 1 × f(risk) d(risk) over Band 2 = ∫ (1 - risk) × f(risk) d(risk) over Band 3
+        # Since Band 3 has mass α and average risk prev_avg_risk:
+        # ∫ (1 - risk) × f(risk) d(risk) over Band 3 = α × (1 - prev_avg_risk)
+        band2_target_mass = alpha * (1 - prev_avg_risk)
+        band3_target_mass = alpha
+        band1_end_idx = _end_index_for_target_mass(n, band1_target_mass)
+        band2_end_idx = _end_index_for_target_mass(n, band1_target_mass + band2_target_mass)
+        band3_end_idx = _end_index_for_target_mass(n, band1_target_mass + band2_target_mass + band3_target_mass)
+        # Ensure indices are ordered and within bounds
+        band1_end_idx = min(band1_end_idx, n)
+        band2_end_idx = max(band1_end_idx, min(band2_end_idx, n))
+        band3_end_idx = max(band2_end_idx, min(band3_end_idx, n))
+        # Compute average risk of Band 3
+        if band3_end_idx > band2_end_idx:
+            band3_risks = [rows_with_risk[i]["empirical_risk"] for i in range(band2_end_idx, band3_end_idx)]
+            current_avg_risk = np.mean(band3_risks) if band3_risks else 0.0
+        else:
+            current_avg_risk = 0.0
+        avg_risk_band3 = current_avg_risk
+        # Check convergence
+        if abs(current_avg_risk - prev_avg_risk) < tolerance:
+            break
+        prev_avg_risk = current_avg_risk
+    return band1_end_idx, band2_end_idx, band3_end_idx, avg_risk_band3
+def compute_optimal_screening_actions(
+    rows: list[dict[str, Any]],
+    outcome_col: str,
+    strata_features: Sequence[str],
+    prediction_col: str = "probability",
+    beta: float = 0.5,
+    alpha: float = 0.0,
+    max_iterations: int = 20,
+    tolerance: float = 1e-6,
+    seed: int | None = None,
+    use_custom_risk_col: str | None = None,
+    simulation: str | tuple[float, float] | None = None,
+) -> list[int]:
+    """Compute one optimal screening allocation.
+    Returns one action per input row, preserving input order:
+    - 0: ignore
+    - 1: treat directly
+    - 2: screen
+    """
+    rows_with_risk = _build_rows_with_risk(
+        rows=rows,
+        outcome_col=outcome_col,
+        strata_features=strata_features,
+        prediction_col=prediction_col,
+        seed=seed,
+        use_custom_risk_col=use_custom_risk_col,
+        simulation=simulation,
+    )
+    rows_with_risk.sort(key=lambda x: x["empirical_risk"], reverse=True)
+    _band1_end_idx, band2_end_idx, band3_end_idx, _avg_risk_band3 = _find_optimal_band_indices(
+        rows_with_risk=rows_with_risk,
+        beta=beta,
+        alpha=alpha,
+        max_iterations=max_iterations,
+        tolerance=tolerance,
+    )
+    actions_by_input_index: dict[int, int] = {}
+    for sorted_index, item in enumerate(rows_with_risk):
+        if sorted_index < band2_end_idx:
+            action = 1
+        elif sorted_index < band3_end_idx:
+            action = 2
+        else:
+            action = 0
+        actions_by_input_index[item["_input_index"]] = action
+    return [actions_by_input_index[i] for i in range(len(rows_with_risk))]
+def compute_optimal_screening_curve(
+    rows: list[dict[str, Any]],
+    outcome_col: str,
+    strata_features: Sequence[str],
+    prediction_col: str = "probability",
+    beta: float = 0.5,
+    alpha_quantiles: Sequence[float] | None = None,
+    max_iterations: int = 20,
+    tolerance: float = 1e-6,
+    seed: int | None = None,
+    use_custom_risk_col: str | None = None,
+    simulation: str | tuple[float, float] | None = None,
+) -> dict[str, Any]:
+    """Compute optimal screening curve with treatment budget β and screening budget α.
+    Band structure (highest to lowest risk):
+    - Band 1: Top (β - α) - Treated, model predictions
+    - Band 2: Next (α - avg_risk(Band 3)) - Treated, model predictions
+    - Band 3: Next α - Screened (true outcomes)
+    - Band 4: Bottom (1 - β - α + avg_risk) - Untreated (predict 0)
+    Uses iterative method to resolve circular dependency between Band 2 and Band 3.
+    Args:
+        rows: List of data rows with features, outcome, and predictions
+        outcome_col: Name of outcome column
+        strata_features: Features defining strata for computing empirical P(Y=1|X)
+        prediction_col: Column name for model predictions
+        beta: Treatment budget (proportion who can be treated)
+        alpha_quantiles: Screening budget levels to evaluate
+        max_iterations: Maximum iterations for convergence
+        tolerance: Convergence tolerance for avg_risk
+        seed: Random seed for uniform distribution override (for debugging)
+        use_custom_risk_col: If provided, use this column for risk instead of computing
+            empirical probabilities from strata. Useful for comparing LLM predictions
+            with empirical baselines.
+        simulation: If provided, generate synthetic data from a Beta distribution instead
+            of using real data. Pass a preset name ('uniform', 'bimodal', 'unimodal') or
+            a tuple (a, b) of Beta distribution parameters. Uses SIMULATION_SIZE samples.
+    Returns:
+        Dictionary with screening curves and band information
+    """
+    if alpha_quantiles is None:
+        # Default: 10 equally spaced values from 0 to beta
+        alpha_quantiles = [beta * i / 49 for i in range(50)]
+    # Assign each row its risk (simulation, custom, or empirical)
+    rows_with_risk = _build_rows_with_risk(
+        rows=rows,
+        outcome_col=outcome_col,
+        strata_features=strata_features,
+        prediction_col=prediction_col,
+        seed=seed,
+        use_custom_risk_col=use_custom_risk_col,
+        simulation=simulation,
+    )
     # Sort by risk (highest to lowest)
     rows_with_risk.sort(key=lambda x: x["empirical_risk"], reverse=True)
     }
     for alpha in alpha_quantiles:
+        band1_end_idx, band2_end_idx, band3_end_idx, avg_risk_band3 = _find_optimal_band_indices(
+            rows_with_risk=rows_with_risk,
+            beta=beta,
+            alpha=alpha,
+            max_iterations=max_iterations,
+            tolerance=tolerance,
+        )
         # Compute integrals: ∫ risk × (1/n) dx for each band (for reporting purposes)
         band1_integral = sum(rows_with_risk[i]["empirical_risk"] / n for i in range(0, band1_end_idx))

optimal_screening/cli/get_optimal_screening.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+import yaml
+from optimal_screening.analysis import compute_optimal_screening_actions
+from optimal_screening.data_sources import load_dataframe
+REQUIRED_FIELDS = {"alpha", "beta", "outcome", "strata"}
+DEFAULT_ACTION_COL = "screening_decision"
+def _read_config(path: Path) -> dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Config file not found: {path}")
+    text = path.read_text()
+    if path.suffix.lower() == ".json":
+        data = json.loads(text)
+    elif path.suffix.lower() in {".yaml", ".yml"}:
+        data = yaml.safe_load(text)
+    else:
+        raise ValueError("Config file must be YAML or JSON")
+    if not isinstance(data, dict):
+        raise ValueError("Config must be a mapping")
+    return data
+def _validate_config(config: dict[str, Any]) -> dict[str, Any]:
+    if "alpha_quantiles" in config:
+        raise ValueError("Use alpha for one screening budget; alpha_quantiles is only for curve outputs")
+    missing = sorted(REQUIRED_FIELDS - set(config))
+    if missing:
+        raise ValueError(f"Missing required config fields: {missing}")
+    has_csv = config.get("csv") is not None
+    has_hf_dataset = config.get("hf_dataset") is not None
+    if has_csv == has_hf_dataset:
+        raise ValueError("Config must provide exactly one data source: csv or hf_dataset")
+    strata = config["strata"]
+    if not isinstance(strata, list) or not strata or not all(isinstance(item, str) for item in strata):
+        raise ValueError("strata must be a non-empty list of column names")
+    beta = float(config["beta"])
+    if not 0 < beta <= 1:
+        raise ValueError("beta must be in the interval (0, 1]")
+    alpha = float(config["alpha"])
+    if not 0 <= alpha <= beta:
+        raise ValueError(f"alpha must be between 0 and beta={beta}")
+    action_col = str(config.get("action_col", DEFAULT_ACTION_COL))
+    if not action_col:
+        raise ValueError("action_col must not be empty")
+    return {
+        "csv": str(config["csv"]) if has_csv else None,
+        "hf_dataset": str(config["hf_dataset"]) if has_hf_dataset else None,
+        "hf_split": str(config.get("hf_split", "train")),
+        "hf_revision": str(config["hf_revision"]) if config.get("hf_revision") is not None else None,
+        "outcome": str(config["outcome"]),
+        "strata": strata,
+        "beta": beta,
+        "alpha": alpha,
+        "prediction_col": str(config.get("prediction_col", "probability")),
+        "risk_col": str(config["risk_col"]) if config.get("risk_col") is not None else None,
+        "action_col": action_col,
+        "output": str(config.get("output", "runs/optimal_screening.csv")),
+    }
+def get_optimal_screening_from_config(config_path: Path) -> Path:
+    config = _validate_config(_read_config(config_path))
+    df, dataset_label = load_dataframe(
+        csv_path=config["csv"],
+        hf_dataset=config["hf_dataset"],
+        hf_split=config["hf_split"],
+        hf_revision=config["hf_revision"],
+    )
+    required_cols = {config["outcome"], *config["strata"]}
+    if config["risk_col"]:
+        required_cols.add(config["risk_col"])
+    elif config["prediction_col"] in df.columns:
+        required_cols.add(config["prediction_col"])
+    missing_cols = sorted(required_cols - set(df.columns))
+    if missing_cols:
+        raise ValueError(f"Missing required columns in {dataset_label}: {missing_cols}")
+    if config["action_col"] in df.columns:
+        raise ValueError(f"Output action column already exists in {dataset_label}: {config['action_col']}")
+    df[config["action_col"]] = compute_optimal_screening_actions(
+        rows=df.to_dict("records"),
+        outcome_col=config["outcome"],
+        strata_features=config["strata"],
+        prediction_col=config["prediction_col"],
+        beta=config["beta"],
+        alpha=config["alpha"],
+        use_custom_risk_col=config["risk_col"],
+    )
+    output_path = Path(config["output"])
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_csv(output_path, index=False)
+    return output_path
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Write optimal screening actions from a YAML or JSON config")
+    parser.add_argument("config", help="Path to a YAML or JSON config file")
+    args = parser.parse_args()
+    output_path = get_optimal_screening_from_config(Path(args.config))
+    print(f"Wrote {output_path}")
+if __name__ == "__main__":
+    main()