Spaces:

Beam2513
/

again

Sleeping

App Files Files Community

Beam2513 commited on Feb 17

Commit

798602c

verified ·

1 Parent(s): e53126f

Upload 127 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.gradio/certificate.pem +31 -0
__pycache__/app.cpython-313.pyc +0 -0
app.py +8 -0
controllers/__pycache__/data_controller.cpython-312.pyc +0 -0
controllers/__pycache__/data_controller.cpython-313.pyc +0 -0
controllers/__pycache__/hypothesis_controller.cpython-312.pyc +0 -0
controllers/__pycache__/hypothesis_controller.cpython-313.pyc +0 -0
controllers/__pycache__/linear_regression_controller.cpython-312.pyc +0 -0
controllers/__pycache__/linear_regression_controller.cpython-313.pyc +0 -0
controllers/data_controller.py +264 -0
controllers/estimation/__init__.py +0 -0
controllers/estimation/__pycache__/__init__.cpython-312.pyc +0 -0
controllers/estimation/__pycache__/__init__.cpython-313.pyc +0 -0
controllers/estimation/__pycache__/descriptive_controller.cpython-312.pyc +0 -0
controllers/estimation/__pycache__/descriptive_controller.cpython-313.pyc +0 -0
controllers/estimation/__pycache__/graphical_controller.cpython-312.pyc +0 -0
controllers/estimation/__pycache__/graphical_controller.cpython-313.pyc +0 -0
controllers/estimation/__pycache__/inference_controller.cpython-312.pyc +0 -0
controllers/estimation/__pycache__/inference_controller.cpython-313.pyc +0 -0
controllers/estimation/descriptive_controller.py +59 -0
controllers/estimation/graphical_controller.py +383 -0
controllers/estimation/inference_controller.py +300 -0
controllers/hypothesis_controller.py +204 -0
controllers/linear_regression_controller.py +160 -0
controllers/utils/__pycache__/downloads.cpython-312.pyc +0 -0
controllers/utils/__pycache__/downloads.cpython-313.pyc +0 -0
controllers/utils/downloads.py +39 -0
core/__init__.py +0 -0
core/__pycache__/__init__.cpython-312.pyc +0 -0
core/__pycache__/__init__.cpython-313.pyc +0 -0
core/__pycache__/data_stats.cpython-312.pyc +0 -0
core/__pycache__/data_stats.cpython-313.pyc +0 -0
core/__pycache__/descriptive.cpython-313.pyc +0 -0
core/__pycache__/hypothesis_tests.cpython-312.pyc +0 -0
core/__pycache__/hypothesis_tests.cpython-313.pyc +0 -0
core/__pycache__/linear_regression.cpython-312.pyc +0 -0
core/__pycache__/linear_regression.cpython-313.pyc +0 -0
core/__pycache__/statistic_plots.cpython-313.pyc +0 -0
core/data_stats.py +150 -0
core/estimation/__init__.py +0 -0
core/estimation/__pycache__/__init__.cpython-312.pyc +0 -0
core/estimation/__pycache__/__init__.cpython-313.pyc +0 -0
core/estimation/__pycache__/descriptive.cpython-312.pyc +0 -0
core/estimation/__pycache__/descriptive.cpython-313.pyc +0 -0
core/estimation/__pycache__/graphical_analysis.cpython-312.pyc +0 -0
core/estimation/__pycache__/graphical_analysis.cpython-313.pyc +0 -0
core/estimation/descriptive.py +181 -0
core/estimation/graphical_analysis.py +260 -0
core/estimation/inference/__pycache__/ci.cpython-312.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ui/assets/logos/HimmapanLab.png filter=lfs diff=lfs merge=lfs -text
+ui/assets/logos/ThotsakanStats.png filter=lfs diff=lfs merge=lfs -text

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

__pycache__/app.cpython-313.pyc ADDED Viewed

Binary file (940 Bytes). View file

app.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from ui.layout import build_layout
+def main():
+    app = build_layout()
+    app.launch(share=True)
+if __name__ == "__main__":
+    main()

controllers/__pycache__/data_controller.cpython-312.pyc ADDED Viewed

Binary file (6.96 kB). View file

controllers/__pycache__/data_controller.cpython-313.pyc ADDED Viewed

Binary file (6.71 kB). View file

controllers/__pycache__/hypothesis_controller.cpython-312.pyc ADDED Viewed

Binary file (6.3 kB). View file

controllers/__pycache__/hypothesis_controller.cpython-313.pyc ADDED Viewed

Binary file (6.28 kB). View file

controllers/__pycache__/linear_regression_controller.cpython-312.pyc ADDED Viewed

Binary file (5.56 kB). View file

controllers/__pycache__/linear_regression_controller.cpython-313.pyc ADDED Viewed

Binary file (5.5 kB). View file

controllers/data_controller.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import gradio as gr
+from core.data_stats import (
+    load_dataset,
+    dataset_summary,
+    variable_types,
+    infer_column_types,
+    apply_category_filters,
+    reclassify_as_categorical,
+    reclassify_as_numeric,
+)
+def wire_callbacks(
+    *,
+    file_input,
+    status_output,
+    # RAW DATA
+    preview_checkbox,
+    overview_checkbox,
+    csv_preview,
+    desc_output,
+    dtypes_output,
+    # RECLASSIFICATION
+    num_to_cat,
+    cat_to_num,
+    fix_to_categorical_button,
+    fix_to_numeric_button,
+    fix_dtype_status,
+    # FILTERS
+    cat_filter_cols,
+    cat_val_1,
+    cat_val_2,
+    cat_val_3,
+    apply_filter_button,
+    filter_status,
+    # FILTERED DATA
+    preview_checkbox_filter,
+    overview_checkbox_filter,
+    csv_preview_filter,
+    desc_output_filter,
+    dtypes_output_filter,
+    state,
+):
+    # ==================================================
+    # File upload
+    # ==================================================
+    def on_file_upload(file):
+        df, status = load_dataset(file)
+        if df is None:
+            return (
+                status,
+                None, None, None,
+                gr.update(choices=[], value=None),
+                gr.update(choices=[], value=None),
+                gr.update(choices=[], value=[]),
+            )
+        numeric_cols, categorical_cols = infer_column_types(df)
+        state.df = df
+        state.filtered_df = df
+        state.numeric_cols = numeric_cols
+        state.categorical_cols = categorical_cols
+        state.active_filters = {}
+        state.overrides = {"num_to_cat": [], "cat_to_num": []}
+        return (
+            status,
+            df,
+            dataset_summary(df),
+            variable_types(df),
+            # Reclassification dropdowns
+            gr.update(choices=numeric_cols, value=None),
+            gr.update(choices=categorical_cols, value=None),
+            # Filter columns (categorical only)
+            gr.update(choices=categorical_cols, value=[]),
+        )
+    file_input.change(
+        on_file_upload,
+        inputs=file_input,
+        outputs=[
+            status_output,
+            csv_preview,
+            desc_output,
+            dtypes_output,
+            num_to_cat,
+            cat_to_num,
+            cat_filter_cols,
+        ],
+    )
+    # ==================================================
+    # Category value dropdowns (Filter 1–3)
+    # ==================================================
+    def update_category_filters(selected_columns):
+        df = state.df
+        if df is None or not selected_columns:
+            return (
+                gr.update(visible=False, choices=[], value=[]),
+                gr.update(visible=False, choices=[], value=[]),
+                gr.update(visible=False, choices=[], value=[]),
+            )
+        updates = []
+        for i in range(3):
+            if i < len(selected_columns):
+                col = selected_columns[i]
+                values = sorted(df[col].dropna().unique().tolist())
+                updates.append(
+                    gr.update(
+                        visible=True,
+                        choices=values,
+                        value=[],
+                    )
+                )
+            else:
+                updates.append(
+                    gr.update(visible=False, choices=[], value=[])
+                )
+        return tuple(updates)
+    cat_filter_cols.change(
+        update_category_filters,
+        inputs=cat_filter_cols,
+        outputs=[cat_val_1, cat_val_2, cat_val_3],
+    )
+    # ==================================================
+    # Apply filters
+    # ==================================================
+    def on_apply_filter(cat_cols, v1, v2, v3):
+        filtered_df, status = apply_category_filters(
+            state.df,
+            cat_cols,
+            v1, v2, v3,
+        )
+        state.filtered_df = filtered_df
+        state.active_filters = {
+            col: vals
+            for col, vals in zip(cat_cols[:3], [v1, v2, v3])
+            if vals
+        }
+        return status
+    apply_filter_button.click(
+        on_apply_filter,
+        inputs=[cat_filter_cols, cat_val_1, cat_val_2, cat_val_3],
+        outputs=filter_status,
+    )
+    # ==================================================
+    # RAW preview / summary
+    # ==================================================
+    preview_checkbox.change(
+        lambda x: gr.update(visible=x),
+        inputs=preview_checkbox,
+        outputs=csv_preview,
+    )
+    overview_checkbox.change(
+        lambda x: (
+            gr.update(visible=x),
+            gr.update(visible=x),
+        ),
+        inputs=overview_checkbox,
+        outputs=[desc_output, dtypes_output],
+    )
+    # ==================================================
+    # FILTERED preview / summary
+    # ==================================================
+    preview_checkbox_filter.change(
+        lambda x: (
+            gr.update(visible=x),
+            state.filtered_df if x else None,
+        ),
+        inputs=preview_checkbox_filter,
+        outputs=[csv_preview_filter, csv_preview_filter],
+    )
+    overview_checkbox_filter.change(
+        lambda x: (
+            gr.update(visible=x),
+            gr.update(visible=x),
+            dataset_summary(state.filtered_df) if x else None,
+            variable_types(state.filtered_df) if x else None,
+        ),
+        inputs=overview_checkbox_filter,
+        outputs=[
+            desc_output_filter,
+            dtypes_output_filter,
+            desc_output_filter,
+            dtypes_output_filter,
+        ],
+    )
+    # ==================================================
+    # Reclassification
+    # ==================================================
+    def on_fix_to_categorical(column):
+        _, msg = reclassify_as_categorical(state, column)
+        return (
+            gr.update(choices=state.categorical_cols, value=[]),
+            gr.update(choices=state.numeric_cols, value=None),
+            gr.update(choices=state.categorical_cols, value=None),
+            msg,
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+        )
+    def on_fix_to_numeric(column):
+        _, msg = reclassify_as_numeric(state, column)
+        return (
+            gr.update(choices=state.categorical_cols, value=[]),
+            gr.update(choices=state.numeric_cols, value=None),
+            gr.update(choices=state.categorical_cols, value=None),
+            msg,
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+        )
+    fix_to_categorical_button.click(
+        on_fix_to_categorical,
+        inputs=num_to_cat,
+        outputs=[
+            cat_filter_cols,
+            num_to_cat,
+            cat_to_num,
+            fix_dtype_status,
+            cat_val_1,
+            cat_val_2,
+            cat_val_3,
+        ],
+    )
+    fix_to_numeric_button.click(
+        on_fix_to_numeric,
+        inputs=cat_to_num,
+        outputs=[
+            cat_filter_cols,
+            num_to_cat,
+            cat_to_num,
+            fix_dtype_status,
+            cat_val_1,
+            cat_val_2,
+            cat_val_3,
+        ],
+    )

controllers/estimation/__init__.py ADDED Viewed

File without changes

controllers/estimation/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (239 Bytes). View file

controllers/estimation/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (206 Bytes). View file

controllers/estimation/__pycache__/descriptive_controller.cpython-312.pyc ADDED Viewed

Binary file (2.28 kB). View file

controllers/estimation/__pycache__/descriptive_controller.cpython-313.pyc ADDED Viewed

Binary file (2.3 kB). View file

controllers/estimation/__pycache__/graphical_controller.cpython-312.pyc ADDED Viewed

Binary file (8.93 kB). View file

controllers/estimation/__pycache__/graphical_controller.cpython-313.pyc ADDED Viewed

Binary file (8.87 kB). View file

controllers/estimation/__pycache__/inference_controller.cpython-312.pyc ADDED Viewed

Binary file (5.27 kB). View file

controllers/estimation/__pycache__/inference_controller.cpython-313.pyc ADDED Viewed

Binary file (5.16 kB). View file

controllers/estimation/descriptive_controller.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# ui/controllers/estimation/descriptive_controller.py
+import pandas as pd
+from core.estimation.descriptive import compute_descriptive_statistics
+def run_descriptive_statistics(
+    *,
+    df: pd.DataFrame,
+    column: str,
+    quantile_probs: list[float],
+    trim_alpha: float | None,
+    winsor_limits: tuple[float, float] | None,
+    weights_col: str | None,
+    round_digits: int,
+) -> pd.DataFrame:
+    if df is None:
+        raise ValueError("No dataset loaded.")
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' not found.")
+    series = df[column].dropna()
+    if series.empty:
+        raise ValueError("Selected column has no valid data.")
+    if not pd.api.types.is_numeric_dtype(series):
+        raise ValueError("Selected column must be numeric.")
+    weights = None
+    if weights_col:
+        if weights_col not in df.columns:
+            raise ValueError(f"Weights column '{weights_col}' not found.")
+        weights = df.loc[series.index, weights_col]
+        if not pd.api.types.is_numeric_dtype(weights):
+            raise ValueError("Weights must be numeric.")
+        if (weights < 0).any():
+            raise ValueError("Weights must be non-negative.")
+    stats_df = compute_descriptive_statistics(
+        data=series.values,
+        quantile_probs=quantile_probs,
+        trim_alpha=trim_alpha,
+        winsor_limits=winsor_limits,
+        weights=weights.values if weights is not None else None,
+    )
+    #numeric_cols = stats_df.select_dtypes("number").columns
+    #stats_df[numeric_cols] = stats_df[numeric_cols].round(round_digits)
+    stats_df[["Value", "Bias Corrected"]] = stats_df[["Value", "Bias Corrected"]].round(round_digits)
+    return stats_df

controllers/estimation/graphical_controller.py ADDED Viewed

	@@ -0,0 +1,383 @@

+from __future__ import annotations
+from typing import Optional, Tuple
+import numpy as np
+import pandas as pd
+from core.estimation.inference.estimators import estimate_mean, estimate_sigma
+from core.estimation.inference.ci import (
+    ci_mean_analytic,
+    ci_mean_bootstrap,
+    ci_median_analytic,
+    ci_median_bootstrap,
+)
+from core.estimation.inference.pi import (
+    pi_mean,
+    pi_median,
+    pi_iqr,
+    pi_bootstrap,
+)
+from core.estimation.graphical_analysis import (
+    plot_histogram_with_overlays,
+    plot_ecdf,
+)
+# ---------------------------------------------------------------------
+# Utilities (aligned with inference_controller)
+# ---------------------------------------------------------------------
+def select_distribution(mean_estimator: str, sigma_estimator: str) -> str:
+    if (
+        mean_estimator == "Sample Mean"
+        and sigma_estimator == "Deviation (1 ddof)"
+    ):
+        return "t"
+    return "norm"
+def validate_deviation_estimator(*, sigma_estimator: str, n: int):
+    if sigma_estimator == "Range (bias corrected)" and n > 25:
+        raise ValueError(
+            "Range-based confidence intervals require n ≤ 25. "
+            "Use another estimator or bootstrap."
+        )
+def _prepare_series(
+    df: pd.DataFrame,
+    column: str,
+    weights_col: Optional[str],
+) -> tuple[np.ndarray, Optional[np.ndarray]]:
+    if df is None:
+        raise ValueError("No data loaded. Please load a dataset first.")
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' not found in the dataframe.")
+    series = df[column].dropna()
+    if series.empty:
+        raise ValueError(f"Column '{column}' has no non-missing values.")
+    weights = None
+    if weights_col is not None:
+        if weights_col not in df.columns:
+            raise ValueError(
+                f"Weights column '{weights_col}' not found in the dataframe."
+            )
+        weights_series = df[weights_col].reindex(series.index).dropna()
+        common_idx = series.index.intersection(weights_series.index)
+        series = series.loc[common_idx]
+        weights_series = weights_series.loc[common_idx]
+        weights = weights_series.to_numpy()
+    return series.to_numpy(), weights
+def run_graphical_analysis(
+    *,
+    df: pd.DataFrame,
+    column: str,
+    graph_type: str,
+    # Histogram / PMF controls
+    add_kde: bool,
+    add_data: bool,
+    add_normal: bool,
+    add_ci: bool,
+    ci_choice: str,
+    add_pi: bool,
+    pi_choice: str,
+    # Estimators
+    mean_estimator: str,
+    median_estimator: str,
+    sigma_estimator: str,
+    trim_param,
+    winsor_limits,
+    weights_col: Optional[str],
+    # Normal μ source
+    normal_mu_source: str,
+    # Bootstrap options
+    bootstrap_mean: bool,
+    bootstrap_median: bool,
+    bootstrap_sigma: bool,
+    bootstrap_prediction: bool,
+    bootstrap_samples: int,
+    # CI/PI confidence level
+    ci_pi_conf_level: float,
+    # ECDF controls
+    ecdf_add_conf: bool,
+    ecdf_conf_level: float,
+    ecdf_add_normal: bool,
+):
+    data, weights = _prepare_series(df, column, weights_col)
+    if not (0.0 < ci_pi_conf_level < 1.0):
+        raise ValueError("Confidence level for CI/PI must be in (0, 1).")
+    if graph_type in ("Histogram", "Empirical Probability Mass Function"):
+        return _run_hist_or_pmf(
+            data=data,
+            var_name=column,
+            graph_type=graph_type,
+            add_kde=add_kde,
+            add_data=add_data,
+            add_normal=add_normal,
+            add_ci=add_ci,
+            ci_choice=ci_choice,
+            add_pi=add_pi,
+            pi_choice=pi_choice,
+            mean_estimator=mean_estimator,
+            median_estimator=median_estimator,
+            sigma_estimator=sigma_estimator,
+            trim_param=trim_param,
+            winsor_limits=winsor_limits,
+            weights=weights,
+            normal_mu_source=normal_mu_source,
+            bootstrap_mean=bootstrap_mean,
+            bootstrap_median=bootstrap_median,
+            bootstrap_sigma=bootstrap_sigma,
+            bootstrap_prediction=bootstrap_prediction,
+            bootstrap_samples=bootstrap_samples,
+            ci_pi_conf_level=ci_pi_conf_level,
+        )
+    if graph_type == "Empirical Cumulative Distribution Function (ECDF)":
+        return _run_ecdf(
+            data=data,
+            var_name=column,
+            ecdf_add_conf=ecdf_add_conf,
+            ecdf_conf_level=ecdf_conf_level,
+            ecdf_add_normal=ecdf_add_normal,
+            mean_estimator=mean_estimator,
+            sigma_estimator=sigma_estimator,
+            trim_param=trim_param,
+            winsor_limits=winsor_limits,
+            weights=weights,
+            normal_mu_source=normal_mu_source,
+        )
+    raise ValueError(f"Unknown graph type: {graph_type}")
+def _run_hist_or_pmf(
+    *,
+    data: np.ndarray,
+    var_name: str,
+    graph_type: str,
+    add_kde: bool,
+    add_data: bool,
+    add_normal: bool,
+    add_ci: bool,
+    ci_choice: str,
+    add_pi: bool,
+    pi_choice: str,
+    mean_estimator: str,
+    median_estimator: str,
+    sigma_estimator: str,
+    trim_param,
+    winsor_limits,
+    weights: Optional[np.ndarray],
+    normal_mu_source: str,
+    bootstrap_mean: bool,
+    bootstrap_median: bool,
+    bootstrap_sigma: bool,
+    bootstrap_prediction: bool,
+    bootstrap_samples: int,
+    ci_pi_conf_level: float,
+):
+    alpha = 1.0 - ci_pi_conf_level
+    n = len(data)
+    validate_deviation_estimator(
+        sigma_estimator=sigma_estimator,
+        n=n,
+    )
+    ci_mean_interval = None
+    ci_median_interval = None
+    pi_interval = None
+    hat_mu = None
+    hat_sigma = None
+    need_intervals = add_ci or add_pi or add_normal
+    if need_intervals:
+        # --- Parameters for Normal overlay ---
+        if add_normal:
+            if normal_mu_source == "Mean-based CI":
+                hat_mu = estimate_mean(
+                    data,
+                    mean_estimator,
+                    trim_param=trim_param,
+                    winsor_limits=winsor_limits,
+                    weights=weights,
+                )
+            else:
+                hat_mu = float(np.median(data))
+            hat_sigma = estimate_sigma(
+                data=data,
+                estimator=sigma_estimator,
+            )
+        # --- Confidence intervals ---
+        if add_ci:
+            dist = select_distribution(mean_estimator, sigma_estimator)
+            # CI for mean
+            if bootstrap_mean:
+                ci_mean_interval = ci_mean_bootstrap(
+                    data=data,
+                    estimator=mean_estimator,
+                    alpha=alpha,
+                    trim_param=trim_param,
+                    winsor_limits=winsor_limits,
+                    weights=weights,
+                    B=bootstrap_samples,
+                )
+            else:
+                ci_mean_interval = ci_mean_analytic(
+                    data=data,
+                    estimator=mean_estimator,
+                    alpha=alpha,
+                    dist=dist,
+                    sigma_estimator=sigma_estimator,
+                    trim_param=trim_param,
+                    winsor_limits=winsor_limits,
+                    weights=weights,
+                )
+            # CI for median
+            if bootstrap_median:
+                ci_median_interval = ci_median_bootstrap(
+                    data=data,
+                    alpha=alpha,
+                    B=bootstrap_samples,
+                )
+            else:
+                ci_median_interval = ci_median_analytic(
+                    data=data,
+                    alpha=alpha,
+                    sigma_estimator=sigma_estimator,
+                )
+            # Respect user choice (Mean / Median / Both)
+            if ci_choice == "Mean":
+                ci_median_interval = None
+            elif ci_choice == "Median":
+                ci_mean_interval = None
+        # --- Prediction intervals ---
+        if add_pi:
+            dist = select_distribution(mean_estimator, sigma_estimator)
+            if pi_choice == "Mean":
+                pi_interval = pi_mean(
+                    data=data,
+                    alpha=alpha,
+                    estimator=mean_estimator,
+                    dist=dist,
+                    sigma_estimator=sigma_estimator,
+                    trim_param=trim_param,
+                    winsor_limits=winsor_limits,
+                    weights=weights,
+                )
+            elif pi_choice == "Median":
+                # New API: pi_median only needs data, alpha and sigma_estimator
+                pi_interval = pi_median(
+                    data=data,
+                    alpha=alpha,
+                    sigma_estimator=sigma_estimator,
+                )
+            elif pi_choice == "IQR":
+                pi_interval = pi_iqr(
+                    data=data,
+                    alpha=alpha,
+                )
+            elif pi_choice == "Bootstrap":
+                if not bootstrap_prediction:
+                    raise ValueError(
+                        "To use the Bootstrap prediction interval, enable the "
+                        "'Bootstrap Prediction' option in the estimator settings."
+                    )
+                pi_interval = pi_bootstrap(
+                    data=data,
+                    alpha=alpha,
+                    B=bootstrap_samples,
+                )
+            else:
+                raise ValueError(
+                    f"Unknown prediction-interval choice: {pi_choice}"
+                )
+    fig = plot_histogram_with_overlays(
+        data=data,
+        graph_type=graph_type,
+        var_name=var_name,
+        add_kde=add_kde,
+        add_data=add_data,
+        add_normal=add_normal,
+        hat_mu=hat_mu,
+        hat_sigma=hat_sigma,
+        ci_mean_interval=ci_mean_interval,
+        ci_median_interval=ci_median_interval,
+        pi_interval=pi_interval,
+    )
+    return fig
+def _run_ecdf(
+    *,
+    data: np.ndarray,
+    var_name: str,
+    ecdf_add_conf: bool,
+    ecdf_conf_level: float,
+    ecdf_add_normal: bool,
+    mean_estimator: str,
+    sigma_estimator: str,
+    trim_param,
+    winsor_limits,
+    weights: Optional[np.ndarray],
+    normal_mu_source: str,
+):
+    if not (0.0 < ecdf_conf_level < 1.0):
+        raise ValueError("ECDF confidence level must be in (0, 1).")
+    alpha = 1.0 - ecdf_conf_level
+    n = len(data)
+    validate_deviation_estimator(
+        sigma_estimator=sigma_estimator,
+        n=n,
+    )
+    hat_mu = None
+    hat_sigma = None
+    if ecdf_add_normal:
+        if normal_mu_source == "Mean-based CI":
+            hat_mu = estimate_mean(
+                data,
+                mean_estimator,
+                trim_param=trim_param,
+                winsor_limits=winsor_limits,
+                weights=weights,
+            )
+        else:
+            hat_mu = float(np.median(data))
+        hat_sigma = estimate_sigma(
+            data=data,
+            estimator=sigma_estimator,
+        )
+    fig = plot_ecdf(
+        data=data,
+        var_name=var_name,
+        alpha=alpha,
+        add_conf_band=ecdf_add_conf,
+        add_normal=ecdf_add_normal,
+        hat_mu=hat_mu,
+        hat_sigma=hat_sigma,
+    )
+    return fig

controllers/estimation/inference_controller.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import pandas as pd
+from core.estimation.inference.ci import (
+    ci_mean_analytic,
+    ci_median_analytic,
+    ci_deviation_analytic,
+    ci_mean_bootstrap,
+    ci_median_bootstrap,
+    ci_deviation_bootstrap,
+)
+from core.estimation.inference.pi import (
+    pi_mean,
+    pi_median,
+    pi_iqr,
+    pi_bootstrap,
+)
+from core.estimation.inference.confidence_regions import confidence_regions
+# ---------------------------------------------------------------------
+# Utilities
+# ---------------------------------------------------------------------
+def select_distribution(mean_estimator: str, sigma_estimator: str) -> str:
+    if mean_estimator == "Sample Mean" and sigma_estimator == "Deviation (1 ddof)":
+        return "t"
+    return "norm"
+def validate_deviation_estimator(*, sigma_estimator: str, n: int):
+    if sigma_estimator == "Range (bias corrected)" and n > 25:
+        raise ValueError(
+            "Range-based confidence intervals require n ≤ 25. "
+            "Use another estimator or bootstrap."
+        )
+# ---------------------------------------------------------------------
+# Confidence Intervals
+# ---------------------------------------------------------------------
+def run_confidence_intervals(
+    *,
+    data,
+    alpha,
+    mean_estimator,
+    median_estimator,
+    sigma_estimator,
+    trim_param=None,
+    winsor_limits=None,
+    weights=None,
+    bootstrap_mean=False,
+    bootstrap_median=False,
+    bootstrap_deviation=False,
+    bootstrap_samples=1000,
+):
+    n = len(data)
+    validate_deviation_estimator(
+        sigma_estimator=sigma_estimator,
+        n=n,
+    )
+    dist = select_distribution(mean_estimator, sigma_estimator)
+    # ---------------- Mean ----------------
+    if bootstrap_mean:
+        mean_ci = ci_mean_bootstrap(
+            data=data,
+            estimator=mean_estimator,
+            alpha=alpha,
+            B=bootstrap_samples,
+            trim_param=trim_param,
+            winsor_limits=winsor_limits,
+            weights=weights,
+        )
+    else:
+        mean_ci = ci_mean_analytic(
+            data=data,
+            estimator=mean_estimator,
+            alpha=alpha,
+            dist=dist,
+            sigma_estimator=sigma_estimator,
+            trim_param=trim_param,
+            winsor_limits=winsor_limits,
+            weights=weights,
+        )
+    # ---------------- Median ----------------
+    if bootstrap_median:
+        median_ci = ci_median_bootstrap(
+            data=data,
+            alpha=alpha,
+            B=bootstrap_samples,
+        )
+    else:
+        median_ci = ci_median_analytic(
+            data=data,
+            alpha=alpha,
+            sigma_estimator=sigma_estimator,
+        )
+    # ---------------- Deviation ----------------
+    if bootstrap_deviation:
+        sigma_ci = ci_deviation_bootstrap(
+            data=data,
+            alpha=alpha,
+            B=bootstrap_samples,
+            estimator=sigma_estimator,
+        )
+    else:
+        sigma_ci = ci_deviation_analytic(
+            data=data,
+            alpha=alpha,
+            estimator=sigma_estimator,
+        )
+    table = pd.DataFrame(
+        [
+            ["Confidence", "Mean", *mean_ci],
+            ["Confidence", "Median", *median_ci],
+            ["Confidence", "Deviation", *sigma_ci],
+        ],
+        columns=["Interval Type", "Statistic", "Lower", "Upper"],
+    )
+    return table, mean_ci, sigma_ci, median_ci
+# ---------------------------------------------------------------------
+# Prediction Intervals
+# ---------------------------------------------------------------------
+def run_prediction_intervals(
+    *,
+    data,
+    alpha,
+    mean_estimator,
+    median_estimator,
+    sigma_estimator,
+    trim_param=None,
+    winsor_limits=None,
+    weights=None,
+    bootstrap=False,
+    bootstrap_samples=1000,
+):
+    dist = select_distribution(mean_estimator, sigma_estimator)
+    rows = []
+    # Mean-based PI
+    mean_pi = pi_mean(
+        data=data,
+        alpha=alpha,
+        estimator=mean_estimator,
+        dist=dist,
+        sigma_estimator=sigma_estimator,
+        trim_param=trim_param,
+        winsor_limits=winsor_limits,
+        weights=weights,
+    )
+    rows.append(["Prediction", "Mean", *mean_pi])
+    # Median-based PI (uses same deviation estimator)
+    median_pi = pi_median(
+        data=data,
+        alpha=alpha,
+        sigma_estimator=sigma_estimator,
+    )
+    rows.append(["Prediction", "Median", *median_pi])
+    # IQR-based PI
+    iqr_pi = pi_iqr(
+        data=data,
+        alpha=alpha,
+    )
+    rows.append(["Prediction", "IQR", *iqr_pi])
+    # Optional bootstrap PI
+    if bootstrap:
+        boot_pi = pi_bootstrap(
+            data=data,
+            alpha=alpha,
+            B=bootstrap_samples,
+        )
+        rows.append(["Prediction", "Bootstrap", *boot_pi])
+    return pd.DataFrame(
+        rows,
+        columns=["Interval Type", "Statistic", "Lower", "Upper"],
+    )
+# ---------------------------------------------------------------------
+# Confidence Regions
+# ---------------------------------------------------------------------
+def run_confidence_regions(
+    *,
+    data,
+    alpha,
+    mean_estimator,
+    median_estimator,
+    sigma_estimator,
+    trim_param,
+    winsor_limits,
+    weights,
+    bootstrap_mean,
+    bootstrap_median,
+    bootstrap_deviation,
+    bootstrap_samples,
+    mu_ci_source,
+    probs,
+    eps_mu,
+    eps_sigma,
+    add_ci_box,
+):
+    """
+    Use the CI machinery to compute CIs for mean, median and deviation,
+    then choose which CI to use for μ (mean-based or median-based) and
+    pass that CI plus the σ CI into the likelihood-based confidence
+    regions function.
+    """
+    ci_table, mean_ci, sigma_ci, median_ci = run_confidence_intervals(
+        data=data,
+        alpha=alpha,
+        mean_estimator=mean_estimator,
+        median_estimator=median_estimator,
+        sigma_estimator=sigma_estimator,
+        trim_param=trim_param,
+        winsor_limits=winsor_limits,
+        weights=weights,
+        bootstrap_mean=bootstrap_mean,
+        bootstrap_median=bootstrap_median,
+        bootstrap_deviation=bootstrap_deviation,
+        bootstrap_samples=bootstrap_samples,
+    )
+    if mu_ci_source == "Median-based CI":
+        mu_ci = median_ci
+    else:
+        # default: mean-based CI
+        mu_ci = mean_ci
+    fig = confidence_regions(
+        data=data,
+        mean_ci=mu_ci,
+        sigma_ci=sigma_ci,
+        probs=probs,
+        eps_mu=eps_mu,
+        eps_sigma=eps_sigma,
+        add_ci_box=add_ci_box,
+    )
+    return fig
+# ---------------------------------------------------------------------
+# Combined Runner (used by UI)
+# ---------------------------------------------------------------------
+def run_intervals(
+    *,
+    data,
+    alpha,
+    mean_estimator,
+    median_estimator,
+    sigma_estimator,
+    bootstrap_mean,
+    bootstrap_median,
+    bootstrap_deviation,
+    bootstrap_samples,
+):
+    ci_table, mean_ci, sigma_ci = run_confidence_intervals(
+        data=data,
+        alpha=alpha,
+        mean_estimator=mean_estimator,
+        median_estimator=median_estimator,
+        sigma_estimator=sigma_estimator,
+        bootstrap_mean=bootstrap_mean,
+        bootstrap_median=bootstrap_median,
+        bootstrap_deviation=bootstrap_deviation,
+        bootstrap_samples=bootstrap_samples,
+    )
+    pi_table = run_prediction_intervals(
+        data=data,
+        alpha=alpha,
+        mean_estimator=mean_estimator,
+        median_estimator=median_estimator,
+        sigma_estimator=sigma_estimator,
+        bootstrap=bootstrap_mean,
+        bootstrap_samples=bootstrap_samples,
+    )
+    combined = pd.concat([ci_table, pi_table], ignore_index=True)
+    return ci_table, pi_table, combined

controllers/hypothesis_controller.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from __future__ import annotations
+from typing import Iterable, Tuple
+import numpy as np
+import pandas as pd
+from core.hypothesis_tests import (
+    one_sample_ttest,
+    two_sample_ttest,
+    variance_test,
+    one_way_anova,
+)
+ROUND = 4
+def _round_table(table: pd.DataFrame, decimals: int = ROUND) -> pd.DataFrame:
+    """Round only numeric columns of the result table."""
+    if table is None:
+        return table
+    tbl = table.copy()
+    num_cols = tbl.select_dtypes(include="number").columns
+    if len(num_cols) > 0:
+        tbl[num_cols] = tbl[num_cols].round(decimals)
+    return tbl
+def _ensure_numeric_series(df: pd.DataFrame, column: str) -> np.ndarray:
+    if df is None:
+        raise ValueError("No dataset loaded.")
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' not found in the dataset.")
+    series = df[column].dropna()
+    if series.empty:
+        raise ValueError("No valid data in the selected column.")
+    return series.to_numpy()
+def _materialize_group(
+    df: pd.DataFrame,
+    numeric_col: str,
+    cat_col: str | None,
+    cat_vals: Iterable[str],
+) -> np.ndarray:
+    if cat_col is None:
+        raise ValueError("No categorical column selected.")
+    if cat_col not in df.columns:
+        raise ValueError(f"Categorical column '{cat_col}' not found in the dataset.")
+    # Cast selected values to the actual dtype of the column
+    if cat_vals is None:
+        values = []
+    else:
+        values = list(cat_vals)
+    if not values:
+        raise ValueError(f"No categories selected for column '{cat_col}'.")
+    cat_series = pd.Series(values).astype(df[cat_col].dtype)
+    mask = df[cat_col].isin(cat_series)
+    series = df.loc[mask, numeric_col].dropna()
+    if series.empty:
+        raise ValueError("One or more groups are empty after filtering.")
+    return series.to_numpy()
+def run_hypothesis_testing(
+    *,
+    df: pd.DataFrame | None,
+    numeric_col: str,
+    hypo_test: str,
+    mu0_text: str,
+    alternative: str,
+    include_graph: bool,
+    bootstrap_samples: int,
+    cat_col1: str | None,
+    cat_vals1: list[str],
+    name_group1: str,
+    cat_col2: str | None,
+    cat_vals2: list[str],
+    name_group2: str,
+    cat_col3: str | None,
+    cat_vals3: list[str],
+    plot_type: str,
+    correction: bool,
+    test_type: str,
+) -> Tuple[pd.DataFrame, object | None]:
+    """
+    High-level dispatcher used by the Hypothesis Testing tab.
+    Returns:
+        (result_table, figure_or_none)
+    """
+    if df is None:
+        raise ValueError("No dataset loaded.")
+    # Common numeric data check
+    _ = _ensure_numeric_series(df, numeric_col)
+    # ------------------------------------------------------------
+    # One-sample t-test
+    # ------------------------------------------------------------
+    if hypo_test == "One sample Student's t-test":
+        if not mu0_text.strip():
+            raise ValueError("μ₀ must be specified for the one-sample t-test.")
+        try:
+            mu0 = float(mu0_text)
+        except Exception:
+            raise ValueError("μ₀ must be a numeric value.")
+        sample = df[numeric_col].dropna().to_numpy()
+        table, fig = one_sample_ttest(
+            sample=sample,
+            mu0=mu0,
+            alternative=alternative,
+            numeric_col=numeric_col,
+            bootstrap_samples=bootstrap_samples,
+            include_graph=include_graph,
+        )
+        table = _round_table(table)
+        return table, fig
+    # ------------------------------------------------------------
+    # Two-sample t-test
+    # ------------------------------------------------------------
+    if hypo_test == "Two samples Student's t-test":
+        group1 = _materialize_group(df, numeric_col, cat_col1, cat_vals1)
+        group2 = _materialize_group(df, numeric_col, cat_col2, cat_vals2)
+        # If names are empty, fall back to defaults
+        name1 = name_group1 or "Group 1"
+        name2 = name_group2 or "Group 2"
+        table, fig = two_sample_ttest(
+            group1=group1,
+            group2=group2,
+            numeric_col=numeric_col,
+            name_group1=name1,
+            name_group2=name2,
+            alternative=alternative,
+            correction=correction,
+            plot_type=plot_type,
+            bootstrap_samples=bootstrap_samples,
+            include_graph=include_graph,
+        )
+        table = _round_table(table)
+        return table, fig
+    # ------------------------------------------------------------
+    # Equal variance between two groups
+    # ------------------------------------------------------------
+    if hypo_test == "Equal variance between two groups":
+        group1 = _materialize_group(df, numeric_col, cat_col1, cat_vals1)
+        group2 = _materialize_group(df, numeric_col, cat_col2, cat_vals2)
+        name1 = name_group1 or "Group 1"
+        name2 = name_group2 or "Group 2"
+        table, fig = variance_test(
+            group1=group1,
+            group2=group2,
+            name_group1=name1,
+            name_group2=name2,
+            test_type=test_type,
+            include_graph=include_graph,
+            bootstrap_samples=bootstrap_samples,
+        )
+        table = _round_table(table)
+        return table, fig
+    # ------------------------------------------------------------
+    # One-way ANOVA
+    # ------------------------------------------------------------
+    if hypo_test == "One-way ANOVA":
+        if cat_col3 is None:
+            raise ValueError("A categorical column must be selected for ANOVA.")
+        if cat_col3 not in df.columns:
+            raise ValueError(
+                f"Categorical column '{cat_col3}' not found in the dataset."
+            )
+        if not cat_vals3:
+            raise ValueError("At least one category must be selected for ANOVA.")
+        cat_series = pd.Series(cat_vals3).astype(df[cat_col3].dtype)
+        data_group = df[df[cat_col3].isin(cat_series)][[numeric_col, cat_col3]].dropna()
+        table, fig = one_way_anova(
+            data_group=data_group,
+            numeric_col=numeric_col,
+            cat_col=cat_col3,
+        )
+        table = _round_table(table)
+        return table, fig
+    # ------------------------------------------------------------
+    # Fallback
+    # ------------------------------------------------------------
+    raise ValueError(f"Unknown hypothesis test: {hypo_test}")

controllers/linear_regression_controller.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from __future__ import annotations
+from typing import List, Optional, Sequence, Tuple
+from matplotlib.figure import Figure
+import numpy as np
+import pandas as pd
+from core.linear_regression import run_linear_regression as _run_linear_regression
+def _select_working_dataframe(
+    df: Optional[pd.DataFrame],
+    filtered_df: Optional[pd.DataFrame],
+) -> pd.DataFrame:
+    """
+    Use the filtered dataframe if it is non-empty; otherwise fall back to the
+    original dataframe. This mirrors the behaviour used in other tabs.
+    """
+    if df is None:
+        raise ValueError("No dataset loaded.")
+    if filtered_df is not None and not filtered_df.empty:
+        return filtered_df
+    if df.empty:
+        raise ValueError("The dataset is empty.")
+    return df
+def _parse_confidence_level(text: str) -> float:
+    """
+    Parse a confidence level like '0.95' into an alpha value for statsmodels.
+    Returns
+    -------
+    alpha : float
+        Significance level (e.g. 0.05 for a 95% confidence level).
+    """
+    s = str(text).strip()
+    if not s:
+        raise ValueError("Confidence level is required (e.g. 0.95).")
+    try:
+        level = float(s)
+    except ValueError as exc:
+        raise ValueError("Confidence level must be a numeric value between 0 and 1.") from exc
+    if not (0 < level < 1):
+        raise ValueError("Confidence level must be between 0 and 1 (e.g. 0.95).")
+    # statsmodels expects alpha, not the confidence level itself
+    return 1.0 - level
+def _parse_range(text: str) -> Optional[np.ndarray]:
+    """
+    Parse a range string like '0, 10' into a numpy array suitable for predictions.
+    Returns
+    -------
+    np.ndarray or None
+        If the string is empty or only whitespace, returns None.
+        Otherwise returns a 1-D array of 100 evenly spaced values between
+        the parsed minimum and maximum.
+    """
+    s = str(text).strip()
+    if not s:
+        return None
+    parts = s.split(",")
+    if len(parts) != 2:
+        raise ValueError("Range must have the form 'min, max'.")
+    try:
+        lo = float(parts[0].strip())
+        hi = float(parts[1].strip())
+    except ValueError as exc:
+        raise ValueError("Range values must be numeric (e.g. '0, 10').") from exc
+    if lo >= hi:
+        raise ValueError("Range minimum must be strictly less than the maximum.")
+    return np.linspace(lo, hi, 100)
+def run_linear_regression(
+    *,
+    df: Optional[pd.DataFrame],
+    filtered_df: Optional[pd.DataFrame],
+    formula_check: bool,
+    formula_text: str,
+    formula_latex: str,
+    dependent_var: Optional[str],
+    independent_vars: List[str],
+    alpha_input: str,
+    intercept: bool,
+    graph_check: bool,
+    graph_type: str,
+    show_ci: bool,
+    show_pi: bool,
+    fit_to_obs: bool,
+    x_range_text: str,
+    round_digits: int = 4,
+) -> Tuple[str, pd.DataFrame, Optional[Figure]]:
+    """
+    High-level controller used by the Linear Regression tab.
+    This function takes raw user input from the UI, performs validation and
+    parsing, calls the stats layer, and returns a tuple:
+        (summary_html, params_df_rounded, figure)
+    Any exceptions should be caught in the tab layer and turned into user-
+    facing error messages.
+    """
+    working_df = _select_working_dataframe(df, filtered_df)
+    if dependent_var is None or dependent_var == "":
+        raise ValueError("Please select a dependent variable.")
+    if not independent_vars:
+        raise ValueError("Please select at least one independent variable.")
+    # For the "Simple Regression" graph we require exactly one independent variable.
+    if graph_check and graph_type == "Simple Regression" and len(independent_vars) != 1:
+        raise ValueError(
+            "The 'Simple Regression' graph is only available when exactly one "
+            "independent variable is selected."
+        )
+    # Parse confidence level
+    alpha = _parse_confidence_level(alpha_input)
+    # Parse X range only when needed: Simple Regression + graph + not fit_to_obs
+    x_vector = None
+    if graph_check and graph_type == "Simple Regression" and not fit_to_obs:
+        x_vector = _parse_range(x_range_text)
+    summary_html, params_df, fig = _run_linear_regression(
+        df=working_df,
+        formula_check=formula_check,
+        formula_text=formula_text,
+        formula_latex=formula_latex,
+        dependent_var=dependent_var,
+        independent_vars=independent_vars,
+        alpha=alpha,
+        intercept=intercept,
+        create_graph=graph_check,
+        graph_type=graph_type,
+        show_ci=show_ci,
+        show_pi=show_pi,
+        fit_to_obs=fit_to_obs,
+        x_vector=x_vector,
+    )
+    # Rounding happens here, not in the stats layer.
+    params_df_rounded = params_df.round(round_digits)
+    return summary_html, params_df_rounded, fig

controllers/utils/__pycache__/downloads.cpython-312.pyc ADDED Viewed

Binary file (1.82 kB). View file

controllers/utils/__pycache__/downloads.cpython-313.pyc ADDED Viewed

Binary file (1.81 kB). View file

controllers/utils/downloads.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import re
+import tempfile
+import gradio as gr
+def sanitize_filename(name: str, default: str):
+    if not name or not name.strip():
+        return default
+    clean = re.sub(r'[\\/*?:"<>|]', "", name).strip()
+    return clean if clean else default
+def dataframe_to_csv(df, filename):
+    if df is None:
+        gr.Warning("❌ No table available to download.")
+        return None
+    base = sanitize_filename(filename, "descriptive_statistics")
+    with tempfile.NamedTemporaryFile(
+        delete=False,
+        mode="w",
+        suffix=".csv",
+        prefix=base + "_",
+        encoding="utf-8",
+    ) as tmp:
+        df.to_csv(tmp.name, index=False)
+        return tmp.name
+def figure_to_png(fig, filename: str):
+    if fig is None:
+        return None
+    tmp = tempfile.NamedTemporaryFile(
+        delete=False,
+        suffix=".png",
+        prefix=filename + "_"
+    )
+    fig.savefig(tmp.name, dpi=200, bbox_inches="tight")
+    return tmp.name

core/__init__.py ADDED Viewed

File without changes

core/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (221 Bytes). View file

core/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (186 Bytes). View file

core/__pycache__/data_stats.cpython-312.pyc ADDED Viewed

Binary file (6.36 kB). View file

core/__pycache__/data_stats.cpython-313.pyc ADDED Viewed

Binary file (6.37 kB). View file

core/__pycache__/descriptive.cpython-313.pyc ADDED Viewed

Binary file (6.91 kB). View file

core/__pycache__/hypothesis_tests.cpython-312.pyc ADDED Viewed

Binary file (18.9 kB). View file

core/__pycache__/hypothesis_tests.cpython-313.pyc ADDED Viewed

Binary file (18.4 kB). View file

core/__pycache__/linear_regression.cpython-312.pyc ADDED Viewed

Binary file (12.4 kB). View file

core/__pycache__/linear_regression.cpython-313.pyc ADDED Viewed

Binary file (12.1 kB). View file

core/__pycache__/statistic_plots.cpython-313.pyc ADDED Viewed

Binary file (8.47 kB). View file

core/data_stats.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import pandas as pd
+import numpy as np
+import gradio as gr
+from pathlib import Path
+ROUND = 4
+def load_dataset(file):
+    """
+    Load CSV or Excel file.
+    Returns:
+        df, status_message
+    """
+    if file is None:
+        return None, "No file uploaded."
+    try:
+        path = Path(file.name)
+        if path.suffix == ".csv":
+            df = pd.read_csv(path)
+        elif path.suffix in [".xlsx", ".xls"]:
+            df = pd.read_excel(path)
+        else:
+            return None, "Unsupported file format."
+        return df, f"Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns."
+    except Exception as e:
+        return None, f"Error loading file: {e}"
+def dataset_summary(df: pd.DataFrame):
+    if df is None:
+        return None
+    summary = (
+        df.describe(include="all")
+        .transpose()
+        .reset_index()
+        .rename(columns={"index": "variable"})
+    )
+    # Add unique counts explicitly
+    summary["unique"] = df.nunique(dropna=True).values
+    # Desired column order
+    desired_order = [
+        "variable",
+        "count",
+        "unique",
+        "mean",
+        "std",
+        "min",
+        "25%",
+        "50%",
+        "75%",
+        "max",
+    ]
+    summary = summary[[c for c in desired_order if c in summary.columns]]
+    # ---- IMPORTANT PART ----
+    # Format numeric columns as strings
+    for col in summary.columns:
+        if col not in ["variable", "count", "unique"]:
+            summary[col] = summary[col].apply(
+                lambda x: f"{x:.{ROUND}f}" if isinstance(x, (int, float)) else x
+            )
+    return summary
+def variable_types(df):
+    if df is None:
+        return None
+    return (
+        df.dtypes
+        .reset_index()
+        .rename(columns={"index": "Variable", 0: "Type"})
+    )
+def column_choices_single(cols: list[str]):
+    return gr.update(choices=cols, value=None)
+def column_choices_multi(cols: list[str]):
+    return gr.update(choices=cols, value=[])
+def category_value_choices(df, col):
+    if df is None or col is None or col not in df.columns:
+        return gr.update(visible=False, choices=[], value=[])
+    values = sorted(df[col].dropna().unique().tolist())
+    return gr.update(
+        visible=True,
+        choices=values,
+        value=[],   # MUST be a list for multiselect
+    )
+def infer_column_types(df: pd.DataFrame):
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
+    return sorted(numeric_cols), sorted(categorical_cols)
+def apply_category_filters(
+    df,
+    cat_cols,
+    val1,
+    val2,
+    val3,
+):
+    if df is None:
+        return None, "❌ No data loaded."
+    if not cat_cols or all(not v for v in [val1, val2, val3]):
+        return df.copy(), "⚠️ No filters selected. Using full dataset."
+    filtered_df = df.copy()
+    values = [val1, val2, val3]
+    for col, selected_vals in zip(cat_cols[:3], values):
+        if selected_vals:
+            filtered_df = filtered_df[filtered_df[col].isin(selected_vals)]
+    return filtered_df, f"✅ Filter applied. Rows remaining: {len(filtered_df)}"
+def reclassify_as_categorical(state, column):
+    if column and column in state.numeric_cols:
+        state.numeric_cols.remove(column)
+        state.categorical_cols.append(column)
+        state.active_filters = {}   # reset filters
+        return True, f"Column '{column}' reclassified as categorical."
+    return False, f"Column '{column}' is not numeric."
+def reclassify_as_numeric(state, column):
+    if column and column in state.categorical_cols:
+        state.categorical_cols.remove(column)
+        state.numeric_cols.append(column)
+        state.active_filters = {}   # reset filters
+        return True, f"Column '{column}' reclassified as numeric."
+    return False, f"Column '{column}' is not categorical."

core/estimation/__init__.py ADDED Viewed

File without changes

core/estimation/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (232 Bytes). View file

core/estimation/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (196 Bytes). View file

core/estimation/__pycache__/descriptive.cpython-312.pyc ADDED Viewed

Binary file (7 kB). View file

core/estimation/__pycache__/descriptive.cpython-313.pyc ADDED Viewed

Binary file (6.92 kB). View file

core/estimation/__pycache__/graphical_analysis.cpython-312.pyc ADDED Viewed

Binary file (8.6 kB). View file

core/estimation/__pycache__/graphical_analysis.cpython-313.pyc ADDED Viewed

Binary file (8.48 kB). View file

core/estimation/descriptive.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# ui/stats/estimation/descriptive.py
+from functools import lru_cache
+import numpy as np
+import pandas as pd
+from scipy.stats import (
+    trim_mean,
+    gmean,
+    hmean,
+    skew,
+    kurtosis,
+    norm
+)
+from scipy.special import loggamma
+from scipy.integrate import quad
+from scipy.stats import median_abs_deviation
+# ------------------------------------------------------------------
+# Bias-correction constants (user-approved implementations)
+# ------------------------------------------------------------------
+@lru_cache(maxsize=None)
+def c4(n: int) -> float:
+    """Bias correction constant for standard deviation."""
+    return np.exp(
+        np.log(np.sqrt(2 / (n - 1)))
+        + loggamma(n / 2)
+        - loggamma((n - 1) / 2)
+    )
+@lru_cache(maxsize=None)
+def d2(n: int) -> float:
+    """Bias correction constant for the range."""
+    f = lambda x, n: 1 - (1 - norm.cdf(x)) ** n - (norm.cdf(x)) ** n
+    return quad(f, -np.inf, np.inf, args=(n,))[0]
+# ------------------------------------------------------------------
+# Main computation function
+# ------------------------------------------------------------------
+def compute_descriptive_statistics(
+    data,
+    *,
+    quantile_probs=(0.25, 0.5, 0.75),
+    trim_alpha=None,
+    winsor_limits=None,
+    weights=None,
+):
+    """
+    Compute all descriptive statistics for a single numeric variable.
+    """
+    # --- preparation ------------------------------------------------
+    x = pd.Series(data).dropna().astype(float)
+    n = len(x)
+    rows = []
+    # ----------------------------------------------------------------
+    # Quantiles
+    # ----------------------------------------------------------------
+    probs = np.atleast_1d(quantile_probs)
+    q_vals = np.quantile(x, probs)
+    for p, q in zip(probs, q_vals):
+        rows.append([
+            "Quantiles",
+            f"Q{p}",
+            q,
+            np.nan,
+            0
+        ])
+    # ----------------------------------------------------------------
+    # Central Tendency
+    # ----------------------------------------------------------------
+    mean = x.mean()
+    median = np.median(x)
+    iq_mean = trim_mean(x, 0.25)
+    rows.extend([
+        ["Central Tendency", "Mean", mean, np.nan, 0],
+        ["Central Tendency", "Median", median, np.nan, 1],
+        ["Central Tendency", "Interquartile Mean", iq_mean, np.nan, 1],
+    ])
+    # Weighted mean (additional, never replaces mean)
+    if weights is not None:
+        w = pd.Series(weights).loc[x.index].astype(float)
+        w_mean = np.average(x, weights=w)
+        rows.append([
+            "Central Tendency",
+            "Weighted Mean",
+            w_mean,
+            np.nan,
+            0
+        ])
+    # Trimmed mean
+    if trim_alpha is not None:
+        t_mean = trim_mean(x, trim_alpha)
+        rows.append([
+            "Central Tendency",
+            f"Trimmed Mean ({trim_alpha})",
+            t_mean,
+            np.nan,
+            1
+        ])
+    # Winsorized mean
+    if winsor_limits is not None:
+        from scipy.stats.mstats import winsorize
+        xw = winsorize(x, winsor_limits)
+        rows.append([
+            "Central Tendency",
+            f"Winsorized Mean {tuple(winsor_limits)}",
+            np.mean(xw),
+            np.nan,
+            1
+        ])
+    # Geometric & harmonic means
+    if np.all(x > 0):
+        rows.extend([
+            ["Central Tendency", "Geometric Mean", gmean(x), np.nan, 0],
+            ["Central Tendency", "Harmonic Mean", hmean(x), np.nan, 0],
+        ])
+    # ----------------------------------------------------------------
+    # Dispersion
+    # ----------------------------------------------------------------
+    var0 = np.var(x, ddof=0)
+    var1 = np.var(x, ddof=1)   # unbiased
+    std0 = np.std(x, ddof=0)
+    std1 = np.std(x, ddof=1)
+    rng = x.max() - x.min()
+    iqr = np.subtract(*np.percentile(x, [75, 25]))
+    mad = median_abs_deviation(x)
+    aad = np.mean(np.abs(x - mean))
+    rows.extend([
+        ["Dispersion", "Variance (ddof=0)", var0, var1, 0],
+        ["Dispersion", "Variance (ddof=1)", var1, var1, 0],
+        ["Dispersion", "Std (ddof=0)", std0, std0 * np.sqrt(n / (n - 1)) / c4(n), 0],
+        ["Dispersion", "Std (ddof=1)", std1, std1 / c4(n), 0],
+        ["Dispersion", "Range", rng, rng / d2(n), 0],
+        ["Dispersion", "AAD", aad, aad * np.sqrt(np.pi / 2), 0],
+        ["Dispersion", "IQR", iqr, iqr / (2 * norm.ppf(0.75)), 1],
+        ["Dispersion", "MAD", mad, mad / norm.ppf(0.75), 1],
+    ])
+    # ----------------------------------------------------------------
+    # Shape
+    # ----------------------------------------------------------------
+    rows.extend([
+        ["Shape", "Skewness (central moments)", skew(x), np.nan, 0],
+        ["Shape", "Skewness (k-statistic)", skew(x, bias=False), np.nan, 0],
+        ["Shape", "Kurtosis (central moments)", kurtosis(x, fisher=False), np.nan, 0],
+        ["Shape", "Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False), np.nan, 0],
+        ["Shape", "Excess Kurtosis (central moments)", kurtosis(x, fisher=False) - 3, np.nan, 0],
+        ["Shape", "Excess Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False) - 3, np.nan, 0],
+    ])
+    # ----------------------------------------------------------------
+    # Final table
+    # ----------------------------------------------------------------
+    return pd.DataFrame(
+        rows,
+        columns=[
+            "Statistic Type",
+            "Measure",
+            "Value",
+            "Bias Corrected",
+            "Robust",
+        ],
+    )

core/estimation/graphical_analysis.py ADDED Viewed

	@@ -0,0 +1,260 @@

+from __future__ import annotations
+from typing import Iterable, Optional, Tuple
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from scipy.stats import norm
+Interval = Optional[Tuple[float, float]]
+def _plot_hist_or_pmf(
+    ax,
+    *,
+    data: np.ndarray,
+    graph_type: str,
+    var_name: str,
+    add_kde: bool,
+    add_data: bool,
+):
+    """
+    Draw the main histogram / PMF on *ax*.
+    Modularized version of the monolithic PlotHistogram logic.
+    """
+    sns.set_style("whitegrid")
+    if graph_type == "Histogram":
+        sns.histplot(
+            data,
+            kde=add_kde,
+            stat="density",
+            color="rebeccapurple",
+            alpha=0.5,
+            ax=ax,
+        )
+        ax.set_ylabel("Density")
+        ax.set_xlabel(var_name)
+        ax.set_title(f"Distribution of {var_name}")
+    elif graph_type == "Empirical Probability Mass Function":
+        values, counts = np.unique(data, return_counts=True)
+        probs = counts / counts.sum()
+        ax.stem(values, probs, basefmt="rebeccapurple", linefmt="rebeccapurple")
+        if add_kde:
+            sns.kdeplot(data, ax=ax, color="rebeccapurple")
+        ax.set_ylabel("Probability")
+        ax.set_xlabel(var_name)
+        ax.set_title(f"Empirical PMF of {var_name}")
+    else:
+        raise ValueError(f"Unknown graph type: {graph_type}")
+    if add_data:
+        _, upper = ax.get_ylim()
+        sns.rugplot(data, height=0.1 * upper, ax=ax, color="black")
+def _plot_normal_density(
+    ax,
+    *,
+    hat_mu: float,
+    hat_sigma: float,
+    color: str = "black",
+):
+    if hat_sigma <= 0:
+        return
+    y_vect = np.linspace(hat_mu - 3 * hat_sigma, hat_mu + 3 * hat_sigma, 200)
+    ax.plot(
+        y_vect,
+        norm.pdf(y_vect, hat_mu, hat_sigma),
+        color=color,
+        linestyle="--",
+        label="Normal density",
+    )
+    ax.legend()
+def _plot_interval_band(
+    ax,
+    *,
+    y_val: float,
+    interval: Tuple[float, float],
+    label: str,
+    color: str,
+):
+    low, high = interval
+    ax.hlines(y_val, low, high, color=color, linewidth=2)
+    ax.scatter((low + high) / 2.0, y_val, color=color, s=30, zorder=5)
+    ax.text(
+        high,
+        y_val,
+        f" {label}",
+        va="center",
+        fontsize=9,
+        bbox=dict(
+            boxstyle="round,pad=0.2",
+            facecolor="whitesmoke",
+            edgecolor="gray",
+        ),
+    )
+def plot_histogram_with_overlays(
+    *,
+    data: Iterable[float],
+    graph_type: str,
+    var_name: str,
+    add_kde: bool,
+    add_data: bool,
+    add_normal: bool,
+    hat_mu: Optional[float],
+    hat_sigma: Optional[float],
+    ci_mean_interval: Interval,
+    ci_median_interval: Interval,
+    pi_interval: Interval,
+):
+    """
+    Return a matplotlib Figure for the histogram / PMF with optional overlays.
+    """
+    data = np.asarray(data)
+    show_any_interval = (
+        (ci_mean_interval is not None)
+        or (ci_median_interval is not None)
+        or (pi_interval is not None)
+    )
+    if show_any_interval:
+        fig, (ax1, ax2) = plt.subplots(
+            2,
+            1,
+            sharex=True,
+            figsize=(8, 6),
+        )
+    else:
+        fig, ax1 = plt.subplots(1, 1, figsize=(8, 4))
+        ax2 = None
+    _plot_hist_or_pmf(
+        ax1,
+        data=data,
+        graph_type=graph_type,
+        var_name=var_name,
+        add_kde=add_kde,
+        add_data=add_data,
+    )
+    if add_normal and hat_mu is not None and hat_sigma is not None:
+        _plot_normal_density(ax1, hat_mu=hat_mu, hat_sigma=hat_sigma)
+    # Interval annotations (confidence / prediction)
+    if show_any_interval and ax2 is not None:
+        ax2.set_yticks([])
+        ax2.set_xlabel(var_name)
+        ax2.set_ylim(0, 0.5)
+        ci_base_y = 0.4
+        if ci_mean_interval is not None:
+            _plot_interval_band(
+                ax2,
+                y_val=ci_base_y,
+                interval=ci_mean_interval,
+                label="CI Mean",
+                color="blue",
+            )
+        if ci_median_interval is not None:
+            _plot_interval_band(
+                ax2,
+                y_val=ci_base_y - 0.1,
+                interval=ci_median_interval,
+                label="CI Median",
+                color="green",
+            )
+        if pi_interval is not None:
+            _plot_interval_band(
+                ax2,
+                y_val=0.1,
+                interval=pi_interval,
+                label="Prediction Interval",
+                color="darkred",
+            )
+    fig.tight_layout()
+    return fig
+def plot_ecdf(
+    *,
+    data: Iterable[float],
+    var_name: str,
+    alpha: float,
+    add_conf_band: bool,
+    add_normal: bool,
+    hat_mu: Optional[float],
+    hat_sigma: Optional[float],
+):
+    """Modular version of the ECDF plot with optional DKW band and Normal CDF."""
+    from statsmodels.distributions.empirical_distribution import ECDF
+    data = np.asarray(data)
+    ecdf = ECDF(data)
+    fig, ax = plt.subplots(figsize=(8, 5))
+    # ECDF step
+    ax.step(
+        ecdf.x,
+        ecdf.y,
+        where="post",
+        color="rebeccapurple",
+        linewidth=2,
+        label="ECDF",
+    )
+    ax.scatter(ecdf.x, ecdf.y, color="rebeccapurple", s=10, alpha=0.6)
+    # DKW band
+    if add_conf_band:
+        n = len(data)
+        epsilon = np.sqrt(np.log(2.0 / alpha) / (2.0 * n))
+        lower = np.clip(ecdf.y - epsilon, 0.0, 1.0)
+        upper = np.clip(ecdf.y + epsilon, 0.0, 1.0)
+        ax.fill_between(
+            ecdf.x,
+            lower,
+            upper,
+            step="post",
+            color="plum",
+            alpha=0.4,
+            label="DKW CI",
+        )
+    # Optional Normal CDF
+    if add_normal and hat_mu is not None and hat_sigma is not None and hat_sigma > 0:
+        y_vals = np.linspace(hat_mu - 3.0 * hat_sigma, hat_mu + 3.0 * hat_sigma, 200)
+        ax.plot(
+            y_vals,
+            norm.cdf(y_vals, hat_mu, hat_sigma),
+            color="black",
+            linestyle="--",
+            linewidth=2,
+            label="Normal CDF",
+        )
+        ax.set_xlim(
+            min(data.min(), y_vals.min()) - 0.1,
+            max(data.max(), y_vals.max()) + 0.1,
+        )
+    else:
+        ax.set_xlim(data.min() - 0.1, data.max() + 0.1)
+    ax.set_title("Empirical Cumulative Distribution Function", fontsize=14)
+    ax.set_xlabel(var_name, fontsize=12)
+    ax.set_ylabel("ECDF", fontsize=12)
+    ax.set_ylim(0, 1.05)
+    ax.grid(True, linestyle="--", alpha=0.5)
+    ax.legend(loc="lower right", fontsize=10)
+    fig.tight_layout()
+    return fig

core/estimation/inference/__pycache__/ci.cpython-312.pyc ADDED Viewed

Binary file (552 Bytes). View file