Upload 127 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +2 -0
- .gradio/certificate.pem +31 -0
- __pycache__/app.cpython-313.pyc +0 -0
- app.py +8 -0
- controllers/__pycache__/data_controller.cpython-312.pyc +0 -0
- controllers/__pycache__/data_controller.cpython-313.pyc +0 -0
- controllers/__pycache__/hypothesis_controller.cpython-312.pyc +0 -0
- controllers/__pycache__/hypothesis_controller.cpython-313.pyc +0 -0
- controllers/__pycache__/linear_regression_controller.cpython-312.pyc +0 -0
- controllers/__pycache__/linear_regression_controller.cpython-313.pyc +0 -0
- controllers/data_controller.py +264 -0
- controllers/estimation/__init__.py +0 -0
- controllers/estimation/__pycache__/__init__.cpython-312.pyc +0 -0
- controllers/estimation/__pycache__/__init__.cpython-313.pyc +0 -0
- controllers/estimation/__pycache__/descriptive_controller.cpython-312.pyc +0 -0
- controllers/estimation/__pycache__/descriptive_controller.cpython-313.pyc +0 -0
- controllers/estimation/__pycache__/graphical_controller.cpython-312.pyc +0 -0
- controllers/estimation/__pycache__/graphical_controller.cpython-313.pyc +0 -0
- controllers/estimation/__pycache__/inference_controller.cpython-312.pyc +0 -0
- controllers/estimation/__pycache__/inference_controller.cpython-313.pyc +0 -0
- controllers/estimation/descriptive_controller.py +59 -0
- controllers/estimation/graphical_controller.py +383 -0
- controllers/estimation/inference_controller.py +300 -0
- controllers/hypothesis_controller.py +204 -0
- controllers/linear_regression_controller.py +160 -0
- controllers/utils/__pycache__/downloads.cpython-312.pyc +0 -0
- controllers/utils/__pycache__/downloads.cpython-313.pyc +0 -0
- controllers/utils/downloads.py +39 -0
- core/__init__.py +0 -0
- core/__pycache__/__init__.cpython-312.pyc +0 -0
- core/__pycache__/__init__.cpython-313.pyc +0 -0
- core/__pycache__/data_stats.cpython-312.pyc +0 -0
- core/__pycache__/data_stats.cpython-313.pyc +0 -0
- core/__pycache__/descriptive.cpython-313.pyc +0 -0
- core/__pycache__/hypothesis_tests.cpython-312.pyc +0 -0
- core/__pycache__/hypothesis_tests.cpython-313.pyc +0 -0
- core/__pycache__/linear_regression.cpython-312.pyc +0 -0
- core/__pycache__/linear_regression.cpython-313.pyc +0 -0
- core/__pycache__/statistic_plots.cpython-313.pyc +0 -0
- core/data_stats.py +150 -0
- core/estimation/__init__.py +0 -0
- core/estimation/__pycache__/__init__.cpython-312.pyc +0 -0
- core/estimation/__pycache__/__init__.cpython-313.pyc +0 -0
- core/estimation/__pycache__/descriptive.cpython-312.pyc +0 -0
- core/estimation/__pycache__/descriptive.cpython-313.pyc +0 -0
- core/estimation/__pycache__/graphical_analysis.cpython-312.pyc +0 -0
- core/estimation/__pycache__/graphical_analysis.cpython-313.pyc +0 -0
- core/estimation/descriptive.py +181 -0
- core/estimation/graphical_analysis.py +260 -0
- core/estimation/inference/__pycache__/ci.cpython-312.pyc +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
ui/assets/logos/HimmapanLab.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
ui/assets/logos/ThotsakanStats.png filter=lfs diff=lfs merge=lfs -text
|
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
__pycache__/app.cpython-313.pyc
ADDED
|
Binary file (940 Bytes). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ui.layout import build_layout
|
| 2 |
+
|
| 3 |
+
def main():
|
| 4 |
+
app = build_layout()
|
| 5 |
+
app.launch(share=True)
|
| 6 |
+
|
| 7 |
+
if __name__ == "__main__":
|
| 8 |
+
main()
|
controllers/__pycache__/data_controller.cpython-312.pyc
ADDED
|
Binary file (6.96 kB). View file
|
|
|
controllers/__pycache__/data_controller.cpython-313.pyc
ADDED
|
Binary file (6.71 kB). View file
|
|
|
controllers/__pycache__/hypothesis_controller.cpython-312.pyc
ADDED
|
Binary file (6.3 kB). View file
|
|
|
controllers/__pycache__/hypothesis_controller.cpython-313.pyc
ADDED
|
Binary file (6.28 kB). View file
|
|
|
controllers/__pycache__/linear_regression_controller.cpython-312.pyc
ADDED
|
Binary file (5.56 kB). View file
|
|
|
controllers/__pycache__/linear_regression_controller.cpython-313.pyc
ADDED
|
Binary file (5.5 kB). View file
|
|
|
controllers/data_controller.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
from core.data_stats import (
|
| 4 |
+
load_dataset,
|
| 5 |
+
dataset_summary,
|
| 6 |
+
variable_types,
|
| 7 |
+
infer_column_types,
|
| 8 |
+
apply_category_filters,
|
| 9 |
+
reclassify_as_categorical,
|
| 10 |
+
reclassify_as_numeric,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def wire_callbacks(
|
| 15 |
+
*,
|
| 16 |
+
file_input,
|
| 17 |
+
status_output,
|
| 18 |
+
|
| 19 |
+
# RAW DATA
|
| 20 |
+
preview_checkbox,
|
| 21 |
+
overview_checkbox,
|
| 22 |
+
csv_preview,
|
| 23 |
+
desc_output,
|
| 24 |
+
dtypes_output,
|
| 25 |
+
|
| 26 |
+
# RECLASSIFICATION
|
| 27 |
+
num_to_cat,
|
| 28 |
+
cat_to_num,
|
| 29 |
+
fix_to_categorical_button,
|
| 30 |
+
fix_to_numeric_button,
|
| 31 |
+
fix_dtype_status,
|
| 32 |
+
|
| 33 |
+
# FILTERS
|
| 34 |
+
cat_filter_cols,
|
| 35 |
+
cat_val_1,
|
| 36 |
+
cat_val_2,
|
| 37 |
+
cat_val_3,
|
| 38 |
+
apply_filter_button,
|
| 39 |
+
filter_status,
|
| 40 |
+
|
| 41 |
+
# FILTERED DATA
|
| 42 |
+
preview_checkbox_filter,
|
| 43 |
+
overview_checkbox_filter,
|
| 44 |
+
csv_preview_filter,
|
| 45 |
+
desc_output_filter,
|
| 46 |
+
dtypes_output_filter,
|
| 47 |
+
|
| 48 |
+
state,
|
| 49 |
+
):
|
| 50 |
+
# ==================================================
|
| 51 |
+
# File upload
|
| 52 |
+
# ==================================================
|
| 53 |
+
def on_file_upload(file):
|
| 54 |
+
df, status = load_dataset(file)
|
| 55 |
+
|
| 56 |
+
if df is None:
|
| 57 |
+
return (
|
| 58 |
+
status,
|
| 59 |
+
None, None, None,
|
| 60 |
+
gr.update(choices=[], value=None),
|
| 61 |
+
gr.update(choices=[], value=None),
|
| 62 |
+
gr.update(choices=[], value=[]),
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
numeric_cols, categorical_cols = infer_column_types(df)
|
| 66 |
+
|
| 67 |
+
state.df = df
|
| 68 |
+
state.filtered_df = df
|
| 69 |
+
state.numeric_cols = numeric_cols
|
| 70 |
+
state.categorical_cols = categorical_cols
|
| 71 |
+
state.active_filters = {}
|
| 72 |
+
state.overrides = {"num_to_cat": [], "cat_to_num": []}
|
| 73 |
+
|
| 74 |
+
return (
|
| 75 |
+
status,
|
| 76 |
+
df,
|
| 77 |
+
dataset_summary(df),
|
| 78 |
+
variable_types(df),
|
| 79 |
+
|
| 80 |
+
# Reclassification dropdowns
|
| 81 |
+
gr.update(choices=numeric_cols, value=None),
|
| 82 |
+
gr.update(choices=categorical_cols, value=None),
|
| 83 |
+
|
| 84 |
+
# Filter columns (categorical only)
|
| 85 |
+
gr.update(choices=categorical_cols, value=[]),
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
file_input.change(
|
| 89 |
+
on_file_upload,
|
| 90 |
+
inputs=file_input,
|
| 91 |
+
outputs=[
|
| 92 |
+
status_output,
|
| 93 |
+
csv_preview,
|
| 94 |
+
desc_output,
|
| 95 |
+
dtypes_output,
|
| 96 |
+
num_to_cat,
|
| 97 |
+
cat_to_num,
|
| 98 |
+
cat_filter_cols,
|
| 99 |
+
],
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# ==================================================
|
| 103 |
+
# Category value dropdowns (Filter 1–3)
|
| 104 |
+
# ==================================================
|
| 105 |
+
def update_category_filters(selected_columns):
|
| 106 |
+
df = state.df
|
| 107 |
+
|
| 108 |
+
if df is None or not selected_columns:
|
| 109 |
+
return (
|
| 110 |
+
gr.update(visible=False, choices=[], value=[]),
|
| 111 |
+
gr.update(visible=False, choices=[], value=[]),
|
| 112 |
+
gr.update(visible=False, choices=[], value=[]),
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
updates = []
|
| 116 |
+
for i in range(3):
|
| 117 |
+
if i < len(selected_columns):
|
| 118 |
+
col = selected_columns[i]
|
| 119 |
+
values = sorted(df[col].dropna().unique().tolist())
|
| 120 |
+
updates.append(
|
| 121 |
+
gr.update(
|
| 122 |
+
visible=True,
|
| 123 |
+
choices=values,
|
| 124 |
+
value=[],
|
| 125 |
+
)
|
| 126 |
+
)
|
| 127 |
+
else:
|
| 128 |
+
updates.append(
|
| 129 |
+
gr.update(visible=False, choices=[], value=[])
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
return tuple(updates)
|
| 133 |
+
|
| 134 |
+
cat_filter_cols.change(
|
| 135 |
+
update_category_filters,
|
| 136 |
+
inputs=cat_filter_cols,
|
| 137 |
+
outputs=[cat_val_1, cat_val_2, cat_val_3],
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# ==================================================
|
| 141 |
+
# Apply filters
|
| 142 |
+
# ==================================================
|
| 143 |
+
def on_apply_filter(cat_cols, v1, v2, v3):
|
| 144 |
+
filtered_df, status = apply_category_filters(
|
| 145 |
+
state.df,
|
| 146 |
+
cat_cols,
|
| 147 |
+
v1, v2, v3,
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
state.filtered_df = filtered_df
|
| 151 |
+
state.active_filters = {
|
| 152 |
+
col: vals
|
| 153 |
+
for col, vals in zip(cat_cols[:3], [v1, v2, v3])
|
| 154 |
+
if vals
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
return status
|
| 158 |
+
|
| 159 |
+
apply_filter_button.click(
|
| 160 |
+
on_apply_filter,
|
| 161 |
+
inputs=[cat_filter_cols, cat_val_1, cat_val_2, cat_val_3],
|
| 162 |
+
outputs=filter_status,
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
# ==================================================
|
| 166 |
+
# RAW preview / summary
|
| 167 |
+
# ==================================================
|
| 168 |
+
preview_checkbox.change(
|
| 169 |
+
lambda x: gr.update(visible=x),
|
| 170 |
+
inputs=preview_checkbox,
|
| 171 |
+
outputs=csv_preview,
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
overview_checkbox.change(
|
| 175 |
+
lambda x: (
|
| 176 |
+
gr.update(visible=x),
|
| 177 |
+
gr.update(visible=x),
|
| 178 |
+
),
|
| 179 |
+
inputs=overview_checkbox,
|
| 180 |
+
outputs=[desc_output, dtypes_output],
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
# ==================================================
|
| 184 |
+
# FILTERED preview / summary
|
| 185 |
+
# ==================================================
|
| 186 |
+
preview_checkbox_filter.change(
|
| 187 |
+
lambda x: (
|
| 188 |
+
gr.update(visible=x),
|
| 189 |
+
state.filtered_df if x else None,
|
| 190 |
+
),
|
| 191 |
+
inputs=preview_checkbox_filter,
|
| 192 |
+
outputs=[csv_preview_filter, csv_preview_filter],
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
overview_checkbox_filter.change(
|
| 196 |
+
lambda x: (
|
| 197 |
+
gr.update(visible=x),
|
| 198 |
+
gr.update(visible=x),
|
| 199 |
+
dataset_summary(state.filtered_df) if x else None,
|
| 200 |
+
variable_types(state.filtered_df) if x else None,
|
| 201 |
+
),
|
| 202 |
+
inputs=overview_checkbox_filter,
|
| 203 |
+
outputs=[
|
| 204 |
+
desc_output_filter,
|
| 205 |
+
dtypes_output_filter,
|
| 206 |
+
desc_output_filter,
|
| 207 |
+
dtypes_output_filter,
|
| 208 |
+
],
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
# ==================================================
|
| 212 |
+
# Reclassification
|
| 213 |
+
# ==================================================
|
| 214 |
+
def on_fix_to_categorical(column):
|
| 215 |
+
_, msg = reclassify_as_categorical(state, column)
|
| 216 |
+
return (
|
| 217 |
+
gr.update(choices=state.categorical_cols, value=[]),
|
| 218 |
+
gr.update(choices=state.numeric_cols, value=None),
|
| 219 |
+
gr.update(choices=state.categorical_cols, value=None),
|
| 220 |
+
msg,
|
| 221 |
+
gr.update(visible=False),
|
| 222 |
+
gr.update(visible=False),
|
| 223 |
+
gr.update(visible=False),
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
def on_fix_to_numeric(column):
|
| 227 |
+
_, msg = reclassify_as_numeric(state, column)
|
| 228 |
+
return (
|
| 229 |
+
gr.update(choices=state.categorical_cols, value=[]),
|
| 230 |
+
gr.update(choices=state.numeric_cols, value=None),
|
| 231 |
+
gr.update(choices=state.categorical_cols, value=None),
|
| 232 |
+
msg,
|
| 233 |
+
gr.update(visible=False),
|
| 234 |
+
gr.update(visible=False),
|
| 235 |
+
gr.update(visible=False),
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
fix_to_categorical_button.click(
|
| 239 |
+
on_fix_to_categorical,
|
| 240 |
+
inputs=num_to_cat,
|
| 241 |
+
outputs=[
|
| 242 |
+
cat_filter_cols,
|
| 243 |
+
num_to_cat,
|
| 244 |
+
cat_to_num,
|
| 245 |
+
fix_dtype_status,
|
| 246 |
+
cat_val_1,
|
| 247 |
+
cat_val_2,
|
| 248 |
+
cat_val_3,
|
| 249 |
+
],
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
fix_to_numeric_button.click(
|
| 253 |
+
on_fix_to_numeric,
|
| 254 |
+
inputs=cat_to_num,
|
| 255 |
+
outputs=[
|
| 256 |
+
cat_filter_cols,
|
| 257 |
+
num_to_cat,
|
| 258 |
+
cat_to_num,
|
| 259 |
+
fix_dtype_status,
|
| 260 |
+
cat_val_1,
|
| 261 |
+
cat_val_2,
|
| 262 |
+
cat_val_3,
|
| 263 |
+
],
|
| 264 |
+
)
|
controllers/estimation/__init__.py
ADDED
|
File without changes
|
controllers/estimation/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (239 Bytes). View file
|
|
|
controllers/estimation/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (206 Bytes). View file
|
|
|
controllers/estimation/__pycache__/descriptive_controller.cpython-312.pyc
ADDED
|
Binary file (2.28 kB). View file
|
|
|
controllers/estimation/__pycache__/descriptive_controller.cpython-313.pyc
ADDED
|
Binary file (2.3 kB). View file
|
|
|
controllers/estimation/__pycache__/graphical_controller.cpython-312.pyc
ADDED
|
Binary file (8.93 kB). View file
|
|
|
controllers/estimation/__pycache__/graphical_controller.cpython-313.pyc
ADDED
|
Binary file (8.87 kB). View file
|
|
|
controllers/estimation/__pycache__/inference_controller.cpython-312.pyc
ADDED
|
Binary file (5.27 kB). View file
|
|
|
controllers/estimation/__pycache__/inference_controller.cpython-313.pyc
ADDED
|
Binary file (5.16 kB). View file
|
|
|
controllers/estimation/descriptive_controller.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ui/controllers/estimation/descriptive_controller.py
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from core.estimation.descriptive import compute_descriptive_statistics
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def run_descriptive_statistics(
|
| 8 |
+
*,
|
| 9 |
+
df: pd.DataFrame,
|
| 10 |
+
column: str,
|
| 11 |
+
quantile_probs: list[float],
|
| 12 |
+
trim_alpha: float | None,
|
| 13 |
+
winsor_limits: tuple[float, float] | None,
|
| 14 |
+
weights_col: str | None,
|
| 15 |
+
round_digits: int,
|
| 16 |
+
) -> pd.DataFrame:
|
| 17 |
+
|
| 18 |
+
if df is None:
|
| 19 |
+
raise ValueError("No dataset loaded.")
|
| 20 |
+
|
| 21 |
+
if column not in df.columns:
|
| 22 |
+
raise ValueError(f"Column '{column}' not found.")
|
| 23 |
+
|
| 24 |
+
series = df[column].dropna()
|
| 25 |
+
|
| 26 |
+
if series.empty:
|
| 27 |
+
raise ValueError("Selected column has no valid data.")
|
| 28 |
+
|
| 29 |
+
if not pd.api.types.is_numeric_dtype(series):
|
| 30 |
+
raise ValueError("Selected column must be numeric.")
|
| 31 |
+
|
| 32 |
+
weights = None
|
| 33 |
+
|
| 34 |
+
if weights_col:
|
| 35 |
+
if weights_col not in df.columns:
|
| 36 |
+
raise ValueError(f"Weights column '{weights_col}' not found.")
|
| 37 |
+
|
| 38 |
+
weights = df.loc[series.index, weights_col]
|
| 39 |
+
|
| 40 |
+
if not pd.api.types.is_numeric_dtype(weights):
|
| 41 |
+
raise ValueError("Weights must be numeric.")
|
| 42 |
+
|
| 43 |
+
if (weights < 0).any():
|
| 44 |
+
raise ValueError("Weights must be non-negative.")
|
| 45 |
+
|
| 46 |
+
stats_df = compute_descriptive_statistics(
|
| 47 |
+
data=series.values,
|
| 48 |
+
quantile_probs=quantile_probs,
|
| 49 |
+
trim_alpha=trim_alpha,
|
| 50 |
+
winsor_limits=winsor_limits,
|
| 51 |
+
weights=weights.values if weights is not None else None,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
#numeric_cols = stats_df.select_dtypes("number").columns
|
| 55 |
+
#stats_df[numeric_cols] = stats_df[numeric_cols].round(round_digits)
|
| 56 |
+
|
| 57 |
+
stats_df[["Value", "Bias Corrected"]] = stats_df[["Value", "Bias Corrected"]].round(round_digits)
|
| 58 |
+
|
| 59 |
+
return stats_df
|
controllers/estimation/graphical_controller.py
ADDED
|
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Optional, Tuple
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
from core.estimation.inference.estimators import estimate_mean, estimate_sigma
|
| 9 |
+
from core.estimation.inference.ci import (
|
| 10 |
+
ci_mean_analytic,
|
| 11 |
+
ci_mean_bootstrap,
|
| 12 |
+
ci_median_analytic,
|
| 13 |
+
ci_median_bootstrap,
|
| 14 |
+
)
|
| 15 |
+
from core.estimation.inference.pi import (
|
| 16 |
+
pi_mean,
|
| 17 |
+
pi_median,
|
| 18 |
+
pi_iqr,
|
| 19 |
+
pi_bootstrap,
|
| 20 |
+
)
|
| 21 |
+
from core.estimation.graphical_analysis import (
|
| 22 |
+
plot_histogram_with_overlays,
|
| 23 |
+
plot_ecdf,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ---------------------------------------------------------------------
|
| 28 |
+
# Utilities (aligned with inference_controller)
|
| 29 |
+
# ---------------------------------------------------------------------
|
| 30 |
+
def select_distribution(mean_estimator: str, sigma_estimator: str) -> str:
|
| 31 |
+
if (
|
| 32 |
+
mean_estimator == "Sample Mean"
|
| 33 |
+
and sigma_estimator == "Deviation (1 ddof)"
|
| 34 |
+
):
|
| 35 |
+
return "t"
|
| 36 |
+
return "norm"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def validate_deviation_estimator(*, sigma_estimator: str, n: int):
|
| 40 |
+
if sigma_estimator == "Range (bias corrected)" and n > 25:
|
| 41 |
+
raise ValueError(
|
| 42 |
+
"Range-based confidence intervals require n ≤ 25. "
|
| 43 |
+
"Use another estimator or bootstrap."
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _prepare_series(
|
| 48 |
+
df: pd.DataFrame,
|
| 49 |
+
column: str,
|
| 50 |
+
weights_col: Optional[str],
|
| 51 |
+
) -> tuple[np.ndarray, Optional[np.ndarray]]:
|
| 52 |
+
if df is None:
|
| 53 |
+
raise ValueError("No data loaded. Please load a dataset first.")
|
| 54 |
+
|
| 55 |
+
if column not in df.columns:
|
| 56 |
+
raise ValueError(f"Column '{column}' not found in the dataframe.")
|
| 57 |
+
|
| 58 |
+
series = df[column].dropna()
|
| 59 |
+
if series.empty:
|
| 60 |
+
raise ValueError(f"Column '{column}' has no non-missing values.")
|
| 61 |
+
|
| 62 |
+
weights = None
|
| 63 |
+
if weights_col is not None:
|
| 64 |
+
if weights_col not in df.columns:
|
| 65 |
+
raise ValueError(
|
| 66 |
+
f"Weights column '{weights_col}' not found in the dataframe."
|
| 67 |
+
)
|
| 68 |
+
weights_series = df[weights_col].reindex(series.index).dropna()
|
| 69 |
+
common_idx = series.index.intersection(weights_series.index)
|
| 70 |
+
series = series.loc[common_idx]
|
| 71 |
+
weights_series = weights_series.loc[common_idx]
|
| 72 |
+
weights = weights_series.to_numpy()
|
| 73 |
+
|
| 74 |
+
return series.to_numpy(), weights
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def run_graphical_analysis(
|
| 78 |
+
*,
|
| 79 |
+
df: pd.DataFrame,
|
| 80 |
+
column: str,
|
| 81 |
+
graph_type: str,
|
| 82 |
+
# Histogram / PMF controls
|
| 83 |
+
add_kde: bool,
|
| 84 |
+
add_data: bool,
|
| 85 |
+
add_normal: bool,
|
| 86 |
+
add_ci: bool,
|
| 87 |
+
ci_choice: str,
|
| 88 |
+
add_pi: bool,
|
| 89 |
+
pi_choice: str,
|
| 90 |
+
# Estimators
|
| 91 |
+
mean_estimator: str,
|
| 92 |
+
median_estimator: str,
|
| 93 |
+
sigma_estimator: str,
|
| 94 |
+
trim_param,
|
| 95 |
+
winsor_limits,
|
| 96 |
+
weights_col: Optional[str],
|
| 97 |
+
# Normal μ source
|
| 98 |
+
normal_mu_source: str,
|
| 99 |
+
# Bootstrap options
|
| 100 |
+
bootstrap_mean: bool,
|
| 101 |
+
bootstrap_median: bool,
|
| 102 |
+
bootstrap_sigma: bool,
|
| 103 |
+
bootstrap_prediction: bool,
|
| 104 |
+
bootstrap_samples: int,
|
| 105 |
+
# CI/PI confidence level
|
| 106 |
+
ci_pi_conf_level: float,
|
| 107 |
+
# ECDF controls
|
| 108 |
+
ecdf_add_conf: bool,
|
| 109 |
+
ecdf_conf_level: float,
|
| 110 |
+
ecdf_add_normal: bool,
|
| 111 |
+
):
|
| 112 |
+
data, weights = _prepare_series(df, column, weights_col)
|
| 113 |
+
|
| 114 |
+
if not (0.0 < ci_pi_conf_level < 1.0):
|
| 115 |
+
raise ValueError("Confidence level for CI/PI must be in (0, 1).")
|
| 116 |
+
|
| 117 |
+
if graph_type in ("Histogram", "Empirical Probability Mass Function"):
|
| 118 |
+
return _run_hist_or_pmf(
|
| 119 |
+
data=data,
|
| 120 |
+
var_name=column,
|
| 121 |
+
graph_type=graph_type,
|
| 122 |
+
add_kde=add_kde,
|
| 123 |
+
add_data=add_data,
|
| 124 |
+
add_normal=add_normal,
|
| 125 |
+
add_ci=add_ci,
|
| 126 |
+
ci_choice=ci_choice,
|
| 127 |
+
add_pi=add_pi,
|
| 128 |
+
pi_choice=pi_choice,
|
| 129 |
+
mean_estimator=mean_estimator,
|
| 130 |
+
median_estimator=median_estimator,
|
| 131 |
+
sigma_estimator=sigma_estimator,
|
| 132 |
+
trim_param=trim_param,
|
| 133 |
+
winsor_limits=winsor_limits,
|
| 134 |
+
weights=weights,
|
| 135 |
+
normal_mu_source=normal_mu_source,
|
| 136 |
+
bootstrap_mean=bootstrap_mean,
|
| 137 |
+
bootstrap_median=bootstrap_median,
|
| 138 |
+
bootstrap_sigma=bootstrap_sigma,
|
| 139 |
+
bootstrap_prediction=bootstrap_prediction,
|
| 140 |
+
bootstrap_samples=bootstrap_samples,
|
| 141 |
+
ci_pi_conf_level=ci_pi_conf_level,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
if graph_type == "Empirical Cumulative Distribution Function (ECDF)":
|
| 145 |
+
return _run_ecdf(
|
| 146 |
+
data=data,
|
| 147 |
+
var_name=column,
|
| 148 |
+
ecdf_add_conf=ecdf_add_conf,
|
| 149 |
+
ecdf_conf_level=ecdf_conf_level,
|
| 150 |
+
ecdf_add_normal=ecdf_add_normal,
|
| 151 |
+
mean_estimator=mean_estimator,
|
| 152 |
+
sigma_estimator=sigma_estimator,
|
| 153 |
+
trim_param=trim_param,
|
| 154 |
+
winsor_limits=winsor_limits,
|
| 155 |
+
weights=weights,
|
| 156 |
+
normal_mu_source=normal_mu_source,
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
raise ValueError(f"Unknown graph type: {graph_type}")
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def _run_hist_or_pmf(
|
| 163 |
+
*,
|
| 164 |
+
data: np.ndarray,
|
| 165 |
+
var_name: str,
|
| 166 |
+
graph_type: str,
|
| 167 |
+
add_kde: bool,
|
| 168 |
+
add_data: bool,
|
| 169 |
+
add_normal: bool,
|
| 170 |
+
add_ci: bool,
|
| 171 |
+
ci_choice: str,
|
| 172 |
+
add_pi: bool,
|
| 173 |
+
pi_choice: str,
|
| 174 |
+
mean_estimator: str,
|
| 175 |
+
median_estimator: str,
|
| 176 |
+
sigma_estimator: str,
|
| 177 |
+
trim_param,
|
| 178 |
+
winsor_limits,
|
| 179 |
+
weights: Optional[np.ndarray],
|
| 180 |
+
normal_mu_source: str,
|
| 181 |
+
bootstrap_mean: bool,
|
| 182 |
+
bootstrap_median: bool,
|
| 183 |
+
bootstrap_sigma: bool,
|
| 184 |
+
bootstrap_prediction: bool,
|
| 185 |
+
bootstrap_samples: int,
|
| 186 |
+
ci_pi_conf_level: float,
|
| 187 |
+
):
|
| 188 |
+
alpha = 1.0 - ci_pi_conf_level
|
| 189 |
+
|
| 190 |
+
n = len(data)
|
| 191 |
+
validate_deviation_estimator(
|
| 192 |
+
sigma_estimator=sigma_estimator,
|
| 193 |
+
n=n,
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
ci_mean_interval = None
|
| 197 |
+
ci_median_interval = None
|
| 198 |
+
pi_interval = None
|
| 199 |
+
hat_mu = None
|
| 200 |
+
hat_sigma = None
|
| 201 |
+
|
| 202 |
+
need_intervals = add_ci or add_pi or add_normal
|
| 203 |
+
|
| 204 |
+
if need_intervals:
|
| 205 |
+
# --- Parameters for Normal overlay ---
|
| 206 |
+
if add_normal:
|
| 207 |
+
if normal_mu_source == "Mean-based CI":
|
| 208 |
+
hat_mu = estimate_mean(
|
| 209 |
+
data,
|
| 210 |
+
mean_estimator,
|
| 211 |
+
trim_param=trim_param,
|
| 212 |
+
winsor_limits=winsor_limits,
|
| 213 |
+
weights=weights,
|
| 214 |
+
)
|
| 215 |
+
else:
|
| 216 |
+
hat_mu = float(np.median(data))
|
| 217 |
+
|
| 218 |
+
hat_sigma = estimate_sigma(
|
| 219 |
+
data=data,
|
| 220 |
+
estimator=sigma_estimator,
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
# --- Confidence intervals ---
|
| 224 |
+
if add_ci:
|
| 225 |
+
dist = select_distribution(mean_estimator, sigma_estimator)
|
| 226 |
+
|
| 227 |
+
# CI for mean
|
| 228 |
+
if bootstrap_mean:
|
| 229 |
+
ci_mean_interval = ci_mean_bootstrap(
|
| 230 |
+
data=data,
|
| 231 |
+
estimator=mean_estimator,
|
| 232 |
+
alpha=alpha,
|
| 233 |
+
trim_param=trim_param,
|
| 234 |
+
winsor_limits=winsor_limits,
|
| 235 |
+
weights=weights,
|
| 236 |
+
B=bootstrap_samples,
|
| 237 |
+
)
|
| 238 |
+
else:
|
| 239 |
+
ci_mean_interval = ci_mean_analytic(
|
| 240 |
+
data=data,
|
| 241 |
+
estimator=mean_estimator,
|
| 242 |
+
alpha=alpha,
|
| 243 |
+
dist=dist,
|
| 244 |
+
sigma_estimator=sigma_estimator,
|
| 245 |
+
trim_param=trim_param,
|
| 246 |
+
winsor_limits=winsor_limits,
|
| 247 |
+
weights=weights,
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
# CI for median
|
| 251 |
+
if bootstrap_median:
|
| 252 |
+
ci_median_interval = ci_median_bootstrap(
|
| 253 |
+
data=data,
|
| 254 |
+
alpha=alpha,
|
| 255 |
+
B=bootstrap_samples,
|
| 256 |
+
)
|
| 257 |
+
else:
|
| 258 |
+
ci_median_interval = ci_median_analytic(
|
| 259 |
+
data=data,
|
| 260 |
+
alpha=alpha,
|
| 261 |
+
sigma_estimator=sigma_estimator,
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
# Respect user choice (Mean / Median / Both)
|
| 265 |
+
if ci_choice == "Mean":
|
| 266 |
+
ci_median_interval = None
|
| 267 |
+
elif ci_choice == "Median":
|
| 268 |
+
ci_mean_interval = None
|
| 269 |
+
|
| 270 |
+
# --- Prediction intervals ---
|
| 271 |
+
if add_pi:
|
| 272 |
+
dist = select_distribution(mean_estimator, sigma_estimator)
|
| 273 |
+
if pi_choice == "Mean":
|
| 274 |
+
pi_interval = pi_mean(
|
| 275 |
+
data=data,
|
| 276 |
+
alpha=alpha,
|
| 277 |
+
estimator=mean_estimator,
|
| 278 |
+
dist=dist,
|
| 279 |
+
sigma_estimator=sigma_estimator,
|
| 280 |
+
trim_param=trim_param,
|
| 281 |
+
winsor_limits=winsor_limits,
|
| 282 |
+
weights=weights,
|
| 283 |
+
)
|
| 284 |
+
elif pi_choice == "Median":
|
| 285 |
+
# New API: pi_median only needs data, alpha and sigma_estimator
|
| 286 |
+
pi_interval = pi_median(
|
| 287 |
+
data=data,
|
| 288 |
+
alpha=alpha,
|
| 289 |
+
sigma_estimator=sigma_estimator,
|
| 290 |
+
)
|
| 291 |
+
elif pi_choice == "IQR":
|
| 292 |
+
pi_interval = pi_iqr(
|
| 293 |
+
data=data,
|
| 294 |
+
alpha=alpha,
|
| 295 |
+
)
|
| 296 |
+
elif pi_choice == "Bootstrap":
|
| 297 |
+
if not bootstrap_prediction:
|
| 298 |
+
raise ValueError(
|
| 299 |
+
"To use the Bootstrap prediction interval, enable the "
|
| 300 |
+
"'Bootstrap Prediction' option in the estimator settings."
|
| 301 |
+
)
|
| 302 |
+
pi_interval = pi_bootstrap(
|
| 303 |
+
data=data,
|
| 304 |
+
alpha=alpha,
|
| 305 |
+
B=bootstrap_samples,
|
| 306 |
+
)
|
| 307 |
+
else:
|
| 308 |
+
raise ValueError(
|
| 309 |
+
f"Unknown prediction-interval choice: {pi_choice}"
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
fig = plot_histogram_with_overlays(
|
| 313 |
+
data=data,
|
| 314 |
+
graph_type=graph_type,
|
| 315 |
+
var_name=var_name,
|
| 316 |
+
add_kde=add_kde,
|
| 317 |
+
add_data=add_data,
|
| 318 |
+
add_normal=add_normal,
|
| 319 |
+
hat_mu=hat_mu,
|
| 320 |
+
hat_sigma=hat_sigma,
|
| 321 |
+
ci_mean_interval=ci_mean_interval,
|
| 322 |
+
ci_median_interval=ci_median_interval,
|
| 323 |
+
pi_interval=pi_interval,
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
return fig
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def _run_ecdf(
|
| 330 |
+
*,
|
| 331 |
+
data: np.ndarray,
|
| 332 |
+
var_name: str,
|
| 333 |
+
ecdf_add_conf: bool,
|
| 334 |
+
ecdf_conf_level: float,
|
| 335 |
+
ecdf_add_normal: bool,
|
| 336 |
+
mean_estimator: str,
|
| 337 |
+
sigma_estimator: str,
|
| 338 |
+
trim_param,
|
| 339 |
+
winsor_limits,
|
| 340 |
+
weights: Optional[np.ndarray],
|
| 341 |
+
normal_mu_source: str,
|
| 342 |
+
):
|
| 343 |
+
if not (0.0 < ecdf_conf_level < 1.0):
|
| 344 |
+
raise ValueError("ECDF confidence level must be in (0, 1).")
|
| 345 |
+
|
| 346 |
+
alpha = 1.0 - ecdf_conf_level
|
| 347 |
+
|
| 348 |
+
n = len(data)
|
| 349 |
+
validate_deviation_estimator(
|
| 350 |
+
sigma_estimator=sigma_estimator,
|
| 351 |
+
n=n,
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
hat_mu = None
|
| 355 |
+
hat_sigma = None
|
| 356 |
+
|
| 357 |
+
if ecdf_add_normal:
|
| 358 |
+
if normal_mu_source == "Mean-based CI":
|
| 359 |
+
hat_mu = estimate_mean(
|
| 360 |
+
data,
|
| 361 |
+
mean_estimator,
|
| 362 |
+
trim_param=trim_param,
|
| 363 |
+
winsor_limits=winsor_limits,
|
| 364 |
+
weights=weights,
|
| 365 |
+
)
|
| 366 |
+
else:
|
| 367 |
+
hat_mu = float(np.median(data))
|
| 368 |
+
|
| 369 |
+
hat_sigma = estimate_sigma(
|
| 370 |
+
data=data,
|
| 371 |
+
estimator=sigma_estimator,
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
fig = plot_ecdf(
|
| 375 |
+
data=data,
|
| 376 |
+
var_name=var_name,
|
| 377 |
+
alpha=alpha,
|
| 378 |
+
add_conf_band=ecdf_add_conf,
|
| 379 |
+
add_normal=ecdf_add_normal,
|
| 380 |
+
hat_mu=hat_mu,
|
| 381 |
+
hat_sigma=hat_sigma,
|
| 382 |
+
)
|
| 383 |
+
return fig
|
controllers/estimation/inference_controller.py
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
from core.estimation.inference.ci import (
|
| 4 |
+
ci_mean_analytic,
|
| 5 |
+
ci_median_analytic,
|
| 6 |
+
ci_deviation_analytic,
|
| 7 |
+
ci_mean_bootstrap,
|
| 8 |
+
ci_median_bootstrap,
|
| 9 |
+
ci_deviation_bootstrap,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
from core.estimation.inference.pi import (
|
| 13 |
+
pi_mean,
|
| 14 |
+
pi_median,
|
| 15 |
+
pi_iqr,
|
| 16 |
+
pi_bootstrap,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
from core.estimation.inference.confidence_regions import confidence_regions
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# ---------------------------------------------------------------------
|
| 23 |
+
# Utilities
|
| 24 |
+
# ---------------------------------------------------------------------
|
| 25 |
+
|
| 26 |
+
def select_distribution(mean_estimator: str, sigma_estimator: str) -> str:
|
| 27 |
+
if mean_estimator == "Sample Mean" and sigma_estimator == "Deviation (1 ddof)":
|
| 28 |
+
return "t"
|
| 29 |
+
return "norm"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def validate_deviation_estimator(*, sigma_estimator: str, n: int):
|
| 33 |
+
if sigma_estimator == "Range (bias corrected)" and n > 25:
|
| 34 |
+
raise ValueError(
|
| 35 |
+
"Range-based confidence intervals require n ≤ 25. "
|
| 36 |
+
"Use another estimator or bootstrap."
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ---------------------------------------------------------------------
|
| 41 |
+
# Confidence Intervals
|
| 42 |
+
# ---------------------------------------------------------------------
|
| 43 |
+
|
| 44 |
+
def run_confidence_intervals(
|
| 45 |
+
*,
|
| 46 |
+
data,
|
| 47 |
+
alpha,
|
| 48 |
+
mean_estimator,
|
| 49 |
+
median_estimator,
|
| 50 |
+
sigma_estimator,
|
| 51 |
+
trim_param=None,
|
| 52 |
+
winsor_limits=None,
|
| 53 |
+
weights=None,
|
| 54 |
+
bootstrap_mean=False,
|
| 55 |
+
bootstrap_median=False,
|
| 56 |
+
bootstrap_deviation=False,
|
| 57 |
+
bootstrap_samples=1000,
|
| 58 |
+
):
|
| 59 |
+
n = len(data)
|
| 60 |
+
|
| 61 |
+
validate_deviation_estimator(
|
| 62 |
+
sigma_estimator=sigma_estimator,
|
| 63 |
+
n=n,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
dist = select_distribution(mean_estimator, sigma_estimator)
|
| 67 |
+
|
| 68 |
+
# ---------------- Mean ----------------
|
| 69 |
+
if bootstrap_mean:
|
| 70 |
+
mean_ci = ci_mean_bootstrap(
|
| 71 |
+
data=data,
|
| 72 |
+
estimator=mean_estimator,
|
| 73 |
+
alpha=alpha,
|
| 74 |
+
B=bootstrap_samples,
|
| 75 |
+
trim_param=trim_param,
|
| 76 |
+
winsor_limits=winsor_limits,
|
| 77 |
+
weights=weights,
|
| 78 |
+
)
|
| 79 |
+
else:
|
| 80 |
+
mean_ci = ci_mean_analytic(
|
| 81 |
+
data=data,
|
| 82 |
+
estimator=mean_estimator,
|
| 83 |
+
alpha=alpha,
|
| 84 |
+
dist=dist,
|
| 85 |
+
sigma_estimator=sigma_estimator,
|
| 86 |
+
trim_param=trim_param,
|
| 87 |
+
winsor_limits=winsor_limits,
|
| 88 |
+
weights=weights,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# ---------------- Median ----------------
|
| 92 |
+
if bootstrap_median:
|
| 93 |
+
median_ci = ci_median_bootstrap(
|
| 94 |
+
data=data,
|
| 95 |
+
alpha=alpha,
|
| 96 |
+
B=bootstrap_samples,
|
| 97 |
+
)
|
| 98 |
+
else:
|
| 99 |
+
median_ci = ci_median_analytic(
|
| 100 |
+
data=data,
|
| 101 |
+
alpha=alpha,
|
| 102 |
+
sigma_estimator=sigma_estimator,
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
# ---------------- Deviation ----------------
|
| 106 |
+
if bootstrap_deviation:
|
| 107 |
+
sigma_ci = ci_deviation_bootstrap(
|
| 108 |
+
data=data,
|
| 109 |
+
alpha=alpha,
|
| 110 |
+
B=bootstrap_samples,
|
| 111 |
+
estimator=sigma_estimator,
|
| 112 |
+
)
|
| 113 |
+
else:
|
| 114 |
+
sigma_ci = ci_deviation_analytic(
|
| 115 |
+
data=data,
|
| 116 |
+
alpha=alpha,
|
| 117 |
+
estimator=sigma_estimator,
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
table = pd.DataFrame(
|
| 121 |
+
[
|
| 122 |
+
["Confidence", "Mean", *mean_ci],
|
| 123 |
+
["Confidence", "Median", *median_ci],
|
| 124 |
+
["Confidence", "Deviation", *sigma_ci],
|
| 125 |
+
],
|
| 126 |
+
columns=["Interval Type", "Statistic", "Lower", "Upper"],
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
return table, mean_ci, sigma_ci, median_ci
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# ---------------------------------------------------------------------
|
| 133 |
+
# Prediction Intervals
|
| 134 |
+
# ---------------------------------------------------------------------
|
| 135 |
+
|
| 136 |
+
def run_prediction_intervals(
|
| 137 |
+
*,
|
| 138 |
+
data,
|
| 139 |
+
alpha,
|
| 140 |
+
mean_estimator,
|
| 141 |
+
median_estimator,
|
| 142 |
+
sigma_estimator,
|
| 143 |
+
trim_param=None,
|
| 144 |
+
winsor_limits=None,
|
| 145 |
+
weights=None,
|
| 146 |
+
bootstrap=False,
|
| 147 |
+
bootstrap_samples=1000,
|
| 148 |
+
):
|
| 149 |
+
dist = select_distribution(mean_estimator, sigma_estimator)
|
| 150 |
+
|
| 151 |
+
rows = []
|
| 152 |
+
|
| 153 |
+
# Mean-based PI
|
| 154 |
+
mean_pi = pi_mean(
|
| 155 |
+
data=data,
|
| 156 |
+
alpha=alpha,
|
| 157 |
+
estimator=mean_estimator,
|
| 158 |
+
dist=dist,
|
| 159 |
+
sigma_estimator=sigma_estimator,
|
| 160 |
+
trim_param=trim_param,
|
| 161 |
+
winsor_limits=winsor_limits,
|
| 162 |
+
weights=weights,
|
| 163 |
+
)
|
| 164 |
+
rows.append(["Prediction", "Mean", *mean_pi])
|
| 165 |
+
|
| 166 |
+
# Median-based PI (uses same deviation estimator)
|
| 167 |
+
median_pi = pi_median(
|
| 168 |
+
data=data,
|
| 169 |
+
alpha=alpha,
|
| 170 |
+
sigma_estimator=sigma_estimator,
|
| 171 |
+
)
|
| 172 |
+
rows.append(["Prediction", "Median", *median_pi])
|
| 173 |
+
|
| 174 |
+
# IQR-based PI
|
| 175 |
+
iqr_pi = pi_iqr(
|
| 176 |
+
data=data,
|
| 177 |
+
alpha=alpha,
|
| 178 |
+
)
|
| 179 |
+
rows.append(["Prediction", "IQR", *iqr_pi])
|
| 180 |
+
|
| 181 |
+
# Optional bootstrap PI
|
| 182 |
+
if bootstrap:
|
| 183 |
+
boot_pi = pi_bootstrap(
|
| 184 |
+
data=data,
|
| 185 |
+
alpha=alpha,
|
| 186 |
+
B=bootstrap_samples,
|
| 187 |
+
)
|
| 188 |
+
rows.append(["Prediction", "Bootstrap", *boot_pi])
|
| 189 |
+
|
| 190 |
+
return pd.DataFrame(
|
| 191 |
+
rows,
|
| 192 |
+
columns=["Interval Type", "Statistic", "Lower", "Upper"],
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# ---------------------------------------------------------------------
|
| 196 |
+
# Confidence Regions
|
| 197 |
+
# ---------------------------------------------------------------------
|
| 198 |
+
|
| 199 |
+
def run_confidence_regions(
|
| 200 |
+
*,
|
| 201 |
+
data,
|
| 202 |
+
alpha,
|
| 203 |
+
mean_estimator,
|
| 204 |
+
median_estimator,
|
| 205 |
+
sigma_estimator,
|
| 206 |
+
trim_param,
|
| 207 |
+
winsor_limits,
|
| 208 |
+
weights,
|
| 209 |
+
bootstrap_mean,
|
| 210 |
+
bootstrap_median,
|
| 211 |
+
bootstrap_deviation,
|
| 212 |
+
bootstrap_samples,
|
| 213 |
+
mu_ci_source,
|
| 214 |
+
probs,
|
| 215 |
+
eps_mu,
|
| 216 |
+
eps_sigma,
|
| 217 |
+
add_ci_box,
|
| 218 |
+
):
|
| 219 |
+
"""
|
| 220 |
+
Use the CI machinery to compute CIs for mean, median and deviation,
|
| 221 |
+
then choose which CI to use for μ (mean-based or median-based) and
|
| 222 |
+
pass that CI plus the σ CI into the likelihood-based confidence
|
| 223 |
+
regions function.
|
| 224 |
+
"""
|
| 225 |
+
|
| 226 |
+
ci_table, mean_ci, sigma_ci, median_ci = run_confidence_intervals(
|
| 227 |
+
data=data,
|
| 228 |
+
alpha=alpha,
|
| 229 |
+
mean_estimator=mean_estimator,
|
| 230 |
+
median_estimator=median_estimator,
|
| 231 |
+
sigma_estimator=sigma_estimator,
|
| 232 |
+
trim_param=trim_param,
|
| 233 |
+
winsor_limits=winsor_limits,
|
| 234 |
+
weights=weights,
|
| 235 |
+
bootstrap_mean=bootstrap_mean,
|
| 236 |
+
bootstrap_median=bootstrap_median,
|
| 237 |
+
bootstrap_deviation=bootstrap_deviation,
|
| 238 |
+
bootstrap_samples=bootstrap_samples,
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
if mu_ci_source == "Median-based CI":
|
| 242 |
+
mu_ci = median_ci
|
| 243 |
+
else:
|
| 244 |
+
# default: mean-based CI
|
| 245 |
+
mu_ci = mean_ci
|
| 246 |
+
|
| 247 |
+
fig = confidence_regions(
|
| 248 |
+
data=data,
|
| 249 |
+
mean_ci=mu_ci,
|
| 250 |
+
sigma_ci=sigma_ci,
|
| 251 |
+
probs=probs,
|
| 252 |
+
eps_mu=eps_mu,
|
| 253 |
+
eps_sigma=eps_sigma,
|
| 254 |
+
add_ci_box=add_ci_box,
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
return fig
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
# ---------------------------------------------------------------------
|
| 261 |
+
# Combined Runner (used by UI)
|
| 262 |
+
# ---------------------------------------------------------------------
|
| 263 |
+
|
| 264 |
+
def run_intervals(
|
| 265 |
+
*,
|
| 266 |
+
data,
|
| 267 |
+
alpha,
|
| 268 |
+
mean_estimator,
|
| 269 |
+
median_estimator,
|
| 270 |
+
sigma_estimator,
|
| 271 |
+
bootstrap_mean,
|
| 272 |
+
bootstrap_median,
|
| 273 |
+
bootstrap_deviation,
|
| 274 |
+
bootstrap_samples,
|
| 275 |
+
):
|
| 276 |
+
ci_table, mean_ci, sigma_ci = run_confidence_intervals(
|
| 277 |
+
data=data,
|
| 278 |
+
alpha=alpha,
|
| 279 |
+
mean_estimator=mean_estimator,
|
| 280 |
+
median_estimator=median_estimator,
|
| 281 |
+
sigma_estimator=sigma_estimator,
|
| 282 |
+
bootstrap_mean=bootstrap_mean,
|
| 283 |
+
bootstrap_median=bootstrap_median,
|
| 284 |
+
bootstrap_deviation=bootstrap_deviation,
|
| 285 |
+
bootstrap_samples=bootstrap_samples,
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
pi_table = run_prediction_intervals(
|
| 289 |
+
data=data,
|
| 290 |
+
alpha=alpha,
|
| 291 |
+
mean_estimator=mean_estimator,
|
| 292 |
+
median_estimator=median_estimator,
|
| 293 |
+
sigma_estimator=sigma_estimator,
|
| 294 |
+
bootstrap=bootstrap_mean,
|
| 295 |
+
bootstrap_samples=bootstrap_samples,
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
combined = pd.concat([ci_table, pi_table], ignore_index=True)
|
| 299 |
+
|
| 300 |
+
return ci_table, pi_table, combined
|
controllers/hypothesis_controller.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Iterable, Tuple
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
from core.hypothesis_tests import (
|
| 9 |
+
one_sample_ttest,
|
| 10 |
+
two_sample_ttest,
|
| 11 |
+
variance_test,
|
| 12 |
+
one_way_anova,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
ROUND = 4
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _round_table(table: pd.DataFrame, decimals: int = ROUND) -> pd.DataFrame:
|
| 19 |
+
"""Round only numeric columns of the result table."""
|
| 20 |
+
if table is None:
|
| 21 |
+
return table
|
| 22 |
+
tbl = table.copy()
|
| 23 |
+
num_cols = tbl.select_dtypes(include="number").columns
|
| 24 |
+
if len(num_cols) > 0:
|
| 25 |
+
tbl[num_cols] = tbl[num_cols].round(decimals)
|
| 26 |
+
return tbl
|
| 27 |
+
|
| 28 |
+
def _ensure_numeric_series(df: pd.DataFrame, column: str) -> np.ndarray:
|
| 29 |
+
if df is None:
|
| 30 |
+
raise ValueError("No dataset loaded.")
|
| 31 |
+
if column not in df.columns:
|
| 32 |
+
raise ValueError(f"Column '{column}' not found in the dataset.")
|
| 33 |
+
|
| 34 |
+
series = df[column].dropna()
|
| 35 |
+
if series.empty:
|
| 36 |
+
raise ValueError("No valid data in the selected column.")
|
| 37 |
+
return series.to_numpy()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _materialize_group(
|
| 41 |
+
df: pd.DataFrame,
|
| 42 |
+
numeric_col: str,
|
| 43 |
+
cat_col: str | None,
|
| 44 |
+
cat_vals: Iterable[str],
|
| 45 |
+
) -> np.ndarray:
|
| 46 |
+
if cat_col is None:
|
| 47 |
+
raise ValueError("No categorical column selected.")
|
| 48 |
+
|
| 49 |
+
if cat_col not in df.columns:
|
| 50 |
+
raise ValueError(f"Categorical column '{cat_col}' not found in the dataset.")
|
| 51 |
+
|
| 52 |
+
# Cast selected values to the actual dtype of the column
|
| 53 |
+
if cat_vals is None:
|
| 54 |
+
values = []
|
| 55 |
+
else:
|
| 56 |
+
values = list(cat_vals)
|
| 57 |
+
|
| 58 |
+
if not values:
|
| 59 |
+
raise ValueError(f"No categories selected for column '{cat_col}'.")
|
| 60 |
+
|
| 61 |
+
cat_series = pd.Series(values).astype(df[cat_col].dtype)
|
| 62 |
+
mask = df[cat_col].isin(cat_series)
|
| 63 |
+
series = df.loc[mask, numeric_col].dropna()
|
| 64 |
+
|
| 65 |
+
if series.empty:
|
| 66 |
+
raise ValueError("One or more groups are empty after filtering.")
|
| 67 |
+
return series.to_numpy()
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def run_hypothesis_testing(
|
| 71 |
+
*,
|
| 72 |
+
df: pd.DataFrame | None,
|
| 73 |
+
numeric_col: str,
|
| 74 |
+
hypo_test: str,
|
| 75 |
+
mu0_text: str,
|
| 76 |
+
alternative: str,
|
| 77 |
+
include_graph: bool,
|
| 78 |
+
bootstrap_samples: int,
|
| 79 |
+
cat_col1: str | None,
|
| 80 |
+
cat_vals1: list[str],
|
| 81 |
+
name_group1: str,
|
| 82 |
+
cat_col2: str | None,
|
| 83 |
+
cat_vals2: list[str],
|
| 84 |
+
name_group2: str,
|
| 85 |
+
cat_col3: str | None,
|
| 86 |
+
cat_vals3: list[str],
|
| 87 |
+
plot_type: str,
|
| 88 |
+
correction: bool,
|
| 89 |
+
test_type: str,
|
| 90 |
+
) -> Tuple[pd.DataFrame, object | None]:
|
| 91 |
+
"""
|
| 92 |
+
High-level dispatcher used by the Hypothesis Testing tab.
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
(result_table, figure_or_none)
|
| 96 |
+
"""
|
| 97 |
+
if df is None:
|
| 98 |
+
raise ValueError("No dataset loaded.")
|
| 99 |
+
|
| 100 |
+
# Common numeric data check
|
| 101 |
+
_ = _ensure_numeric_series(df, numeric_col)
|
| 102 |
+
|
| 103 |
+
# ------------------------------------------------------------
|
| 104 |
+
# One-sample t-test
|
| 105 |
+
# ------------------------------------------------------------
|
| 106 |
+
if hypo_test == "One sample Student's t-test":
|
| 107 |
+
if not mu0_text.strip():
|
| 108 |
+
raise ValueError("μ₀ must be specified for the one-sample t-test.")
|
| 109 |
+
try:
|
| 110 |
+
mu0 = float(mu0_text)
|
| 111 |
+
except Exception:
|
| 112 |
+
raise ValueError("μ₀ must be a numeric value.")
|
| 113 |
+
|
| 114 |
+
sample = df[numeric_col].dropna().to_numpy()
|
| 115 |
+
|
| 116 |
+
table, fig = one_sample_ttest(
|
| 117 |
+
sample=sample,
|
| 118 |
+
mu0=mu0,
|
| 119 |
+
alternative=alternative,
|
| 120 |
+
numeric_col=numeric_col,
|
| 121 |
+
bootstrap_samples=bootstrap_samples,
|
| 122 |
+
include_graph=include_graph,
|
| 123 |
+
)
|
| 124 |
+
table = _round_table(table)
|
| 125 |
+
return table, fig
|
| 126 |
+
|
| 127 |
+
# ------------------------------------------------------------
|
| 128 |
+
# Two-sample t-test
|
| 129 |
+
# ------------------------------------------------------------
|
| 130 |
+
if hypo_test == "Two samples Student's t-test":
|
| 131 |
+
group1 = _materialize_group(df, numeric_col, cat_col1, cat_vals1)
|
| 132 |
+
group2 = _materialize_group(df, numeric_col, cat_col2, cat_vals2)
|
| 133 |
+
|
| 134 |
+
# If names are empty, fall back to defaults
|
| 135 |
+
name1 = name_group1 or "Group 1"
|
| 136 |
+
name2 = name_group2 or "Group 2"
|
| 137 |
+
|
| 138 |
+
table, fig = two_sample_ttest(
|
| 139 |
+
group1=group1,
|
| 140 |
+
group2=group2,
|
| 141 |
+
numeric_col=numeric_col,
|
| 142 |
+
name_group1=name1,
|
| 143 |
+
name_group2=name2,
|
| 144 |
+
alternative=alternative,
|
| 145 |
+
correction=correction,
|
| 146 |
+
plot_type=plot_type,
|
| 147 |
+
bootstrap_samples=bootstrap_samples,
|
| 148 |
+
include_graph=include_graph,
|
| 149 |
+
)
|
| 150 |
+
table = _round_table(table)
|
| 151 |
+
return table, fig
|
| 152 |
+
|
| 153 |
+
# ------------------------------------------------------------
|
| 154 |
+
# Equal variance between two groups
|
| 155 |
+
# ------------------------------------------------------------
|
| 156 |
+
if hypo_test == "Equal variance between two groups":
|
| 157 |
+
group1 = _materialize_group(df, numeric_col, cat_col1, cat_vals1)
|
| 158 |
+
group2 = _materialize_group(df, numeric_col, cat_col2, cat_vals2)
|
| 159 |
+
|
| 160 |
+
name1 = name_group1 or "Group 1"
|
| 161 |
+
name2 = name_group2 or "Group 2"
|
| 162 |
+
|
| 163 |
+
table, fig = variance_test(
|
| 164 |
+
group1=group1,
|
| 165 |
+
group2=group2,
|
| 166 |
+
name_group1=name1,
|
| 167 |
+
name_group2=name2,
|
| 168 |
+
test_type=test_type,
|
| 169 |
+
include_graph=include_graph,
|
| 170 |
+
bootstrap_samples=bootstrap_samples,
|
| 171 |
+
)
|
| 172 |
+
table = _round_table(table)
|
| 173 |
+
return table, fig
|
| 174 |
+
|
| 175 |
+
# ------------------------------------------------------------
|
| 176 |
+
# One-way ANOVA
|
| 177 |
+
# ------------------------------------------------------------
|
| 178 |
+
if hypo_test == "One-way ANOVA":
|
| 179 |
+
if cat_col3 is None:
|
| 180 |
+
raise ValueError("A categorical column must be selected for ANOVA.")
|
| 181 |
+
|
| 182 |
+
if cat_col3 not in df.columns:
|
| 183 |
+
raise ValueError(
|
| 184 |
+
f"Categorical column '{cat_col3}' not found in the dataset."
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
if not cat_vals3:
|
| 188 |
+
raise ValueError("At least one category must be selected for ANOVA.")
|
| 189 |
+
|
| 190 |
+
cat_series = pd.Series(cat_vals3).astype(df[cat_col3].dtype)
|
| 191 |
+
data_group = df[df[cat_col3].isin(cat_series)][[numeric_col, cat_col3]].dropna()
|
| 192 |
+
|
| 193 |
+
table, fig = one_way_anova(
|
| 194 |
+
data_group=data_group,
|
| 195 |
+
numeric_col=numeric_col,
|
| 196 |
+
cat_col=cat_col3,
|
| 197 |
+
)
|
| 198 |
+
table = _round_table(table)
|
| 199 |
+
return table, fig
|
| 200 |
+
|
| 201 |
+
# ------------------------------------------------------------
|
| 202 |
+
# Fallback
|
| 203 |
+
# ------------------------------------------------------------
|
| 204 |
+
raise ValueError(f"Unknown hypothesis test: {hypo_test}")
|
controllers/linear_regression_controller.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List, Optional, Sequence, Tuple
|
| 4 |
+
|
| 5 |
+
from matplotlib.figure import Figure
|
| 6 |
+
import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
+
|
| 9 |
+
from core.linear_regression import run_linear_regression as _run_linear_regression
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _select_working_dataframe(
|
| 13 |
+
df: Optional[pd.DataFrame],
|
| 14 |
+
filtered_df: Optional[pd.DataFrame],
|
| 15 |
+
) -> pd.DataFrame:
|
| 16 |
+
"""
|
| 17 |
+
Use the filtered dataframe if it is non-empty; otherwise fall back to the
|
| 18 |
+
original dataframe. This mirrors the behaviour used in other tabs.
|
| 19 |
+
"""
|
| 20 |
+
if df is None:
|
| 21 |
+
raise ValueError("No dataset loaded.")
|
| 22 |
+
|
| 23 |
+
if filtered_df is not None and not filtered_df.empty:
|
| 24 |
+
return filtered_df
|
| 25 |
+
|
| 26 |
+
if df.empty:
|
| 27 |
+
raise ValueError("The dataset is empty.")
|
| 28 |
+
|
| 29 |
+
return df
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _parse_confidence_level(text: str) -> float:
|
| 33 |
+
"""
|
| 34 |
+
Parse a confidence level like '0.95' into an alpha value for statsmodels.
|
| 35 |
+
|
| 36 |
+
Returns
|
| 37 |
+
-------
|
| 38 |
+
alpha : float
|
| 39 |
+
Significance level (e.g. 0.05 for a 95% confidence level).
|
| 40 |
+
"""
|
| 41 |
+
s = str(text).strip()
|
| 42 |
+
if not s:
|
| 43 |
+
raise ValueError("Confidence level is required (e.g. 0.95).")
|
| 44 |
+
try:
|
| 45 |
+
level = float(s)
|
| 46 |
+
except ValueError as exc:
|
| 47 |
+
raise ValueError("Confidence level must be a numeric value between 0 and 1.") from exc
|
| 48 |
+
|
| 49 |
+
if not (0 < level < 1):
|
| 50 |
+
raise ValueError("Confidence level must be between 0 and 1 (e.g. 0.95).")
|
| 51 |
+
|
| 52 |
+
# statsmodels expects alpha, not the confidence level itself
|
| 53 |
+
return 1.0 - level
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _parse_range(text: str) -> Optional[np.ndarray]:
|
| 57 |
+
"""
|
| 58 |
+
Parse a range string like '0, 10' into a numpy array suitable for predictions.
|
| 59 |
+
|
| 60 |
+
Returns
|
| 61 |
+
-------
|
| 62 |
+
np.ndarray or None
|
| 63 |
+
If the string is empty or only whitespace, returns None.
|
| 64 |
+
Otherwise returns a 1-D array of 100 evenly spaced values between
|
| 65 |
+
the parsed minimum and maximum.
|
| 66 |
+
"""
|
| 67 |
+
s = str(text).strip()
|
| 68 |
+
if not s:
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
parts = s.split(",")
|
| 72 |
+
if len(parts) != 2:
|
| 73 |
+
raise ValueError("Range must have the form 'min, max'.")
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
lo = float(parts[0].strip())
|
| 77 |
+
hi = float(parts[1].strip())
|
| 78 |
+
except ValueError as exc:
|
| 79 |
+
raise ValueError("Range values must be numeric (e.g. '0, 10').") from exc
|
| 80 |
+
|
| 81 |
+
if lo >= hi:
|
| 82 |
+
raise ValueError("Range minimum must be strictly less than the maximum.")
|
| 83 |
+
|
| 84 |
+
return np.linspace(lo, hi, 100)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def run_linear_regression(
|
| 88 |
+
*,
|
| 89 |
+
df: Optional[pd.DataFrame],
|
| 90 |
+
filtered_df: Optional[pd.DataFrame],
|
| 91 |
+
formula_check: bool,
|
| 92 |
+
formula_text: str,
|
| 93 |
+
formula_latex: str,
|
| 94 |
+
dependent_var: Optional[str],
|
| 95 |
+
independent_vars: List[str],
|
| 96 |
+
alpha_input: str,
|
| 97 |
+
intercept: bool,
|
| 98 |
+
graph_check: bool,
|
| 99 |
+
graph_type: str,
|
| 100 |
+
show_ci: bool,
|
| 101 |
+
show_pi: bool,
|
| 102 |
+
fit_to_obs: bool,
|
| 103 |
+
x_range_text: str,
|
| 104 |
+
round_digits: int = 4,
|
| 105 |
+
) -> Tuple[str, pd.DataFrame, Optional[Figure]]:
|
| 106 |
+
"""
|
| 107 |
+
High-level controller used by the Linear Regression tab.
|
| 108 |
+
|
| 109 |
+
This function takes raw user input from the UI, performs validation and
|
| 110 |
+
parsing, calls the stats layer, and returns a tuple:
|
| 111 |
+
|
| 112 |
+
(summary_html, params_df_rounded, figure)
|
| 113 |
+
|
| 114 |
+
Any exceptions should be caught in the tab layer and turned into user-
|
| 115 |
+
facing error messages.
|
| 116 |
+
"""
|
| 117 |
+
working_df = _select_working_dataframe(df, filtered_df)
|
| 118 |
+
|
| 119 |
+
if dependent_var is None or dependent_var == "":
|
| 120 |
+
raise ValueError("Please select a dependent variable.")
|
| 121 |
+
|
| 122 |
+
if not independent_vars:
|
| 123 |
+
raise ValueError("Please select at least one independent variable.")
|
| 124 |
+
|
| 125 |
+
# For the "Simple Regression" graph we require exactly one independent variable.
|
| 126 |
+
if graph_check and graph_type == "Simple Regression" and len(independent_vars) != 1:
|
| 127 |
+
raise ValueError(
|
| 128 |
+
"The 'Simple Regression' graph is only available when exactly one "
|
| 129 |
+
"independent variable is selected."
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# Parse confidence level
|
| 133 |
+
alpha = _parse_confidence_level(alpha_input)
|
| 134 |
+
|
| 135 |
+
# Parse X range only when needed: Simple Regression + graph + not fit_to_obs
|
| 136 |
+
x_vector = None
|
| 137 |
+
if graph_check and graph_type == "Simple Regression" and not fit_to_obs:
|
| 138 |
+
x_vector = _parse_range(x_range_text)
|
| 139 |
+
|
| 140 |
+
summary_html, params_df, fig = _run_linear_regression(
|
| 141 |
+
df=working_df,
|
| 142 |
+
formula_check=formula_check,
|
| 143 |
+
formula_text=formula_text,
|
| 144 |
+
formula_latex=formula_latex,
|
| 145 |
+
dependent_var=dependent_var,
|
| 146 |
+
independent_vars=independent_vars,
|
| 147 |
+
alpha=alpha,
|
| 148 |
+
intercept=intercept,
|
| 149 |
+
create_graph=graph_check,
|
| 150 |
+
graph_type=graph_type,
|
| 151 |
+
show_ci=show_ci,
|
| 152 |
+
show_pi=show_pi,
|
| 153 |
+
fit_to_obs=fit_to_obs,
|
| 154 |
+
x_vector=x_vector,
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# Rounding happens here, not in the stats layer.
|
| 158 |
+
params_df_rounded = params_df.round(round_digits)
|
| 159 |
+
|
| 160 |
+
return summary_html, params_df_rounded, fig
|
controllers/utils/__pycache__/downloads.cpython-312.pyc
ADDED
|
Binary file (1.82 kB). View file
|
|
|
controllers/utils/__pycache__/downloads.cpython-313.pyc
ADDED
|
Binary file (1.81 kB). View file
|
|
|
controllers/utils/downloads.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import tempfile
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
def sanitize_filename(name: str, default: str):
|
| 6 |
+
if not name or not name.strip():
|
| 7 |
+
return default
|
| 8 |
+
clean = re.sub(r'[\\/*?:"<>|]', "", name).strip()
|
| 9 |
+
return clean if clean else default
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def dataframe_to_csv(df, filename):
|
| 13 |
+
if df is None:
|
| 14 |
+
gr.Warning("❌ No table available to download.")
|
| 15 |
+
return None
|
| 16 |
+
|
| 17 |
+
base = sanitize_filename(filename, "descriptive_statistics")
|
| 18 |
+
|
| 19 |
+
with tempfile.NamedTemporaryFile(
|
| 20 |
+
delete=False,
|
| 21 |
+
mode="w",
|
| 22 |
+
suffix=".csv",
|
| 23 |
+
prefix=base + "_",
|
| 24 |
+
encoding="utf-8",
|
| 25 |
+
) as tmp:
|
| 26 |
+
df.to_csv(tmp.name, index=False)
|
| 27 |
+
return tmp.name
|
| 28 |
+
|
| 29 |
+
def figure_to_png(fig, filename: str):
|
| 30 |
+
if fig is None:
|
| 31 |
+
return None
|
| 32 |
+
|
| 33 |
+
tmp = tempfile.NamedTemporaryFile(
|
| 34 |
+
delete=False,
|
| 35 |
+
suffix=".png",
|
| 36 |
+
prefix=filename + "_"
|
| 37 |
+
)
|
| 38 |
+
fig.savefig(tmp.name, dpi=200, bbox_inches="tight")
|
| 39 |
+
return tmp.name
|
core/__init__.py
ADDED
|
File without changes
|
core/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (221 Bytes). View file
|
|
|
core/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (186 Bytes). View file
|
|
|
core/__pycache__/data_stats.cpython-312.pyc
ADDED
|
Binary file (6.36 kB). View file
|
|
|
core/__pycache__/data_stats.cpython-313.pyc
ADDED
|
Binary file (6.37 kB). View file
|
|
|
core/__pycache__/descriptive.cpython-313.pyc
ADDED
|
Binary file (6.91 kB). View file
|
|
|
core/__pycache__/hypothesis_tests.cpython-312.pyc
ADDED
|
Binary file (18.9 kB). View file
|
|
|
core/__pycache__/hypothesis_tests.cpython-313.pyc
ADDED
|
Binary file (18.4 kB). View file
|
|
|
core/__pycache__/linear_regression.cpython-312.pyc
ADDED
|
Binary file (12.4 kB). View file
|
|
|
core/__pycache__/linear_regression.cpython-313.pyc
ADDED
|
Binary file (12.1 kB). View file
|
|
|
core/__pycache__/statistic_plots.cpython-313.pyc
ADDED
|
Binary file (8.47 kB). View file
|
|
|
core/data_stats.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import gradio as gr
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
ROUND = 4
|
| 7 |
+
|
| 8 |
+
def load_dataset(file):
|
| 9 |
+
"""
|
| 10 |
+
Load CSV or Excel file.
|
| 11 |
+
Returns:
|
| 12 |
+
df, status_message
|
| 13 |
+
"""
|
| 14 |
+
if file is None:
|
| 15 |
+
return None, "No file uploaded."
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
path = Path(file.name)
|
| 19 |
+
|
| 20 |
+
if path.suffix == ".csv":
|
| 21 |
+
df = pd.read_csv(path)
|
| 22 |
+
elif path.suffix in [".xlsx", ".xls"]:
|
| 23 |
+
df = pd.read_excel(path)
|
| 24 |
+
else:
|
| 25 |
+
return None, "Unsupported file format."
|
| 26 |
+
|
| 27 |
+
return df, f"Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns."
|
| 28 |
+
|
| 29 |
+
except Exception as e:
|
| 30 |
+
return None, f"Error loading file: {e}"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def dataset_summary(df: pd.DataFrame):
|
| 34 |
+
if df is None:
|
| 35 |
+
return None
|
| 36 |
+
|
| 37 |
+
summary = (
|
| 38 |
+
df.describe(include="all")
|
| 39 |
+
.transpose()
|
| 40 |
+
.reset_index()
|
| 41 |
+
.rename(columns={"index": "variable"})
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Add unique counts explicitly
|
| 45 |
+
summary["unique"] = df.nunique(dropna=True).values
|
| 46 |
+
|
| 47 |
+
# Desired column order
|
| 48 |
+
desired_order = [
|
| 49 |
+
"variable",
|
| 50 |
+
"count",
|
| 51 |
+
"unique",
|
| 52 |
+
"mean",
|
| 53 |
+
"std",
|
| 54 |
+
"min",
|
| 55 |
+
"25%",
|
| 56 |
+
"50%",
|
| 57 |
+
"75%",
|
| 58 |
+
"max",
|
| 59 |
+
]
|
| 60 |
+
summary = summary[[c for c in desired_order if c in summary.columns]]
|
| 61 |
+
|
| 62 |
+
# ---- IMPORTANT PART ----
|
| 63 |
+
# Format numeric columns as strings
|
| 64 |
+
for col in summary.columns:
|
| 65 |
+
if col not in ["variable", "count", "unique"]:
|
| 66 |
+
summary[col] = summary[col].apply(
|
| 67 |
+
lambda x: f"{x:.{ROUND}f}" if isinstance(x, (int, float)) else x
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
return summary
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def variable_types(df):
|
| 74 |
+
if df is None:
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
return (
|
| 78 |
+
df.dtypes
|
| 79 |
+
.reset_index()
|
| 80 |
+
.rename(columns={"index": "Variable", 0: "Type"})
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def column_choices_single(cols: list[str]):
|
| 85 |
+
return gr.update(choices=cols, value=None)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def column_choices_multi(cols: list[str]):
|
| 89 |
+
return gr.update(choices=cols, value=[])
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def category_value_choices(df, col):
|
| 93 |
+
if df is None or col is None or col not in df.columns:
|
| 94 |
+
return gr.update(visible=False, choices=[], value=[])
|
| 95 |
+
|
| 96 |
+
values = sorted(df[col].dropna().unique().tolist())
|
| 97 |
+
|
| 98 |
+
return gr.update(
|
| 99 |
+
visible=True,
|
| 100 |
+
choices=values,
|
| 101 |
+
value=[], # MUST be a list for multiselect
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def infer_column_types(df: pd.DataFrame):
|
| 106 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 107 |
+
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
|
| 108 |
+
|
| 109 |
+
return sorted(numeric_cols), sorted(categorical_cols)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def apply_category_filters(
|
| 113 |
+
df,
|
| 114 |
+
cat_cols,
|
| 115 |
+
val1,
|
| 116 |
+
val2,
|
| 117 |
+
val3,
|
| 118 |
+
):
|
| 119 |
+
if df is None:
|
| 120 |
+
return None, "❌ No data loaded."
|
| 121 |
+
|
| 122 |
+
if not cat_cols or all(not v for v in [val1, val2, val3]):
|
| 123 |
+
return df.copy(), "⚠️ No filters selected. Using full dataset."
|
| 124 |
+
|
| 125 |
+
filtered_df = df.copy()
|
| 126 |
+
|
| 127 |
+
values = [val1, val2, val3]
|
| 128 |
+
|
| 129 |
+
for col, selected_vals in zip(cat_cols[:3], values):
|
| 130 |
+
if selected_vals:
|
| 131 |
+
filtered_df = filtered_df[filtered_df[col].isin(selected_vals)]
|
| 132 |
+
|
| 133 |
+
return filtered_df, f"✅ Filter applied. Rows remaining: {len(filtered_df)}"
|
| 134 |
+
|
| 135 |
+
def reclassify_as_categorical(state, column):
|
| 136 |
+
if column and column in state.numeric_cols:
|
| 137 |
+
state.numeric_cols.remove(column)
|
| 138 |
+
state.categorical_cols.append(column)
|
| 139 |
+
state.active_filters = {} # reset filters
|
| 140 |
+
return True, f"Column '{column}' reclassified as categorical."
|
| 141 |
+
return False, f"Column '{column}' is not numeric."
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def reclassify_as_numeric(state, column):
|
| 145 |
+
if column and column in state.categorical_cols:
|
| 146 |
+
state.categorical_cols.remove(column)
|
| 147 |
+
state.numeric_cols.append(column)
|
| 148 |
+
state.active_filters = {} # reset filters
|
| 149 |
+
return True, f"Column '{column}' reclassified as numeric."
|
| 150 |
+
return False, f"Column '{column}' is not categorical."
|
core/estimation/__init__.py
ADDED
|
File without changes
|
core/estimation/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (232 Bytes). View file
|
|
|
core/estimation/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (196 Bytes). View file
|
|
|
core/estimation/__pycache__/descriptive.cpython-312.pyc
ADDED
|
Binary file (7 kB). View file
|
|
|
core/estimation/__pycache__/descriptive.cpython-313.pyc
ADDED
|
Binary file (6.92 kB). View file
|
|
|
core/estimation/__pycache__/graphical_analysis.cpython-312.pyc
ADDED
|
Binary file (8.6 kB). View file
|
|
|
core/estimation/__pycache__/graphical_analysis.cpython-313.pyc
ADDED
|
Binary file (8.48 kB). View file
|
|
|
core/estimation/descriptive.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ui/stats/estimation/descriptive.py
|
| 2 |
+
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from scipy.stats import (
|
| 7 |
+
trim_mean,
|
| 8 |
+
gmean,
|
| 9 |
+
hmean,
|
| 10 |
+
skew,
|
| 11 |
+
kurtosis,
|
| 12 |
+
norm
|
| 13 |
+
)
|
| 14 |
+
from scipy.special import loggamma
|
| 15 |
+
from scipy.integrate import quad
|
| 16 |
+
from scipy.stats import median_abs_deviation
|
| 17 |
+
|
| 18 |
+
# ------------------------------------------------------------------
|
| 19 |
+
# Bias-correction constants (user-approved implementations)
|
| 20 |
+
# ------------------------------------------------------------------
|
| 21 |
+
|
| 22 |
+
@lru_cache(maxsize=None)
|
| 23 |
+
def c4(n: int) -> float:
|
| 24 |
+
"""Bias correction constant for standard deviation."""
|
| 25 |
+
return np.exp(
|
| 26 |
+
np.log(np.sqrt(2 / (n - 1)))
|
| 27 |
+
+ loggamma(n / 2)
|
| 28 |
+
- loggamma((n - 1) / 2)
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@lru_cache(maxsize=None)
|
| 33 |
+
def d2(n: int) -> float:
|
| 34 |
+
"""Bias correction constant for the range."""
|
| 35 |
+
f = lambda x, n: 1 - (1 - norm.cdf(x)) ** n - (norm.cdf(x)) ** n
|
| 36 |
+
return quad(f, -np.inf, np.inf, args=(n,))[0]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# ------------------------------------------------------------------
|
| 40 |
+
# Main computation function
|
| 41 |
+
# ------------------------------------------------------------------
|
| 42 |
+
|
| 43 |
+
def compute_descriptive_statistics(
|
| 44 |
+
data,
|
| 45 |
+
*,
|
| 46 |
+
quantile_probs=(0.25, 0.5, 0.75),
|
| 47 |
+
trim_alpha=None,
|
| 48 |
+
winsor_limits=None,
|
| 49 |
+
weights=None,
|
| 50 |
+
):
|
| 51 |
+
"""
|
| 52 |
+
Compute all descriptive statistics for a single numeric variable.
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
# --- preparation ------------------------------------------------
|
| 56 |
+
x = pd.Series(data).dropna().astype(float)
|
| 57 |
+
n = len(x)
|
| 58 |
+
|
| 59 |
+
rows = []
|
| 60 |
+
|
| 61 |
+
# ----------------------------------------------------------------
|
| 62 |
+
# Quantiles
|
| 63 |
+
# ----------------------------------------------------------------
|
| 64 |
+
probs = np.atleast_1d(quantile_probs)
|
| 65 |
+
q_vals = np.quantile(x, probs)
|
| 66 |
+
for p, q in zip(probs, q_vals):
|
| 67 |
+
rows.append([
|
| 68 |
+
"Quantiles",
|
| 69 |
+
f"Q{p}",
|
| 70 |
+
q,
|
| 71 |
+
np.nan,
|
| 72 |
+
0
|
| 73 |
+
])
|
| 74 |
+
|
| 75 |
+
# ----------------------------------------------------------------
|
| 76 |
+
# Central Tendency
|
| 77 |
+
# ----------------------------------------------------------------
|
| 78 |
+
|
| 79 |
+
mean = x.mean()
|
| 80 |
+
median = np.median(x)
|
| 81 |
+
iq_mean = trim_mean(x, 0.25)
|
| 82 |
+
|
| 83 |
+
rows.extend([
|
| 84 |
+
["Central Tendency", "Mean", mean, np.nan, 0],
|
| 85 |
+
["Central Tendency", "Median", median, np.nan, 1],
|
| 86 |
+
["Central Tendency", "Interquartile Mean", iq_mean, np.nan, 1],
|
| 87 |
+
])
|
| 88 |
+
|
| 89 |
+
# Weighted mean (additional, never replaces mean)
|
| 90 |
+
if weights is not None:
|
| 91 |
+
w = pd.Series(weights).loc[x.index].astype(float)
|
| 92 |
+
w_mean = np.average(x, weights=w)
|
| 93 |
+
rows.append([
|
| 94 |
+
"Central Tendency",
|
| 95 |
+
"Weighted Mean",
|
| 96 |
+
w_mean,
|
| 97 |
+
np.nan,
|
| 98 |
+
0
|
| 99 |
+
])
|
| 100 |
+
|
| 101 |
+
# Trimmed mean
|
| 102 |
+
if trim_alpha is not None:
|
| 103 |
+
t_mean = trim_mean(x, trim_alpha)
|
| 104 |
+
rows.append([
|
| 105 |
+
"Central Tendency",
|
| 106 |
+
f"Trimmed Mean ({trim_alpha})",
|
| 107 |
+
t_mean,
|
| 108 |
+
np.nan,
|
| 109 |
+
1
|
| 110 |
+
])
|
| 111 |
+
|
| 112 |
+
# Winsorized mean
|
| 113 |
+
if winsor_limits is not None:
|
| 114 |
+
from scipy.stats.mstats import winsorize
|
| 115 |
+
xw = winsorize(x, winsor_limits)
|
| 116 |
+
rows.append([
|
| 117 |
+
"Central Tendency",
|
| 118 |
+
f"Winsorized Mean {tuple(winsor_limits)}",
|
| 119 |
+
np.mean(xw),
|
| 120 |
+
np.nan,
|
| 121 |
+
1
|
| 122 |
+
])
|
| 123 |
+
|
| 124 |
+
# Geometric & harmonic means
|
| 125 |
+
if np.all(x > 0):
|
| 126 |
+
rows.extend([
|
| 127 |
+
["Central Tendency", "Geometric Mean", gmean(x), np.nan, 0],
|
| 128 |
+
["Central Tendency", "Harmonic Mean", hmean(x), np.nan, 0],
|
| 129 |
+
])
|
| 130 |
+
|
| 131 |
+
# ----------------------------------------------------------------
|
| 132 |
+
# Dispersion
|
| 133 |
+
# ----------------------------------------------------------------
|
| 134 |
+
|
| 135 |
+
var0 = np.var(x, ddof=0)
|
| 136 |
+
var1 = np.var(x, ddof=1) # unbiased
|
| 137 |
+
std0 = np.std(x, ddof=0)
|
| 138 |
+
std1 = np.std(x, ddof=1)
|
| 139 |
+
rng = x.max() - x.min()
|
| 140 |
+
iqr = np.subtract(*np.percentile(x, [75, 25]))
|
| 141 |
+
mad = median_abs_deviation(x)
|
| 142 |
+
aad = np.mean(np.abs(x - mean))
|
| 143 |
+
|
| 144 |
+
rows.extend([
|
| 145 |
+
["Dispersion", "Variance (ddof=0)", var0, var1, 0],
|
| 146 |
+
["Dispersion", "Variance (ddof=1)", var1, var1, 0],
|
| 147 |
+
["Dispersion", "Std (ddof=0)", std0, std0 * np.sqrt(n / (n - 1)) / c4(n), 0],
|
| 148 |
+
["Dispersion", "Std (ddof=1)", std1, std1 / c4(n), 0],
|
| 149 |
+
["Dispersion", "Range", rng, rng / d2(n), 0],
|
| 150 |
+
["Dispersion", "AAD", aad, aad * np.sqrt(np.pi / 2), 0],
|
| 151 |
+
["Dispersion", "IQR", iqr, iqr / (2 * norm.ppf(0.75)), 1],
|
| 152 |
+
["Dispersion", "MAD", mad, mad / norm.ppf(0.75), 1],
|
| 153 |
+
])
|
| 154 |
+
|
| 155 |
+
# ----------------------------------------------------------------
|
| 156 |
+
# Shape
|
| 157 |
+
# ----------------------------------------------------------------
|
| 158 |
+
|
| 159 |
+
rows.extend([
|
| 160 |
+
["Shape", "Skewness (central moments)", skew(x), np.nan, 0],
|
| 161 |
+
["Shape", "Skewness (k-statistic)", skew(x, bias=False), np.nan, 0],
|
| 162 |
+
["Shape", "Kurtosis (central moments)", kurtosis(x, fisher=False), np.nan, 0],
|
| 163 |
+
["Shape", "Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False), np.nan, 0],
|
| 164 |
+
["Shape", "Excess Kurtosis (central moments)", kurtosis(x, fisher=False) - 3, np.nan, 0],
|
| 165 |
+
["Shape", "Excess Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False) - 3, np.nan, 0],
|
| 166 |
+
])
|
| 167 |
+
|
| 168 |
+
# ----------------------------------------------------------------
|
| 169 |
+
# Final table
|
| 170 |
+
# ----------------------------------------------------------------
|
| 171 |
+
|
| 172 |
+
return pd.DataFrame(
|
| 173 |
+
rows,
|
| 174 |
+
columns=[
|
| 175 |
+
"Statistic Type",
|
| 176 |
+
"Measure",
|
| 177 |
+
"Value",
|
| 178 |
+
"Bias Corrected",
|
| 179 |
+
"Robust",
|
| 180 |
+
],
|
| 181 |
+
)
|
core/estimation/graphical_analysis.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Iterable, Optional, Tuple
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import seaborn as sns
|
| 8 |
+
from scipy.stats import norm
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
Interval = Optional[Tuple[float, float]]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _plot_hist_or_pmf(
|
| 15 |
+
ax,
|
| 16 |
+
*,
|
| 17 |
+
data: np.ndarray,
|
| 18 |
+
graph_type: str,
|
| 19 |
+
var_name: str,
|
| 20 |
+
add_kde: bool,
|
| 21 |
+
add_data: bool,
|
| 22 |
+
):
|
| 23 |
+
"""
|
| 24 |
+
Draw the main histogram / PMF on *ax*.
|
| 25 |
+
Modularized version of the monolithic PlotHistogram logic.
|
| 26 |
+
"""
|
| 27 |
+
sns.set_style("whitegrid")
|
| 28 |
+
|
| 29 |
+
if graph_type == "Histogram":
|
| 30 |
+
sns.histplot(
|
| 31 |
+
data,
|
| 32 |
+
kde=add_kde,
|
| 33 |
+
stat="density",
|
| 34 |
+
color="rebeccapurple",
|
| 35 |
+
alpha=0.5,
|
| 36 |
+
ax=ax,
|
| 37 |
+
)
|
| 38 |
+
ax.set_ylabel("Density")
|
| 39 |
+
ax.set_xlabel(var_name)
|
| 40 |
+
ax.set_title(f"Distribution of {var_name}")
|
| 41 |
+
elif graph_type == "Empirical Probability Mass Function":
|
| 42 |
+
values, counts = np.unique(data, return_counts=True)
|
| 43 |
+
probs = counts / counts.sum()
|
| 44 |
+
ax.stem(values, probs, basefmt="rebeccapurple", linefmt="rebeccapurple")
|
| 45 |
+
if add_kde:
|
| 46 |
+
sns.kdeplot(data, ax=ax, color="rebeccapurple")
|
| 47 |
+
ax.set_ylabel("Probability")
|
| 48 |
+
ax.set_xlabel(var_name)
|
| 49 |
+
ax.set_title(f"Empirical PMF of {var_name}")
|
| 50 |
+
else:
|
| 51 |
+
raise ValueError(f"Unknown graph type: {graph_type}")
|
| 52 |
+
|
| 53 |
+
if add_data:
|
| 54 |
+
_, upper = ax.get_ylim()
|
| 55 |
+
sns.rugplot(data, height=0.1 * upper, ax=ax, color="black")
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def _plot_normal_density(
|
| 59 |
+
ax,
|
| 60 |
+
*,
|
| 61 |
+
hat_mu: float,
|
| 62 |
+
hat_sigma: float,
|
| 63 |
+
color: str = "black",
|
| 64 |
+
):
|
| 65 |
+
if hat_sigma <= 0:
|
| 66 |
+
return
|
| 67 |
+
|
| 68 |
+
y_vect = np.linspace(hat_mu - 3 * hat_sigma, hat_mu + 3 * hat_sigma, 200)
|
| 69 |
+
ax.plot(
|
| 70 |
+
y_vect,
|
| 71 |
+
norm.pdf(y_vect, hat_mu, hat_sigma),
|
| 72 |
+
color=color,
|
| 73 |
+
linestyle="--",
|
| 74 |
+
label="Normal density",
|
| 75 |
+
)
|
| 76 |
+
ax.legend()
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def _plot_interval_band(
|
| 80 |
+
ax,
|
| 81 |
+
*,
|
| 82 |
+
y_val: float,
|
| 83 |
+
interval: Tuple[float, float],
|
| 84 |
+
label: str,
|
| 85 |
+
color: str,
|
| 86 |
+
):
|
| 87 |
+
low, high = interval
|
| 88 |
+
ax.hlines(y_val, low, high, color=color, linewidth=2)
|
| 89 |
+
ax.scatter((low + high) / 2.0, y_val, color=color, s=30, zorder=5)
|
| 90 |
+
ax.text(
|
| 91 |
+
high,
|
| 92 |
+
y_val,
|
| 93 |
+
f" {label}",
|
| 94 |
+
va="center",
|
| 95 |
+
fontsize=9,
|
| 96 |
+
bbox=dict(
|
| 97 |
+
boxstyle="round,pad=0.2",
|
| 98 |
+
facecolor="whitesmoke",
|
| 99 |
+
edgecolor="gray",
|
| 100 |
+
),
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def plot_histogram_with_overlays(
|
| 105 |
+
*,
|
| 106 |
+
data: Iterable[float],
|
| 107 |
+
graph_type: str,
|
| 108 |
+
var_name: str,
|
| 109 |
+
add_kde: bool,
|
| 110 |
+
add_data: bool,
|
| 111 |
+
add_normal: bool,
|
| 112 |
+
hat_mu: Optional[float],
|
| 113 |
+
hat_sigma: Optional[float],
|
| 114 |
+
ci_mean_interval: Interval,
|
| 115 |
+
ci_median_interval: Interval,
|
| 116 |
+
pi_interval: Interval,
|
| 117 |
+
):
|
| 118 |
+
"""
|
| 119 |
+
Return a matplotlib Figure for the histogram / PMF with optional overlays.
|
| 120 |
+
"""
|
| 121 |
+
data = np.asarray(data)
|
| 122 |
+
|
| 123 |
+
show_any_interval = (
|
| 124 |
+
(ci_mean_interval is not None)
|
| 125 |
+
or (ci_median_interval is not None)
|
| 126 |
+
or (pi_interval is not None)
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
if show_any_interval:
|
| 130 |
+
fig, (ax1, ax2) = plt.subplots(
|
| 131 |
+
2,
|
| 132 |
+
1,
|
| 133 |
+
sharex=True,
|
| 134 |
+
figsize=(8, 6),
|
| 135 |
+
)
|
| 136 |
+
else:
|
| 137 |
+
fig, ax1 = plt.subplots(1, 1, figsize=(8, 4))
|
| 138 |
+
ax2 = None
|
| 139 |
+
|
| 140 |
+
_plot_hist_or_pmf(
|
| 141 |
+
ax1,
|
| 142 |
+
data=data,
|
| 143 |
+
graph_type=graph_type,
|
| 144 |
+
var_name=var_name,
|
| 145 |
+
add_kde=add_kde,
|
| 146 |
+
add_data=add_data,
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
if add_normal and hat_mu is not None and hat_sigma is not None:
|
| 150 |
+
_plot_normal_density(ax1, hat_mu=hat_mu, hat_sigma=hat_sigma)
|
| 151 |
+
|
| 152 |
+
# Interval annotations (confidence / prediction)
|
| 153 |
+
if show_any_interval and ax2 is not None:
|
| 154 |
+
ax2.set_yticks([])
|
| 155 |
+
ax2.set_xlabel(var_name)
|
| 156 |
+
ax2.set_ylim(0, 0.5)
|
| 157 |
+
|
| 158 |
+
ci_base_y = 0.4
|
| 159 |
+
if ci_mean_interval is not None:
|
| 160 |
+
_plot_interval_band(
|
| 161 |
+
ax2,
|
| 162 |
+
y_val=ci_base_y,
|
| 163 |
+
interval=ci_mean_interval,
|
| 164 |
+
label="CI Mean",
|
| 165 |
+
color="blue",
|
| 166 |
+
)
|
| 167 |
+
if ci_median_interval is not None:
|
| 168 |
+
_plot_interval_band(
|
| 169 |
+
ax2,
|
| 170 |
+
y_val=ci_base_y - 0.1,
|
| 171 |
+
interval=ci_median_interval,
|
| 172 |
+
label="CI Median",
|
| 173 |
+
color="green",
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
if pi_interval is not None:
|
| 177 |
+
_plot_interval_band(
|
| 178 |
+
ax2,
|
| 179 |
+
y_val=0.1,
|
| 180 |
+
interval=pi_interval,
|
| 181 |
+
label="Prediction Interval",
|
| 182 |
+
color="darkred",
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
fig.tight_layout()
|
| 186 |
+
return fig
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def plot_ecdf(
|
| 190 |
+
*,
|
| 191 |
+
data: Iterable[float],
|
| 192 |
+
var_name: str,
|
| 193 |
+
alpha: float,
|
| 194 |
+
add_conf_band: bool,
|
| 195 |
+
add_normal: bool,
|
| 196 |
+
hat_mu: Optional[float],
|
| 197 |
+
hat_sigma: Optional[float],
|
| 198 |
+
):
|
| 199 |
+
"""Modular version of the ECDF plot with optional DKW band and Normal CDF."""
|
| 200 |
+
from statsmodels.distributions.empirical_distribution import ECDF
|
| 201 |
+
|
| 202 |
+
data = np.asarray(data)
|
| 203 |
+
ecdf = ECDF(data)
|
| 204 |
+
|
| 205 |
+
fig, ax = plt.subplots(figsize=(8, 5))
|
| 206 |
+
|
| 207 |
+
# ECDF step
|
| 208 |
+
ax.step(
|
| 209 |
+
ecdf.x,
|
| 210 |
+
ecdf.y,
|
| 211 |
+
where="post",
|
| 212 |
+
color="rebeccapurple",
|
| 213 |
+
linewidth=2,
|
| 214 |
+
label="ECDF",
|
| 215 |
+
)
|
| 216 |
+
ax.scatter(ecdf.x, ecdf.y, color="rebeccapurple", s=10, alpha=0.6)
|
| 217 |
+
|
| 218 |
+
# DKW band
|
| 219 |
+
if add_conf_band:
|
| 220 |
+
n = len(data)
|
| 221 |
+
epsilon = np.sqrt(np.log(2.0 / alpha) / (2.0 * n))
|
| 222 |
+
lower = np.clip(ecdf.y - epsilon, 0.0, 1.0)
|
| 223 |
+
upper = np.clip(ecdf.y + epsilon, 0.0, 1.0)
|
| 224 |
+
ax.fill_between(
|
| 225 |
+
ecdf.x,
|
| 226 |
+
lower,
|
| 227 |
+
upper,
|
| 228 |
+
step="post",
|
| 229 |
+
color="plum",
|
| 230 |
+
alpha=0.4,
|
| 231 |
+
label="DKW CI",
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
# Optional Normal CDF
|
| 235 |
+
if add_normal and hat_mu is not None and hat_sigma is not None and hat_sigma > 0:
|
| 236 |
+
y_vals = np.linspace(hat_mu - 3.0 * hat_sigma, hat_mu + 3.0 * hat_sigma, 200)
|
| 237 |
+
ax.plot(
|
| 238 |
+
y_vals,
|
| 239 |
+
norm.cdf(y_vals, hat_mu, hat_sigma),
|
| 240 |
+
color="black",
|
| 241 |
+
linestyle="--",
|
| 242 |
+
linewidth=2,
|
| 243 |
+
label="Normal CDF",
|
| 244 |
+
)
|
| 245 |
+
ax.set_xlim(
|
| 246 |
+
min(data.min(), y_vals.min()) - 0.1,
|
| 247 |
+
max(data.max(), y_vals.max()) + 0.1,
|
| 248 |
+
)
|
| 249 |
+
else:
|
| 250 |
+
ax.set_xlim(data.min() - 0.1, data.max() + 0.1)
|
| 251 |
+
|
| 252 |
+
ax.set_title("Empirical Cumulative Distribution Function", fontsize=14)
|
| 253 |
+
ax.set_xlabel(var_name, fontsize=12)
|
| 254 |
+
ax.set_ylabel("ECDF", fontsize=12)
|
| 255 |
+
ax.set_ylim(0, 1.05)
|
| 256 |
+
ax.grid(True, linestyle="--", alpha=0.5)
|
| 257 |
+
ax.legend(loc="lower right", fontsize=10)
|
| 258 |
+
|
| 259 |
+
fig.tight_layout()
|
| 260 |
+
return fig
|
core/estimation/inference/__pycache__/ci.cpython-312.pyc
ADDED
|
Binary file (552 Bytes). View file
|
|
|