Beam2513 commited on
Commit
798602c
·
verified ·
1 Parent(s): e53126f

Upload 127 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. .gradio/certificate.pem +31 -0
  3. __pycache__/app.cpython-313.pyc +0 -0
  4. app.py +8 -0
  5. controllers/__pycache__/data_controller.cpython-312.pyc +0 -0
  6. controllers/__pycache__/data_controller.cpython-313.pyc +0 -0
  7. controllers/__pycache__/hypothesis_controller.cpython-312.pyc +0 -0
  8. controllers/__pycache__/hypothesis_controller.cpython-313.pyc +0 -0
  9. controllers/__pycache__/linear_regression_controller.cpython-312.pyc +0 -0
  10. controllers/__pycache__/linear_regression_controller.cpython-313.pyc +0 -0
  11. controllers/data_controller.py +264 -0
  12. controllers/estimation/__init__.py +0 -0
  13. controllers/estimation/__pycache__/__init__.cpython-312.pyc +0 -0
  14. controllers/estimation/__pycache__/__init__.cpython-313.pyc +0 -0
  15. controllers/estimation/__pycache__/descriptive_controller.cpython-312.pyc +0 -0
  16. controllers/estimation/__pycache__/descriptive_controller.cpython-313.pyc +0 -0
  17. controllers/estimation/__pycache__/graphical_controller.cpython-312.pyc +0 -0
  18. controllers/estimation/__pycache__/graphical_controller.cpython-313.pyc +0 -0
  19. controllers/estimation/__pycache__/inference_controller.cpython-312.pyc +0 -0
  20. controllers/estimation/__pycache__/inference_controller.cpython-313.pyc +0 -0
  21. controllers/estimation/descriptive_controller.py +59 -0
  22. controllers/estimation/graphical_controller.py +383 -0
  23. controllers/estimation/inference_controller.py +300 -0
  24. controllers/hypothesis_controller.py +204 -0
  25. controllers/linear_regression_controller.py +160 -0
  26. controllers/utils/__pycache__/downloads.cpython-312.pyc +0 -0
  27. controllers/utils/__pycache__/downloads.cpython-313.pyc +0 -0
  28. controllers/utils/downloads.py +39 -0
  29. core/__init__.py +0 -0
  30. core/__pycache__/__init__.cpython-312.pyc +0 -0
  31. core/__pycache__/__init__.cpython-313.pyc +0 -0
  32. core/__pycache__/data_stats.cpython-312.pyc +0 -0
  33. core/__pycache__/data_stats.cpython-313.pyc +0 -0
  34. core/__pycache__/descriptive.cpython-313.pyc +0 -0
  35. core/__pycache__/hypothesis_tests.cpython-312.pyc +0 -0
  36. core/__pycache__/hypothesis_tests.cpython-313.pyc +0 -0
  37. core/__pycache__/linear_regression.cpython-312.pyc +0 -0
  38. core/__pycache__/linear_regression.cpython-313.pyc +0 -0
  39. core/__pycache__/statistic_plots.cpython-313.pyc +0 -0
  40. core/data_stats.py +150 -0
  41. core/estimation/__init__.py +0 -0
  42. core/estimation/__pycache__/__init__.cpython-312.pyc +0 -0
  43. core/estimation/__pycache__/__init__.cpython-313.pyc +0 -0
  44. core/estimation/__pycache__/descriptive.cpython-312.pyc +0 -0
  45. core/estimation/__pycache__/descriptive.cpython-313.pyc +0 -0
  46. core/estimation/__pycache__/graphical_analysis.cpython-312.pyc +0 -0
  47. core/estimation/__pycache__/graphical_analysis.cpython-313.pyc +0 -0
  48. core/estimation/descriptive.py +181 -0
  49. core/estimation/graphical_analysis.py +260 -0
  50. core/estimation/inference/__pycache__/ci.cpython-312.pyc +0 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ui/assets/logos/HimmapanLab.png filter=lfs diff=lfs merge=lfs -text
37
+ ui/assets/logos/ThotsakanStats.png filter=lfs diff=lfs merge=lfs -text
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
__pycache__/app.cpython-313.pyc ADDED
Binary file (940 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from ui.layout import build_layout
2
+
3
+ def main():
4
+ app = build_layout()
5
+ app.launch(share=True)
6
+
7
+ if __name__ == "__main__":
8
+ main()
controllers/__pycache__/data_controller.cpython-312.pyc ADDED
Binary file (6.96 kB). View file
 
controllers/__pycache__/data_controller.cpython-313.pyc ADDED
Binary file (6.71 kB). View file
 
controllers/__pycache__/hypothesis_controller.cpython-312.pyc ADDED
Binary file (6.3 kB). View file
 
controllers/__pycache__/hypothesis_controller.cpython-313.pyc ADDED
Binary file (6.28 kB). View file
 
controllers/__pycache__/linear_regression_controller.cpython-312.pyc ADDED
Binary file (5.56 kB). View file
 
controllers/__pycache__/linear_regression_controller.cpython-313.pyc ADDED
Binary file (5.5 kB). View file
 
controllers/data_controller.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from core.data_stats import (
4
+ load_dataset,
5
+ dataset_summary,
6
+ variable_types,
7
+ infer_column_types,
8
+ apply_category_filters,
9
+ reclassify_as_categorical,
10
+ reclassify_as_numeric,
11
+ )
12
+
13
+
14
+ def wire_callbacks(
15
+ *,
16
+ file_input,
17
+ status_output,
18
+
19
+ # RAW DATA
20
+ preview_checkbox,
21
+ overview_checkbox,
22
+ csv_preview,
23
+ desc_output,
24
+ dtypes_output,
25
+
26
+ # RECLASSIFICATION
27
+ num_to_cat,
28
+ cat_to_num,
29
+ fix_to_categorical_button,
30
+ fix_to_numeric_button,
31
+ fix_dtype_status,
32
+
33
+ # FILTERS
34
+ cat_filter_cols,
35
+ cat_val_1,
36
+ cat_val_2,
37
+ cat_val_3,
38
+ apply_filter_button,
39
+ filter_status,
40
+
41
+ # FILTERED DATA
42
+ preview_checkbox_filter,
43
+ overview_checkbox_filter,
44
+ csv_preview_filter,
45
+ desc_output_filter,
46
+ dtypes_output_filter,
47
+
48
+ state,
49
+ ):
50
+ # ==================================================
51
+ # File upload
52
+ # ==================================================
53
+ def on_file_upload(file):
54
+ df, status = load_dataset(file)
55
+
56
+ if df is None:
57
+ return (
58
+ status,
59
+ None, None, None,
60
+ gr.update(choices=[], value=None),
61
+ gr.update(choices=[], value=None),
62
+ gr.update(choices=[], value=[]),
63
+ )
64
+
65
+ numeric_cols, categorical_cols = infer_column_types(df)
66
+
67
+ state.df = df
68
+ state.filtered_df = df
69
+ state.numeric_cols = numeric_cols
70
+ state.categorical_cols = categorical_cols
71
+ state.active_filters = {}
72
+ state.overrides = {"num_to_cat": [], "cat_to_num": []}
73
+
74
+ return (
75
+ status,
76
+ df,
77
+ dataset_summary(df),
78
+ variable_types(df),
79
+
80
+ # Reclassification dropdowns
81
+ gr.update(choices=numeric_cols, value=None),
82
+ gr.update(choices=categorical_cols, value=None),
83
+
84
+ # Filter columns (categorical only)
85
+ gr.update(choices=categorical_cols, value=[]),
86
+ )
87
+
88
+ file_input.change(
89
+ on_file_upload,
90
+ inputs=file_input,
91
+ outputs=[
92
+ status_output,
93
+ csv_preview,
94
+ desc_output,
95
+ dtypes_output,
96
+ num_to_cat,
97
+ cat_to_num,
98
+ cat_filter_cols,
99
+ ],
100
+ )
101
+
102
+ # ==================================================
103
+ # Category value dropdowns (Filter 1–3)
104
+ # ==================================================
105
+ def update_category_filters(selected_columns):
106
+ df = state.df
107
+
108
+ if df is None or not selected_columns:
109
+ return (
110
+ gr.update(visible=False, choices=[], value=[]),
111
+ gr.update(visible=False, choices=[], value=[]),
112
+ gr.update(visible=False, choices=[], value=[]),
113
+ )
114
+
115
+ updates = []
116
+ for i in range(3):
117
+ if i < len(selected_columns):
118
+ col = selected_columns[i]
119
+ values = sorted(df[col].dropna().unique().tolist())
120
+ updates.append(
121
+ gr.update(
122
+ visible=True,
123
+ choices=values,
124
+ value=[],
125
+ )
126
+ )
127
+ else:
128
+ updates.append(
129
+ gr.update(visible=False, choices=[], value=[])
130
+ )
131
+
132
+ return tuple(updates)
133
+
134
+ cat_filter_cols.change(
135
+ update_category_filters,
136
+ inputs=cat_filter_cols,
137
+ outputs=[cat_val_1, cat_val_2, cat_val_3],
138
+ )
139
+
140
+ # ==================================================
141
+ # Apply filters
142
+ # ==================================================
143
+ def on_apply_filter(cat_cols, v1, v2, v3):
144
+ filtered_df, status = apply_category_filters(
145
+ state.df,
146
+ cat_cols,
147
+ v1, v2, v3,
148
+ )
149
+
150
+ state.filtered_df = filtered_df
151
+ state.active_filters = {
152
+ col: vals
153
+ for col, vals in zip(cat_cols[:3], [v1, v2, v3])
154
+ if vals
155
+ }
156
+
157
+ return status
158
+
159
+ apply_filter_button.click(
160
+ on_apply_filter,
161
+ inputs=[cat_filter_cols, cat_val_1, cat_val_2, cat_val_3],
162
+ outputs=filter_status,
163
+ )
164
+
165
+ # ==================================================
166
+ # RAW preview / summary
167
+ # ==================================================
168
+ preview_checkbox.change(
169
+ lambda x: gr.update(visible=x),
170
+ inputs=preview_checkbox,
171
+ outputs=csv_preview,
172
+ )
173
+
174
+ overview_checkbox.change(
175
+ lambda x: (
176
+ gr.update(visible=x),
177
+ gr.update(visible=x),
178
+ ),
179
+ inputs=overview_checkbox,
180
+ outputs=[desc_output, dtypes_output],
181
+ )
182
+
183
+ # ==================================================
184
+ # FILTERED preview / summary
185
+ # ==================================================
186
+ preview_checkbox_filter.change(
187
+ lambda x: (
188
+ gr.update(visible=x),
189
+ state.filtered_df if x else None,
190
+ ),
191
+ inputs=preview_checkbox_filter,
192
+ outputs=[csv_preview_filter, csv_preview_filter],
193
+ )
194
+
195
+ overview_checkbox_filter.change(
196
+ lambda x: (
197
+ gr.update(visible=x),
198
+ gr.update(visible=x),
199
+ dataset_summary(state.filtered_df) if x else None,
200
+ variable_types(state.filtered_df) if x else None,
201
+ ),
202
+ inputs=overview_checkbox_filter,
203
+ outputs=[
204
+ desc_output_filter,
205
+ dtypes_output_filter,
206
+ desc_output_filter,
207
+ dtypes_output_filter,
208
+ ],
209
+ )
210
+
211
+ # ==================================================
212
+ # Reclassification
213
+ # ==================================================
214
+ def on_fix_to_categorical(column):
215
+ _, msg = reclassify_as_categorical(state, column)
216
+ return (
217
+ gr.update(choices=state.categorical_cols, value=[]),
218
+ gr.update(choices=state.numeric_cols, value=None),
219
+ gr.update(choices=state.categorical_cols, value=None),
220
+ msg,
221
+ gr.update(visible=False),
222
+ gr.update(visible=False),
223
+ gr.update(visible=False),
224
+ )
225
+
226
+ def on_fix_to_numeric(column):
227
+ _, msg = reclassify_as_numeric(state, column)
228
+ return (
229
+ gr.update(choices=state.categorical_cols, value=[]),
230
+ gr.update(choices=state.numeric_cols, value=None),
231
+ gr.update(choices=state.categorical_cols, value=None),
232
+ msg,
233
+ gr.update(visible=False),
234
+ gr.update(visible=False),
235
+ gr.update(visible=False),
236
+ )
237
+
238
+ fix_to_categorical_button.click(
239
+ on_fix_to_categorical,
240
+ inputs=num_to_cat,
241
+ outputs=[
242
+ cat_filter_cols,
243
+ num_to_cat,
244
+ cat_to_num,
245
+ fix_dtype_status,
246
+ cat_val_1,
247
+ cat_val_2,
248
+ cat_val_3,
249
+ ],
250
+ )
251
+
252
+ fix_to_numeric_button.click(
253
+ on_fix_to_numeric,
254
+ inputs=cat_to_num,
255
+ outputs=[
256
+ cat_filter_cols,
257
+ num_to_cat,
258
+ cat_to_num,
259
+ fix_dtype_status,
260
+ cat_val_1,
261
+ cat_val_2,
262
+ cat_val_3,
263
+ ],
264
+ )
controllers/estimation/__init__.py ADDED
File without changes
controllers/estimation/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (239 Bytes). View file
 
controllers/estimation/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (206 Bytes). View file
 
controllers/estimation/__pycache__/descriptive_controller.cpython-312.pyc ADDED
Binary file (2.28 kB). View file
 
controllers/estimation/__pycache__/descriptive_controller.cpython-313.pyc ADDED
Binary file (2.3 kB). View file
 
controllers/estimation/__pycache__/graphical_controller.cpython-312.pyc ADDED
Binary file (8.93 kB). View file
 
controllers/estimation/__pycache__/graphical_controller.cpython-313.pyc ADDED
Binary file (8.87 kB). View file
 
controllers/estimation/__pycache__/inference_controller.cpython-312.pyc ADDED
Binary file (5.27 kB). View file
 
controllers/estimation/__pycache__/inference_controller.cpython-313.pyc ADDED
Binary file (5.16 kB). View file
 
controllers/estimation/descriptive_controller.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ui/controllers/estimation/descriptive_controller.py
2
+
3
+ import pandas as pd
4
+ from core.estimation.descriptive import compute_descriptive_statistics
5
+
6
+
7
+ def run_descriptive_statistics(
8
+ *,
9
+ df: pd.DataFrame,
10
+ column: str,
11
+ quantile_probs: list[float],
12
+ trim_alpha: float | None,
13
+ winsor_limits: tuple[float, float] | None,
14
+ weights_col: str | None,
15
+ round_digits: int,
16
+ ) -> pd.DataFrame:
17
+
18
+ if df is None:
19
+ raise ValueError("No dataset loaded.")
20
+
21
+ if column not in df.columns:
22
+ raise ValueError(f"Column '{column}' not found.")
23
+
24
+ series = df[column].dropna()
25
+
26
+ if series.empty:
27
+ raise ValueError("Selected column has no valid data.")
28
+
29
+ if not pd.api.types.is_numeric_dtype(series):
30
+ raise ValueError("Selected column must be numeric.")
31
+
32
+ weights = None
33
+
34
+ if weights_col:
35
+ if weights_col not in df.columns:
36
+ raise ValueError(f"Weights column '{weights_col}' not found.")
37
+
38
+ weights = df.loc[series.index, weights_col]
39
+
40
+ if not pd.api.types.is_numeric_dtype(weights):
41
+ raise ValueError("Weights must be numeric.")
42
+
43
+ if (weights < 0).any():
44
+ raise ValueError("Weights must be non-negative.")
45
+
46
+ stats_df = compute_descriptive_statistics(
47
+ data=series.values,
48
+ quantile_probs=quantile_probs,
49
+ trim_alpha=trim_alpha,
50
+ winsor_limits=winsor_limits,
51
+ weights=weights.values if weights is not None else None,
52
+ )
53
+
54
+ #numeric_cols = stats_df.select_dtypes("number").columns
55
+ #stats_df[numeric_cols] = stats_df[numeric_cols].round(round_digits)
56
+
57
+ stats_df[["Value", "Bias Corrected"]] = stats_df[["Value", "Bias Corrected"]].round(round_digits)
58
+
59
+ return stats_df
controllers/estimation/graphical_controller.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, Tuple
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from core.estimation.inference.estimators import estimate_mean, estimate_sigma
9
+ from core.estimation.inference.ci import (
10
+ ci_mean_analytic,
11
+ ci_mean_bootstrap,
12
+ ci_median_analytic,
13
+ ci_median_bootstrap,
14
+ )
15
+ from core.estimation.inference.pi import (
16
+ pi_mean,
17
+ pi_median,
18
+ pi_iqr,
19
+ pi_bootstrap,
20
+ )
21
+ from core.estimation.graphical_analysis import (
22
+ plot_histogram_with_overlays,
23
+ plot_ecdf,
24
+ )
25
+
26
+
27
+ # ---------------------------------------------------------------------
28
+ # Utilities (aligned with inference_controller)
29
+ # ---------------------------------------------------------------------
30
+ def select_distribution(mean_estimator: str, sigma_estimator: str) -> str:
31
+ if (
32
+ mean_estimator == "Sample Mean"
33
+ and sigma_estimator == "Deviation (1 ddof)"
34
+ ):
35
+ return "t"
36
+ return "norm"
37
+
38
+
39
+ def validate_deviation_estimator(*, sigma_estimator: str, n: int):
40
+ if sigma_estimator == "Range (bias corrected)" and n > 25:
41
+ raise ValueError(
42
+ "Range-based confidence intervals require n ≤ 25. "
43
+ "Use another estimator or bootstrap."
44
+ )
45
+
46
+
47
+ def _prepare_series(
48
+ df: pd.DataFrame,
49
+ column: str,
50
+ weights_col: Optional[str],
51
+ ) -> tuple[np.ndarray, Optional[np.ndarray]]:
52
+ if df is None:
53
+ raise ValueError("No data loaded. Please load a dataset first.")
54
+
55
+ if column not in df.columns:
56
+ raise ValueError(f"Column '{column}' not found in the dataframe.")
57
+
58
+ series = df[column].dropna()
59
+ if series.empty:
60
+ raise ValueError(f"Column '{column}' has no non-missing values.")
61
+
62
+ weights = None
63
+ if weights_col is not None:
64
+ if weights_col not in df.columns:
65
+ raise ValueError(
66
+ f"Weights column '{weights_col}' not found in the dataframe."
67
+ )
68
+ weights_series = df[weights_col].reindex(series.index).dropna()
69
+ common_idx = series.index.intersection(weights_series.index)
70
+ series = series.loc[common_idx]
71
+ weights_series = weights_series.loc[common_idx]
72
+ weights = weights_series.to_numpy()
73
+
74
+ return series.to_numpy(), weights
75
+
76
+
77
+ def run_graphical_analysis(
78
+ *,
79
+ df: pd.DataFrame,
80
+ column: str,
81
+ graph_type: str,
82
+ # Histogram / PMF controls
83
+ add_kde: bool,
84
+ add_data: bool,
85
+ add_normal: bool,
86
+ add_ci: bool,
87
+ ci_choice: str,
88
+ add_pi: bool,
89
+ pi_choice: str,
90
+ # Estimators
91
+ mean_estimator: str,
92
+ median_estimator: str,
93
+ sigma_estimator: str,
94
+ trim_param,
95
+ winsor_limits,
96
+ weights_col: Optional[str],
97
+ # Normal μ source
98
+ normal_mu_source: str,
99
+ # Bootstrap options
100
+ bootstrap_mean: bool,
101
+ bootstrap_median: bool,
102
+ bootstrap_sigma: bool,
103
+ bootstrap_prediction: bool,
104
+ bootstrap_samples: int,
105
+ # CI/PI confidence level
106
+ ci_pi_conf_level: float,
107
+ # ECDF controls
108
+ ecdf_add_conf: bool,
109
+ ecdf_conf_level: float,
110
+ ecdf_add_normal: bool,
111
+ ):
112
+ data, weights = _prepare_series(df, column, weights_col)
113
+
114
+ if not (0.0 < ci_pi_conf_level < 1.0):
115
+ raise ValueError("Confidence level for CI/PI must be in (0, 1).")
116
+
117
+ if graph_type in ("Histogram", "Empirical Probability Mass Function"):
118
+ return _run_hist_or_pmf(
119
+ data=data,
120
+ var_name=column,
121
+ graph_type=graph_type,
122
+ add_kde=add_kde,
123
+ add_data=add_data,
124
+ add_normal=add_normal,
125
+ add_ci=add_ci,
126
+ ci_choice=ci_choice,
127
+ add_pi=add_pi,
128
+ pi_choice=pi_choice,
129
+ mean_estimator=mean_estimator,
130
+ median_estimator=median_estimator,
131
+ sigma_estimator=sigma_estimator,
132
+ trim_param=trim_param,
133
+ winsor_limits=winsor_limits,
134
+ weights=weights,
135
+ normal_mu_source=normal_mu_source,
136
+ bootstrap_mean=bootstrap_mean,
137
+ bootstrap_median=bootstrap_median,
138
+ bootstrap_sigma=bootstrap_sigma,
139
+ bootstrap_prediction=bootstrap_prediction,
140
+ bootstrap_samples=bootstrap_samples,
141
+ ci_pi_conf_level=ci_pi_conf_level,
142
+ )
143
+
144
+ if graph_type == "Empirical Cumulative Distribution Function (ECDF)":
145
+ return _run_ecdf(
146
+ data=data,
147
+ var_name=column,
148
+ ecdf_add_conf=ecdf_add_conf,
149
+ ecdf_conf_level=ecdf_conf_level,
150
+ ecdf_add_normal=ecdf_add_normal,
151
+ mean_estimator=mean_estimator,
152
+ sigma_estimator=sigma_estimator,
153
+ trim_param=trim_param,
154
+ winsor_limits=winsor_limits,
155
+ weights=weights,
156
+ normal_mu_source=normal_mu_source,
157
+ )
158
+
159
+ raise ValueError(f"Unknown graph type: {graph_type}")
160
+
161
+
162
+ def _run_hist_or_pmf(
163
+ *,
164
+ data: np.ndarray,
165
+ var_name: str,
166
+ graph_type: str,
167
+ add_kde: bool,
168
+ add_data: bool,
169
+ add_normal: bool,
170
+ add_ci: bool,
171
+ ci_choice: str,
172
+ add_pi: bool,
173
+ pi_choice: str,
174
+ mean_estimator: str,
175
+ median_estimator: str,
176
+ sigma_estimator: str,
177
+ trim_param,
178
+ winsor_limits,
179
+ weights: Optional[np.ndarray],
180
+ normal_mu_source: str,
181
+ bootstrap_mean: bool,
182
+ bootstrap_median: bool,
183
+ bootstrap_sigma: bool,
184
+ bootstrap_prediction: bool,
185
+ bootstrap_samples: int,
186
+ ci_pi_conf_level: float,
187
+ ):
188
+ alpha = 1.0 - ci_pi_conf_level
189
+
190
+ n = len(data)
191
+ validate_deviation_estimator(
192
+ sigma_estimator=sigma_estimator,
193
+ n=n,
194
+ )
195
+
196
+ ci_mean_interval = None
197
+ ci_median_interval = None
198
+ pi_interval = None
199
+ hat_mu = None
200
+ hat_sigma = None
201
+
202
+ need_intervals = add_ci or add_pi or add_normal
203
+
204
+ if need_intervals:
205
+ # --- Parameters for Normal overlay ---
206
+ if add_normal:
207
+ if normal_mu_source == "Mean-based CI":
208
+ hat_mu = estimate_mean(
209
+ data,
210
+ mean_estimator,
211
+ trim_param=trim_param,
212
+ winsor_limits=winsor_limits,
213
+ weights=weights,
214
+ )
215
+ else:
216
+ hat_mu = float(np.median(data))
217
+
218
+ hat_sigma = estimate_sigma(
219
+ data=data,
220
+ estimator=sigma_estimator,
221
+ )
222
+
223
+ # --- Confidence intervals ---
224
+ if add_ci:
225
+ dist = select_distribution(mean_estimator, sigma_estimator)
226
+
227
+ # CI for mean
228
+ if bootstrap_mean:
229
+ ci_mean_interval = ci_mean_bootstrap(
230
+ data=data,
231
+ estimator=mean_estimator,
232
+ alpha=alpha,
233
+ trim_param=trim_param,
234
+ winsor_limits=winsor_limits,
235
+ weights=weights,
236
+ B=bootstrap_samples,
237
+ )
238
+ else:
239
+ ci_mean_interval = ci_mean_analytic(
240
+ data=data,
241
+ estimator=mean_estimator,
242
+ alpha=alpha,
243
+ dist=dist,
244
+ sigma_estimator=sigma_estimator,
245
+ trim_param=trim_param,
246
+ winsor_limits=winsor_limits,
247
+ weights=weights,
248
+ )
249
+
250
+ # CI for median
251
+ if bootstrap_median:
252
+ ci_median_interval = ci_median_bootstrap(
253
+ data=data,
254
+ alpha=alpha,
255
+ B=bootstrap_samples,
256
+ )
257
+ else:
258
+ ci_median_interval = ci_median_analytic(
259
+ data=data,
260
+ alpha=alpha,
261
+ sigma_estimator=sigma_estimator,
262
+ )
263
+
264
+ # Respect user choice (Mean / Median / Both)
265
+ if ci_choice == "Mean":
266
+ ci_median_interval = None
267
+ elif ci_choice == "Median":
268
+ ci_mean_interval = None
269
+
270
+ # --- Prediction intervals ---
271
+ if add_pi:
272
+ dist = select_distribution(mean_estimator, sigma_estimator)
273
+ if pi_choice == "Mean":
274
+ pi_interval = pi_mean(
275
+ data=data,
276
+ alpha=alpha,
277
+ estimator=mean_estimator,
278
+ dist=dist,
279
+ sigma_estimator=sigma_estimator,
280
+ trim_param=trim_param,
281
+ winsor_limits=winsor_limits,
282
+ weights=weights,
283
+ )
284
+ elif pi_choice == "Median":
285
+ # New API: pi_median only needs data, alpha and sigma_estimator
286
+ pi_interval = pi_median(
287
+ data=data,
288
+ alpha=alpha,
289
+ sigma_estimator=sigma_estimator,
290
+ )
291
+ elif pi_choice == "IQR":
292
+ pi_interval = pi_iqr(
293
+ data=data,
294
+ alpha=alpha,
295
+ )
296
+ elif pi_choice == "Bootstrap":
297
+ if not bootstrap_prediction:
298
+ raise ValueError(
299
+ "To use the Bootstrap prediction interval, enable the "
300
+ "'Bootstrap Prediction' option in the estimator settings."
301
+ )
302
+ pi_interval = pi_bootstrap(
303
+ data=data,
304
+ alpha=alpha,
305
+ B=bootstrap_samples,
306
+ )
307
+ else:
308
+ raise ValueError(
309
+ f"Unknown prediction-interval choice: {pi_choice}"
310
+ )
311
+
312
+ fig = plot_histogram_with_overlays(
313
+ data=data,
314
+ graph_type=graph_type,
315
+ var_name=var_name,
316
+ add_kde=add_kde,
317
+ add_data=add_data,
318
+ add_normal=add_normal,
319
+ hat_mu=hat_mu,
320
+ hat_sigma=hat_sigma,
321
+ ci_mean_interval=ci_mean_interval,
322
+ ci_median_interval=ci_median_interval,
323
+ pi_interval=pi_interval,
324
+ )
325
+
326
+ return fig
327
+
328
+
329
+ def _run_ecdf(
330
+ *,
331
+ data: np.ndarray,
332
+ var_name: str,
333
+ ecdf_add_conf: bool,
334
+ ecdf_conf_level: float,
335
+ ecdf_add_normal: bool,
336
+ mean_estimator: str,
337
+ sigma_estimator: str,
338
+ trim_param,
339
+ winsor_limits,
340
+ weights: Optional[np.ndarray],
341
+ normal_mu_source: str,
342
+ ):
343
+ if not (0.0 < ecdf_conf_level < 1.0):
344
+ raise ValueError("ECDF confidence level must be in (0, 1).")
345
+
346
+ alpha = 1.0 - ecdf_conf_level
347
+
348
+ n = len(data)
349
+ validate_deviation_estimator(
350
+ sigma_estimator=sigma_estimator,
351
+ n=n,
352
+ )
353
+
354
+ hat_mu = None
355
+ hat_sigma = None
356
+
357
+ if ecdf_add_normal:
358
+ if normal_mu_source == "Mean-based CI":
359
+ hat_mu = estimate_mean(
360
+ data,
361
+ mean_estimator,
362
+ trim_param=trim_param,
363
+ winsor_limits=winsor_limits,
364
+ weights=weights,
365
+ )
366
+ else:
367
+ hat_mu = float(np.median(data))
368
+
369
+ hat_sigma = estimate_sigma(
370
+ data=data,
371
+ estimator=sigma_estimator,
372
+ )
373
+
374
+ fig = plot_ecdf(
375
+ data=data,
376
+ var_name=var_name,
377
+ alpha=alpha,
378
+ add_conf_band=ecdf_add_conf,
379
+ add_normal=ecdf_add_normal,
380
+ hat_mu=hat_mu,
381
+ hat_sigma=hat_sigma,
382
+ )
383
+ return fig
controllers/estimation/inference_controller.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ from core.estimation.inference.ci import (
4
+ ci_mean_analytic,
5
+ ci_median_analytic,
6
+ ci_deviation_analytic,
7
+ ci_mean_bootstrap,
8
+ ci_median_bootstrap,
9
+ ci_deviation_bootstrap,
10
+ )
11
+
12
+ from core.estimation.inference.pi import (
13
+ pi_mean,
14
+ pi_median,
15
+ pi_iqr,
16
+ pi_bootstrap,
17
+ )
18
+
19
+ from core.estimation.inference.confidence_regions import confidence_regions
20
+
21
+
22
+ # ---------------------------------------------------------------------
23
+ # Utilities
24
+ # ---------------------------------------------------------------------
25
+
26
+ def select_distribution(mean_estimator: str, sigma_estimator: str) -> str:
27
+ if mean_estimator == "Sample Mean" and sigma_estimator == "Deviation (1 ddof)":
28
+ return "t"
29
+ return "norm"
30
+
31
+
32
+ def validate_deviation_estimator(*, sigma_estimator: str, n: int):
33
+ if sigma_estimator == "Range (bias corrected)" and n > 25:
34
+ raise ValueError(
35
+ "Range-based confidence intervals require n ≤ 25. "
36
+ "Use another estimator or bootstrap."
37
+ )
38
+
39
+
40
+ # ---------------------------------------------------------------------
41
+ # Confidence Intervals
42
+ # ---------------------------------------------------------------------
43
+
44
+ def run_confidence_intervals(
45
+ *,
46
+ data,
47
+ alpha,
48
+ mean_estimator,
49
+ median_estimator,
50
+ sigma_estimator,
51
+ trim_param=None,
52
+ winsor_limits=None,
53
+ weights=None,
54
+ bootstrap_mean=False,
55
+ bootstrap_median=False,
56
+ bootstrap_deviation=False,
57
+ bootstrap_samples=1000,
58
+ ):
59
+ n = len(data)
60
+
61
+ validate_deviation_estimator(
62
+ sigma_estimator=sigma_estimator,
63
+ n=n,
64
+ )
65
+
66
+ dist = select_distribution(mean_estimator, sigma_estimator)
67
+
68
+ # ---------------- Mean ----------------
69
+ if bootstrap_mean:
70
+ mean_ci = ci_mean_bootstrap(
71
+ data=data,
72
+ estimator=mean_estimator,
73
+ alpha=alpha,
74
+ B=bootstrap_samples,
75
+ trim_param=trim_param,
76
+ winsor_limits=winsor_limits,
77
+ weights=weights,
78
+ )
79
+ else:
80
+ mean_ci = ci_mean_analytic(
81
+ data=data,
82
+ estimator=mean_estimator,
83
+ alpha=alpha,
84
+ dist=dist,
85
+ sigma_estimator=sigma_estimator,
86
+ trim_param=trim_param,
87
+ winsor_limits=winsor_limits,
88
+ weights=weights,
89
+ )
90
+
91
+ # ---------------- Median ----------------
92
+ if bootstrap_median:
93
+ median_ci = ci_median_bootstrap(
94
+ data=data,
95
+ alpha=alpha,
96
+ B=bootstrap_samples,
97
+ )
98
+ else:
99
+ median_ci = ci_median_analytic(
100
+ data=data,
101
+ alpha=alpha,
102
+ sigma_estimator=sigma_estimator,
103
+ )
104
+
105
+ # ---------------- Deviation ----------------
106
+ if bootstrap_deviation:
107
+ sigma_ci = ci_deviation_bootstrap(
108
+ data=data,
109
+ alpha=alpha,
110
+ B=bootstrap_samples,
111
+ estimator=sigma_estimator,
112
+ )
113
+ else:
114
+ sigma_ci = ci_deviation_analytic(
115
+ data=data,
116
+ alpha=alpha,
117
+ estimator=sigma_estimator,
118
+ )
119
+
120
+ table = pd.DataFrame(
121
+ [
122
+ ["Confidence", "Mean", *mean_ci],
123
+ ["Confidence", "Median", *median_ci],
124
+ ["Confidence", "Deviation", *sigma_ci],
125
+ ],
126
+ columns=["Interval Type", "Statistic", "Lower", "Upper"],
127
+ )
128
+
129
+ return table, mean_ci, sigma_ci, median_ci
130
+
131
+
132
+ # ---------------------------------------------------------------------
133
+ # Prediction Intervals
134
+ # ---------------------------------------------------------------------
135
+
136
+ def run_prediction_intervals(
137
+ *,
138
+ data,
139
+ alpha,
140
+ mean_estimator,
141
+ median_estimator,
142
+ sigma_estimator,
143
+ trim_param=None,
144
+ winsor_limits=None,
145
+ weights=None,
146
+ bootstrap=False,
147
+ bootstrap_samples=1000,
148
+ ):
149
+ dist = select_distribution(mean_estimator, sigma_estimator)
150
+
151
+ rows = []
152
+
153
+ # Mean-based PI
154
+ mean_pi = pi_mean(
155
+ data=data,
156
+ alpha=alpha,
157
+ estimator=mean_estimator,
158
+ dist=dist,
159
+ sigma_estimator=sigma_estimator,
160
+ trim_param=trim_param,
161
+ winsor_limits=winsor_limits,
162
+ weights=weights,
163
+ )
164
+ rows.append(["Prediction", "Mean", *mean_pi])
165
+
166
+ # Median-based PI (uses same deviation estimator)
167
+ median_pi = pi_median(
168
+ data=data,
169
+ alpha=alpha,
170
+ sigma_estimator=sigma_estimator,
171
+ )
172
+ rows.append(["Prediction", "Median", *median_pi])
173
+
174
+ # IQR-based PI
175
+ iqr_pi = pi_iqr(
176
+ data=data,
177
+ alpha=alpha,
178
+ )
179
+ rows.append(["Prediction", "IQR", *iqr_pi])
180
+
181
+ # Optional bootstrap PI
182
+ if bootstrap:
183
+ boot_pi = pi_bootstrap(
184
+ data=data,
185
+ alpha=alpha,
186
+ B=bootstrap_samples,
187
+ )
188
+ rows.append(["Prediction", "Bootstrap", *boot_pi])
189
+
190
+ return pd.DataFrame(
191
+ rows,
192
+ columns=["Interval Type", "Statistic", "Lower", "Upper"],
193
+ )
194
+
195
+ # ---------------------------------------------------------------------
196
+ # Confidence Regions
197
+ # ---------------------------------------------------------------------
198
+
199
+ def run_confidence_regions(
200
+ *,
201
+ data,
202
+ alpha,
203
+ mean_estimator,
204
+ median_estimator,
205
+ sigma_estimator,
206
+ trim_param,
207
+ winsor_limits,
208
+ weights,
209
+ bootstrap_mean,
210
+ bootstrap_median,
211
+ bootstrap_deviation,
212
+ bootstrap_samples,
213
+ mu_ci_source,
214
+ probs,
215
+ eps_mu,
216
+ eps_sigma,
217
+ add_ci_box,
218
+ ):
219
+ """
220
+ Use the CI machinery to compute CIs for mean, median and deviation,
221
+ then choose which CI to use for μ (mean-based or median-based) and
222
+ pass that CI plus the σ CI into the likelihood-based confidence
223
+ regions function.
224
+ """
225
+
226
+ ci_table, mean_ci, sigma_ci, median_ci = run_confidence_intervals(
227
+ data=data,
228
+ alpha=alpha,
229
+ mean_estimator=mean_estimator,
230
+ median_estimator=median_estimator,
231
+ sigma_estimator=sigma_estimator,
232
+ trim_param=trim_param,
233
+ winsor_limits=winsor_limits,
234
+ weights=weights,
235
+ bootstrap_mean=bootstrap_mean,
236
+ bootstrap_median=bootstrap_median,
237
+ bootstrap_deviation=bootstrap_deviation,
238
+ bootstrap_samples=bootstrap_samples,
239
+ )
240
+
241
+ if mu_ci_source == "Median-based CI":
242
+ mu_ci = median_ci
243
+ else:
244
+ # default: mean-based CI
245
+ mu_ci = mean_ci
246
+
247
+ fig = confidence_regions(
248
+ data=data,
249
+ mean_ci=mu_ci,
250
+ sigma_ci=sigma_ci,
251
+ probs=probs,
252
+ eps_mu=eps_mu,
253
+ eps_sigma=eps_sigma,
254
+ add_ci_box=add_ci_box,
255
+ )
256
+
257
+ return fig
258
+
259
+
260
+ # ---------------------------------------------------------------------
261
+ # Combined Runner (used by UI)
262
+ # ---------------------------------------------------------------------
263
+
264
+ def run_intervals(
265
+ *,
266
+ data,
267
+ alpha,
268
+ mean_estimator,
269
+ median_estimator,
270
+ sigma_estimator,
271
+ bootstrap_mean,
272
+ bootstrap_median,
273
+ bootstrap_deviation,
274
+ bootstrap_samples,
275
+ ):
276
+ ci_table, mean_ci, sigma_ci = run_confidence_intervals(
277
+ data=data,
278
+ alpha=alpha,
279
+ mean_estimator=mean_estimator,
280
+ median_estimator=median_estimator,
281
+ sigma_estimator=sigma_estimator,
282
+ bootstrap_mean=bootstrap_mean,
283
+ bootstrap_median=bootstrap_median,
284
+ bootstrap_deviation=bootstrap_deviation,
285
+ bootstrap_samples=bootstrap_samples,
286
+ )
287
+
288
+ pi_table = run_prediction_intervals(
289
+ data=data,
290
+ alpha=alpha,
291
+ mean_estimator=mean_estimator,
292
+ median_estimator=median_estimator,
293
+ sigma_estimator=sigma_estimator,
294
+ bootstrap=bootstrap_mean,
295
+ bootstrap_samples=bootstrap_samples,
296
+ )
297
+
298
+ combined = pd.concat([ci_table, pi_table], ignore_index=True)
299
+
300
+ return ci_table, pi_table, combined
controllers/hypothesis_controller.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable, Tuple
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from core.hypothesis_tests import (
9
+ one_sample_ttest,
10
+ two_sample_ttest,
11
+ variance_test,
12
+ one_way_anova,
13
+ )
14
+
15
+ ROUND = 4
16
+
17
+
18
+ def _round_table(table: pd.DataFrame, decimals: int = ROUND) -> pd.DataFrame:
19
+ """Round only numeric columns of the result table."""
20
+ if table is None:
21
+ return table
22
+ tbl = table.copy()
23
+ num_cols = tbl.select_dtypes(include="number").columns
24
+ if len(num_cols) > 0:
25
+ tbl[num_cols] = tbl[num_cols].round(decimals)
26
+ return tbl
27
+
28
+ def _ensure_numeric_series(df: pd.DataFrame, column: str) -> np.ndarray:
29
+ if df is None:
30
+ raise ValueError("No dataset loaded.")
31
+ if column not in df.columns:
32
+ raise ValueError(f"Column '{column}' not found in the dataset.")
33
+
34
+ series = df[column].dropna()
35
+ if series.empty:
36
+ raise ValueError("No valid data in the selected column.")
37
+ return series.to_numpy()
38
+
39
+
40
+ def _materialize_group(
41
+ df: pd.DataFrame,
42
+ numeric_col: str,
43
+ cat_col: str | None,
44
+ cat_vals: Iterable[str],
45
+ ) -> np.ndarray:
46
+ if cat_col is None:
47
+ raise ValueError("No categorical column selected.")
48
+
49
+ if cat_col not in df.columns:
50
+ raise ValueError(f"Categorical column '{cat_col}' not found in the dataset.")
51
+
52
+ # Cast selected values to the actual dtype of the column
53
+ if cat_vals is None:
54
+ values = []
55
+ else:
56
+ values = list(cat_vals)
57
+
58
+ if not values:
59
+ raise ValueError(f"No categories selected for column '{cat_col}'.")
60
+
61
+ cat_series = pd.Series(values).astype(df[cat_col].dtype)
62
+ mask = df[cat_col].isin(cat_series)
63
+ series = df.loc[mask, numeric_col].dropna()
64
+
65
+ if series.empty:
66
+ raise ValueError("One or more groups are empty after filtering.")
67
+ return series.to_numpy()
68
+
69
+
70
+ def run_hypothesis_testing(
71
+ *,
72
+ df: pd.DataFrame | None,
73
+ numeric_col: str,
74
+ hypo_test: str,
75
+ mu0_text: str,
76
+ alternative: str,
77
+ include_graph: bool,
78
+ bootstrap_samples: int,
79
+ cat_col1: str | None,
80
+ cat_vals1: list[str],
81
+ name_group1: str,
82
+ cat_col2: str | None,
83
+ cat_vals2: list[str],
84
+ name_group2: str,
85
+ cat_col3: str | None,
86
+ cat_vals3: list[str],
87
+ plot_type: str,
88
+ correction: bool,
89
+ test_type: str,
90
+ ) -> Tuple[pd.DataFrame, object | None]:
91
+ """
92
+ High-level dispatcher used by the Hypothesis Testing tab.
93
+
94
+ Returns:
95
+ (result_table, figure_or_none)
96
+ """
97
+ if df is None:
98
+ raise ValueError("No dataset loaded.")
99
+
100
+ # Common numeric data check
101
+ _ = _ensure_numeric_series(df, numeric_col)
102
+
103
+ # ------------------------------------------------------------
104
+ # One-sample t-test
105
+ # ------------------------------------------------------------
106
+ if hypo_test == "One sample Student's t-test":
107
+ if not mu0_text.strip():
108
+ raise ValueError("μ₀ must be specified for the one-sample t-test.")
109
+ try:
110
+ mu0 = float(mu0_text)
111
+ except Exception:
112
+ raise ValueError("μ₀ must be a numeric value.")
113
+
114
+ sample = df[numeric_col].dropna().to_numpy()
115
+
116
+ table, fig = one_sample_ttest(
117
+ sample=sample,
118
+ mu0=mu0,
119
+ alternative=alternative,
120
+ numeric_col=numeric_col,
121
+ bootstrap_samples=bootstrap_samples,
122
+ include_graph=include_graph,
123
+ )
124
+ table = _round_table(table)
125
+ return table, fig
126
+
127
+ # ------------------------------------------------------------
128
+ # Two-sample t-test
129
+ # ------------------------------------------------------------
130
+ if hypo_test == "Two samples Student's t-test":
131
+ group1 = _materialize_group(df, numeric_col, cat_col1, cat_vals1)
132
+ group2 = _materialize_group(df, numeric_col, cat_col2, cat_vals2)
133
+
134
+ # If names are empty, fall back to defaults
135
+ name1 = name_group1 or "Group 1"
136
+ name2 = name_group2 or "Group 2"
137
+
138
+ table, fig = two_sample_ttest(
139
+ group1=group1,
140
+ group2=group2,
141
+ numeric_col=numeric_col,
142
+ name_group1=name1,
143
+ name_group2=name2,
144
+ alternative=alternative,
145
+ correction=correction,
146
+ plot_type=plot_type,
147
+ bootstrap_samples=bootstrap_samples,
148
+ include_graph=include_graph,
149
+ )
150
+ table = _round_table(table)
151
+ return table, fig
152
+
153
+ # ------------------------------------------------------------
154
+ # Equal variance between two groups
155
+ # ------------------------------------------------------------
156
+ if hypo_test == "Equal variance between two groups":
157
+ group1 = _materialize_group(df, numeric_col, cat_col1, cat_vals1)
158
+ group2 = _materialize_group(df, numeric_col, cat_col2, cat_vals2)
159
+
160
+ name1 = name_group1 or "Group 1"
161
+ name2 = name_group2 or "Group 2"
162
+
163
+ table, fig = variance_test(
164
+ group1=group1,
165
+ group2=group2,
166
+ name_group1=name1,
167
+ name_group2=name2,
168
+ test_type=test_type,
169
+ include_graph=include_graph,
170
+ bootstrap_samples=bootstrap_samples,
171
+ )
172
+ table = _round_table(table)
173
+ return table, fig
174
+
175
+ # ------------------------------------------------------------
176
+ # One-way ANOVA
177
+ # ------------------------------------------------------------
178
+ if hypo_test == "One-way ANOVA":
179
+ if cat_col3 is None:
180
+ raise ValueError("A categorical column must be selected for ANOVA.")
181
+
182
+ if cat_col3 not in df.columns:
183
+ raise ValueError(
184
+ f"Categorical column '{cat_col3}' not found in the dataset."
185
+ )
186
+
187
+ if not cat_vals3:
188
+ raise ValueError("At least one category must be selected for ANOVA.")
189
+
190
+ cat_series = pd.Series(cat_vals3).astype(df[cat_col3].dtype)
191
+ data_group = df[df[cat_col3].isin(cat_series)][[numeric_col, cat_col3]].dropna()
192
+
193
+ table, fig = one_way_anova(
194
+ data_group=data_group,
195
+ numeric_col=numeric_col,
196
+ cat_col=cat_col3,
197
+ )
198
+ table = _round_table(table)
199
+ return table, fig
200
+
201
+ # ------------------------------------------------------------
202
+ # Fallback
203
+ # ------------------------------------------------------------
204
+ raise ValueError(f"Unknown hypothesis test: {hypo_test}")
controllers/linear_regression_controller.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Optional, Sequence, Tuple
4
+
5
+ from matplotlib.figure import Figure
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ from core.linear_regression import run_linear_regression as _run_linear_regression
10
+
11
+
12
+ def _select_working_dataframe(
13
+ df: Optional[pd.DataFrame],
14
+ filtered_df: Optional[pd.DataFrame],
15
+ ) -> pd.DataFrame:
16
+ """
17
+ Use the filtered dataframe if it is non-empty; otherwise fall back to the
18
+ original dataframe. This mirrors the behaviour used in other tabs.
19
+ """
20
+ if df is None:
21
+ raise ValueError("No dataset loaded.")
22
+
23
+ if filtered_df is not None and not filtered_df.empty:
24
+ return filtered_df
25
+
26
+ if df.empty:
27
+ raise ValueError("The dataset is empty.")
28
+
29
+ return df
30
+
31
+
32
+ def _parse_confidence_level(text: str) -> float:
33
+ """
34
+ Parse a confidence level like '0.95' into an alpha value for statsmodels.
35
+
36
+ Returns
37
+ -------
38
+ alpha : float
39
+ Significance level (e.g. 0.05 for a 95% confidence level).
40
+ """
41
+ s = str(text).strip()
42
+ if not s:
43
+ raise ValueError("Confidence level is required (e.g. 0.95).")
44
+ try:
45
+ level = float(s)
46
+ except ValueError as exc:
47
+ raise ValueError("Confidence level must be a numeric value between 0 and 1.") from exc
48
+
49
+ if not (0 < level < 1):
50
+ raise ValueError("Confidence level must be between 0 and 1 (e.g. 0.95).")
51
+
52
+ # statsmodels expects alpha, not the confidence level itself
53
+ return 1.0 - level
54
+
55
+
56
+ def _parse_range(text: str) -> Optional[np.ndarray]:
57
+ """
58
+ Parse a range string like '0, 10' into a numpy array suitable for predictions.
59
+
60
+ Returns
61
+ -------
62
+ np.ndarray or None
63
+ If the string is empty or only whitespace, returns None.
64
+ Otherwise returns a 1-D array of 100 evenly spaced values between
65
+ the parsed minimum and maximum.
66
+ """
67
+ s = str(text).strip()
68
+ if not s:
69
+ return None
70
+
71
+ parts = s.split(",")
72
+ if len(parts) != 2:
73
+ raise ValueError("Range must have the form 'min, max'.")
74
+
75
+ try:
76
+ lo = float(parts[0].strip())
77
+ hi = float(parts[1].strip())
78
+ except ValueError as exc:
79
+ raise ValueError("Range values must be numeric (e.g. '0, 10').") from exc
80
+
81
+ if lo >= hi:
82
+ raise ValueError("Range minimum must be strictly less than the maximum.")
83
+
84
+ return np.linspace(lo, hi, 100)
85
+
86
+
87
+ def run_linear_regression(
88
+ *,
89
+ df: Optional[pd.DataFrame],
90
+ filtered_df: Optional[pd.DataFrame],
91
+ formula_check: bool,
92
+ formula_text: str,
93
+ formula_latex: str,
94
+ dependent_var: Optional[str],
95
+ independent_vars: List[str],
96
+ alpha_input: str,
97
+ intercept: bool,
98
+ graph_check: bool,
99
+ graph_type: str,
100
+ show_ci: bool,
101
+ show_pi: bool,
102
+ fit_to_obs: bool,
103
+ x_range_text: str,
104
+ round_digits: int = 4,
105
+ ) -> Tuple[str, pd.DataFrame, Optional[Figure]]:
106
+ """
107
+ High-level controller used by the Linear Regression tab.
108
+
109
+ This function takes raw user input from the UI, performs validation and
110
+ parsing, calls the stats layer, and returns a tuple:
111
+
112
+ (summary_html, params_df_rounded, figure)
113
+
114
+ Any exceptions should be caught in the tab layer and turned into user-
115
+ facing error messages.
116
+ """
117
+ working_df = _select_working_dataframe(df, filtered_df)
118
+
119
+ if dependent_var is None or dependent_var == "":
120
+ raise ValueError("Please select a dependent variable.")
121
+
122
+ if not independent_vars:
123
+ raise ValueError("Please select at least one independent variable.")
124
+
125
+ # For the "Simple Regression" graph we require exactly one independent variable.
126
+ if graph_check and graph_type == "Simple Regression" and len(independent_vars) != 1:
127
+ raise ValueError(
128
+ "The 'Simple Regression' graph is only available when exactly one "
129
+ "independent variable is selected."
130
+ )
131
+
132
+ # Parse confidence level
133
+ alpha = _parse_confidence_level(alpha_input)
134
+
135
+ # Parse X range only when needed: Simple Regression + graph + not fit_to_obs
136
+ x_vector = None
137
+ if graph_check and graph_type == "Simple Regression" and not fit_to_obs:
138
+ x_vector = _parse_range(x_range_text)
139
+
140
+ summary_html, params_df, fig = _run_linear_regression(
141
+ df=working_df,
142
+ formula_check=formula_check,
143
+ formula_text=formula_text,
144
+ formula_latex=formula_latex,
145
+ dependent_var=dependent_var,
146
+ independent_vars=independent_vars,
147
+ alpha=alpha,
148
+ intercept=intercept,
149
+ create_graph=graph_check,
150
+ graph_type=graph_type,
151
+ show_ci=show_ci,
152
+ show_pi=show_pi,
153
+ fit_to_obs=fit_to_obs,
154
+ x_vector=x_vector,
155
+ )
156
+
157
+ # Rounding happens here, not in the stats layer.
158
+ params_df_rounded = params_df.round(round_digits)
159
+
160
+ return summary_html, params_df_rounded, fig
controllers/utils/__pycache__/downloads.cpython-312.pyc ADDED
Binary file (1.82 kB). View file
 
controllers/utils/__pycache__/downloads.cpython-313.pyc ADDED
Binary file (1.81 kB). View file
 
controllers/utils/downloads.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import tempfile
3
+ import gradio as gr
4
+
5
+ def sanitize_filename(name: str, default: str):
6
+ if not name or not name.strip():
7
+ return default
8
+ clean = re.sub(r'[\\/*?:"<>|]', "", name).strip()
9
+ return clean if clean else default
10
+
11
+
12
+ def dataframe_to_csv(df, filename):
13
+ if df is None:
14
+ gr.Warning("❌ No table available to download.")
15
+ return None
16
+
17
+ base = sanitize_filename(filename, "descriptive_statistics")
18
+
19
+ with tempfile.NamedTemporaryFile(
20
+ delete=False,
21
+ mode="w",
22
+ suffix=".csv",
23
+ prefix=base + "_",
24
+ encoding="utf-8",
25
+ ) as tmp:
26
+ df.to_csv(tmp.name, index=False)
27
+ return tmp.name
28
+
29
+ def figure_to_png(fig, filename: str):
30
+ if fig is None:
31
+ return None
32
+
33
+ tmp = tempfile.NamedTemporaryFile(
34
+ delete=False,
35
+ suffix=".png",
36
+ prefix=filename + "_"
37
+ )
38
+ fig.savefig(tmp.name, dpi=200, bbox_inches="tight")
39
+ return tmp.name
core/__init__.py ADDED
File without changes
core/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (221 Bytes). View file
 
core/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (186 Bytes). View file
 
core/__pycache__/data_stats.cpython-312.pyc ADDED
Binary file (6.36 kB). View file
 
core/__pycache__/data_stats.cpython-313.pyc ADDED
Binary file (6.37 kB). View file
 
core/__pycache__/descriptive.cpython-313.pyc ADDED
Binary file (6.91 kB). View file
 
core/__pycache__/hypothesis_tests.cpython-312.pyc ADDED
Binary file (18.9 kB). View file
 
core/__pycache__/hypothesis_tests.cpython-313.pyc ADDED
Binary file (18.4 kB). View file
 
core/__pycache__/linear_regression.cpython-312.pyc ADDED
Binary file (12.4 kB). View file
 
core/__pycache__/linear_regression.cpython-313.pyc ADDED
Binary file (12.1 kB). View file
 
core/__pycache__/statistic_plots.cpython-313.pyc ADDED
Binary file (8.47 kB). View file
 
core/data_stats.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import gradio as gr
4
+ from pathlib import Path
5
+
6
+ ROUND = 4
7
+
8
+ def load_dataset(file):
9
+ """
10
+ Load CSV or Excel file.
11
+ Returns:
12
+ df, status_message
13
+ """
14
+ if file is None:
15
+ return None, "No file uploaded."
16
+
17
+ try:
18
+ path = Path(file.name)
19
+
20
+ if path.suffix == ".csv":
21
+ df = pd.read_csv(path)
22
+ elif path.suffix in [".xlsx", ".xls"]:
23
+ df = pd.read_excel(path)
24
+ else:
25
+ return None, "Unsupported file format."
26
+
27
+ return df, f"Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns."
28
+
29
+ except Exception as e:
30
+ return None, f"Error loading file: {e}"
31
+
32
+
33
+ def dataset_summary(df: pd.DataFrame):
34
+ if df is None:
35
+ return None
36
+
37
+ summary = (
38
+ df.describe(include="all")
39
+ .transpose()
40
+ .reset_index()
41
+ .rename(columns={"index": "variable"})
42
+ )
43
+
44
+ # Add unique counts explicitly
45
+ summary["unique"] = df.nunique(dropna=True).values
46
+
47
+ # Desired column order
48
+ desired_order = [
49
+ "variable",
50
+ "count",
51
+ "unique",
52
+ "mean",
53
+ "std",
54
+ "min",
55
+ "25%",
56
+ "50%",
57
+ "75%",
58
+ "max",
59
+ ]
60
+ summary = summary[[c for c in desired_order if c in summary.columns]]
61
+
62
+ # ---- IMPORTANT PART ----
63
+ # Format numeric columns as strings
64
+ for col in summary.columns:
65
+ if col not in ["variable", "count", "unique"]:
66
+ summary[col] = summary[col].apply(
67
+ lambda x: f"{x:.{ROUND}f}" if isinstance(x, (int, float)) else x
68
+ )
69
+
70
+ return summary
71
+
72
+
73
+ def variable_types(df):
74
+ if df is None:
75
+ return None
76
+
77
+ return (
78
+ df.dtypes
79
+ .reset_index()
80
+ .rename(columns={"index": "Variable", 0: "Type"})
81
+ )
82
+
83
+
84
+ def column_choices_single(cols: list[str]):
85
+ return gr.update(choices=cols, value=None)
86
+
87
+
88
+ def column_choices_multi(cols: list[str]):
89
+ return gr.update(choices=cols, value=[])
90
+
91
+
92
+ def category_value_choices(df, col):
93
+ if df is None or col is None or col not in df.columns:
94
+ return gr.update(visible=False, choices=[], value=[])
95
+
96
+ values = sorted(df[col].dropna().unique().tolist())
97
+
98
+ return gr.update(
99
+ visible=True,
100
+ choices=values,
101
+ value=[], # MUST be a list for multiselect
102
+ )
103
+
104
+
105
+ def infer_column_types(df: pd.DataFrame):
106
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
107
+ categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
108
+
109
+ return sorted(numeric_cols), sorted(categorical_cols)
110
+
111
+
112
+ def apply_category_filters(
113
+ df,
114
+ cat_cols,
115
+ val1,
116
+ val2,
117
+ val3,
118
+ ):
119
+ if df is None:
120
+ return None, "❌ No data loaded."
121
+
122
+ if not cat_cols or all(not v for v in [val1, val2, val3]):
123
+ return df.copy(), "⚠️ No filters selected. Using full dataset."
124
+
125
+ filtered_df = df.copy()
126
+
127
+ values = [val1, val2, val3]
128
+
129
+ for col, selected_vals in zip(cat_cols[:3], values):
130
+ if selected_vals:
131
+ filtered_df = filtered_df[filtered_df[col].isin(selected_vals)]
132
+
133
+ return filtered_df, f"✅ Filter applied. Rows remaining: {len(filtered_df)}"
134
+
135
+ def reclassify_as_categorical(state, column):
136
+ if column and column in state.numeric_cols:
137
+ state.numeric_cols.remove(column)
138
+ state.categorical_cols.append(column)
139
+ state.active_filters = {} # reset filters
140
+ return True, f"Column '{column}' reclassified as categorical."
141
+ return False, f"Column '{column}' is not numeric."
142
+
143
+
144
+ def reclassify_as_numeric(state, column):
145
+ if column and column in state.categorical_cols:
146
+ state.categorical_cols.remove(column)
147
+ state.numeric_cols.append(column)
148
+ state.active_filters = {} # reset filters
149
+ return True, f"Column '{column}' reclassified as numeric."
150
+ return False, f"Column '{column}' is not categorical."
core/estimation/__init__.py ADDED
File without changes
core/estimation/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (232 Bytes). View file
 
core/estimation/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (196 Bytes). View file
 
core/estimation/__pycache__/descriptive.cpython-312.pyc ADDED
Binary file (7 kB). View file
 
core/estimation/__pycache__/descriptive.cpython-313.pyc ADDED
Binary file (6.92 kB). View file
 
core/estimation/__pycache__/graphical_analysis.cpython-312.pyc ADDED
Binary file (8.6 kB). View file
 
core/estimation/__pycache__/graphical_analysis.cpython-313.pyc ADDED
Binary file (8.48 kB). View file
 
core/estimation/descriptive.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ui/stats/estimation/descriptive.py
2
+
3
+ from functools import lru_cache
4
+ import numpy as np
5
+ import pandas as pd
6
+ from scipy.stats import (
7
+ trim_mean,
8
+ gmean,
9
+ hmean,
10
+ skew,
11
+ kurtosis,
12
+ norm
13
+ )
14
+ from scipy.special import loggamma
15
+ from scipy.integrate import quad
16
+ from scipy.stats import median_abs_deviation
17
+
18
+ # ------------------------------------------------------------------
19
+ # Bias-correction constants (user-approved implementations)
20
+ # ------------------------------------------------------------------
21
+
22
+ @lru_cache(maxsize=None)
23
+ def c4(n: int) -> float:
24
+ """Bias correction constant for standard deviation."""
25
+ return np.exp(
26
+ np.log(np.sqrt(2 / (n - 1)))
27
+ + loggamma(n / 2)
28
+ - loggamma((n - 1) / 2)
29
+ )
30
+
31
+
32
+ @lru_cache(maxsize=None)
33
+ def d2(n: int) -> float:
34
+ """Bias correction constant for the range."""
35
+ f = lambda x, n: 1 - (1 - norm.cdf(x)) ** n - (norm.cdf(x)) ** n
36
+ return quad(f, -np.inf, np.inf, args=(n,))[0]
37
+
38
+
39
+ # ------------------------------------------------------------------
40
+ # Main computation function
41
+ # ------------------------------------------------------------------
42
+
43
+ def compute_descriptive_statistics(
44
+ data,
45
+ *,
46
+ quantile_probs=(0.25, 0.5, 0.75),
47
+ trim_alpha=None,
48
+ winsor_limits=None,
49
+ weights=None,
50
+ ):
51
+ """
52
+ Compute all descriptive statistics for a single numeric variable.
53
+ """
54
+
55
+ # --- preparation ------------------------------------------------
56
+ x = pd.Series(data).dropna().astype(float)
57
+ n = len(x)
58
+
59
+ rows = []
60
+
61
+ # ----------------------------------------------------------------
62
+ # Quantiles
63
+ # ----------------------------------------------------------------
64
+ probs = np.atleast_1d(quantile_probs)
65
+ q_vals = np.quantile(x, probs)
66
+ for p, q in zip(probs, q_vals):
67
+ rows.append([
68
+ "Quantiles",
69
+ f"Q{p}",
70
+ q,
71
+ np.nan,
72
+ 0
73
+ ])
74
+
75
+ # ----------------------------------------------------------------
76
+ # Central Tendency
77
+ # ----------------------------------------------------------------
78
+
79
+ mean = x.mean()
80
+ median = np.median(x)
81
+ iq_mean = trim_mean(x, 0.25)
82
+
83
+ rows.extend([
84
+ ["Central Tendency", "Mean", mean, np.nan, 0],
85
+ ["Central Tendency", "Median", median, np.nan, 1],
86
+ ["Central Tendency", "Interquartile Mean", iq_mean, np.nan, 1],
87
+ ])
88
+
89
+ # Weighted mean (additional, never replaces mean)
90
+ if weights is not None:
91
+ w = pd.Series(weights).loc[x.index].astype(float)
92
+ w_mean = np.average(x, weights=w)
93
+ rows.append([
94
+ "Central Tendency",
95
+ "Weighted Mean",
96
+ w_mean,
97
+ np.nan,
98
+ 0
99
+ ])
100
+
101
+ # Trimmed mean
102
+ if trim_alpha is not None:
103
+ t_mean = trim_mean(x, trim_alpha)
104
+ rows.append([
105
+ "Central Tendency",
106
+ f"Trimmed Mean ({trim_alpha})",
107
+ t_mean,
108
+ np.nan,
109
+ 1
110
+ ])
111
+
112
+ # Winsorized mean
113
+ if winsor_limits is not None:
114
+ from scipy.stats.mstats import winsorize
115
+ xw = winsorize(x, winsor_limits)
116
+ rows.append([
117
+ "Central Tendency",
118
+ f"Winsorized Mean {tuple(winsor_limits)}",
119
+ np.mean(xw),
120
+ np.nan,
121
+ 1
122
+ ])
123
+
124
+ # Geometric & harmonic means
125
+ if np.all(x > 0):
126
+ rows.extend([
127
+ ["Central Tendency", "Geometric Mean", gmean(x), np.nan, 0],
128
+ ["Central Tendency", "Harmonic Mean", hmean(x), np.nan, 0],
129
+ ])
130
+
131
+ # ----------------------------------------------------------------
132
+ # Dispersion
133
+ # ----------------------------------------------------------------
134
+
135
+ var0 = np.var(x, ddof=0)
136
+ var1 = np.var(x, ddof=1) # unbiased
137
+ std0 = np.std(x, ddof=0)
138
+ std1 = np.std(x, ddof=1)
139
+ rng = x.max() - x.min()
140
+ iqr = np.subtract(*np.percentile(x, [75, 25]))
141
+ mad = median_abs_deviation(x)
142
+ aad = np.mean(np.abs(x - mean))
143
+
144
+ rows.extend([
145
+ ["Dispersion", "Variance (ddof=0)", var0, var1, 0],
146
+ ["Dispersion", "Variance (ddof=1)", var1, var1, 0],
147
+ ["Dispersion", "Std (ddof=0)", std0, std0 * np.sqrt(n / (n - 1)) / c4(n), 0],
148
+ ["Dispersion", "Std (ddof=1)", std1, std1 / c4(n), 0],
149
+ ["Dispersion", "Range", rng, rng / d2(n), 0],
150
+ ["Dispersion", "AAD", aad, aad * np.sqrt(np.pi / 2), 0],
151
+ ["Dispersion", "IQR", iqr, iqr / (2 * norm.ppf(0.75)), 1],
152
+ ["Dispersion", "MAD", mad, mad / norm.ppf(0.75), 1],
153
+ ])
154
+
155
+ # ----------------------------------------------------------------
156
+ # Shape
157
+ # ----------------------------------------------------------------
158
+
159
+ rows.extend([
160
+ ["Shape", "Skewness (central moments)", skew(x), np.nan, 0],
161
+ ["Shape", "Skewness (k-statistic)", skew(x, bias=False), np.nan, 0],
162
+ ["Shape", "Kurtosis (central moments)", kurtosis(x, fisher=False), np.nan, 0],
163
+ ["Shape", "Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False), np.nan, 0],
164
+ ["Shape", "Excess Kurtosis (central moments)", kurtosis(x, fisher=False) - 3, np.nan, 0],
165
+ ["Shape", "Excess Kurtosis (k-statistic)", kurtosis(x, fisher=False, bias=False) - 3, np.nan, 0],
166
+ ])
167
+
168
+ # ----------------------------------------------------------------
169
+ # Final table
170
+ # ----------------------------------------------------------------
171
+
172
+ return pd.DataFrame(
173
+ rows,
174
+ columns=[
175
+ "Statistic Type",
176
+ "Measure",
177
+ "Value",
178
+ "Bias Corrected",
179
+ "Robust",
180
+ ],
181
+ )
core/estimation/graphical_analysis.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable, Optional, Tuple
4
+
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ from scipy.stats import norm
9
+
10
+
11
+ Interval = Optional[Tuple[float, float]]
12
+
13
+
14
+ def _plot_hist_or_pmf(
15
+ ax,
16
+ *,
17
+ data: np.ndarray,
18
+ graph_type: str,
19
+ var_name: str,
20
+ add_kde: bool,
21
+ add_data: bool,
22
+ ):
23
+ """
24
+ Draw the main histogram / PMF on *ax*.
25
+ Modularized version of the monolithic PlotHistogram logic.
26
+ """
27
+ sns.set_style("whitegrid")
28
+
29
+ if graph_type == "Histogram":
30
+ sns.histplot(
31
+ data,
32
+ kde=add_kde,
33
+ stat="density",
34
+ color="rebeccapurple",
35
+ alpha=0.5,
36
+ ax=ax,
37
+ )
38
+ ax.set_ylabel("Density")
39
+ ax.set_xlabel(var_name)
40
+ ax.set_title(f"Distribution of {var_name}")
41
+ elif graph_type == "Empirical Probability Mass Function":
42
+ values, counts = np.unique(data, return_counts=True)
43
+ probs = counts / counts.sum()
44
+ ax.stem(values, probs, basefmt="rebeccapurple", linefmt="rebeccapurple")
45
+ if add_kde:
46
+ sns.kdeplot(data, ax=ax, color="rebeccapurple")
47
+ ax.set_ylabel("Probability")
48
+ ax.set_xlabel(var_name)
49
+ ax.set_title(f"Empirical PMF of {var_name}")
50
+ else:
51
+ raise ValueError(f"Unknown graph type: {graph_type}")
52
+
53
+ if add_data:
54
+ _, upper = ax.get_ylim()
55
+ sns.rugplot(data, height=0.1 * upper, ax=ax, color="black")
56
+
57
+
58
+ def _plot_normal_density(
59
+ ax,
60
+ *,
61
+ hat_mu: float,
62
+ hat_sigma: float,
63
+ color: str = "black",
64
+ ):
65
+ if hat_sigma <= 0:
66
+ return
67
+
68
+ y_vect = np.linspace(hat_mu - 3 * hat_sigma, hat_mu + 3 * hat_sigma, 200)
69
+ ax.plot(
70
+ y_vect,
71
+ norm.pdf(y_vect, hat_mu, hat_sigma),
72
+ color=color,
73
+ linestyle="--",
74
+ label="Normal density",
75
+ )
76
+ ax.legend()
77
+
78
+
79
+ def _plot_interval_band(
80
+ ax,
81
+ *,
82
+ y_val: float,
83
+ interval: Tuple[float, float],
84
+ label: str,
85
+ color: str,
86
+ ):
87
+ low, high = interval
88
+ ax.hlines(y_val, low, high, color=color, linewidth=2)
89
+ ax.scatter((low + high) / 2.0, y_val, color=color, s=30, zorder=5)
90
+ ax.text(
91
+ high,
92
+ y_val,
93
+ f" {label}",
94
+ va="center",
95
+ fontsize=9,
96
+ bbox=dict(
97
+ boxstyle="round,pad=0.2",
98
+ facecolor="whitesmoke",
99
+ edgecolor="gray",
100
+ ),
101
+ )
102
+
103
+
104
+ def plot_histogram_with_overlays(
105
+ *,
106
+ data: Iterable[float],
107
+ graph_type: str,
108
+ var_name: str,
109
+ add_kde: bool,
110
+ add_data: bool,
111
+ add_normal: bool,
112
+ hat_mu: Optional[float],
113
+ hat_sigma: Optional[float],
114
+ ci_mean_interval: Interval,
115
+ ci_median_interval: Interval,
116
+ pi_interval: Interval,
117
+ ):
118
+ """
119
+ Return a matplotlib Figure for the histogram / PMF with optional overlays.
120
+ """
121
+ data = np.asarray(data)
122
+
123
+ show_any_interval = (
124
+ (ci_mean_interval is not None)
125
+ or (ci_median_interval is not None)
126
+ or (pi_interval is not None)
127
+ )
128
+
129
+ if show_any_interval:
130
+ fig, (ax1, ax2) = plt.subplots(
131
+ 2,
132
+ 1,
133
+ sharex=True,
134
+ figsize=(8, 6),
135
+ )
136
+ else:
137
+ fig, ax1 = plt.subplots(1, 1, figsize=(8, 4))
138
+ ax2 = None
139
+
140
+ _plot_hist_or_pmf(
141
+ ax1,
142
+ data=data,
143
+ graph_type=graph_type,
144
+ var_name=var_name,
145
+ add_kde=add_kde,
146
+ add_data=add_data,
147
+ )
148
+
149
+ if add_normal and hat_mu is not None and hat_sigma is not None:
150
+ _plot_normal_density(ax1, hat_mu=hat_mu, hat_sigma=hat_sigma)
151
+
152
+ # Interval annotations (confidence / prediction)
153
+ if show_any_interval and ax2 is not None:
154
+ ax2.set_yticks([])
155
+ ax2.set_xlabel(var_name)
156
+ ax2.set_ylim(0, 0.5)
157
+
158
+ ci_base_y = 0.4
159
+ if ci_mean_interval is not None:
160
+ _plot_interval_band(
161
+ ax2,
162
+ y_val=ci_base_y,
163
+ interval=ci_mean_interval,
164
+ label="CI Mean",
165
+ color="blue",
166
+ )
167
+ if ci_median_interval is not None:
168
+ _plot_interval_band(
169
+ ax2,
170
+ y_val=ci_base_y - 0.1,
171
+ interval=ci_median_interval,
172
+ label="CI Median",
173
+ color="green",
174
+ )
175
+
176
+ if pi_interval is not None:
177
+ _plot_interval_band(
178
+ ax2,
179
+ y_val=0.1,
180
+ interval=pi_interval,
181
+ label="Prediction Interval",
182
+ color="darkred",
183
+ )
184
+
185
+ fig.tight_layout()
186
+ return fig
187
+
188
+
189
+ def plot_ecdf(
190
+ *,
191
+ data: Iterable[float],
192
+ var_name: str,
193
+ alpha: float,
194
+ add_conf_band: bool,
195
+ add_normal: bool,
196
+ hat_mu: Optional[float],
197
+ hat_sigma: Optional[float],
198
+ ):
199
+ """Modular version of the ECDF plot with optional DKW band and Normal CDF."""
200
+ from statsmodels.distributions.empirical_distribution import ECDF
201
+
202
+ data = np.asarray(data)
203
+ ecdf = ECDF(data)
204
+
205
+ fig, ax = plt.subplots(figsize=(8, 5))
206
+
207
+ # ECDF step
208
+ ax.step(
209
+ ecdf.x,
210
+ ecdf.y,
211
+ where="post",
212
+ color="rebeccapurple",
213
+ linewidth=2,
214
+ label="ECDF",
215
+ )
216
+ ax.scatter(ecdf.x, ecdf.y, color="rebeccapurple", s=10, alpha=0.6)
217
+
218
+ # DKW band
219
+ if add_conf_band:
220
+ n = len(data)
221
+ epsilon = np.sqrt(np.log(2.0 / alpha) / (2.0 * n))
222
+ lower = np.clip(ecdf.y - epsilon, 0.0, 1.0)
223
+ upper = np.clip(ecdf.y + epsilon, 0.0, 1.0)
224
+ ax.fill_between(
225
+ ecdf.x,
226
+ lower,
227
+ upper,
228
+ step="post",
229
+ color="plum",
230
+ alpha=0.4,
231
+ label="DKW CI",
232
+ )
233
+
234
+ # Optional Normal CDF
235
+ if add_normal and hat_mu is not None and hat_sigma is not None and hat_sigma > 0:
236
+ y_vals = np.linspace(hat_mu - 3.0 * hat_sigma, hat_mu + 3.0 * hat_sigma, 200)
237
+ ax.plot(
238
+ y_vals,
239
+ norm.cdf(y_vals, hat_mu, hat_sigma),
240
+ color="black",
241
+ linestyle="--",
242
+ linewidth=2,
243
+ label="Normal CDF",
244
+ )
245
+ ax.set_xlim(
246
+ min(data.min(), y_vals.min()) - 0.1,
247
+ max(data.max(), y_vals.max()) + 0.1,
248
+ )
249
+ else:
250
+ ax.set_xlim(data.min() - 0.1, data.max() + 0.1)
251
+
252
+ ax.set_title("Empirical Cumulative Distribution Function", fontsize=14)
253
+ ax.set_xlabel(var_name, fontsize=12)
254
+ ax.set_ylabel("ECDF", fontsize=12)
255
+ ax.set_ylim(0, 1.05)
256
+ ax.grid(True, linestyle="--", alpha=0.5)
257
+ ax.legend(loc="lower right", fontsize=10)
258
+
259
+ fig.tight_layout()
260
+ return fig
core/estimation/inference/__pycache__/ci.cpython-312.pyc ADDED
Binary file (552 Bytes). View file