EnYa32 commited on
Commit
fef5b3a
·
verified ·
1 Parent(s): 6a9ee23

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +61 -152
src/streamlit_app.py CHANGED
@@ -4,178 +4,87 @@ import streamlit as st
4
  import joblib
5
  from pathlib import Path
6
 
7
- # -------------------------
8
- # Page config
9
- # -------------------------
10
- st.set_page_config(
11
- page_title='Clustering Predictor (GMM / KMeans)',
12
- page_icon='🧩',
13
- layout='centered'
14
- )
15
-
16
- st.title('🧩 Clustering Predictor')
17
- st.write('Predict cluster labels using saved preprocessing (Scaler + PCA) and a clustering model.')
18
-
19
- # -------------------------
20
- # Paths
21
- # -------------------------
22
  BASE_DIR = Path(__file__).resolve().parent
23
 
24
  FEATURES_PATH = BASE_DIR / 'feature_names.pkl'
25
  SCALER_PATH = BASE_DIR / 'scaler.pkl'
26
  PCA_PATH = BASE_DIR / 'pca.pkl'
27
-
28
- KMEANS_PATH = BASE_DIR / 'kmeans_model.pkl'
29
  GMM_PATH = BASE_DIR / 'gmm_model.pkl'
30
 
31
- # -------------------------
32
- # Load assets
33
- # -------------------------
34
  @st.cache_resource
35
  def load_assets():
36
- required = [FEATURES_PATH, SCALER_PATH, PCA_PATH]
37
- missing = [p.name for p in required if not p.exists()]
38
  if missing:
39
- raise FileNotFoundError(
40
- f'Missing required files in repo root: {missing}. Put them next to app.py.'
41
- )
42
 
43
  feature_names = joblib.load(FEATURES_PATH)
44
  scaler = joblib.load(SCALER_PATH)
45
  pca = joblib.load(PCA_PATH)
46
-
47
- models = {}
48
- if GMM_PATH.exists():
49
- models['GMM (best)'] = joblib.load(GMM_PATH)
50
- if KMEANS_PATH.exists():
51
- models['KMeans'] = joblib.load(KMEANS_PATH)
52
-
53
- if not models:
54
- raise FileNotFoundError(
55
- "No model found. Upload 'gmm_model_k9.pkl' (and optionally 'kmeans_model_k9.pkl') next to app.py."
56
  )
57
 
58
- return feature_names, scaler, pca, models
59
 
60
  try:
61
- feature_names, scaler, pca, models = load_assets()
62
  except Exception as e:
63
  st.error(str(e))
64
  st.stop()
65
 
66
- # -------------------------
67
- # Model selection
68
- # -------------------------
69
- model_name = st.selectbox('Select model', list(models.keys()), index=0)
70
- model = models[model_name]
71
-
72
- def predict_from_features(df_features: pd.DataFrame) -> np.ndarray:
73
- # Ensure correct order
74
- df_features = df_features[feature_names].copy()
75
-
76
- # Convert to numeric
77
- for c in df_features.columns:
78
- df_features[c] = pd.to_numeric(df_features[c], errors='coerce')
79
-
80
- if df_features.isna().any().any():
81
- bad_cols = df_features.columns[df_features.isna().any()].tolist()
82
- raise ValueError(f'NaN values found in columns: {bad_cols}. Please provide valid numeric values.')
83
-
84
- X_scaled = scaler.transform(df_features)
85
- X_pca = pca.transform(X_scaled)
86
- preds = model.predict(X_pca)
87
- return preds
88
-
89
- # -------------------------
90
- # UI Tabs
91
- # -------------------------
92
- tab_csv, tab_single = st.tabs(['📁 CSV Upload', '🧮 Single Prediction'])
93
-
94
- # =========================
95
- # Tab 1: CSV Upload
96
- # =========================
97
- with tab_csv:
98
- st.subheader('Upload a CSV and predict clusters')
99
- uploaded = st.file_uploader('Upload CSV', type=['csv'])
100
-
101
- if uploaded is not None:
102
- try:
103
- df_up = pd.read_csv(uploaded)
104
-
105
- # id handling (optional)
106
- id_col = None
107
- if 'id' in df_up.columns:
108
- id_col = 'id'
109
- elif 'Id' in df_up.columns:
110
- id_col = 'Id'
111
-
112
- ids = df_up[id_col].copy() if id_col else None
113
- X_up = df_up.drop(columns=[id_col], errors='ignore')
114
-
115
- missing_cols = [c for c in feature_names if c not in X_up.columns]
116
- if missing_cols:
117
- st.error(f'Missing required columns: {missing_cols}')
118
- st.stop()
119
-
120
- X_up = X_up[feature_names]
121
- preds = predict_from_features(X_up)
122
-
123
- out = pd.DataFrame({'Predicted': preds})
124
- if ids is not None:
125
- out.insert(0, 'Id', ids)
126
-
127
- st.success('✅ Predictions created successfully.')
128
- st.dataframe(out.head(30), use_container_width=True)
129
-
130
- st.download_button(
131
- 'Download predictions as CSV',
132
- data=out.to_csv(index=False).encode('utf-8'),
133
- file_name='predictions.csv',
134
- mime='text/csv'
135
- )
136
-
137
- st.subheader('Cluster distribution')
138
- st.write(pd.Series(preds).value_counts().sort_index())
139
-
140
- except Exception as e:
141
- st.error(str(e))
142
-
143
- else:
144
- st.info('Upload a CSV file to generate cluster predictions.')
145
-
146
- # =========================
147
- # Tab 2: Single Prediction
148
- # =========================
149
- with tab_single:
150
- st.subheader('Enter one row of features and predict its cluster')
151
-
152
- # Optional: user-friendly note
153
- st.caption('Tip: If you don’t know values, use a row from your dataset as an example.')
154
-
155
- # Build form inputs dynamically
156
- with st.form('single_pred_form'):
157
- cols = st.columns(2)
158
- values = {}
159
-
160
- for i, feat in enumerate(feature_names):
161
- # Default 0.0 is safe; you can also set dataset mean if you want
162
- if i % 2 == 0:
163
- values[feat] = cols[0].number_input(feat, value=0.0)
164
- else:
165
- values[feat] = cols[1].number_input(feat, value=0.0)
166
-
167
- submitted = st.form_submit_button('Predict cluster')
168
-
169
- if submitted:
170
- try:
171
- one_row = pd.DataFrame([values], columns=feature_names)
172
- pred = predict_from_features(one_row)[0]
173
- st.success(f'✅ Predicted cluster: **{int(pred)}**')
174
- except Exception as e:
175
- st.error(str(e))
176
-
177
- # -------------------------
178
- # Footer
179
- # -------------------------
180
  with st.expander('Show expected feature columns'):
181
- st.write(feature_names)
 
 
 
 
 
 
4
  import joblib
5
  from pathlib import Path
6
 
7
+ st.set_page_config(page_title='Clustering Predictor (GMM)', page_icon='🧩', layout='centered')
8
+
9
+ st.title('🧩 Clustering Predictor (GMM)')
10
+ st.write('Single-row cluster prediction using saved preprocessing: StandardScaler → PCA → GaussianMixture.')
11
+
 
 
 
 
 
 
 
 
 
 
12
  BASE_DIR = Path(__file__).resolve().parent
13
 
14
  FEATURES_PATH = BASE_DIR / 'feature_names.pkl'
15
  SCALER_PATH = BASE_DIR / 'scaler.pkl'
16
  PCA_PATH = BASE_DIR / 'pca.pkl'
 
 
17
  GMM_PATH = BASE_DIR / 'gmm_model.pkl'
18
 
 
 
 
19
  @st.cache_resource
20
  def load_assets():
21
+ missing = [p.name for p in [FEATURES_PATH, SCALER_PATH, PCA_PATH, GMM_PATH] if not p.exists()]
 
22
  if missing:
23
+ raise FileNotFoundError(f'Missing files in repo root: {missing}. Put them next to app.py.')
 
 
24
 
25
  feature_names = joblib.load(FEATURES_PATH)
26
  scaler = joblib.load(SCALER_PATH)
27
  pca = joblib.load(PCA_PATH)
28
+ model = joblib.load(GMM_PATH)
29
+
30
+ # Hard safety checks
31
+ if hasattr(pca, 'n_features_in_') and len(feature_names) != int(pca.n_features_in_):
32
+ raise ValueError(
33
+ f'Feature mismatch: feature_names has {len(feature_names)} features, '
34
+ f'but PCA expects {int(pca.n_features_in_)}. '
35
+ 'Re-export feature_names.pkl and pca.pkl from the same training run.'
 
 
36
  )
37
 
38
+ return feature_names, scaler, pca, model
39
 
40
  try:
41
+ feature_names, scaler, pca, model = load_assets()
42
  except Exception as e:
43
  st.error(str(e))
44
  st.stop()
45
 
46
+ def predict_cluster(values_dict: dict) -> int:
47
+ df_one = pd.DataFrame([values_dict], columns=feature_names)
48
+
49
+ # Convert to numeric safely
50
+ for c in df_one.columns:
51
+ df_one[c] = pd.to_numeric(df_one[c], errors='coerce')
52
+
53
+ if df_one.isna().any().any():
54
+ bad = df_one.columns[df_one.isna().any()].tolist()
55
+ raise ValueError(f'NaN values found in columns: {bad}. Please provide valid numeric values.')
56
+
57
+ X_scaled = scaler.transform(df_one) # (1, 29)
58
+ X_pca = pca.transform(X_scaled) # (1, 27)
59
+ pred = model.predict(X_pca)[0]
60
+ return int(pred)
61
+
62
+ st.subheader('🧮 Single Prediction')
63
+ st.caption('Tip: Use a real row from your dataset for realistic values (all zeros may be unrealistic).')
64
+
65
+ with st.form('single_pred_form'):
66
+ cols = st.columns(2)
67
+ values = {}
68
+
69
+ for i, feat in enumerate(feature_names):
70
+ if i % 2 == 0:
71
+ values[feat] = cols[0].number_input(feat, value=0.0)
72
+ else:
73
+ values[feat] = cols[1].number_input(feat, value=0.0)
74
+
75
+ submitted = st.form_submit_button('Predict cluster')
76
+
77
+ if submitted:
78
+ try:
79
+ pred = predict_cluster(values)
80
+ st.success(f'✅ Predicted cluster: **{pred}**')
81
+ except Exception as e:
82
+ st.error(str(e))
83
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  with st.expander('Show expected feature columns'):
85
+ st.write(feature_names)
86
+
87
+ with st.expander('Debug shapes (advanced)'):
88
+ st.write('Number of input features:', len(feature_names))
89
+ st.write('PCA expects n_features_in_:', getattr(pca, 'n_features_in_', 'NA'))
90
+ st.write('PCA output components:', getattr(pca, 'n_components_', 'NA'))