Spaces:

EnYa32
/

UnsupervisedCustumerPrediction

Sleeping

App Files Files Community

EnYa32 commited on Dec 27, 2025

Commit

fef5b3a

verified ·

1 Parent(s): 6a9ee23

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +61 -152

src/streamlit_app.py CHANGED Viewed

@@ -4,178 +4,87 @@ import streamlit as st
 import joblib
 from pathlib import Path
-# -------------------------
-# Page config
-# -------------------------
-st.set_page_config(
-    page_title='Clustering Predictor (GMM / KMeans)',
-    page_icon='🧩',
-    layout='centered'
-)
-st.title('🧩 Clustering Predictor')
-st.write('Predict cluster labels using saved preprocessing (Scaler + PCA) and a clustering model.')
-# -------------------------
-# Paths
-# -------------------------
 BASE_DIR = Path(__file__).resolve().parent
 FEATURES_PATH = BASE_DIR / 'feature_names.pkl'
 SCALER_PATH = BASE_DIR / 'scaler.pkl'
 PCA_PATH = BASE_DIR / 'pca.pkl'
-KMEANS_PATH = BASE_DIR / 'kmeans_model.pkl'
 GMM_PATH = BASE_DIR / 'gmm_model.pkl'
-# -------------------------
-# Load assets
-# -------------------------
 @st.cache_resource
 def load_assets():
-    required = [FEATURES_PATH, SCALER_PATH, PCA_PATH]
-    missing = [p.name for p in required if not p.exists()]
     if missing:
-        raise FileNotFoundError(
-            f'Missing required files in repo root: {missing}. Put them next to app.py.'
-        )
     feature_names = joblib.load(FEATURES_PATH)
     scaler = joblib.load(SCALER_PATH)
     pca = joblib.load(PCA_PATH)
-    models = {}
-    if GMM_PATH.exists():
-        models['GMM (best)'] = joblib.load(GMM_PATH)
-    if KMEANS_PATH.exists():
-        models['KMeans'] = joblib.load(KMEANS_PATH)
-    if not models:
-        raise FileNotFoundError(
-            "No model found. Upload 'gmm_model_k9.pkl' (and optionally 'kmeans_model_k9.pkl') next to app.py."
         )
-    return feature_names, scaler, pca, models
 try:
-    feature_names, scaler, pca, models = load_assets()
 except Exception as e:
     st.error(str(e))
     st.stop()
-# -------------------------
-# Model selection
-# -------------------------
-model_name = st.selectbox('Select model', list(models.keys()), index=0)
-model = models[model_name]
-def predict_from_features(df_features: pd.DataFrame) -> np.ndarray:
-    # Ensure correct order
-    df_features = df_features[feature_names].copy()
-    # Convert to numeric
-    for c in df_features.columns:
-        df_features[c] = pd.to_numeric(df_features[c], errors='coerce')
-    if df_features.isna().any().any():
-        bad_cols = df_features.columns[df_features.isna().any()].tolist()
-        raise ValueError(f'NaN values found in columns: {bad_cols}. Please provide valid numeric values.')
-    X_scaled = scaler.transform(df_features)
-    X_pca = pca.transform(X_scaled)
-    preds = model.predict(X_pca)
-    return preds
-# -------------------------
-# UI Tabs
-# -------------------------
-tab_csv, tab_single = st.tabs(['📁 CSV Upload', '🧮 Single Prediction'])
-# =========================
-# Tab 1: CSV Upload
-# =========================
-with tab_csv:
-    st.subheader('Upload a CSV and predict clusters')
-    uploaded = st.file_uploader('Upload CSV', type=['csv'])
-    if uploaded is not None:
-        try:
-            df_up = pd.read_csv(uploaded)
-            # id handling (optional)
-            id_col = None
-            if 'id' in df_up.columns:
-                id_col = 'id'
-            elif 'Id' in df_up.columns:
-                id_col = 'Id'
-            ids = df_up[id_col].copy() if id_col else None
-            X_up = df_up.drop(columns=[id_col], errors='ignore')
-            missing_cols = [c for c in feature_names if c not in X_up.columns]
-            if missing_cols:
-                st.error(f'Missing required columns: {missing_cols}')
-                st.stop()
-            X_up = X_up[feature_names]
-            preds = predict_from_features(X_up)
-            out = pd.DataFrame({'Predicted': preds})
-            if ids is not None:
-                out.insert(0, 'Id', ids)
-            st.success('✅ Predictions created successfully.')
-            st.dataframe(out.head(30), use_container_width=True)
-            st.download_button(
-                'Download predictions as CSV',
-                data=out.to_csv(index=False).encode('utf-8'),
-                file_name='predictions.csv',
-                mime='text/csv'
-            )
-            st.subheader('Cluster distribution')
-            st.write(pd.Series(preds).value_counts().sort_index())
-        except Exception as e:
-            st.error(str(e))
-    else:
-        st.info('Upload a CSV file to generate cluster predictions.')
-# =========================
-# Tab 2: Single Prediction
-# =========================
-with tab_single:
-    st.subheader('Enter one row of features and predict its cluster')
-    # Optional: user-friendly note
-    st.caption('Tip: If you don’t know values, use a row from your dataset as an example.')
-    # Build form inputs dynamically
-    with st.form('single_pred_form'):
-        cols = st.columns(2)
-        values = {}
-        for i, feat in enumerate(feature_names):
-            # Default 0.0 is safe; you can also set dataset mean if you want
-            if i % 2 == 0:
-                values[feat] = cols[0].number_input(feat, value=0.0)
-            else:
-                values[feat] = cols[1].number_input(feat, value=0.0)
-        submitted = st.form_submit_button('Predict cluster')
-    if submitted:
-        try:
-            one_row = pd.DataFrame([values], columns=feature_names)
-            pred = predict_from_features(one_row)[0]
-            st.success(f'✅ Predicted cluster: **{int(pred)}**')
-        except Exception as e:
-            st.error(str(e))
-# -------------------------
-# Footer
-# -------------------------
 with st.expander('Show expected feature columns'):
-    st.write(feature_names)

 import joblib
 from pathlib import Path
+st.set_page_config(page_title='Clustering Predictor (GMM)', page_icon='🧩', layout='centered')
+st.title('🧩 Clustering Predictor (GMM)')
+st.write('Single-row cluster prediction using saved preprocessing: StandardScaler → PCA → GaussianMixture.')
 BASE_DIR = Path(__file__).resolve().parent
 FEATURES_PATH = BASE_DIR / 'feature_names.pkl'
 SCALER_PATH = BASE_DIR / 'scaler.pkl'
 PCA_PATH = BASE_DIR / 'pca.pkl'
 GMM_PATH = BASE_DIR / 'gmm_model.pkl'
 @st.cache_resource
 def load_assets():
+    missing = [p.name for p in [FEATURES_PATH, SCALER_PATH, PCA_PATH, GMM_PATH] if not p.exists()]
     if missing:
+        raise FileNotFoundError(f'Missing files in repo root: {missing}. Put them next to app.py.')
     feature_names = joblib.load(FEATURES_PATH)
     scaler = joblib.load(SCALER_PATH)
     pca = joblib.load(PCA_PATH)
+    model = joblib.load(GMM_PATH)
+    # Hard safety checks
+    if hasattr(pca, 'n_features_in_') and len(feature_names) != int(pca.n_features_in_):
+        raise ValueError(
+            f'Feature mismatch: feature_names has {len(feature_names)} features, '
+            f'but PCA expects {int(pca.n_features_in_)}. '
+            'Re-export feature_names.pkl and pca.pkl from the same training run.'
         )
+    return feature_names, scaler, pca, model
 try:
+    feature_names, scaler, pca, model = load_assets()
 except Exception as e:
     st.error(str(e))
     st.stop()
+def predict_cluster(values_dict: dict) -> int:
+    df_one = pd.DataFrame([values_dict], columns=feature_names)
+    # Convert to numeric safely
+    for c in df_one.columns:
+        df_one[c] = pd.to_numeric(df_one[c], errors='coerce')
+    if df_one.isna().any().any():
+        bad = df_one.columns[df_one.isna().any()].tolist()
+        raise ValueError(f'NaN values found in columns: {bad}. Please provide valid numeric values.')
+    X_scaled = scaler.transform(df_one)   # (1, 29)
+    X_pca = pca.transform(X_scaled)       # (1, 27)
+    pred = model.predict(X_pca)[0]
+    return int(pred)
+st.subheader('🧮 Single Prediction')
+st.caption('Tip: Use a real row from your dataset for realistic values (all zeros may be unrealistic).')
+with st.form('single_pred_form'):
+    cols = st.columns(2)
+    values = {}
+    for i, feat in enumerate(feature_names):
+        if i % 2 == 0:
+            values[feat] = cols[0].number_input(feat, value=0.0)
+        else:
+            values[feat] = cols[1].number_input(feat, value=0.0)
+    submitted = st.form_submit_button('Predict cluster')
+if submitted:
+    try:
+        pred = predict_cluster(values)
+        st.success(f'✅ Predicted cluster: **{pred}**')
+    except Exception as e:
+        st.error(str(e))
 with st.expander('Show expected feature columns'):
+    st.write(feature_names)
+with st.expander('Debug shapes (advanced)'):
+    st.write('Number of input features:', len(feature_names))
+    st.write('PCA expects n_features_in_:', getattr(pca, 'n_features_in_', 'NA'))
+    st.write('PCA output components:', getattr(pca, 'n_components_', 'NA'))