Spaces:

EnYa32
/

SpaceshipTitanicClassification

Sleeping

App Files Files Community

EnYa32 commited on Dec 23, 2025

Commit

abc3298

verified ·

1 Parent(s): d8564b1

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +256 -36

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,260 @@
-import altair as alt
 import numpy as np
 import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import numpy as np
 import pandas as pd
 import streamlit as st
+import joblib
+from pathlib import Path
+st.set_page_config(page_title='Spaceship Titanic - Transported Predictor', page_icon='🚀', layout='wide')
+BASE_DIR = Path(__file__).resolve().parent
+MODEL_PATH = BASE_DIR / 'spaceship_titanic_gb.pkl'
+HOMEPLANET_OPTIONS = ['Earth', 'Europa', 'Mars', 'Unknown']
+DESTINATION_OPTIONS = ['TRAPPIST-1e', '55 Cancri e', 'PSO J318.5-22', 'Unknown']
+DECK_OPTIONS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'Unknown']
+SIDE_OPTIONS = ['P', 'S', 'Unknown']
+BOOL_UNKNOWN_OPTIONS = ['True', 'False', 'Unknown']
+SPEND_COLS = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
+@st.cache_resource
+def load_artifact():
+    if not MODEL_PATH.exists():
+        raise FileNotFoundError(
+            f'File not found: {MODEL_PATH.name}. Please upload it to the repo root (same folder as app.py).'
+        )
+    artifact = joblib.load(MODEL_PATH)
+    # Expected: {'model': ..., 'feature_columns': [...]}
+    if isinstance(artifact, dict) and 'model' in artifact and 'feature_columns' in artifact:
+        return artifact['model'], artifact['feature_columns']
+    # Fallback: if user saved only model
+    return artifact, None
+def map_bool_unknown(val):
+    # Map to numeric like your notebook: True->1, False->0, Unknown->-1
+    d = {'False': 0, 'True': 1, 'Unknown': -1, False: 0, True: 1}
+    return d.get(val, -1)
+def build_features_from_row(row_dict):
+    df = pd.DataFrame([row_dict])
+    # Ensure categorical values exist
+    for c in ['HomePlanet', 'Destination', 'Deck', 'Side']:
+        if c not in df.columns:
+            df[c] = 'Unknown'
+        df[c] = df[c].fillna('Unknown').astype(str)
+    # Boolean-like with Unknown -> -1
+    for c in ['CryoSleep', 'VIP']:
+        if c not in df.columns:
+            df[c] = 'Unknown'
+        df[c] = df[c].apply(map_bool_unknown).astype(int)
+    # Numeric columns
+    if 'Age' not in df.columns:
+        df['Age'] = np.nan
+    for c in SPEND_COLS:
+        if c not in df.columns:
+            df[c] = 0.0
+        df[c] = pd.to_numeric(df[c], errors='coerce')
+    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
+    # Group features
+    if 'GroupSize' not in df.columns:
+        df['GroupSize'] = 1
+    df['GroupSize'] = pd.to_numeric(df['GroupSize'], errors='coerce').fillna(1).astype(int)
+    df['GroupSize'] = df['GroupSize'].clip(lower=1)
+    # Feature engineering (same logic as your notebook)
+    df['TotalSpend'] = df[SPEND_COLS].sum(axis=1)
+    df['NotSpend'] = (df['TotalSpend'] == 0).astype(int)
+    df['IsAlone'] = (df['GroupSize'] == 1).astype(int)
+    # Fill remaining numeric NaNs (simple median-like fallback for single row)
+    # For single prediction, we can fill with 0 for spends and with median-ish for Age (use 0 if missing)
+    df['Age'] = df['Age'].fillna(df['Age'].median() if df['Age'].notna().any() else 0)
+    return df
+def one_hot_and_align(df_features, feature_columns):
+    df_encoded = pd.get_dummies(df_features, drop_first=True)
+    if feature_columns is None:
+        # If no feature columns are stored, we return encoded as-is (may break if columns mismatch)
+        return df_encoded
+    # Add missing columns
+    missing = [c for c in feature_columns if c not in df_encoded.columns]
+    for c in missing:
+        df_encoded[c] = 0
+    # Drop extra columns
+    extra = [c for c in df_encoded.columns if c not in feature_columns]
+    if extra:
+        df_encoded = df_encoded.drop(columns=extra)
+    # Reorder
+    df_encoded = df_encoded[feature_columns]
+    return df_encoded
+st.title('🚀 Spaceship Titanic - Transported Predictor')
+with st.expander('What does this app do?', expanded=True):
+    st.write(
+        'This app predicts whether a passenger was transported to another dimension (Transported=True/False) '
+        'based on passenger features. It uses a Gradient Boosting Classifier trained on the Spaceship Titanic dataset.'
+    )
+try:
+    model, feature_columns = load_artifact()
+except Exception as e:
+    st.error(str(e))
+    st.stop()
+tab1, tab2 = st.tabs(['Single Prediction', 'Batch CSV Prediction'])
+with tab1:
+    st.subheader('Single passenger prediction')
+    colA, colB, colC = st.columns(3)
+    with colA:
+        homeplanet = st.selectbox('HomePlanet', HOMEPLANET_OPTIONS, index=0)
+        destination = st.selectbox('Destination', DESTINATION_OPTIONS, index=0)
+        age = st.number_input('Age', min_value=0.0, max_value=100.0, value=30.0, step=1.0)
+    with colB:
+        deck = st.selectbox('Deck', DECK_OPTIONS, index=DECK_OPTIONS.index('Unknown'))
+        side = st.selectbox('Side', SIDE_OPTIONS, index=SIDE_OPTIONS.index('Unknown'))
+        cryosleep = st.selectbox('CryoSleep', BOOL_UNKNOWN_OPTIONS, index=BOOL_UNKNOWN_OPTIONS.index('Unknown'))
+        vip = st.selectbox('VIP', BOOL_UNKNOWN_OPTIONS, index=BOOL_UNKNOWN_OPTIONS.index('Unknown'))
+    with colC:
+        groupsize = st.number_input('GroupSize', min_value=1, max_value=20, value=1, step=1)
+    st.markdown('### Spending')
+    s1, s2, s3, s4, s5 = st.columns(5)
+    with s1:
+        roomservice = st.number_input('RoomService', min_value=0.0, value=0.0, step=10.0)
+    with s2:
+        foodcourt = st.number_input('FoodCourt', min_value=0.0, value=0.0, step=10.0)
+    with s3:
+        shoppingmall = st.number_input('ShoppingMall', min_value=0.0, value=0.0, step=10.0)
+    with s4:
+        spa = st.number_input('Spa', min_value=0.0, value=0.0, step=10.0)
+    with s5:
+        vrdeck = st.number_input('VRDeck', min_value=0.0, value=0.0, step=10.0)
+    if st.button('Predict', type='primary'):
+        row = {
+            'HomePlanet': homeplanet,
+            'Destination': destination,
+            'Deck': deck,
+            'Side': side,
+            'CryoSleep': cryosleep,
+            'VIP': vip,
+            'Age': age,
+            'RoomService': roomservice,
+            'FoodCourt': foodcourt,
+            'ShoppingMall': shoppingmall,
+            'Spa': spa,
+            'VRDeck': vrdeck,
+            'GroupSize': groupsize
+        }
+        df_feat = build_features_from_row(row)
+        X = one_hot_and_align(df_feat, feature_columns)
+        pred = model.predict(X)[0]
+        proba = None
+        if hasattr(model, 'predict_proba'):
+            proba = model.predict_proba(X)[0, 1]
+        st.success(f'Prediction: Transported = {bool(pred)}')
+        if proba is not None:
+            st.write(f'Probability (Transported=True): {proba:.3f}')
+        st.caption('Note: This prediction is based on the trained ML model and engineered features.')
+with tab2:
+    st.subheader('Batch prediction from CSV')
+    st.write(
+        'Upload a CSV with columns like: HomePlanet, Destination, Deck, Side, CryoSleep, VIP, Age, '
+        'RoomService, FoodCourt, ShoppingMall, Spa, VRDeck, GroupSize. '
+        'If PassengerId exists, it will be kept in the output.'
+    )
+    uploaded = st.file_uploader('Upload CSV', type=['csv'])
+    if uploaded is not None:
+        df_in = pd.read_csv(uploaded)
+        # Keep PassengerId if provided
+        passenger_ids = None
+        if 'PassengerId' in df_in.columns:
+            passenger_ids = df_in['PassengerId'].copy()
+        # Prepare features
+        # Ensure required columns exist
+        for c in ['HomePlanet', 'Destination', 'Deck', 'Side', 'CryoSleep', 'VIP', 'Age', 'GroupSize']:
+            if c not in df_in.columns:
+                df_in[c] = 'Unknown' if c in ['HomePlanet', 'Destination', 'Deck', 'Side', 'CryoSleep', 'VIP'] else np.nan
+        for c in SPEND_COLS:
+            if c not in df_in.columns:
+                df_in[c] = 0.0
+        # Build engineered features for each row
+        df_feat = df_in.copy()
+        for c in ['HomePlanet', 'Destination', 'Deck', 'Side']:
+            df_feat[c] = df_feat[c].fillna('Unknown').astype(str)
+        for c in ['CryoSleep', 'VIP']:
+            df_feat[c] = df_feat[c].apply(map_bool_unknown).astype(int)
+        df_feat['Age'] = pd.to_numeric(df_feat['Age'], errors='coerce')
+        for c in SPEND_COLS:
+            df_feat[c] = pd.to_numeric(df_feat[c], errors='coerce').fillna(0.0)
+        df_feat['GroupSize'] = pd.to_numeric(df_feat['GroupSize'], errors='coerce').fillna(1).astype(int)
+        df_feat['GroupSize'] = df_feat['GroupSize'].clip(lower=1)
+        df_feat['TotalSpend'] = df_feat[SPEND_COLS].sum(axis=1)
+        df_feat['NotSpend'] = (df_feat['TotalSpend'] == 0).astype(int)
+        df_feat['IsAlone'] = (df_feat['GroupSize'] == 1).astype(int)
+        # Impute Age with median from uploaded file
+        age_med = df_feat['Age'].median()
+        df_feat['Age'] = df_feat['Age'].fillna(age_med if pd.notna(age_med) else 0)
+        Xb = one_hot_and_align(df_feat, feature_columns)
+        preds = model.predict(Xb).astype(bool)
+        out = pd.DataFrame({'Transported': preds})
+        if passenger_ids is not None:
+            out.insert(0, 'PassengerId', passenger_ids)
+        st.write('Preview:')
+        st.dataframe(out.head(20), use_container_width=True)
+        csv_bytes = out.to_csv(index=False).encode('utf-8')
+        st.download_button(
+            label='Download predictions CSV',
+            data=csv_bytes,
+            file_name='predictions.csv',
+            mime='text/csv'
+        )
+st.caption('Built with Streamlit • Model: Gradient Boosting Classifier')