EnYa32 commited on
Commit
abc3298
·
verified ·
1 Parent(s): d8564b1

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +256 -36
src/streamlit_app.py CHANGED
@@ -1,40 +1,260 @@
1
- import altair as alt
2
  import numpy as np
3
  import pandas as pd
4
  import streamlit as st
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import numpy as np
2
  import pandas as pd
3
  import streamlit as st
4
+ import joblib
5
+ from pathlib import Path
6
 
7
+ st.set_page_config(page_title='Spaceship Titanic - Transported Predictor', page_icon='🚀', layout='wide')
8
+
9
+ BASE_DIR = Path(__file__).resolve().parent
10
+ MODEL_PATH = BASE_DIR / 'spaceship_titanic_gb.pkl'
11
+
12
+ HOMEPLANET_OPTIONS = ['Earth', 'Europa', 'Mars', 'Unknown']
13
+ DESTINATION_OPTIONS = ['TRAPPIST-1e', '55 Cancri e', 'PSO J318.5-22', 'Unknown']
14
+ DECK_OPTIONS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'Unknown']
15
+ SIDE_OPTIONS = ['P', 'S', 'Unknown']
16
+ BOOL_UNKNOWN_OPTIONS = ['True', 'False', 'Unknown']
17
+
18
+ SPEND_COLS = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
19
+
20
+
21
+ @st.cache_resource
22
+ def load_artifact():
23
+ if not MODEL_PATH.exists():
24
+ raise FileNotFoundError(
25
+ f'File not found: {MODEL_PATH.name}. Please upload it to the repo root (same folder as app.py).'
26
+ )
27
+ artifact = joblib.load(MODEL_PATH)
28
+
29
+ # Expected: {'model': ..., 'feature_columns': [...]}
30
+ if isinstance(artifact, dict) and 'model' in artifact and 'feature_columns' in artifact:
31
+ return artifact['model'], artifact['feature_columns']
32
+
33
+ # Fallback: if user saved only model
34
+ return artifact, None
35
+
36
+
37
+ def map_bool_unknown(val):
38
+ # Map to numeric like your notebook: True->1, False->0, Unknown->-1
39
+ d = {'False': 0, 'True': 1, 'Unknown': -1, False: 0, True: 1}
40
+ return d.get(val, -1)
41
+
42
+
43
+ def build_features_from_row(row_dict):
44
+ df = pd.DataFrame([row_dict])
45
+
46
+ # Ensure categorical values exist
47
+ for c in ['HomePlanet', 'Destination', 'Deck', 'Side']:
48
+ if c not in df.columns:
49
+ df[c] = 'Unknown'
50
+ df[c] = df[c].fillna('Unknown').astype(str)
51
+
52
+ # Boolean-like with Unknown -> -1
53
+ for c in ['CryoSleep', 'VIP']:
54
+ if c not in df.columns:
55
+ df[c] = 'Unknown'
56
+ df[c] = df[c].apply(map_bool_unknown).astype(int)
57
+
58
+ # Numeric columns
59
+ if 'Age' not in df.columns:
60
+ df['Age'] = np.nan
61
+
62
+ for c in SPEND_COLS:
63
+ if c not in df.columns:
64
+ df[c] = 0.0
65
+ df[c] = pd.to_numeric(df[c], errors='coerce')
66
+
67
+ df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
68
+
69
+ # Group features
70
+ if 'GroupSize' not in df.columns:
71
+ df['GroupSize'] = 1
72
+ df['GroupSize'] = pd.to_numeric(df['GroupSize'], errors='coerce').fillna(1).astype(int)
73
+ df['GroupSize'] = df['GroupSize'].clip(lower=1)
74
+
75
+ # Feature engineering (same logic as your notebook)
76
+ df['TotalSpend'] = df[SPEND_COLS].sum(axis=1)
77
+ df['NotSpend'] = (df['TotalSpend'] == 0).astype(int)
78
+ df['IsAlone'] = (df['GroupSize'] == 1).astype(int)
79
+
80
+ # Fill remaining numeric NaNs (simple median-like fallback for single row)
81
+ # For single prediction, we can fill with 0 for spends and with median-ish for Age (use 0 if missing)
82
+ df['Age'] = df['Age'].fillna(df['Age'].median() if df['Age'].notna().any() else 0)
83
+
84
+ return df
85
+
86
+
87
+ def one_hot_and_align(df_features, feature_columns):
88
+ df_encoded = pd.get_dummies(df_features, drop_first=True)
89
+
90
+ if feature_columns is None:
91
+ # If no feature columns are stored, we return encoded as-is (may break if columns mismatch)
92
+ return df_encoded
93
+
94
+ # Add missing columns
95
+ missing = [c for c in feature_columns if c not in df_encoded.columns]
96
+ for c in missing:
97
+ df_encoded[c] = 0
98
+
99
+ # Drop extra columns
100
+ extra = [c for c in df_encoded.columns if c not in feature_columns]
101
+ if extra:
102
+ df_encoded = df_encoded.drop(columns=extra)
103
+
104
+ # Reorder
105
+ df_encoded = df_encoded[feature_columns]
106
+
107
+ return df_encoded
108
+
109
+
110
+ st.title('🚀 Spaceship Titanic - Transported Predictor')
111
+
112
+ with st.expander('What does this app do?', expanded=True):
113
+ st.write(
114
+ 'This app predicts whether a passenger was transported to another dimension (Transported=True/False) '
115
+ 'based on passenger features. It uses a Gradient Boosting Classifier trained on the Spaceship Titanic dataset.'
116
+ )
117
+
118
+ try:
119
+ model, feature_columns = load_artifact()
120
+ except Exception as e:
121
+ st.error(str(e))
122
+ st.stop()
123
+
124
+ tab1, tab2 = st.tabs(['Single Prediction', 'Batch CSV Prediction'])
125
+
126
+ with tab1:
127
+ st.subheader('Single passenger prediction')
128
+
129
+ colA, colB, colC = st.columns(3)
130
+
131
+ with colA:
132
+ homeplanet = st.selectbox('HomePlanet', HOMEPLANET_OPTIONS, index=0)
133
+ destination = st.selectbox('Destination', DESTINATION_OPTIONS, index=0)
134
+ age = st.number_input('Age', min_value=0.0, max_value=100.0, value=30.0, step=1.0)
135
+
136
+ with colB:
137
+ deck = st.selectbox('Deck', DECK_OPTIONS, index=DECK_OPTIONS.index('Unknown'))
138
+ side = st.selectbox('Side', SIDE_OPTIONS, index=SIDE_OPTIONS.index('Unknown'))
139
+ cryosleep = st.selectbox('CryoSleep', BOOL_UNKNOWN_OPTIONS, index=BOOL_UNKNOWN_OPTIONS.index('Unknown'))
140
+ vip = st.selectbox('VIP', BOOL_UNKNOWN_OPTIONS, index=BOOL_UNKNOWN_OPTIONS.index('Unknown'))
141
+
142
+ with colC:
143
+ groupsize = st.number_input('GroupSize', min_value=1, max_value=20, value=1, step=1)
144
+
145
+ st.markdown('### Spending')
146
+ s1, s2, s3, s4, s5 = st.columns(5)
147
+ with s1:
148
+ roomservice = st.number_input('RoomService', min_value=0.0, value=0.0, step=10.0)
149
+ with s2:
150
+ foodcourt = st.number_input('FoodCourt', min_value=0.0, value=0.0, step=10.0)
151
+ with s3:
152
+ shoppingmall = st.number_input('ShoppingMall', min_value=0.0, value=0.0, step=10.0)
153
+ with s4:
154
+ spa = st.number_input('Spa', min_value=0.0, value=0.0, step=10.0)
155
+ with s5:
156
+ vrdeck = st.number_input('VRDeck', min_value=0.0, value=0.0, step=10.0)
157
+
158
+ if st.button('Predict', type='primary'):
159
+ row = {
160
+ 'HomePlanet': homeplanet,
161
+ 'Destination': destination,
162
+ 'Deck': deck,
163
+ 'Side': side,
164
+ 'CryoSleep': cryosleep,
165
+ 'VIP': vip,
166
+ 'Age': age,
167
+ 'RoomService': roomservice,
168
+ 'FoodCourt': foodcourt,
169
+ 'ShoppingMall': shoppingmall,
170
+ 'Spa': spa,
171
+ 'VRDeck': vrdeck,
172
+ 'GroupSize': groupsize
173
+ }
174
+
175
+ df_feat = build_features_from_row(row)
176
+ X = one_hot_and_align(df_feat, feature_columns)
177
+
178
+ pred = model.predict(X)[0]
179
+ proba = None
180
+ if hasattr(model, 'predict_proba'):
181
+ proba = model.predict_proba(X)[0, 1]
182
+
183
+ st.success(f'Prediction: Transported = {bool(pred)}')
184
+ if proba is not None:
185
+ st.write(f'Probability (Transported=True): {proba:.3f}')
186
+
187
+ st.caption('Note: This prediction is based on the trained ML model and engineered features.')
188
+
189
+ with tab2:
190
+ st.subheader('Batch prediction from CSV')
191
+
192
+ st.write(
193
+ 'Upload a CSV with columns like: HomePlanet, Destination, Deck, Side, CryoSleep, VIP, Age, '
194
+ 'RoomService, FoodCourt, ShoppingMall, Spa, VRDeck, GroupSize. '
195
+ 'If PassengerId exists, it will be kept in the output.'
196
+ )
197
+
198
+ uploaded = st.file_uploader('Upload CSV', type=['csv'])
199
+ if uploaded is not None:
200
+ df_in = pd.read_csv(uploaded)
201
+
202
+ # Keep PassengerId if provided
203
+ passenger_ids = None
204
+ if 'PassengerId' in df_in.columns:
205
+ passenger_ids = df_in['PassengerId'].copy()
206
+
207
+ # Prepare features
208
+ # Ensure required columns exist
209
+ for c in ['HomePlanet', 'Destination', 'Deck', 'Side', 'CryoSleep', 'VIP', 'Age', 'GroupSize']:
210
+ if c not in df_in.columns:
211
+ df_in[c] = 'Unknown' if c in ['HomePlanet', 'Destination', 'Deck', 'Side', 'CryoSleep', 'VIP'] else np.nan
212
+
213
+ for c in SPEND_COLS:
214
+ if c not in df_in.columns:
215
+ df_in[c] = 0.0
216
+
217
+ # Build engineered features for each row
218
+ df_feat = df_in.copy()
219
+
220
+ for c in ['HomePlanet', 'Destination', 'Deck', 'Side']:
221
+ df_feat[c] = df_feat[c].fillna('Unknown').astype(str)
222
+
223
+ for c in ['CryoSleep', 'VIP']:
224
+ df_feat[c] = df_feat[c].apply(map_bool_unknown).astype(int)
225
+
226
+ df_feat['Age'] = pd.to_numeric(df_feat['Age'], errors='coerce')
227
+ for c in SPEND_COLS:
228
+ df_feat[c] = pd.to_numeric(df_feat[c], errors='coerce').fillna(0.0)
229
+
230
+ df_feat['GroupSize'] = pd.to_numeric(df_feat['GroupSize'], errors='coerce').fillna(1).astype(int)
231
+ df_feat['GroupSize'] = df_feat['GroupSize'].clip(lower=1)
232
+
233
+ df_feat['TotalSpend'] = df_feat[SPEND_COLS].sum(axis=1)
234
+ df_feat['NotSpend'] = (df_feat['TotalSpend'] == 0).astype(int)
235
+ df_feat['IsAlone'] = (df_feat['GroupSize'] == 1).astype(int)
236
+
237
+ # Impute Age with median from uploaded file
238
+ age_med = df_feat['Age'].median()
239
+ df_feat['Age'] = df_feat['Age'].fillna(age_med if pd.notna(age_med) else 0)
240
+
241
+ Xb = one_hot_and_align(df_feat, feature_columns)
242
+
243
+ preds = model.predict(Xb).astype(bool)
244
+
245
+ out = pd.DataFrame({'Transported': preds})
246
+ if passenger_ids is not None:
247
+ out.insert(0, 'PassengerId', passenger_ids)
248
+
249
+ st.write('Preview:')
250
+ st.dataframe(out.head(20), use_container_width=True)
251
+
252
+ csv_bytes = out.to_csv(index=False).encode('utf-8')
253
+ st.download_button(
254
+ label='Download predictions CSV',
255
+ data=csv_bytes,
256
+ file_name='predictions.csv',
257
+ mime='text/csv'
258
+ )
259
+
260
+ st.caption('Built with Streamlit • Model: Gradient Boosting Classifier')