enacimie commited on
Commit
94bd0c4
·
verified ·
1 Parent(s): 0e7d7d5

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +287 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,289 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
6
+ from sklearn.linear_model import LogisticRegression, LinearRegression
7
+ from sklearn.metrics import (
8
+ accuracy_score, precision_score, recall_score, f1_score,
9
+ mean_absolute_error, mean_squared_error, r2_score,
10
+ classification_report, confusion_matrix
11
+ )
12
+ from sklearn.preprocessing import LabelEncoder
13
+ import plotly.express as px
14
+ import plotly.graph_objects as go
15
+ import seaborn as sns
16
+ import matplotlib.pyplot as plt
17
+ import io
18
+
19
+ # Metadata
20
+ AUTHOR = "Eduardo Nacimiento García"
21
+ EMAIL = "enacimie@ull.edu.es"
22
+ LICENSE = "Apache 2.0"
23
+
24
+ # Page config
25
+ st.set_page_config(
26
+ page_title="SimpleML",
27
+ page_icon="🤖",
28
+ layout="wide",
29
+ initial_sidebar_state="expanded",
30
+ )
31
+
32
+ # Title
33
+ st.title("🤖 SimpleML")
34
+ st.markdown(f"**Author:** {AUTHOR} | **Email:** {EMAIL} | **License:** {LICENSE}")
35
+ st.write("""
36
+ Upload a CSV or use the demo dataset to train a machine learning model (classification or regression) in seconds.
37
+ """)
38
+
39
+ # === GENERATE DEMO DATASET ===
40
+ @st.cache_data
41
+ def create_demo_data(task="classification"):
42
+ np.random.seed(42)
43
+ n = 500
44
+ data = {
45
+ "Age": np.random.normal(35, 12, n).astype(int),
46
+ "Income": np.random.normal(45000, 15000, n),
47
+ "Experience": np.random.randint(0, 20, n),
48
+ "Education_Level": np.random.choice(["High School", "Bachelor", "Master", "PhD"], n),
49
+ "City": np.random.choice(["Madrid", "Barcelona", "Valencia", "Seville"], n),
50
+ }
51
+ df = pd.DataFrame(data)
52
+
53
+ if task == "classification":
54
+ # Create binary target: Purchase (0/1)
55
+ purchase_prob = (
56
+ 0.3 +
57
+ (df["Income"] > df["Income"].median()) * 0.4 +
58
+ (df["Experience"] > 10) * 0.2 +
59
+ (df["Education_Level"] == "Master") * 0.1 +
60
+ (df["Education_Level"] == "PhD") * 0.15
61
+ )
62
+ df["Purchase"] = np.random.binomial(1, np.clip(purchase_prob, 0, 1), n)
63
+ return df
64
+
65
+ elif task == "regression":
66
+ # Create continuous target: Salary
67
+ df["Salary"] = (
68
+ 25000 +
69
+ df["Experience"] * 1500 +
70
+ (df["Income"] / 100) +
71
+ (df["Age"] * 100) +
72
+ (df["Education_Level"] == "Master") * 8000 +
73
+ (df["Education_Level"] == "PhD") * 15000 +
74
+ np.random.normal(0, 5000, n)
75
+ )
76
+ return df
77
+
78
+ # === LOAD DATA ===
79
+ if "demo_loaded" not in st.session_state:
80
+ st.session_state.demo_loaded = False
81
+ st.session_state.task_type = "classification"
82
+
83
+ if st.button("🧪 Load Classification Demo Dataset"):
84
+ st.session_state.demo_loaded = True
85
+ st.session_state.task_type = "classification"
86
+ st.session_state.df = create_demo_data("classification")
87
+ st.success("✅ Classification demo loaded!")
88
+
89
+ if st.button("🧪 Load Regression Demo Dataset"):
90
+ st.session_state.demo_loaded = True
91
+ st.session_state.task_type = "regression"
92
+ st.session_state.df = create_demo_data("regression")
93
+ st.success("✅ Regression demo loaded!")
94
+
95
+ uploaded_file = st.file_uploader("📂 Upload your CSV file", type=["csv"])
96
+
97
+ # Use demo or uploaded file
98
+ if uploaded_file:
99
+ df = pd.read_csv(uploaded_file)
100
+ st.session_state.df = df
101
+ st.session_state.demo_loaded = False
102
+ st.success("✅ File uploaded successfully.")
103
+ elif "df" in st.session_state:
104
+ df = st.session_state.df
105
+ task_type = st.session_state.task_type
106
+ if st.session_state.demo_loaded:
107
+ st.info(f"Using **{task_type}** demo dataset.")
108
+ else:
109
+ df = None
110
+ st.info("👆 Upload a CSV or load a demo dataset to begin.")
111
+ st.stop()
112
+
113
+ # Show data preview
114
+ with st.expander("🔍 Data Preview (first 10 rows)"):
115
+ st.dataframe(df.head(10))
116
+
117
+ # === TARGET & FEATURE SELECTION ===
118
+ st.subheader("🎯 Select Target Variable")
119
+ target_col = st.selectbox("Target column (y):", df.columns)
120
+
121
+ # Auto-detect task type if not demo
122
+ if "task_type" not in st.session_state or not st.session_state.demo_loaded:
123
+ if df[target_col].nunique() <= 10 and df[target_col].dtype == 'object' or df[target_col].dtype.name == 'category':
124
+ task_type = "classification"
125
+ elif df[target_col].dtype in [np.int64, np.float64] and df[target_col].nunique() <= 10:
126
+ task_type = "classification"
127
+ else:
128
+ task_type = "regression"
129
+ else:
130
+ task_type = st.session_state.task_type
131
+
132
+ st.write(f"**Detected task:** `{task_type}`")
133
+
134
+ # Select features
135
+ feature_cols = [col for col in df.columns if col != target_col]
136
+ selected_features = st.multiselect(
137
+ "Select features (X):",
138
+ feature_cols,
139
+ default=feature_cols
140
+ )
141
+
142
+ if not selected_features:
143
+ st.warning("⚠️ Please select at least one feature.")
144
+ st.stop()
145
+
146
+ # Prepare data
147
+ X = df[selected_features].copy()
148
+ y = df[target_col].copy()
149
+
150
+ # Handle categorical variables
151
+ le_dict = {}
152
+ for col in X.select_dtypes(include=['object', 'category']).columns:
153
+ le = LabelEncoder()
154
+ X[col] = le.fit_transform(X[col].astype(str))
155
+ le_dict[col] = le
156
+
157
+ if task_type == "classification" and y.dtype == 'object':
158
+ le_target = LabelEncoder()
159
+ y = le_target.fit_transform(y.astype(str))
160
+ class_names = le_target.classes_
161
+ else:
162
+ class_names = None
163
+
164
+ # Train/test split
165
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
166
+
167
+ # === MODEL SELECTION ===
168
+ st.subheader("⚙️ Choose Model")
169
+
170
+ if task_type == "classification":
171
+ model_choice = st.selectbox("Model:", ["Random Forest Classifier", "Logistic Regression"])
172
+ if model_choice == "Random Forest Classifier":
173
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
174
+ else:
175
+ model = LogisticRegression(max_iter=1000, random_state=42)
176
+ else:
177
+ model_choice = st.selectbox("Model:", ["Random Forest Regressor", "Linear Regression"])
178
+ if model_choice == "Random Forest Regressor":
179
+ model = RandomForestRegressor(n_estimators=100, random_state=42)
180
+ else:
181
+ model = LinearRegression()
182
+
183
+ # Train model
184
+ model.fit(X_train, y_train)
185
+ y_pred = model.predict(X_test)
186
+
187
+ # === RESULTS ===
188
+ st.header("📈 Results")
189
+
190
+ if task_type == "classification":
191
+ # Metrics
192
+ acc = accuracy_score(y_test, y_pred)
193
+ prec = precision_score(y_test, y_pred, average='weighted')
194
+ rec = recall_score(y_test, y_pred, average='weighted')
195
+ f1 = f1_score(y_test, y_pred, average='weighted')
196
+
197
+ st.subheader("📊 Classification Metrics")
198
+ col1, col2, col3, col4 = st.columns(4)
199
+ col1.metric("Accuracy", f"{acc:.3f}")
200
+ col2.metric("Precision", f"{prec:.3f}")
201
+ col3.metric("Recall", f"{rec:.3f}")
202
+ col4.metric("F1-Score", f"{f1:.3f}")
203
+
204
+ # Classification report
205
+ with st.expander("📋 Detailed Classification Report"):
206
+ if class_names is not None:
207
+ report = classification_report(y_test, y_pred, target_names=class_names, output_dict=True)
208
+ else:
209
+ report = classification_report(y_test, y_pred, output_dict=True)
210
+ st.dataframe(pd.DataFrame(report).transpose())
211
+
212
+ # Confusion Matrix
213
+ st.subheader("🧩 Confusion Matrix")
214
+ cm = confusion_matrix(y_test, y_pred)
215
+ fig = px.imshow(
216
+ cm,
217
+ text_auto=True,
218
+ labels=dict(x="Predicted", y="Actual"),
219
+ x=class_names if class_names is not None else [f"Class {i}" for i in range(cm.shape[1])],
220
+ y=class_names if class_names is not None else [f"Class {i}" for i in range(cm.shape[0])],
221
+ title="Confusion Matrix"
222
+ )
223
+ st.plotly_chart(fig, use_container_width=True)
224
+
225
+ else: # regression
226
+ mae = mean_absolute_error(y_test, y_pred)
227
+ mse = mean_squared_error(y_test, y_pred)
228
+ rmse = np.sqrt(mse)
229
+ r2 = r2_score(y_test, y_pred)
230
+
231
+ st.subheader("📊 Regression Metrics")
232
+ col1, col2, col3, col4 = st.columns(4)
233
+ col1.metric("MAE", f"{mae:.2f}")
234
+ col2.metric("MSE", f"{mse:.2f}")
235
+ col3.metric("RMSE", f"{rmse:.2f}")
236
+ col4.metric("R²", f"{r2:.3f}")
237
+
238
+ # Prediction vs Actual plot
239
+ st.subheader("📉 Predicted vs Actual")
240
+ fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual', 'y': 'Predicted'}, title="Predicted vs Actual Values")
241
+ fig.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()],
242
+ mode='lines', name='Ideal Fit', line=dict(dash='dash', color='red')))
243
+ st.plotly_chart(fig, use_container_width=True)
244
+
245
+ # Feature Importance (for tree-based models)
246
+ if "Forest" in model_choice:
247
+ st.subheader("🔑 Feature Importance")
248
+ importance = model.feature_importances_
249
+ feat_imp_df = pd.DataFrame({
250
+ 'Feature': selected_features,
251
+ 'Importance': importance
252
+ }).sort_values('Importance', ascending=False)
253
+
254
+ fig = px.bar(feat_imp_df, x='Importance', y='Feature', orientation='h', title="Feature Importance")
255
+ st.plotly_chart(fig, use_container_width=True)
256
+
257
+ with st.expander("📋 Feature Importance Table"):
258
+ st.dataframe(feat_imp_df)
259
+
260
+ # === PREDICTION DEMO ===
261
+ st.header("🔮 Make a Prediction")
262
+
263
+ st.write("Enter values below to predict:")
264
+
265
+ input_data = {}
266
+ for feature in selected_features:
267
+ if feature in le_dict:
268
+ # Categorical
269
+ original_values = df[feature].dropna().unique()
270
+ choice = st.selectbox(f"{feature}:", original_values, key=f"pred_{feature}")
271
+ input_data[feature] = le_dict[feature].transform([str(choice)])[0]
272
+ else:
273
+ # Numeric
274
+ if df[feature].dtype in [np.int64, np.int32]:
275
+ val = st.number_input(f"{feature}:", value=int(df[feature].median()), step=1, key=f"pred_{feature}")
276
+ else:
277
+ val = st.number_input(f"{feature}:", value=float(df[feature].median()), step=0.1, key=f"pred_{feature}")
278
+ input_data[feature] = val
279
+
280
+ if st.button("🚀 Predict"):
281
+ input_df = pd.DataFrame([input_data])
282
+ prediction = model.predict(input_df)[0]
283
+ if task_type == "classification" and class_names is not None:
284
+ prediction = class_names[prediction]
285
+ st.success(f"**Prediction:** `{prediction}`")
286
 
287
+ # Footer
288
+ st.markdown("---")
289
+ st.caption(f"© {AUTHOR} | License {LICENSE} | Contact: {EMAIL}")