| import streamlit as st |
| import io |
| import numpy as np |
| import pandas as pd |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| import plotly.express as px |
| from sklearn.metrics import ( |
| accuracy_score, |
| precision_score, |
| recall_score, |
| f1_score, |
| confusion_matrix, |
| mean_absolute_error, |
| mean_squared_error, |
| r2_score, |
| ) |
|
|
| |
| @st.cache_resource(show_spinner=False) |
| def get_llm(): |
| """Cached LLM initialization to prevent reloading on every rerun""" |
| from langchain_google_genai import ChatGoogleGenerativeAI |
| from langchain_groq import ChatGroq |
| import os |
| |
| try: |
| return ChatGroq( |
| model="gemma2-9b-it", |
| groq_api_key=os.getenv("GROQ_API_KEY") |
| ) |
| |
| except Exception as e: |
| try: |
| return ChatGoogleGenerativeAI( |
| model="gemini-2.0-flash-lite-preview-02-05", |
| google_api_key=os.getenv("GEMINI_API_KEY") |
| ) |
| except: |
| return None |
|
|
| llm_insights = get_llm() |
|
|
| |
| @st.cache_data(show_spinner=False) |
| def _compute_classification_metrics(y_test, y_pred): |
| """Cached metric computation for classification""" |
| return { |
| 'accuracy': accuracy_score(y_test, y_pred), |
| 'precision': precision_score(y_test, y_pred, average="weighted", zero_division=0), |
| 'recall': recall_score(y_test, y_pred, average="weighted", zero_division=0), |
| 'f1': f1_score(y_test, y_pred, average="weighted", zero_division=0), |
| 'cm': confusion_matrix(y_test, y_pred) |
| } |
|
|
| @st.cache_data |
| def _compute_regression_metrics(y_test, y_pred): |
| """Cached metric computation for regression""" |
| return { |
| 'mae': mean_absolute_error(y_test, y_pred), |
| 'mse': mean_squared_error(y_test, y_pred), |
| 'rmse': np.sqrt(mean_squared_error(y_test, y_pred)), |
| 'r2': r2_score(y_test, y_pred) |
| } |
|
|
| |
| @st.cache_data(show_spinner=False) |
| def _plot_confusion_matrix(cm, classes): |
| """Cached confusion matrix plotting""" |
| fig, ax = plt.subplots(figsize=(2, 2), dpi=200) |
| sns.heatmap( |
| cm, |
| annot=True, |
| fmt="d", |
| cmap="Blues", |
| xticklabels=classes, |
| yticklabels=classes, |
| annot_kws={"size": 8}, |
| ) |
| plt.xticks(fontsize=5) |
| plt.yticks(fontsize=5) |
| |
| buf = io.BytesIO() |
| fig.savefig(buf, format="png", bbox_inches="tight", dpi=200) |
| buf.seek(0) |
| return buf |
|
|
|
|
| |
| @st.cache_data(show_spinner=False) |
| def _get_insights_classification(accuracy, precision, recall, f1, cm_shape): |
| """Cached insights generation based on metrics""" |
| if llm_insights is None: |
| return ( |
| f"### Classification Metrics Explained\n\n" |
| f"**Accuracy** ({accuracy:.3f}): Correct predictions ratio\n" |
| f"**Precision** ({precision:.3f}): Positive prediction accuracy\n" |
| f"**Recall** ({recall:.3f}): Actual positives found\n" |
| f"**F1 Score** ({f1:.3f}): Precision-Recall balance\n" |
| f"Confusion Matrix ({cm_shape[0]}x{cm_shape[1]}): Prediction vs Actual distribution" |
| ) |
|
|
| try: |
| response = llm_insights.invoke(f""" |
| Briefly explain these classification metrics (accuracy={accuracy:.3f}, |
| precision={precision:.3f}, recall={recall:.3f}, f1={f1:.3f}) |
| and {cm_shape[0]}x{cm_shape[1]} confusion matrix. |
| Use markdown bullet points. |
| """) |
| return response.content.strip() |
| except: |
| return "Could not generate AI insights - showing basic metrics explanation." |
|
|
|
|
|
|
| def display_test_results(trained_model, X_test, y_test, task_type, label_encoder=None): |
| """ |
| Displays test results, including metrics, confusion matrix (if classification), |
| and LLM-based or fallback insights about the metrics. |
| """ |
| |
| |
| st.markdown("## Test Results") |
| loading_placeholder = st.empty() |
| |
| |
| with loading_placeholder.container(): |
| st.info("β³ Evaluating model performance on test data. This may take a moment for large datasets.") |
| progress_bar = st.progress(0) |
| |
| |
| if "test_results_calculated" not in st.session_state: |
| st.session_state.test_results_calculated = False |
| |
| |
| if not st.session_state.test_results_calculated: |
| |
| sampling_message = None |
| MAX_SAMPLES = 5000 |
| |
| |
| with loading_placeholder.container(): |
| progress_bar.progress(10) |
| |
| if len(X_test) <= MAX_SAMPLES: |
| |
| X_test_sample = X_test |
| y_test_sample = y_test |
| st.info("π Using all test data for evaluation...") |
| else: |
| |
| sampling_message = f"π Using {MAX_SAMPLES} samples from the test set for visualization (out of {len(X_test)} total)" |
| st.info("π Sampling test data for evaluation...") |
| |
| |
| idx = np.random.choice(len(X_test.index if hasattr(X_test, 'index') else X_test), size=MAX_SAMPLES, replace=False) |
| X_test_sample = X_test.iloc[idx] if hasattr(X_test, 'iloc') else X_test[idx] |
| y_test_sample = y_test.iloc[idx] if hasattr(y_test, 'iloc') else y_test[idx] |
| |
| |
| with loading_placeholder.container(): |
| progress_bar.progress(30) |
| st.info("π Generating predictions... Please wait") |
| |
| with st.spinner("Model working..."): |
| if task_type == "regression": |
| y_pred = trained_model.predict(X_test_sample) |
| elif task_type == "classification": |
| pipeline, enc = trained_model if label_encoder is None else (trained_model, label_encoder) |
| y_pred = pipeline.predict(X_test_sample) |
| |
| |
| if enc: |
| y_pred = enc.inverse_transform(y_pred) |
| y_test_decoded = enc.inverse_transform(y_test_sample) |
| else: |
| y_test_decoded = y_test_sample |
| |
| |
| with loading_placeholder.container(): |
| progress_bar.progress(60) |
| st.info("π Computing metrics...") |
| |
| |
| if task_type == "regression": |
| metrics = _compute_regression_metrics(y_test_sample, y_pred) |
| else: |
| metrics = _compute_classification_metrics(y_test_decoded, y_pred) |
| |
| |
| with loading_placeholder.container(): |
| progress_bar.progress(90) |
| st.info("π Preparing visualizations...") |
| |
| |
| if task_type == "classification": |
| |
| _ = _plot_confusion_matrix(metrics['cm'], np.unique(y_test_decoded)) |
| |
| _ = _get_insights_classification( |
| metrics['accuracy'], |
| metrics['precision'], |
| metrics['recall'], |
| metrics['f1'], |
| metrics['cm'].shape |
| ) |
| |
| |
| with loading_placeholder.container(): |
| progress_bar.progress(100) |
| st.success("β
Test results ready!") |
| |
| |
| st.session_state.test_results_calculated = True |
| |
| |
| st.session_state.test_metrics = metrics |
| if task_type == "classification": |
| st.session_state.test_y_pred = y_pred |
| st.session_state.test_y_test = y_test_decoded |
| else: |
| st.session_state.test_y_pred = y_pred |
| st.session_state.test_y_test = y_test_sample |
| |
| |
| st.session_state.sampling_message = sampling_message |
| |
| |
| import time |
| time.sleep(0.5) |
| |
| |
| if "sampling_message" in st.session_state and st.session_state.sampling_message: |
| st.info(st.session_state.sampling_message) |
| |
| |
| if task_type == "regression": |
| st.subheader("π Regression Metrics") |
| |
| |
| if "test_metrics" in st.session_state and st.session_state.test_results_calculated: |
| metrics = st.session_state.test_metrics |
| y_pred = st.session_state.test_y_pred |
| y_test = st.session_state.test_y_test |
| |
| mae, mse, rmse, r2 = metrics['mae'], metrics['mse'], np.sqrt(metrics['mse']), metrics['r2'] |
|
|
| col1, col2, col3, col4 = st.columns(4) |
| col1.metric("π MAE", f"{mae:.4f}") |
| col2.metric("π MSE", f"{mse:.4f}") |
| col3.metric("π RMSE", f"{rmse:.4f}") |
| col4.metric("π RΒ² Score", f"{r2:.4f}") |
|
|
| |
| st.subheader("π Prediction vs Actual") |
| df_results = pd.DataFrame({ |
| 'Actual': y_test, |
| 'Predicted': y_pred |
| }) |
| fig = px.scatter(df_results, x='Actual', y='Predicted', |
| title='Predicted vs Actual Values', |
| labels={'Actual': 'Actual Values', 'Predicted': 'Predicted Values'}) |
| fig.add_shape(type='line', x0=min(y_test), y0=min(y_test), |
| x1=max(y_test), y1=max(y_test), |
| line=dict(color='red', dash='dash')) |
| st.plotly_chart(fig, use_container_width=True) |
|
|
| elif task_type == "classification": |
| st.subheader("π Classification Metrics") |
| |
| |
| if "test_metrics" in st.session_state and st.session_state.test_results_calculated: |
| metrics = st.session_state.test_metrics |
| y_pred = st.session_state.test_y_pred |
| y_test_decoded = st.session_state.test_y_test |
| |
| accuracy, precision, recall, f1 = metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1'] |
|
|
| col1, col2, col3, col4 = st.columns(4) |
| col1.metric("β
Accuracy", f"{accuracy:.4f}") |
| col2.metric("π― Precision", f"{precision:.4f}") |
| col3.metric("π’ Recall", f"{recall:.4f}") |
| col4.metric("π₯ F1 Score", f"{f1:.4f}") |
|
|
| st.subheader("π Confusion Matrix") |
| |
| buf = _plot_confusion_matrix(metrics['cm'], np.unique(y_test_decoded)) |
| st.image(buf, width=450) |
|
|
| |
| st.markdown("---") |
| st.markdown("#### Test Insights") |
| accuracy, precision, recall, f1 = metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1'] |
| classification_insights = _get_insights_classification(accuracy, precision, recall, f1, metrics['cm'].shape) |
| st.markdown(classification_insights) |
|
|