"""ML model utilities — trains the same Decision Tree pipeline from Milestone 1.""" import os import numpy as np import pandas as pd import joblib from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, RobustScaler from sklearn.compose import ColumnTransformer from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold from sklearn.tree import DecisionTreeClassifier from imblearn.over_sampling import SMOTE ARTIFACTS_DIR = os.path.join(os.path.dirname(__file__), "artifacts") MODEL_PATH = os.path.join(ARTIFACTS_DIR, "model.joblib") PREPROCESSOR_PATH = os.path.join(ARTIFACTS_DIR, "preprocessor.joblib") COLUMNS_PATH = os.path.join(ARTIFACTS_DIR, "columns.joblib") ORDINAL_FEATURES = { "Maintenance_History": ["Poor", "Average", "Good"], "Tire_Condition": ["Worn Out", "Good", "New"], "Brake_Condition": ["Worn Out", "Good", "New"], "Battery_Status": ["Weak", "Good", "Strong"], } NOMINAL_FEATURES = ["Vehicle_Model", "Fuel_Type", "Transmission_Type", "Owner_Type"] NUMERICAL_FEATURES = [ "Mileage", "Reported_Issues", "Vehicle_Age", "Engine_Size", "Odometer_Reading", "Insurance_Premium", "Service_History", "Accident_History", "Fuel_Efficiency", "Last_Service_Date_days", "Warranty_Expiry_Date_days", ] VEHICLE_MODELS = ["Truck", "Van", "Bus", "SUV", "Sedan"] FUEL_TYPES = ["Electric", "Diesel", "Petrol", "Hybrid"] TRANSMISSION_TYPES = ["Automatic", "Manual"] OWNER_TYPES = ["First", "Second", "Third"] # --------------------------------------------------------------------------- # Synthetic data generation (mirrors the Kaggle dataset schema) # --------------------------------------------------------------------------- def generate_synthetic_data(n: int = 8000) -> pd.DataFrame: """Create a realistic synthetic dataset matching the vehicle maintenance schema.""" rng = np.random.RandomState(42) data = { "Vehicle_Model": rng.choice(VEHICLE_MODELS, n), "Mileage": rng.randint(5000, 150000, n), "Maintenance_History": rng.choice(["Poor", "Average", "Good"], n, p=[0.25, 0.40, 0.35]), "Reported_Issues": rng.randint(0, 6, n), "Vehicle_Age": rng.randint(1, 20, n), "Fuel_Type": rng.choice(FUEL_TYPES, n), "Transmission_Type": rng.choice(TRANSMISSION_TYPES, n), "Engine_Size": rng.choice([1000, 1500, 2000, 2500, 3000, 3500], n), "Odometer_Reading": rng.randint(5000, 250000, n), "Owner_Type": rng.choice(OWNER_TYPES, n, p=[0.5, 0.35, 0.15]), "Insurance_Premium": rng.randint(8000, 35000, n), "Service_History": rng.randint(0, 15, n), "Accident_History": rng.randint(0, 5, n), "Fuel_Efficiency": rng.uniform(8.0, 22.0, n), "Tire_Condition": rng.choice(["Worn Out", "Good", "New"], n, p=[0.25, 0.45, 0.30]), "Brake_Condition": rng.choice(["Worn Out", "Good", "New"], n, p=[0.20, 0.45, 0.35]), "Battery_Status": rng.choice(["Weak", "Good", "Strong"], n, p=[0.25, 0.40, 0.35]), "Last_Service_Date_days": rng.randint(30, 900, n), "Warranty_Expiry_Date_days": rng.randint(-400, 800, n), } df = pd.DataFrame(data) # Deterministic target: vehicles in bad shape need maintenance risk_score = np.zeros(n, dtype=float) risk_score += (df["Maintenance_History"] == "Poor").astype(float) * 2.0 risk_score += (df["Tire_Condition"] == "Worn Out").astype(float) * 1.5 risk_score += (df["Brake_Condition"] == "Worn Out").astype(float) * 1.5 risk_score += (df["Battery_Status"] == "Weak").astype(float) * 1.0 risk_score += (df["Reported_Issues"] / 5.0) * 1.5 risk_score += (df["Vehicle_Age"] / 20.0) * 1.0 risk_score += (df["Accident_History"] / 5.0) * 1.0 risk_score += (df["Last_Service_Date_days"] / 900.0) * 1.5 risk_score += (df["Mileage"] / 150000.0) * 1.0 risk_score -= (df["Service_History"] / 15.0) * 0.5 threshold = np.percentile(risk_score, 55) noise = rng.normal(0, 0.3, n) df["Need_Maintenance"] = ((risk_score + noise) >= threshold).astype(int) return df # --------------------------------------------------------------------------- # Preprocessing & training # --------------------------------------------------------------------------- def build_preprocessor(): ordinal_transformer = OrdinalEncoder( categories=[ORDINAL_FEATURES[k] for k in ORDINAL_FEATURES], handle_unknown="use_encoded_value", unknown_value=-1, ) nominal_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False) numerical_transformer = RobustScaler() return ColumnTransformer( transformers=[ ("ord", ordinal_transformer, list(ORDINAL_FEATURES.keys())), ("nom", nominal_transformer, NOMINAL_FEATURES), ("num", numerical_transformer, NUMERICAL_FEATURES), ], remainder="drop", ) def train_model(df: pd.DataFrame | None = None): """Train Decision Tree with GridSearchCV + SMOTE (same pipeline as Milestone 1).""" if df is None: df = generate_synthetic_data() X = df.drop(columns=["Need_Maintenance"]) y = df["Need_Maintenance"] feature_columns = X.columns.tolist() X_train, _, y_train, _ = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) preprocessor = build_preprocessor() X_train_proc = preprocessor.fit_transform(X_train) smote = SMOTE(random_state=42) X_train_sm, y_train_sm = smote.fit_resample(X_train_proc, y_train) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) param_grid = { "max_depth": [5, 7, 10, None], "min_samples_leaf": [1, 5, 10], "criterion": ["gini", "entropy"], } grid = GridSearchCV( DecisionTreeClassifier(random_state=42), param_grid=param_grid, cv=cv, scoring="f1", n_jobs=-1, ) grid.fit(X_train_sm, y_train_sm) best_model = grid.best_estimator_ os.makedirs(ARTIFACTS_DIR, exist_ok=True) joblib.dump(best_model, MODEL_PATH) joblib.dump(preprocessor, PREPROCESSOR_PATH) joblib.dump(feature_columns, COLUMNS_PATH) return best_model, preprocessor, feature_columns def load_model(): """Load saved model artifacts, or train from scratch if missing.""" if all(os.path.exists(p) for p in [MODEL_PATH, PREPROCESSOR_PATH, COLUMNS_PATH]): return ( joblib.load(MODEL_PATH), joblib.load(PREPROCESSOR_PATH), joblib.load(COLUMNS_PATH), ) return train_model() # --------------------------------------------------------------------------- # Prediction # --------------------------------------------------------------------------- _model, _preprocessor, _feature_columns = None, None, None def _ensure_loaded(): global _model, _preprocessor, _feature_columns if _model is None: _model, _preprocessor, _feature_columns = load_model() def predict(vehicle_data: dict) -> dict: """Run maintenance prediction on a single vehicle. Returns dict with: prediction, probability, risk_level, feature_importances. """ _ensure_loaded() df = pd.DataFrame([vehicle_data]) # Ensure all expected columns exist for col in _feature_columns: if col not in df.columns: df[col] = 0 df = df[_feature_columns] X_proc = _preprocessor.transform(df) pred = int(_model.predict(X_proc)[0]) proba = float(_model.predict_proba(X_proc)[0][1]) if proba >= 0.75: risk_level = "CRITICAL" elif proba >= 0.50: risk_level = "HIGH" elif proba >= 0.30: risk_level = "MODERATE" else: risk_level = "LOW" # Top contributing features importances = _model.feature_importances_ proc_feature_names = _preprocessor.get_feature_names_out() top_indices = np.argsort(importances)[::-1][:5] top_features = [ {"feature": proc_feature_names[i], "importance": round(float(importances[i]), 4)} for i in top_indices ] return { "needs_maintenance": pred, "probability": round(proba, 4), "risk_level": risk_level, "top_features": top_features, } # Sample vehicles for the UI SAMPLE_VEHICLES = { "Truck-001: High Mileage Fleet Truck": { "Vehicle_Model": "Truck", "Mileage": 128000, "Maintenance_History": "Poor", "Reported_Issues": 4, "Vehicle_Age": 12, "Fuel_Type": "Diesel", "Transmission_Type": "Manual", "Engine_Size": 3500, "Odometer_Reading": 210000, "Owner_Type": "Second", "Insurance_Premium": 28000, "Service_History": 3, "Accident_History": 2, "Fuel_Efficiency": 9.5, "Tire_Condition": "Worn Out", "Brake_Condition": "Worn Out", "Battery_Status": "Weak", "Last_Service_Date_days": 420, "Warranty_Expiry_Date_days": -180, }, "Van-002: Recently Serviced Delivery Van": { "Vehicle_Model": "Van", "Mileage": 45000, "Maintenance_History": "Good", "Reported_Issues": 0, "Vehicle_Age": 3, "Fuel_Type": "Petrol", "Transmission_Type": "Automatic", "Engine_Size": 2000, "Odometer_Reading": 52000, "Owner_Type": "First", "Insurance_Premium": 15000, "Service_History": 8, "Accident_History": 0, "Fuel_Efficiency": 15.2, "Tire_Condition": "New", "Brake_Condition": "New", "Battery_Status": "Strong", "Last_Service_Date_days": 45, "Warranty_Expiry_Date_days": 500, }, "Bus-003: Aging City Bus": { "Vehicle_Model": "Bus", "Mileage": 95000, "Maintenance_History": "Average", "Reported_Issues": 2, "Vehicle_Age": 8, "Fuel_Type": "Diesel", "Transmission_Type": "Automatic", "Engine_Size": 3000, "Odometer_Reading": 180000, "Owner_Type": "Third", "Insurance_Premium": 22000, "Service_History": 6, "Accident_History": 1, "Fuel_Efficiency": 11.0, "Tire_Condition": "Good", "Brake_Condition": "Worn Out", "Battery_Status": "Good", "Last_Service_Date_days": 200, "Warranty_Expiry_Date_days": -90, }, "SUV-004: Low Risk Personal Vehicle": { "Vehicle_Model": "SUV", "Mileage": 22000, "Maintenance_History": "Good", "Reported_Issues": 0, "Vehicle_Age": 2, "Fuel_Type": "Hybrid", "Transmission_Type": "Automatic", "Engine_Size": 2000, "Odometer_Reading": 24000, "Owner_Type": "First", "Insurance_Premium": 18000, "Service_History": 10, "Accident_History": 0, "Fuel_Efficiency": 18.5, "Tire_Condition": "New", "Brake_Condition": "New", "Battery_Status": "Strong", "Last_Service_Date_days": 60, "Warranty_Expiry_Date_days": 700, }, }