| """ML model utilities — trains the same Decision Tree pipeline from Milestone 1.""" |
|
|
| import os |
| import numpy as np |
| import pandas as pd |
| import joblib |
|
|
| from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, RobustScaler |
| from sklearn.compose import ColumnTransformer |
| from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold |
| from sklearn.tree import DecisionTreeClassifier |
| from imblearn.over_sampling import SMOTE |
|
|
| ARTIFACTS_DIR = os.path.join(os.path.dirname(__file__), "artifacts") |
| MODEL_PATH = os.path.join(ARTIFACTS_DIR, "model.joblib") |
| PREPROCESSOR_PATH = os.path.join(ARTIFACTS_DIR, "preprocessor.joblib") |
| COLUMNS_PATH = os.path.join(ARTIFACTS_DIR, "columns.joblib") |
|
|
| ORDINAL_FEATURES = { |
| "Maintenance_History": ["Poor", "Average", "Good"], |
| "Tire_Condition": ["Worn Out", "Good", "New"], |
| "Brake_Condition": ["Worn Out", "Good", "New"], |
| "Battery_Status": ["Weak", "Good", "Strong"], |
| } |
|
|
| NOMINAL_FEATURES = ["Vehicle_Model", "Fuel_Type", "Transmission_Type", "Owner_Type"] |
|
|
| NUMERICAL_FEATURES = [ |
| "Mileage", "Reported_Issues", "Vehicle_Age", "Engine_Size", |
| "Odometer_Reading", "Insurance_Premium", "Service_History", |
| "Accident_History", "Fuel_Efficiency", |
| "Last_Service_Date_days", "Warranty_Expiry_Date_days", |
| ] |
|
|
| VEHICLE_MODELS = ["Truck", "Van", "Bus", "SUV", "Sedan"] |
| FUEL_TYPES = ["Electric", "Diesel", "Petrol", "Hybrid"] |
| TRANSMISSION_TYPES = ["Automatic", "Manual"] |
| OWNER_TYPES = ["First", "Second", "Third"] |
|
|
|
|
| |
| |
| |
|
|
| def generate_synthetic_data(n: int = 8000) -> pd.DataFrame: |
| """Create a realistic synthetic dataset matching the vehicle maintenance schema.""" |
| rng = np.random.RandomState(42) |
|
|
| data = { |
| "Vehicle_Model": rng.choice(VEHICLE_MODELS, n), |
| "Mileage": rng.randint(5000, 150000, n), |
| "Maintenance_History": rng.choice(["Poor", "Average", "Good"], n, p=[0.25, 0.40, 0.35]), |
| "Reported_Issues": rng.randint(0, 6, n), |
| "Vehicle_Age": rng.randint(1, 20, n), |
| "Fuel_Type": rng.choice(FUEL_TYPES, n), |
| "Transmission_Type": rng.choice(TRANSMISSION_TYPES, n), |
| "Engine_Size": rng.choice([1000, 1500, 2000, 2500, 3000, 3500], n), |
| "Odometer_Reading": rng.randint(5000, 250000, n), |
| "Owner_Type": rng.choice(OWNER_TYPES, n, p=[0.5, 0.35, 0.15]), |
| "Insurance_Premium": rng.randint(8000, 35000, n), |
| "Service_History": rng.randint(0, 15, n), |
| "Accident_History": rng.randint(0, 5, n), |
| "Fuel_Efficiency": rng.uniform(8.0, 22.0, n), |
| "Tire_Condition": rng.choice(["Worn Out", "Good", "New"], n, p=[0.25, 0.45, 0.30]), |
| "Brake_Condition": rng.choice(["Worn Out", "Good", "New"], n, p=[0.20, 0.45, 0.35]), |
| "Battery_Status": rng.choice(["Weak", "Good", "Strong"], n, p=[0.25, 0.40, 0.35]), |
| "Last_Service_Date_days": rng.randint(30, 900, n), |
| "Warranty_Expiry_Date_days": rng.randint(-400, 800, n), |
| } |
|
|
| df = pd.DataFrame(data) |
|
|
| |
| risk_score = np.zeros(n, dtype=float) |
| risk_score += (df["Maintenance_History"] == "Poor").astype(float) * 2.0 |
| risk_score += (df["Tire_Condition"] == "Worn Out").astype(float) * 1.5 |
| risk_score += (df["Brake_Condition"] == "Worn Out").astype(float) * 1.5 |
| risk_score += (df["Battery_Status"] == "Weak").astype(float) * 1.0 |
| risk_score += (df["Reported_Issues"] / 5.0) * 1.5 |
| risk_score += (df["Vehicle_Age"] / 20.0) * 1.0 |
| risk_score += (df["Accident_History"] / 5.0) * 1.0 |
| risk_score += (df["Last_Service_Date_days"] / 900.0) * 1.5 |
| risk_score += (df["Mileage"] / 150000.0) * 1.0 |
| risk_score -= (df["Service_History"] / 15.0) * 0.5 |
|
|
| threshold = np.percentile(risk_score, 55) |
| noise = rng.normal(0, 0.3, n) |
| df["Need_Maintenance"] = ((risk_score + noise) >= threshold).astype(int) |
|
|
| return df |
|
|
|
|
| |
| |
| |
|
|
| def build_preprocessor(): |
| ordinal_transformer = OrdinalEncoder( |
| categories=[ORDINAL_FEATURES[k] for k in ORDINAL_FEATURES], |
| handle_unknown="use_encoded_value", |
| unknown_value=-1, |
| ) |
| nominal_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False) |
| numerical_transformer = RobustScaler() |
|
|
| return ColumnTransformer( |
| transformers=[ |
| ("ord", ordinal_transformer, list(ORDINAL_FEATURES.keys())), |
| ("nom", nominal_transformer, NOMINAL_FEATURES), |
| ("num", numerical_transformer, NUMERICAL_FEATURES), |
| ], |
| remainder="drop", |
| ) |
|
|
|
|
| def train_model(df: pd.DataFrame | None = None): |
| """Train Decision Tree with GridSearchCV + SMOTE (same pipeline as Milestone 1).""" |
| if df is None: |
| df = generate_synthetic_data() |
|
|
| X = df.drop(columns=["Need_Maintenance"]) |
| y = df["Need_Maintenance"] |
| feature_columns = X.columns.tolist() |
|
|
| X_train, _, y_train, _ = train_test_split( |
| X, y, test_size=0.2, random_state=42, stratify=y |
| ) |
|
|
| preprocessor = build_preprocessor() |
| X_train_proc = preprocessor.fit_transform(X_train) |
|
|
| smote = SMOTE(random_state=42) |
| X_train_sm, y_train_sm = smote.fit_resample(X_train_proc, y_train) |
|
|
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) |
| param_grid = { |
| "max_depth": [5, 7, 10, None], |
| "min_samples_leaf": [1, 5, 10], |
| "criterion": ["gini", "entropy"], |
| } |
|
|
| grid = GridSearchCV( |
| DecisionTreeClassifier(random_state=42), |
| param_grid=param_grid, |
| cv=cv, |
| scoring="f1", |
| n_jobs=-1, |
| ) |
| grid.fit(X_train_sm, y_train_sm) |
| best_model = grid.best_estimator_ |
|
|
| os.makedirs(ARTIFACTS_DIR, exist_ok=True) |
| joblib.dump(best_model, MODEL_PATH) |
| joblib.dump(preprocessor, PREPROCESSOR_PATH) |
| joblib.dump(feature_columns, COLUMNS_PATH) |
|
|
| return best_model, preprocessor, feature_columns |
|
|
|
|
| def load_model(): |
| """Load saved model artifacts, or train from scratch if missing.""" |
| if all(os.path.exists(p) for p in [MODEL_PATH, PREPROCESSOR_PATH, COLUMNS_PATH]): |
| return ( |
| joblib.load(MODEL_PATH), |
| joblib.load(PREPROCESSOR_PATH), |
| joblib.load(COLUMNS_PATH), |
| ) |
| return train_model() |
|
|
|
|
| |
| |
| |
|
|
| _model, _preprocessor, _feature_columns = None, None, None |
|
|
|
|
| def _ensure_loaded(): |
| global _model, _preprocessor, _feature_columns |
| if _model is None: |
| _model, _preprocessor, _feature_columns = load_model() |
|
|
|
|
| def predict(vehicle_data: dict) -> dict: |
| """Run maintenance prediction on a single vehicle. |
| |
| Returns dict with: prediction, probability, risk_level, feature_importances. |
| """ |
| _ensure_loaded() |
|
|
| df = pd.DataFrame([vehicle_data]) |
|
|
| |
| for col in _feature_columns: |
| if col not in df.columns: |
| df[col] = 0 |
|
|
| df = df[_feature_columns] |
| X_proc = _preprocessor.transform(df) |
|
|
| pred = int(_model.predict(X_proc)[0]) |
| proba = float(_model.predict_proba(X_proc)[0][1]) |
|
|
| if proba >= 0.75: |
| risk_level = "CRITICAL" |
| elif proba >= 0.50: |
| risk_level = "HIGH" |
| elif proba >= 0.30: |
| risk_level = "MODERATE" |
| else: |
| risk_level = "LOW" |
|
|
| |
| importances = _model.feature_importances_ |
| proc_feature_names = _preprocessor.get_feature_names_out() |
| top_indices = np.argsort(importances)[::-1][:5] |
| top_features = [ |
| {"feature": proc_feature_names[i], "importance": round(float(importances[i]), 4)} |
| for i in top_indices |
| ] |
|
|
| return { |
| "needs_maintenance": pred, |
| "probability": round(proba, 4), |
| "risk_level": risk_level, |
| "top_features": top_features, |
| } |
|
|
|
|
| |
| SAMPLE_VEHICLES = { |
| "Truck-001: High Mileage Fleet Truck": { |
| "Vehicle_Model": "Truck", "Mileage": 128000, "Maintenance_History": "Poor", |
| "Reported_Issues": 4, "Vehicle_Age": 12, "Fuel_Type": "Diesel", |
| "Transmission_Type": "Manual", "Engine_Size": 3500, "Odometer_Reading": 210000, |
| "Owner_Type": "Second", "Insurance_Premium": 28000, "Service_History": 3, |
| "Accident_History": 2, "Fuel_Efficiency": 9.5, |
| "Tire_Condition": "Worn Out", "Brake_Condition": "Worn Out", |
| "Battery_Status": "Weak", |
| "Last_Service_Date_days": 420, "Warranty_Expiry_Date_days": -180, |
| }, |
| "Van-002: Recently Serviced Delivery Van": { |
| "Vehicle_Model": "Van", "Mileage": 45000, "Maintenance_History": "Good", |
| "Reported_Issues": 0, "Vehicle_Age": 3, "Fuel_Type": "Petrol", |
| "Transmission_Type": "Automatic", "Engine_Size": 2000, "Odometer_Reading": 52000, |
| "Owner_Type": "First", "Insurance_Premium": 15000, "Service_History": 8, |
| "Accident_History": 0, "Fuel_Efficiency": 15.2, |
| "Tire_Condition": "New", "Brake_Condition": "New", |
| "Battery_Status": "Strong", |
| "Last_Service_Date_days": 45, "Warranty_Expiry_Date_days": 500, |
| }, |
| "Bus-003: Aging City Bus": { |
| "Vehicle_Model": "Bus", "Mileage": 95000, "Maintenance_History": "Average", |
| "Reported_Issues": 2, "Vehicle_Age": 8, "Fuel_Type": "Diesel", |
| "Transmission_Type": "Automatic", "Engine_Size": 3000, "Odometer_Reading": 180000, |
| "Owner_Type": "Third", "Insurance_Premium": 22000, "Service_History": 6, |
| "Accident_History": 1, "Fuel_Efficiency": 11.0, |
| "Tire_Condition": "Good", "Brake_Condition": "Worn Out", |
| "Battery_Status": "Good", |
| "Last_Service_Date_days": 200, "Warranty_Expiry_Date_days": -90, |
| }, |
| "SUV-004: Low Risk Personal Vehicle": { |
| "Vehicle_Model": "SUV", "Mileage": 22000, "Maintenance_History": "Good", |
| "Reported_Issues": 0, "Vehicle_Age": 2, "Fuel_Type": "Hybrid", |
| "Transmission_Type": "Automatic", "Engine_Size": 2000, "Odometer_Reading": 24000, |
| "Owner_Type": "First", "Insurance_Premium": 18000, "Service_History": 10, |
| "Accident_History": 0, "Fuel_Efficiency": 18.5, |
| "Tire_Condition": "New", "Brake_Condition": "New", |
| "Battery_Status": "Strong", |
| "Last_Service_Date_days": 60, "Warranty_Expiry_Date_days": 700, |
| }, |
| } |
|
|