kaori02's picture
feat: Milestone 2 — Agentic AI Fleet Management Assistant
66f7e75
"""ML model utilities — trains the same Decision Tree pipeline from Milestone 1."""
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
ARTIFACTS_DIR = os.path.join(os.path.dirname(__file__), "artifacts")
MODEL_PATH = os.path.join(ARTIFACTS_DIR, "model.joblib")
PREPROCESSOR_PATH = os.path.join(ARTIFACTS_DIR, "preprocessor.joblib")
COLUMNS_PATH = os.path.join(ARTIFACTS_DIR, "columns.joblib")
ORDINAL_FEATURES = {
"Maintenance_History": ["Poor", "Average", "Good"],
"Tire_Condition": ["Worn Out", "Good", "New"],
"Brake_Condition": ["Worn Out", "Good", "New"],
"Battery_Status": ["Weak", "Good", "Strong"],
}
NOMINAL_FEATURES = ["Vehicle_Model", "Fuel_Type", "Transmission_Type", "Owner_Type"]
NUMERICAL_FEATURES = [
"Mileage", "Reported_Issues", "Vehicle_Age", "Engine_Size",
"Odometer_Reading", "Insurance_Premium", "Service_History",
"Accident_History", "Fuel_Efficiency",
"Last_Service_Date_days", "Warranty_Expiry_Date_days",
]
VEHICLE_MODELS = ["Truck", "Van", "Bus", "SUV", "Sedan"]
FUEL_TYPES = ["Electric", "Diesel", "Petrol", "Hybrid"]
TRANSMISSION_TYPES = ["Automatic", "Manual"]
OWNER_TYPES = ["First", "Second", "Third"]
# ---------------------------------------------------------------------------
# Synthetic data generation (mirrors the Kaggle dataset schema)
# ---------------------------------------------------------------------------
def generate_synthetic_data(n: int = 8000) -> pd.DataFrame:
"""Create a realistic synthetic dataset matching the vehicle maintenance schema."""
rng = np.random.RandomState(42)
data = {
"Vehicle_Model": rng.choice(VEHICLE_MODELS, n),
"Mileage": rng.randint(5000, 150000, n),
"Maintenance_History": rng.choice(["Poor", "Average", "Good"], n, p=[0.25, 0.40, 0.35]),
"Reported_Issues": rng.randint(0, 6, n),
"Vehicle_Age": rng.randint(1, 20, n),
"Fuel_Type": rng.choice(FUEL_TYPES, n),
"Transmission_Type": rng.choice(TRANSMISSION_TYPES, n),
"Engine_Size": rng.choice([1000, 1500, 2000, 2500, 3000, 3500], n),
"Odometer_Reading": rng.randint(5000, 250000, n),
"Owner_Type": rng.choice(OWNER_TYPES, n, p=[0.5, 0.35, 0.15]),
"Insurance_Premium": rng.randint(8000, 35000, n),
"Service_History": rng.randint(0, 15, n),
"Accident_History": rng.randint(0, 5, n),
"Fuel_Efficiency": rng.uniform(8.0, 22.0, n),
"Tire_Condition": rng.choice(["Worn Out", "Good", "New"], n, p=[0.25, 0.45, 0.30]),
"Brake_Condition": rng.choice(["Worn Out", "Good", "New"], n, p=[0.20, 0.45, 0.35]),
"Battery_Status": rng.choice(["Weak", "Good", "Strong"], n, p=[0.25, 0.40, 0.35]),
"Last_Service_Date_days": rng.randint(30, 900, n),
"Warranty_Expiry_Date_days": rng.randint(-400, 800, n),
}
df = pd.DataFrame(data)
# Deterministic target: vehicles in bad shape need maintenance
risk_score = np.zeros(n, dtype=float)
risk_score += (df["Maintenance_History"] == "Poor").astype(float) * 2.0
risk_score += (df["Tire_Condition"] == "Worn Out").astype(float) * 1.5
risk_score += (df["Brake_Condition"] == "Worn Out").astype(float) * 1.5
risk_score += (df["Battery_Status"] == "Weak").astype(float) * 1.0
risk_score += (df["Reported_Issues"] / 5.0) * 1.5
risk_score += (df["Vehicle_Age"] / 20.0) * 1.0
risk_score += (df["Accident_History"] / 5.0) * 1.0
risk_score += (df["Last_Service_Date_days"] / 900.0) * 1.5
risk_score += (df["Mileage"] / 150000.0) * 1.0
risk_score -= (df["Service_History"] / 15.0) * 0.5
threshold = np.percentile(risk_score, 55)
noise = rng.normal(0, 0.3, n)
df["Need_Maintenance"] = ((risk_score + noise) >= threshold).astype(int)
return df
# ---------------------------------------------------------------------------
# Preprocessing & training
# ---------------------------------------------------------------------------
def build_preprocessor():
ordinal_transformer = OrdinalEncoder(
categories=[ORDINAL_FEATURES[k] for k in ORDINAL_FEATURES],
handle_unknown="use_encoded_value",
unknown_value=-1,
)
nominal_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numerical_transformer = RobustScaler()
return ColumnTransformer(
transformers=[
("ord", ordinal_transformer, list(ORDINAL_FEATURES.keys())),
("nom", nominal_transformer, NOMINAL_FEATURES),
("num", numerical_transformer, NUMERICAL_FEATURES),
],
remainder="drop",
)
def train_model(df: pd.DataFrame | None = None):
"""Train Decision Tree with GridSearchCV + SMOTE (same pipeline as Milestone 1)."""
if df is None:
df = generate_synthetic_data()
X = df.drop(columns=["Need_Maintenance"])
y = df["Need_Maintenance"]
feature_columns = X.columns.tolist()
X_train, _, y_train, _ = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
preprocessor = build_preprocessor()
X_train_proc = preprocessor.fit_transform(X_train)
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train_proc, y_train)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {
"max_depth": [5, 7, 10, None],
"min_samples_leaf": [1, 5, 10],
"criterion": ["gini", "entropy"],
}
grid = GridSearchCV(
DecisionTreeClassifier(random_state=42),
param_grid=param_grid,
cv=cv,
scoring="f1",
n_jobs=-1,
)
grid.fit(X_train_sm, y_train_sm)
best_model = grid.best_estimator_
os.makedirs(ARTIFACTS_DIR, exist_ok=True)
joblib.dump(best_model, MODEL_PATH)
joblib.dump(preprocessor, PREPROCESSOR_PATH)
joblib.dump(feature_columns, COLUMNS_PATH)
return best_model, preprocessor, feature_columns
def load_model():
"""Load saved model artifacts, or train from scratch if missing."""
if all(os.path.exists(p) for p in [MODEL_PATH, PREPROCESSOR_PATH, COLUMNS_PATH]):
return (
joblib.load(MODEL_PATH),
joblib.load(PREPROCESSOR_PATH),
joblib.load(COLUMNS_PATH),
)
return train_model()
# ---------------------------------------------------------------------------
# Prediction
# ---------------------------------------------------------------------------
_model, _preprocessor, _feature_columns = None, None, None
def _ensure_loaded():
global _model, _preprocessor, _feature_columns
if _model is None:
_model, _preprocessor, _feature_columns = load_model()
def predict(vehicle_data: dict) -> dict:
"""Run maintenance prediction on a single vehicle.
Returns dict with: prediction, probability, risk_level, feature_importances.
"""
_ensure_loaded()
df = pd.DataFrame([vehicle_data])
# Ensure all expected columns exist
for col in _feature_columns:
if col not in df.columns:
df[col] = 0
df = df[_feature_columns]
X_proc = _preprocessor.transform(df)
pred = int(_model.predict(X_proc)[0])
proba = float(_model.predict_proba(X_proc)[0][1])
if proba >= 0.75:
risk_level = "CRITICAL"
elif proba >= 0.50:
risk_level = "HIGH"
elif proba >= 0.30:
risk_level = "MODERATE"
else:
risk_level = "LOW"
# Top contributing features
importances = _model.feature_importances_
proc_feature_names = _preprocessor.get_feature_names_out()
top_indices = np.argsort(importances)[::-1][:5]
top_features = [
{"feature": proc_feature_names[i], "importance": round(float(importances[i]), 4)}
for i in top_indices
]
return {
"needs_maintenance": pred,
"probability": round(proba, 4),
"risk_level": risk_level,
"top_features": top_features,
}
# Sample vehicles for the UI
SAMPLE_VEHICLES = {
"Truck-001: High Mileage Fleet Truck": {
"Vehicle_Model": "Truck", "Mileage": 128000, "Maintenance_History": "Poor",
"Reported_Issues": 4, "Vehicle_Age": 12, "Fuel_Type": "Diesel",
"Transmission_Type": "Manual", "Engine_Size": 3500, "Odometer_Reading": 210000,
"Owner_Type": "Second", "Insurance_Premium": 28000, "Service_History": 3,
"Accident_History": 2, "Fuel_Efficiency": 9.5,
"Tire_Condition": "Worn Out", "Brake_Condition": "Worn Out",
"Battery_Status": "Weak",
"Last_Service_Date_days": 420, "Warranty_Expiry_Date_days": -180,
},
"Van-002: Recently Serviced Delivery Van": {
"Vehicle_Model": "Van", "Mileage": 45000, "Maintenance_History": "Good",
"Reported_Issues": 0, "Vehicle_Age": 3, "Fuel_Type": "Petrol",
"Transmission_Type": "Automatic", "Engine_Size": 2000, "Odometer_Reading": 52000,
"Owner_Type": "First", "Insurance_Premium": 15000, "Service_History": 8,
"Accident_History": 0, "Fuel_Efficiency": 15.2,
"Tire_Condition": "New", "Brake_Condition": "New",
"Battery_Status": "Strong",
"Last_Service_Date_days": 45, "Warranty_Expiry_Date_days": 500,
},
"Bus-003: Aging City Bus": {
"Vehicle_Model": "Bus", "Mileage": 95000, "Maintenance_History": "Average",
"Reported_Issues": 2, "Vehicle_Age": 8, "Fuel_Type": "Diesel",
"Transmission_Type": "Automatic", "Engine_Size": 3000, "Odometer_Reading": 180000,
"Owner_Type": "Third", "Insurance_Premium": 22000, "Service_History": 6,
"Accident_History": 1, "Fuel_Efficiency": 11.0,
"Tire_Condition": "Good", "Brake_Condition": "Worn Out",
"Battery_Status": "Good",
"Last_Service_Date_days": 200, "Warranty_Expiry_Date_days": -90,
},
"SUV-004: Low Risk Personal Vehicle": {
"Vehicle_Model": "SUV", "Mileage": 22000, "Maintenance_History": "Good",
"Reported_Issues": 0, "Vehicle_Age": 2, "Fuel_Type": "Hybrid",
"Transmission_Type": "Automatic", "Engine_Size": 2000, "Odometer_Reading": 24000,
"Owner_Type": "First", "Insurance_Premium": 18000, "Service_History": 10,
"Accident_History": 0, "Fuel_Efficiency": 18.5,
"Tire_Condition": "New", "Brake_Condition": "New",
"Battery_Status": "Strong",
"Last_Service_Date_days": 60, "Warranty_Expiry_Date_days": 700,
},
}