| |
| |
| |
| |
|
|
| |
| |
| |
| import pandas as pd |
| import numpy as np |
| from datetime import date |
| import warnings |
| warnings.filterwarnings("ignore") |
|
|
| try: |
| from IPython.display import display, Markdown |
| IPYTHON_OK = True |
| except ImportError: |
| IPYTHON_OK = False |
| def display(x): print(x) |
| def Markdown(x): return x |
|
|
| try: |
| import matplotlib.pyplot as plt |
| MATPLOTLIB_OK = True |
| except ImportError: |
| MATPLOTLIB_OK = False |
| print("β οΈ matplotlib non installΓ© β pas de graphes. pip install matplotlib") |
|
|
| |
| |
| |
|
|
| VACANCES = { |
| "2023-2024": { |
| "A": [(date(2023,10,21),date(2023,11,5)), (date(2023,12,23),date(2024,1,7)), |
| (date(2024,2,17),date(2024,3,3)), (date(2024,4,13),date(2024,4,28)), |
| (date(2024,7,6),date(2024,9,1))], |
| "B": [(date(2023,10,21),date(2023,11,5)), (date(2023,12,23),date(2024,1,7)), |
| (date(2024,2,24),date(2024,3,10)), (date(2024,4,20),date(2024,5,5)), |
| (date(2024,7,6),date(2024,9,1))], |
| "C": [(date(2023,10,21),date(2023,11,5)), (date(2023,12,23),date(2024,1,7)), |
| (date(2024,2,10),date(2024,2,25)), (date(2024,4,6),date(2024,4,21)), |
| (date(2024,7,6),date(2024,9,1))], |
| }, |
| "2024-2025": { |
| "A": [(date(2024,10,19),date(2024,11,3)), (date(2024,12,21),date(2025,1,5)), |
| (date(2025,2,8),date(2025,2,23)), (date(2025,4,5),date(2025,4,20)), |
| (date(2025,7,5),date(2025,8,31))], |
| "B": [(date(2024,10,19),date(2024,11,3)), (date(2024,12,21),date(2025,1,5)), |
| (date(2025,2,22),date(2025,3,9)), (date(2025,4,19),date(2025,5,4)), |
| (date(2025,7,5),date(2025,8,31))], |
| "C": [(date(2024,10,19),date(2024,11,3)), (date(2024,12,21),date(2025,1,5)), |
| (date(2025,2,15),date(2025,3,2)), (date(2025,4,12),date(2025,4,27)), |
| (date(2025,7,5),date(2025,8,31))], |
| }, |
| "2025-2026": { |
| "A": [(date(2025,10,18),date(2025,11,2)), (date(2025,12,20),date(2026,1,4)), |
| (date(2026,2,14),date(2026,3,1)), (date(2026,4,4),date(2026,4,19)), |
| (date(2026,7,4),date(2026,8,31))], |
| "B": [(date(2025,10,18),date(2025,11,2)), (date(2025,12,20),date(2026,1,4)), |
| (date(2026,2,21),date(2026,3,8)), (date(2026,4,11),date(2026,4,26)), |
| (date(2026,7,4),date(2026,8,31))], |
| "C": [(date(2025,10,18),date(2025,11,2)), (date(2025,12,20),date(2026,1,4)), |
| (date(2026,2,7),date(2026,2,22)), (date(2026,3,28),date(2026,4,12)), |
| (date(2026,7,4),date(2026,8,31))], |
| }, |
| "2026-2027": { |
| "A": [(date(2026,10,17),date(2026,11,1)), (date(2026,12,19),date(2027,1,3)), |
| (date(2027,2,14),date(2027,3,1)), (date(2027,4,4),date(2027,4,19)), |
| (date(2027,7,3),date(2027,8,31))], |
| "B": [(date(2026,10,17),date(2026,11,1)), (date(2026,12,19),date(2027,1,3)), |
| (date(2027,2,21),date(2027,3,8)), (date(2027,4,11),date(2027,4,26)), |
| (date(2027,7,3),date(2027,8,31))], |
| "C": [(date(2026,10,17),date(2026,11,1)), (date(2026,12,19),date(2027,1,3)), |
| (date(2027,2,7),date(2027,2,22)), (date(2027,3,28),date(2027,4,12)), |
| (date(2027,7,3),date(2027,8,31))], |
| }, |
| } |
|
|
| DR_TO_ZONE = { |
| "Besancon": "A", "Bordeaux": "A", "Clermont-Ferrand": "A", |
| "Dijon": "A", "Grenoble": "A", "Lyon": "A", "Limoges": "A", "Poitiers": "A", |
| "Aix-Marseille": "B", "Amiens": "B", "Caen": "B", "Lille": "B", |
| "Nantes": "B", "Nice": "B", "Orleans-Tours": "B", "Reims": "B", |
| "Rennes": "B", "Rouen": "B", "Strasbourg": "B", |
| "Creteil": "C", "Montpellier": "C", "Nancy-Metz": "C", |
| "Paris": "C", "Toulouse": "C", "Versailles": "C", |
| "AFC": "C", |
| } |
|
|
| def get_zone(dr): return DR_TO_ZONE.get(dr, "C") |
|
|
| def is_vacances(d, zone, vac): |
| for debut, fin in vac.get(zone, []): |
| if debut <= d <= fin: return True |
| return False |
|
|
| def get_annee_scolaire(d): |
| return f"{d.year}-{d.year+1}" if d.month >= 9 else f"{d.year-1}-{d.year}" |
|
|
| def get_periode_vacances(d, vac): |
| for zone in ["A","B","C"]: |
| for debut, fin in vac.get(zone, []): |
| if debut <= d <= fin: |
| m = d.month |
| if m in [10,11]: return "Toussaint" |
| elif m in [12,1]: return "Noel" |
| elif m in [2,3]: return "Hiver" |
| elif m in [4,5]: return "Printemps" |
| elif m in [7,8]: return "Ete" |
| return "Hors_vacances" |
|
|
| def add_vacances(df): |
| df = df.copy() |
| df["Date"] = pd.to_datetime(df["Date"]).dt.tz_localize(None) |
| df["zone_vacances"] = df["DR"].apply(get_zone) |
| df["annee_scolaire"] = df["Date"].apply(lambda d: get_annee_scolaire(d.date())) |
| def _vac(row): |
| d = row["Date"].date() |
| return is_vacances(d, row["zone_vacances"], VACANCES.get(row["annee_scolaire"], {})) |
| def _per(row): |
| d = row["Date"].date() |
| return get_periode_vacances(d, VACANCES.get(row["annee_scolaire"], {})) |
| df["is_vacances_zone"] = df.apply(_vac, axis=1) |
| df["periode_vacances"] = df.apply(_per, axis=1) |
| return df |
|
|
| |
| |
| |
|
|
| def ecart_absolu(y_true, y_pred): |
| return np.mean(np.abs(np.asarray(y_true) - np.asarray(y_pred))) |
|
|
| def ecart_relatif_pct(y_true, y_pred): |
| yt, yp = np.asarray(y_true), np.asarray(y_pred) |
| return np.mean(np.abs((yt - yp) / np.maximum(yt, 1))) * 100 |
|
|
| |
| |
| |
|
|
| def display_md(text): |
| if IPYTHON_OK: |
| display(Markdown(text)) |
| else: |
| print(text) |
|
|
| def display_df(df, title=None): |
| if title: |
| display_md(f"### {title}") |
| if IPYTHON_OK: |
| styled = df.style.set_properties(**{'text-align': 'center'}) |
| styled = styled.set_table_styles([ |
| {'selector': 'th', 'props': [('text-align', 'center'), ('font-weight', 'bold'), ('background-color', '#f0f0f0')]} |
| ]) |
| display(styled) |
| else: |
| print(df.to_string(index=False)) |
|
|
| |
| |
| |
|
|
| def analyse_globale_avant(df): |
| """ |
| Analyse des Γ©carts AVANT correction. |
| Γ exΓ©cuter sur le DataFrame brut (colonne prediction_XGB originale). |
| """ |
| dfp = df[(df["count"] > 0) & (df["prediction_XGB"].notna())].copy() |
| if len(dfp) == 0: |
| display_md("β **Aucune donnΓ©e passΓ©e avec prΓ©diction valide.**") |
| return None |
| |
| mask_v = dfp["is_vacances_zone"] |
| mask_h = ~mask_v |
| |
| rows = [] |
| for mask, label in [(mask_v, "Vacances scolaires"), (mask_h, "Hors vacances")]: |
| sub = dfp[mask] |
| if len(sub) == 0: continue |
| yt, yp = sub["count"].values, sub["prediction_XGB"].values |
| rows.append({ |
| "Periode": label, |
| "Nb_jours": len(sub), |
| "Vol_reel": round(yt.mean(), 1), |
| "Vol_pred": round(yp.mean(), 1), |
| "Surprediction_%": round(((yp.mean() - yt.mean()) / max(yt.mean(), 1)) * 100, 1), |
| "Ecart_Absolu": round(ecart_absolu(yt, yp), 1), |
| "Ecart_Relatif_%": round(ecart_relatif_pct(yt, yp), 1), |
| }) |
| |
| df_res = pd.DataFrame(rows) |
| |
| display_md(""" |
| ## π ΓTAT DES LIEUX β AVANT CORRECTION (donnΓ©es brutes) |
| |
| **ProcΓ©dure :** |
| 1. Identification des jours de vacances scolaires par zone (A/B/C) |
| 2. Comparaison volume rΓ©el d'appels vs prΓ©diction XGBoost |
| 3. MΓ©triques : |
| - **Ecart_Absolu** = erreur moyenne en nombre d'appels/jour |
| - **Ecart_Relatif_%** = erreur moyenne relative (% du volume rΓ©el) |
| 4. **Objectif** : mesurer le biais liΓ© aux vacances scolaires |
| """) |
| |
| display_df(df_res, "π TABLEAU RΓCAPITULATIF AVANT CORRECTION") |
| |
| if len(df_res) >= 2: |
| row_v = df_res[df_res["Periode"] == "Vacances scolaires"].iloc[0] |
| row_h = df_res[df_res["Periode"] == "Hors vacances"].iloc[0] |
| baisse = ((row_v["Vol_reel"] - row_h["Vol_reel"]) / max(row_h["Vol_reel"], 1)) * 100 |
| |
| display_md(f""" |
| ## π DIAGNOSTIC |
| |
| β Pendant les vacances scolaires, le volume **baisse de {abs(baisse):.1f}%** |
| - **{row_v['Vol_reel']:.0f}** appels/jour en vacances |
| - **{row_h['Vol_reel']:.0f}** appels/jour hors vacances |
| |
| β Le modΓ¨le {'**sur-prΓ©dit**' if row_v['Surprediction_%'] > 0 else '**sous-prΓ©dit**'} |
| de **{abs(row_v['Surprediction_%']):.1f}%** en pΓ©riode de vacances |
| β **Biais dΓ©tectΓ©** : le modΓ¨le ne capte pas complΓ¨tement cette baisse |
| |
| β **Ecart_Absolu** = **{row_v['Ecart_Absolu']:.1f}** appels/jour en vacances |
| (marge d'erreur de **{row_v['Ecart_Relatif_%']:.1f}%** du volume rΓ©el) |
| """) |
| |
| return df_res |
|
|
| |
| |
| |
|
|
| def calcule_facteurs_split(df, split_date): |
| """ |
| Calcule les facteurs correcteurs UNIQUEMENT sur les donnΓ©es |
| AVANT split_date (pΓ©riode d'apprentissage). |
| |
| Args: |
| df : DataFrame avec colonnes Date, count, prediction_XGB, is_vacances_zone... |
| split_date : str ou Timestamp, ex: "2024-10-01" |
| |
| Returns: |
| dict facteurs + dict metadata (dates train) |
| """ |
| split_dt = pd.to_datetime(split_date) |
| |
| |
| df_train = df[(df["Date"] < split_dt) & (df["count"] > 0) & (df["prediction_XGB"].notna())].copy() |
| |
| display_md(f""" |
| ### π§ Calcul des facteurs sur TRAIN |
| - PΓ©riode TRAIN : `{df_train['Date'].min().strftime('%Y-%m-%d')}` β `{df_train['Date'].max().strftime('%Y-%m-%d')}` |
| - Nb de jours TRAIN : **{len(df_train)}** |
| """) |
| |
| facteurs = {} |
| m_v = df_train["is_vacances_zone"] |
| |
| |
| if m_v.sum() > 0: |
| facteur_global = df_train.loc[m_v, "count"].mean() / max(df_train.loc[m_v, "prediction_XGB"].mean(), 1) |
| facteurs[("GLOBAL", "ALL")] = facteur_global |
| display_md(f"β **Facteur global** calculΓ© sur TRAIN : `{facteur_global:.4f}`") |
| else: |
| facteur_global = 1.0 |
| display_md("β οΈ Aucun jour de vacances dans la pΓ©riode TRAIN β facteur global = 1.0") |
| |
| |
| n_gran = 0 |
| for zone in ["A", "B", "C"]: |
| for st in df_train["sous_type_accueil"].dropna().unique(): |
| m = (df_train["zone_vacances"]==zone) & (df_train["sous_type_accueil"]==st) & df_train["is_vacances_zone"] |
| if m.sum() < 3: |
| continue |
| f = df_train.loc[m, "count"].mean() / max(df_train.loc[m, "prediction_XGB"].mean(), 1) |
| facteurs[(zone, st)] = f |
| n_gran += 1 |
| |
| display_md(f"β **{n_gran}** facteurs granulaires (zone Γ sous-type) calculΓ©s") |
| |
| return facteurs |
|
|
| |
| |
| |
|
|
| def applique_correction(df, facteurs): |
| """ |
| Applique les facteurs correcteurs sur TOUT le DataFrame. |
| Les facteurs sont stockΓ©s dans facteurs (issus du TRAIN). |
| """ |
| df = df.copy() |
| df["prediction_XGB_corrige"] = df["prediction_XGB"].astype(float) |
| m_v = df["is_vacances_zone"] |
| |
| for zone in ["A", "B", "C"]: |
| for st in df["sous_type_accueil"].dropna().unique(): |
| m = m_v & (df["zone_vacances"]==zone) & (df["sous_type_accueil"]==st) |
| if not m.any(): continue |
| f = facteurs.get((zone, st), facteurs.get(("GLOBAL","ALL"), 1.0)) |
| df.loc[m, "prediction_XGB_corrige"] = df.loc[m, "prediction_XGB"] * f |
| |
| display_md("β
Correction appliquΓ©e sur toutes les lignes (futures + passΓ©es)") |
| return df |
|
|
| |
| |
| |
|
|
| def evalue_correction_split(df, split_date): |
| """ |
| Γvalue la correction UNIQUEMENT sur les donnΓ©es APRES split_date |
| oΓΉ count > 0 (donnΓ©es passΓ©es non vues lors du calcul du facteur). |
| """ |
| split_dt = pd.to_datetime(split_date) |
| |
| |
| df_test = df[(df["Date"] >= split_dt) & (df["count"] > 0) & (df["prediction_XGB"].notna())].copy() |
| |
| if len(df_test) == 0: |
| display_md("β οΈ **Aucune donnΓ©e de test avec count > 0 aprΓ¨s split_date.**\n" |
| "β Essaye une split_date plus ancienne, ou vΓ©rifie que tu as des donnΓ©es rΓ©elles post-split.") |
| return None |
| |
| rows = [] |
| for label, mask in [ |
| ("Toutes_periodes", pd.Series([True]*len(df_test), index=df_test.index)), |
| ("Vacances", df_test["is_vacances_zone"]), |
| ("Hors_vacances", ~df_test["is_vacances_zone"]), |
| ]: |
| if mask.sum() < 2: continue |
| yt = df_test.loc[mask, "count"].values |
| y_avant = df_test.loc[mask, "prediction_XGB"].values |
| y_apres = df_test.loc[mask, "prediction_XGB_corrige"].values |
| |
| ea_avant = ecart_absolu(yt, y_avant) |
| ea_apres = ecart_absolu(yt, y_apres) |
| er_avant = ecart_relatif_pct(yt, y_avant) |
| er_apres = ecart_relatif_pct(yt, y_apres) |
| gain = ((ea_avant - ea_apres) / max(ea_avant, 1)) * 100 |
| |
| rows.append({ |
| "Periode": label, |
| "Nb_jours": mask.sum(), |
| "Ecart_Absolu_avant": round(ea_avant, 2), |
| "Ecart_Absolu_apres": round(ea_apres, 2), |
| "Gain_Ecart_Absolu_%": round(gain, 1), |
| "Ecart_Relatif_%_avant": round(er_avant, 1), |
| "Ecart_Relatif_%_apres": round(er_apres, 1), |
| }) |
| |
| df_eval = pd.DataFrame(rows) |
| |
| display_md(f""" |
| ## π ΓVALUATION HONNΓTE β PΓRIODE TEST (APRES {split_date.strftime('%Y-%m-%d')}) |
| |
| β οΈ **RΓ¨gle d'or** : les facteurs ont Γ©tΓ© calculΓ©s sur le passΓ© (TRAIN). |
| On Γ©value leur efficacitΓ© sur une pΓ©riode **jamais vue** (TEST). |
| """) |
| |
| display_df(df_eval, "π RΓ©sultats sur TEST") |
| |
| vac_row = df_eval[df_eval["Periode"] == "Vacances"] |
| if len(vac_row) > 0: |
| gain_vac = vac_row.iloc[0]["Gain_Ecart_Absolu_%"] |
| ea_av = vac_row.iloc[0]["Ecart_Absolu_avant"] |
| ea_ap = vac_row.iloc[0]["Ecart_Absolu_apres"] |
| display_md(f""" |
| ## π INTERPRΓTATION SUR TEST |
| |
| β Sur les jours de vacances de la pΓ©riode TEST : |
| - **Ecart_Absolu** passe de **{ea_av:.2f}** β **{ea_ap:.2f}** appels/jour |
| - **Gain de {gain_vac:.1f}%** de prΓ©cision grΓ’ce au post-processing |
| |
| β Hors vacances (TEST) : |
| - Aucune modification β pas de sur-apprentissage du post-processing |
| |
| β **Robustesse** : le facteur calculΓ© sur TRAIN gΓ©nΓ©ralise sur TEST |
| """) |
| else: |
| display_md("βΉοΈ Pas assez de jours de vacances dans la pΓ©riode TEST pour Γ©valuer spΓ©cifiquement.") |
| |
| return df_eval |
|
|
| |
| |
| |
|
|
| def pipeline_split(df, split_date): |
| """ |
| Pipeline complet : |
| 1. Analyse avant correction (tout l'historique) |
| 2. Calcule facteurs sur TRAIN (< split_date) |
| 3. Applique correction sur tout le df |
| 4. Γvalue sur TEST (>= split_date) |
| """ |
| split_dt = pd.to_datetime(split_date) |
| |
| display_md(f""" |
| π΅ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββπ΅ |
| ## PIPELINE SPLIT β TRAIN/TEST |
| - **TRAIN** : dates < `{split_dt.strftime('%Y-%m-%d')}` β calcul des facteurs |
| - **TEST** : dates β₯ `{split_dt.strftime('%Y-%m-%d')}` β Γ©valuation honnΓͺte |
| π΅ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββπ΅ |
| """) |
| |
| |
| _ = analyse_globale_avant(df) |
| |
| |
| facteurs = calcule_facteurs_split(df, split_date) |
| |
| |
| df = applique_correction(df, facteurs) |
| |
| |
| df_eval = evalue_correction_split(df, split_date) |
| |
| return df, facteurs, df_eval |
|
|
| |
| |
| |
| |
| |
| |
|
|