| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import logging |
| from pathlib import Path |
| from typing import Any, Dict, Iterable, Mapping |
|
|
| import pandas as pd |
|
|
| from src import data_prep |
|
|
| LOGGER = logging.getLogger(__name__) |
|
|
|
|
| DEFAULT_META_CONFIG: Dict[str, Dict[str, Any]] = { |
| "14_EU.csv": { |
| "type_scrutin": "europeennes", |
| "date_scrutin": "2014-05-25", |
| "tour_column": "N° tour", |
| "code_bv_cols": ["Code de la commune", "N° de bureau de vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Votants": "votants", |
| "Exprimés": "exprimes", |
| "Exprimés": "exprimes", |
| "Nombre de voix du candidat": "voix", |
| "Voix": "voix", |
| "Nom du candidat": "nom_candidature", |
| "Prénom du candidat": "nom_candidature", |
| "Code nuance du candidat": "code_candidature", |
| }, |
| }, |
| "14_MN14_T1T2.csv": { |
| "type_scrutin": "municipales", |
| "date_scrutin": "2014-03-23", |
| "tour_column": "N° tour", |
| "code_bv_cols": ["Code commune", "N° de bureau de vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Votants": "votants", |
| "Exprimés": "exprimes", |
| "Nombre de voix": "voix", |
| "Nom du candidat tête de liste": "nom_candidature", |
| "Prénom du candidat tête de liste": "nom_candidature", |
| "Code nuance de la liste": "code_candidature", |
| }, |
| }, |
| "17_L_T1.csv": { |
| "type_scrutin": "legislatives", |
| "date_scrutin": "2017-06-11", |
| "tour": 1, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nuance": "code_candidature", |
| "Nom": "nom_candidature", |
| }, |
| }, |
| "17_L_T2.csv": { |
| "type_scrutin": "legislatives", |
| "date_scrutin": "2017-06-18", |
| "tour": 2, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nuance": "code_candidature", |
| "Nom": "nom_candidature", |
| }, |
| }, |
| "17_PR_T1.csv": { |
| "type_scrutin": "presidentielles", |
| "date_scrutin": "2017-04-23", |
| "tour": 1, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nom": "nom_candidature", |
| "Code nuance du candidat": "code_candidature", |
| }, |
| }, |
| "17_PR_T2.csv": { |
| "type_scrutin": "presidentielles", |
| "date_scrutin": "2017-05-07", |
| "tour": 2, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nom": "nom_candidature", |
| "Code nuance du candidat": "code_candidature", |
| }, |
| }, |
| "19_EU.csv": { |
| "type_scrutin": "europeennes", |
| "date_scrutin": "2019-05-26", |
| "tour": 1, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nom Tête de Liste": "nom_candidature", |
| "Nuance Liste": "code_candidature", |
| }, |
| }, |
| "20_MN_T1.csv": { |
| "type_scrutin": "municipales", |
| "date_scrutin": "2020-03-15", |
| "tour": 1, |
| "sep": ";", |
| "code_bv_cols": ["Code de la commune", "Code B.Vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nom": "nom_candidature", |
| "Liste": "nom_candidature", |
| "Code Nuance": "code_candidature", |
| }, |
| }, |
| "20_MN_T2.csv": { |
| "type_scrutin": "municipales", |
| "date_scrutin": "2020-06-28", |
| "tour": 2, |
| "code_bv_cols": ["Code de la commune", "Code B.Vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nom": "nom_candidature", |
| "Liste": "nom_candidature", |
| "Code Nuance": "code_candidature", |
| }, |
| }, |
| "21_DEP_T1.csv": { |
| "type_scrutin": "departementales", |
| "date_scrutin": "2021-06-20", |
| "tour": 1, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nuance": "code_candidature", |
| "Binôme": "nom_candidature", |
| }, |
| }, |
| "21_DEP_T2.csv": { |
| "type_scrutin": "departementales", |
| "date_scrutin": "2021-06-27", |
| "tour": 2, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nuance": "code_candidature", |
| "Binôme": "nom_candidature", |
| }, |
| }, |
| "21_REG_T1.csv": { |
| "type_scrutin": "regionales", |
| "date_scrutin": "2021-06-20", |
| "tour": 1, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nuance Liste": "code_candidature", |
| "Libellé Abrégé Liste": "nom_candidature", |
| }, |
| }, |
| "21_REG_T2.csv": { |
| "type_scrutin": "regionales", |
| "date_scrutin": "2021-06-27", |
| "tour": 2, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nuance Liste": "code_candidature", |
| "Libellé Abrégé Liste": "nom_candidature", |
| }, |
| }, |
| "22_L_T1.csv": { |
| "type_scrutin": "legislatives", |
| "date_scrutin": "2022-06-12", |
| "tour": 1, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nuance": "code_candidature", |
| "Nom": "nom_candidature", |
| }, |
| }, |
| "22_L_T2.csv": { |
| "type_scrutin": "legislatives", |
| "date_scrutin": "2022-06-19", |
| "tour": 2, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nuance": "code_candidature", |
| "Nom": "nom_candidature", |
| }, |
| }, |
| "22_PR_T1.csv": { |
| "type_scrutin": "presidentielles", |
| "date_scrutin": "2022-04-10", |
| "tour": 1, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nom": "nom_candidature", |
| "Code nuance du candidat": "code_candidature", |
| }, |
| }, |
| "22_PR_T2.csv": { |
| "type_scrutin": "presidentielles", |
| "date_scrutin": "2022-04-24", |
| "tour": 2, |
| "code_bv_cols": ["Code de la commune", "Code du b.vote"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nom": "nom_candidature", |
| "Code nuance du candidat": "code_candidature", |
| }, |
| }, |
| "24_EU.csv": { |
| "type_scrutin": "europeennes", |
| "date_scrutin": "2024-06-09", |
| "tour": 1, |
| "code_bv_cols": ["Code commune", "Code BV"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix 1": "voix", |
| "Voix": "voix", |
| "Nuance liste 1": "code_candidature", |
| "Libellé abrégé de liste 1": "nom_candidature", |
| }, |
| }, |
| "24_L_T1.csv": { |
| "type_scrutin": "legislatives", |
| "date_scrutin": "2024-06-30", |
| "tour": 1, |
| "code_bv_cols": ["Code commune", "Code BV"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nuance Liste": "code_candidature", |
| "Libellé Abrégé Liste": "nom_candidature", |
| "Binôme": "nom_candidature", |
| }, |
| }, |
| "24_L_T2.csv": { |
| "type_scrutin": "legislatives", |
| "date_scrutin": "2024-07-07", |
| "tour": 2, |
| "code_bv_cols": ["Code commune", "Code BV"], |
| "rename_map": { |
| "Inscrits": "inscrits", |
| "Abstentions": "abstentions", |
| "Votants": "votants", |
| "Blancs": "blancs", |
| "Nuls": "nuls", |
| "Exprimés": "exprimes", |
| "Voix": "voix", |
| "Nuance Liste": "code_candidature", |
| "Libellé Abrégé Liste": "nom_candidature", |
| "Binôme": "nom_candidature", |
| }, |
| }, |
| } |
|
|
| DEFAULT_META_CONFIG_PATH = Path(__file__).resolve().parents[2] / "config" / "raw_sources.yaml" |
|
|
|
|
| def _resolve_meta_config(raw: Mapping[str, Mapping[str, Any]]) -> Dict[str, Dict[str, Any]]: |
| resolved: Dict[str, Dict[str, Any]] = {} |
|
|
| def resolve_one(key: str, stack: list[str]) -> Dict[str, Any]: |
| if key in resolved: |
| return resolved[key] |
| if key in stack: |
| raise ValueError(f"Cycle detecte dans meta-config: {' -> '.join(stack + [key])}") |
| meta = dict(raw[key]) |
| base_key = meta.pop("copy_from", None) |
| if base_key: |
| if base_key not in raw: |
| raise KeyError(f"copy_from cible introuvable: {base_key}") |
| base = resolve_one(base_key, stack + [key]) |
| merged = dict(base) |
| rename_base = dict(base.get("rename_map", {})) |
| rename_override = dict(meta.get("rename_map", {})) |
| merged.update(meta) |
| if rename_base or rename_override: |
| merged["rename_map"] = {**rename_base, **rename_override} |
| resolved[key] = merged |
| else: |
| resolved[key] = meta |
| return resolved[key] |
|
|
| for name in raw: |
| resolve_one(name, []) |
| return resolved |
|
|
|
|
| def load_meta_config(meta_path: Path | None) -> Dict[str, Dict[str, Any]]: |
| if meta_path is None: |
| if DEFAULT_META_CONFIG_PATH.exists(): |
| meta_path = DEFAULT_META_CONFIG_PATH |
| else: |
| return DEFAULT_META_CONFIG |
| if not meta_path.exists(): |
| raise FileNotFoundError(f"Meta-config file not found: {meta_path}") |
| if meta_path.suffix in {".yml", ".yaml"}: |
| try: |
| import yaml |
| except Exception as exc: |
| raise RuntimeError("PyYAML is required to read YAML meta-config files.") from exc |
| raw = yaml.safe_load(meta_path.read_text()) or {} |
| else: |
| raw = json.loads(meta_path.read_text()) |
| if not isinstance(raw, dict): |
| raise ValueError("Meta-config invalide: attendu un mapping de fichiers vers meta-donnees.") |
| return _resolve_meta_config(raw) |
|
|
|
|
| def preprocess_all(raw_dir: Path, output_dir: Path, meta_config: Mapping[str, Mapping[str, Any]]) -> pd.DataFrame: |
| frames = [] |
| missing: list[str] = [] |
| for file_name, meta in meta_config.items(): |
| path = raw_dir / file_name |
| if not path.exists(): |
| missing.append(file_name) |
| continue |
| LOGGER.info("Standardisation de %s", file_name) |
| df_std = data_prep.standardize_election( |
| path, |
| meta, |
| rename_map=meta.get("rename_map", {}), |
| sep=meta.get("sep", ";"), |
| encoding=meta.get("encoding", ("cp1252", "utf-8-sig", "latin-1")), |
| decimal=meta.get("decimal", ","), |
| ) |
| frames.append(df_std) |
| if missing: |
| LOGGER.warning("Fichiers manquants ignorés: %s", ", ".join(sorted(missing))) |
| if not frames: |
| raise RuntimeError("Aucune donnée chargée : vérifier le dossier raw et la configuration meta.") |
|
|
| elections_long = pd.concat(frames, ignore_index=True) |
| elections_long["date_scrutin"] = pd.to_datetime(elections_long["date_scrutin"]) |
| elections_long["annee"] = elections_long["date_scrutin"].dt.year |
| elections_long["type_scrutin"] = elections_long["type_scrutin"].str.lower() |
| elections_long["code_commune"] = elections_long["code_bv"].astype(str).str.split("-").str[0] |
|
|
| issues = data_prep.validate_consistency(elections_long) |
| for name, df_issue in issues.items(): |
| if len(df_issue) > 0: |
| LOGGER.warning("%s : %s lignes a inspecter", name, len(df_issue)) |
|
|
| output_dir.mkdir(parents=True, exist_ok=True) |
| parquet_path = output_dir / "elections_long.parquet" |
| csv_path = output_dir / "elections_long.csv" |
| elections_long.to_parquet(parquet_path, index=False) |
| elections_long.to_csv(csv_path, sep=";", index=False) |
| LOGGER.info("Long format sauvegarde (%s lignes) -> %s / %s", len(elections_long), parquet_path, csv_path) |
| return elections_long |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="Prétraitement des fichiers bruts en format long standardisé.") |
| parser.add_argument("--raw-dir", type=Path, default=Path("data/raw"), help="Répertoire des fichiers bruts CSV.") |
| parser.add_argument("--output-dir", type=Path, default=Path("data/interim"), help="Destination du format long harmonisé.") |
| parser.add_argument( |
| "--meta-config", |
| type=Path, |
| default=None, |
| help="Chemin vers un fichier JSON/YAML décrivant les meta-données des scrutins. Par défaut, utilise la configuration embarquée.", |
| ) |
| return parser.parse_args() |
|
|
|
|
| def main() -> None: |
| logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") |
| args = parse_args() |
| meta_config = load_meta_config(args.meta_config) |
| preprocess_all(args.raw_dir, args.output_dir, meta_config) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|