CUD-Traffic-AI / src /data /gcc_sources.py
Rajeev Ranjan Pandey
Refine UI and fix model selection bugs
e078b1d
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import pandas as pd
from src.data.utils import load_config
@dataclass
class GCCSource:
source_id: str
label: str
official_url: str
access_type: str
local_sample_csv: str
status: str
notes: str
def get_gcc_sources(config_path: str | Path = "config.yaml") -> list[GCCSource]:
cfg = load_config(config_path)
sources: dict[str, Any] = cfg.get("gcc", {}).get("sources", {})
return [
GCCSource(
source_id=source_id,
label=meta["label"],
official_url=meta["official_url"],
access_type=meta["access_type"],
local_sample_csv=meta["local_sample_csv"],
status=meta.get("status", "bundled_sample"),
notes=meta.get("notes", ""),
)
for source_id, meta in sources.items()
]
def build_source_manifest(config_path: str | Path = "config.yaml") -> pd.DataFrame:
rows = [source.__dict__ for source in get_gcc_sources(config_path)]
return pd.DataFrame(rows)
def load_local_gcc_source(source_id: str, config_path: str | Path = "config.yaml") -> pd.DataFrame:
for source in get_gcc_sources(config_path):
if source.source_id == source_id:
path = Path(source.local_sample_csv)
if not path.exists():
raise FileNotFoundError(f"Bundled GCC sample file is missing: {path}")
df = pd.read_csv(path)
df["source_id"] = source.source_id
df["source_label"] = source.label
df["official_url"] = source.official_url
return df
raise ValueError(f"Unknown GCC source: {source_id}")
def load_all_gcc_sources(config_path: str | Path = "config.yaml") -> pd.DataFrame:
sources = get_gcc_sources(config_path)
parts = [load_local_gcc_source(source.source_id, config_path=config_path) for source in sources]
if not parts:
return pd.DataFrame()
combined = pd.concat(parts, ignore_index=True)
return combined