File size: 2,093 Bytes
e078b1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Any

import pandas as pd

from src.data.utils import load_config


@dataclass
class GCCSource:
    source_id: str
    label: str
    official_url: str
    access_type: str
    local_sample_csv: str
    status: str
    notes: str


def get_gcc_sources(config_path: str | Path = "config.yaml") -> list[GCCSource]:
    cfg = load_config(config_path)
    sources: dict[str, Any] = cfg.get("gcc", {}).get("sources", {})
    return [
        GCCSource(
            source_id=source_id,
            label=meta["label"],
            official_url=meta["official_url"],
            access_type=meta["access_type"],
            local_sample_csv=meta["local_sample_csv"],
            status=meta.get("status", "bundled_sample"),
            notes=meta.get("notes", ""),
        )
        for source_id, meta in sources.items()
    ]


def build_source_manifest(config_path: str | Path = "config.yaml") -> pd.DataFrame:
    rows = [source.__dict__ for source in get_gcc_sources(config_path)]
    return pd.DataFrame(rows)


def load_local_gcc_source(source_id: str, config_path: str | Path = "config.yaml") -> pd.DataFrame:
    for source in get_gcc_sources(config_path):
        if source.source_id == source_id:
            path = Path(source.local_sample_csv)
            if not path.exists():
                raise FileNotFoundError(f"Bundled GCC sample file is missing: {path}")
            df = pd.read_csv(path)
            df["source_id"] = source.source_id
            df["source_label"] = source.label
            df["official_url"] = source.official_url
            return df
    raise ValueError(f"Unknown GCC source: {source_id}")


def load_all_gcc_sources(config_path: str | Path = "config.yaml") -> pd.DataFrame:
    sources = get_gcc_sources(config_path)
    parts = [load_local_gcc_source(source.source_id, config_path=config_path) for source in sources]
    if not parts:
        return pd.DataFrame()
    combined = pd.concat(parts, ignore_index=True)
    return combined