| """ |
| make_edgelists.py |
| |
| Create a canonical edgelist (or a directory of edgelists). |
| |
| Usage |
| ----- |
| python make_edgelists.py [--data_root <root>] <dataset_name> <edges_out> |
| |
| Arguments |
| --------- |
| dataset_name |
| The name of the PyG dataset (e.g. "Cora", "TUDatasetName", etc.). |
| edges_out |
| * If the dataset contains a single graph (e.g. Planetoid Cora) – this is a |
| file path (`graph.txt`, `edges.txt`, …). |
| * If the dataset contains many graphs (e.g. TUDataset) – this is a |
| directory path where each graph is written as |
| `graph_000000.txt`, `graph_000001.txt`, … |
| |
| Examples |
| -------- |
| # One‑graph dataset (Planetoid Cora) |
| python make_edgelists.py Cora ./cora_edges.txt |
| |
| # Many‑graph dataset (TUDataset Facebook) |
| python make_edgelists.py Facebook ./facebook_edgelists |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| from pathlib import Path |
| from typing import Iterable, Tuple, Set |
|
|
| |
|
|
| def canonical_edges(edge_index) -> Set[Tuple[int, int]]: |
| """Return a set of undirected (u,v) pairs with u<v and u!=v.""" |
| seen: Set[Tuple[int, int]] = set() |
| for u, v in edge_index.t().tolist(): |
| if u == v: |
| continue |
| if u > v: |
| u, v = v, u |
| seen.add((u, v)) |
| return seen |
|
|
|
|
| def write_edges(out_file: Path, edges: Iterable[Tuple[int, int]]) -> None: |
| """Write `u v` per line to `out_file`.""" |
| out_file.parent.mkdir(parents=True, exist_ok=True) |
| with out_file.open("w") as f: |
| for u, v in sorted(edges): |
| f.write(f"{u} {v}\n") |
|
|
|
|
| def process_planetoid_dataset(root: Path, name: str, out_dir: Path | Path): |
| """Planetoid datasets contain a single graph.""" |
| from torch_geometric.datasets import Planetoid |
|
|
| ds = Planetoid(root=str(root), name=name) |
| data = ds[0] |
| edges = canonical_edges(data.edge_index) |
|
|
| if isinstance(out_dir, Path) and out_dir.is_dir(): |
| out_file = out_dir / "graph_000000.txt" |
| else: |
| out_file = out_dir |
|
|
| write_edges(out_file, edges) |
| |
|
|
|
|
| def process_tudataset(root: Path, name: str, out_dir: Path): |
| """TUDataset may contain many graphs – write each to <out_dir>/graph_XXXXXX.txt.""" |
| from torch_geometric.datasets import TUDataset |
|
|
| ds = TUDataset(root=str(root), name=name) |
| out_dir.mkdir(parents=True, exist_ok=True) |
|
|
| for i, data in enumerate(ds): |
| edges = canonical_edges(data.edge_index) |
| out_file = out_dir / f"graph_{i:06d}.txt" |
| write_edges(out_file, edges) |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser(description=__doc__.strip(), formatter_class=argparse.RawTextHelpFormatter) |
| parser.add_argument( |
| "--data_root", default="./data", help="Root directory for PyG datasets" |
| ) |
| parser.add_argument("dataset_name", help="PyG dataset name (e.g. Cora)") |
| parser.add_argument( |
| "edges_out", |
| help=( |
| "File path (for single‑graph datasets) or directory " |
| "(for multi‑graph datasets) to write the canonical edgelist(s)" |
| ), |
| ) |
| args = parser.parse_args() |
|
|
| root = Path(args.data_root) |
| out_path = Path(args.edges_out) |
|
|
| |
| |
| try: |
| from torch_geometric.datasets import Planetoid |
|
|
| _ = Planetoid(root=str(root), name=args.dataset_name) |
| dataset_type = "Planetoid" |
| except Exception: |
| from torch_geometric.datasets import TUDataset |
|
|
| _ = TUDataset(root=str(root), name=args.dataset_name) |
| dataset_type = "TUDataset" |
|
|
| |
| if dataset_type == "Planetoid": |
| process_planetoid_dataset(root, args.dataset_name, out_path) |
| else: |
| if out_path.is_file(): |
| raise ValueError( |
| "For multi‑graph datasets (e.g. TUDataset) the output must be a directory" |
| ) |
| process_tudataset(root, args.dataset_name, out_path) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|