clique / src /export_edgelist.py

qingy2024

Upload folder using huggingface_hub

f74dd01 verified 7 months ago

4.23 kB

	"""
	make_edgelists.py

	Create a canonical edgelist (or a directory of edgelists).

	Usage
	-----
	python make_edgelists.py [--data_root <root>] <dataset_name> <edges_out>

	Arguments
	---------
	dataset_name
	The name of the PyG dataset (e.g. "Cora", "TUDatasetName", etc.).
	edges_out
	* If the dataset contains a single graph (e.g. Planetoid Cora) – this is a
	file path (`graph.txt`, `edges.txt`, …).
	* If the dataset contains many graphs (e.g. TUDataset) – this is a
	directory path where each graph is written as
	`graph_000000.txt`, `graph_000001.txt`, …

	Examples
	--------
	# One‑graph dataset (Planetoid Cora)
	python make_edgelists.py Cora ./cora_edges.txt

	# Many‑graph dataset (TUDataset Facebook)
	python make_edgelists.py Facebook ./facebook_edgelists
	"""

	from __future__ import annotations

	import argparse
	from pathlib import Path
	from typing import Iterable, Tuple, Set

	# -------------------------------------------------------------

	def canonical_edges(edge_index) -> Set[Tuple[int, int]]:
	"""Return a set of undirected (u,v) pairs with u<v and u!=v."""
	seen: Set[Tuple[int, int]] = set()
	for u, v in edge_index.t().tolist():
	if u == v:
	continue
	if u > v:
	u, v = v, u
	seen.add((u, v))
	return seen


	def write_edges(out_file: Path, edges: Iterable[Tuple[int, int]]) -> None:
	"""Write `u v` per line to `out_file`."""
	out_file.parent.mkdir(parents=True, exist_ok=True)
	with out_file.open("w") as f:
	for u, v in sorted(edges):
	f.write(f"{u} {v}\n")


	def process_planetoid_dataset(root: Path, name: str, out_dir: Path \| Path):
	"""Planetoid datasets contain a single graph."""
	from torch_geometric.datasets import Planetoid

	ds = Planetoid(root=str(root), name=name)
	data = ds[0] # the only graph
	edges = canonical_edges(data.edge_index)

	if isinstance(out_dir, Path) and out_dir.is_dir():
	out_file = out_dir / "graph_000000.txt"
	else:
	out_file = out_dir

	write_edges(out_file, edges)
	# No output to stdout – the edgelist(s) are written to disk


	def process_tudataset(root: Path, name: str, out_dir: Path):
	"""TUDataset may contain many graphs – write each to <out_dir>/graph_XXXXXX.txt."""
	from torch_geometric.datasets import TUDataset

	ds = TUDataset(root=str(root), name=name)
	out_dir.mkdir(parents=True, exist_ok=True)

	for i, data in enumerate(ds):
	edges = canonical_edges(data.edge_index)
	out_file = out_dir / f"graph_{i:06d}.txt"
	write_edges(out_file, edges)


	def main() -> None:
	parser = argparse.ArgumentParser(description=__doc__.strip(), formatter_class=argparse.RawTextHelpFormatter)
	parser.add_argument(
	"--data_root", default="./data", help="Root directory for PyG datasets"
	)
	parser.add_argument("dataset_name", help="PyG dataset name (e.g. Cora)")
	parser.add_argument(
	"edges_out",
	help=(
	"File path (for single‑graph datasets) or directory "
	"(for multi‑graph datasets) to write the canonical edgelist(s)"
	),
	)
	args = parser.parse_args()

	root = Path(args.data_root)
	out_path = Path(args.edges_out)

	# We try to guess whether the requested dataset is a Planetoid or TUDataset.
	# If it can be loaded as a Planetoid we use that; otherwise we fall back to TUDataset.
	try:
	from torch_geometric.datasets import Planetoid

	_ = Planetoid(root=str(root), name=args.dataset_name)
	dataset_type = "Planetoid"
	except Exception: # pragma: no cover – normal branch failure
	from torch_geometric.datasets import TUDataset

	_ = TUDataset(root=str(root), name=args.dataset_name)
	dataset_type = "TUDataset"

	# Dispatch
	if dataset_type == "Planetoid":
	process_planetoid_dataset(root, args.dataset_name, out_path)
	else: # TUDataset
	if out_path.is_file():
	raise ValueError(
	"For multi‑graph datasets (e.g. TUDataset) the output must be a directory"
	)
	process_tudataset(root, args.dataset_name, out_path)


	if __name__ == "__main__":
	main()