clique / src /old /make_proteins_edgelists_and_seeds.py

qingy2024

Upload folder using huggingface_hub

f74dd01 verified 7 months ago

7.66 kB

	# make_proteins_edgelists_and_seeds.py
	# Originally exported PROTEINS/Cora edgelists and called a Java LRMC seeder.
	# Extended to canonicalize edgelists (0-indexed, undirected unique edges) and
	# to call the Java Reddit_streamsafe seeder for a single file as well.

	import argparse, subprocess
	from pathlib import Path
	from typing import Tuple, Iterable
	from torch_geometric.datasets import TUDataset, Planetoid

	from rich import print


	def export_edgelists(root: Path, out_dir: Path):
	ds = Planetoid(root=str(root), name='Cora')
	out_dir.mkdir(parents=True, exist_ok=True)
	for i, data in enumerate(ds):
	with (out_dir / f"graph_{i:06d}.txt").open('w') as f:
	for u, v in data.edge_index.t().tolist():
	if (v > u):
	f.write(f"{u} {v}\n")
	print(f"[export] wrote {len(ds)} edge lists to {out_dir}")


	def run_java_seeder(edges_dir: Path, seeds_dir: Path, alpha="DIAM", eps="1e-6"):
	seeds_dir.mkdir(parents=True, exist_ok=True)
	# Use LRMCseedsProteins_streamsafe2 in directory mode (after our patch)
	subprocess.run([
	"javac", "LRMCseedsProteins_streamsafe2.java", "clique2_ablations_parallel2.java"
	], check=True)
	subprocess.run([
	"java", "LRMCseedsProteins_streamsafe2", str(edges_dir), str(seeds_dir), alpha, eps
	], check=True)


	def canonicalize_edgelist_file(in_path: Path, out_path: Path, input_is_one_indexed: bool = False) -> Tuple[int, int]:
	"""
	Read an edgelist file with lines "u v" possibly 1-indexed, possibly containing
	duplicates and/or both directions, and write a canonical 0-indexed undirected
	edgelist with exactly one line per edge (u < v) and no self-loops.
	Returns (n_nodes_approx, n_edges_out).
	"""
	seen = set()
	max_id = -1
	with in_path.open('r') as f:
	for line in f:
	s = line.strip()
	if not s or s.startswith('#'):
	continue
	parts = s.replace(',', ' ').split()
	if len(parts) < 2:
	continue
	try:
	u = int(parts[0]); v = int(parts[1])
	except ValueError:
	continue
	if input_is_one_indexed:
	u -= 1; v -= 1
	if u == v:
	continue
	if u > v:
	u, v = v, u
	seen.add((u, v))
	if u > max_id: max_id = u
	if v > max_id: max_id = v
	edges = sorted(seen)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	with out_path.open('w') as w:
	for u, v in edges:
	w.write(f"{u} {v}\n")
	return (max_id + 1, len(edges))


	def run_java_seeder_single(edges_file: Path, seeds_out: Path, alpha: str = "DIAM", eps: str = "1e-6"):
	"""Compile and run the Reddit_streamsafe Java seeder on a single canonical edgelist."""
	subprocess.run([
	"javac", "LRMCseedsReddit_streamsafe.java", "clique2_ablations_parallel2.java"
	], check=True)
	subprocess.run([
	"java", "LRMCseedsReddit_streamsafe", str(edges_file), str(seeds_out), alpha, eps
	], check=True)


	def export_cora_edgelist_from_content_cites(content: Path, cites: Path, out_path: Path) -> Tuple[int, int]:
	"""
	Export a canonical 0-indexed undirected edgelist (u < v) from Cora's
	content/cites files using the exact node-id mapping that the Java loader
	uses (insertion order from cora.content).
	Returns (n_nodes, m_edges).
	"""
	id2idx = {}
	# Pass 1: build mapping from content (insertion order)
	with content.open('r', encoding='utf-8') as f:
	for line in f:
	s = line.strip()
	if not s:
	continue
	parts = s.split()
	paper_id = parts[0]
	if paper_id not in id2idx:
	id2idx[paper_id] = len(id2idx)

	seen = set()
	max_idx = -1
	# Pass 2: read cites and map to indices; dedup undirected edges
	with cites.open('r', encoding='utf-8') as f:
	for line in f:
	s = line.strip()
	if not s or s.startswith('#'):
	continue
	parts = s.replace(',', ' ').split()
	if len(parts) < 2:
	continue
	a, b = parts[0], parts[1]
	if a not in id2idx or b not in id2idx:
	continue
	u = id2idx[a]; v = id2idx[b]
	if u == v:
	continue
	if u > v:
	u, v = v, u
	seen.add((u, v))
	if u > max_idx: max_idx = u
	if v > max_idx: max_idx = v

	edges = sorted(seen)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	with out_path.open('w', encoding='utf-8') as w:
	for u, v in edges:
	w.write(f"{u} {v}\n")
	return (len(id2idx), len(edges))


	if __name__ == "__main__":
	ap = argparse.ArgumentParser()
	ap.add_argument("--data_root", type=str, default="./data")
	ap.add_argument("--edges_out", type=str, default="./proteins_edgelists")
	ap.add_argument("--seeds_out", type=str, default="./proteins_seeds")
	ap.add_argument("--alpha", type=str, default="DIAM")
	ap.add_argument("--eps", type=str, default="1e-6")
	ap.add_argument("--no_java", action="store_true")
	# Canonicalization / single-file seeding for Cora edgelists
	ap.add_argument("--canonicalize_in", type=str, default="", help="Input edgelist to canonicalize (optional)")
	ap.add_argument("--canonicalize_out", type=str, default="", help="Output path for canonicalized edgelist")
	ap.add_argument("--one_indexed", action="store_true", help="Treat input as 1-indexed during canonicalization")
	ap.add_argument("--java_single_in", type=str, default="", help="Canonical edgelist to feed into Java seeder (overrides directory mode)")
	ap.add_argument("--java_single_out", type=str, default="", help="Output seeds JSON for single-file Java run")
	ap.add_argument("--cora_content", type=str, default="", help="Path to cora.content (to export canonical edgelist)")
	ap.add_argument("--cora_cites", type=str, default="", help="Path to cora.cites (to export canonical edgelist)")
	ap.add_argument("--export_cora_out", type=str, default="", help="Where to write the canonical Cora edgelist")
	args = ap.parse_args()

	root = Path(args.data_root)
	edges_dir = Path(args.edges_out)
	seeds_dir = Path(args.seeds_out)

	# Optional canonicalization path (e.g., for src/cora/graph_000000.txt)
	# if args.canonicalize_in and args.canonicalize_out:
	# n, m = canonicalize_edgelist_file(Path(args.canonicalize_in), Path(args.canonicalize_out), input_is_one_indexed=args.one_indexed)
	# print(f"[canonicalize] wrote {args.canonicalize_out} (n≈{n}, m={m})")
	#
	# # Optional: export edgelist from Cora content/cites using Java's mapping
	# if args.cora_content and args.cora_cites and args.export_cora_out:
	# n, m = export_cora_edgelist_from_content_cites(Path(args.cora_content), Path(args.cora_cites), Path(args.export_cora_out))
	# print(f"[cora-export] wrote {args.export_cora_out} (n={n}, m={m})")
	#
	# # Optional single-file Java run on a canonical edgelist
	# if args.java_single_in and args.java_single_out:
	# run_java_seeder_single(Path(args.java_single_in), Path(args.java_single_out), args.alpha, args.eps)
	# print(f"[java] wrote seeds to {args.java_single_out}")

	# Original directory-based export + Java for dataset
	# if not args.canonicalize_in and not args.java_single_in:
	export_edgelists(root, edges_dir)
	if not args.no_java:
	run_java_seeder(edges_dir, seeds_dir, args.alpha, args.eps)