clique / src /old /make_proteins_edgelists_and_seeds.py
qingy2024's picture
Upload folder using huggingface_hub
f74dd01 verified
# make_proteins_edgelists_and_seeds.py
# Originally exported PROTEINS/Cora edgelists and called a Java LRMC seeder.
# Extended to canonicalize edgelists (0-indexed, undirected unique edges) and
# to call the Java Reddit_streamsafe seeder for a single file as well.
import argparse, subprocess
from pathlib import Path
from typing import Tuple, Iterable
from torch_geometric.datasets import TUDataset, Planetoid
from rich import print
def export_edgelists(root: Path, out_dir: Path):
ds = Planetoid(root=str(root), name='Cora')
out_dir.mkdir(parents=True, exist_ok=True)
for i, data in enumerate(ds):
with (out_dir / f"graph_{i:06d}.txt").open('w') as f:
for u, v in data.edge_index.t().tolist():
if (v > u):
f.write(f"{u} {v}\n")
print(f"[export] wrote {len(ds)} edge lists to {out_dir}")
def run_java_seeder(edges_dir: Path, seeds_dir: Path, alpha="DIAM", eps="1e-6"):
seeds_dir.mkdir(parents=True, exist_ok=True)
# Use LRMCseedsProteins_streamsafe2 in directory mode (after our patch)
subprocess.run([
"javac", "LRMCseedsProteins_streamsafe2.java", "clique2_ablations_parallel2.java"
], check=True)
subprocess.run([
"java", "LRMCseedsProteins_streamsafe2", str(edges_dir), str(seeds_dir), alpha, eps
], check=True)
def canonicalize_edgelist_file(in_path: Path, out_path: Path, input_is_one_indexed: bool = False) -> Tuple[int, int]:
"""
Read an edgelist file with lines "u v" possibly 1-indexed, possibly containing
duplicates and/or both directions, and write a canonical 0-indexed undirected
edgelist with exactly one line per edge (u < v) and no self-loops.
Returns (n_nodes_approx, n_edges_out).
"""
seen = set()
max_id = -1
with in_path.open('r') as f:
for line in f:
s = line.strip()
if not s or s.startswith('#'):
continue
parts = s.replace(',', ' ').split()
if len(parts) < 2:
continue
try:
u = int(parts[0]); v = int(parts[1])
except ValueError:
continue
if input_is_one_indexed:
u -= 1; v -= 1
if u == v:
continue
if u > v:
u, v = v, u
seen.add((u, v))
if u > max_id: max_id = u
if v > max_id: max_id = v
edges = sorted(seen)
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open('w') as w:
for u, v in edges:
w.write(f"{u} {v}\n")
return (max_id + 1, len(edges))
def run_java_seeder_single(edges_file: Path, seeds_out: Path, alpha: str = "DIAM", eps: str = "1e-6"):
"""Compile and run the Reddit_streamsafe Java seeder on a single canonical edgelist."""
subprocess.run([
"javac", "LRMCseedsReddit_streamsafe.java", "clique2_ablations_parallel2.java"
], check=True)
subprocess.run([
"java", "LRMCseedsReddit_streamsafe", str(edges_file), str(seeds_out), alpha, eps
], check=True)
def export_cora_edgelist_from_content_cites(content: Path, cites: Path, out_path: Path) -> Tuple[int, int]:
"""
Export a canonical 0-indexed undirected edgelist (u < v) from Cora's
content/cites files using the exact node-id mapping that the Java loader
uses (insertion order from cora.content).
Returns (n_nodes, m_edges).
"""
id2idx = {}
# Pass 1: build mapping from content (insertion order)
with content.open('r', encoding='utf-8') as f:
for line in f:
s = line.strip()
if not s:
continue
parts = s.split()
paper_id = parts[0]
if paper_id not in id2idx:
id2idx[paper_id] = len(id2idx)
seen = set()
max_idx = -1
# Pass 2: read cites and map to indices; dedup undirected edges
with cites.open('r', encoding='utf-8') as f:
for line in f:
s = line.strip()
if not s or s.startswith('#'):
continue
parts = s.replace(',', ' ').split()
if len(parts) < 2:
continue
a, b = parts[0], parts[1]
if a not in id2idx or b not in id2idx:
continue
u = id2idx[a]; v = id2idx[b]
if u == v:
continue
if u > v:
u, v = v, u
seen.add((u, v))
if u > max_idx: max_idx = u
if v > max_idx: max_idx = v
edges = sorted(seen)
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open('w', encoding='utf-8') as w:
for u, v in edges:
w.write(f"{u} {v}\n")
return (len(id2idx), len(edges))
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("--data_root", type=str, default="./data")
ap.add_argument("--edges_out", type=str, default="./proteins_edgelists")
ap.add_argument("--seeds_out", type=str, default="./proteins_seeds")
ap.add_argument("--alpha", type=str, default="DIAM")
ap.add_argument("--eps", type=str, default="1e-6")
ap.add_argument("--no_java", action="store_true")
# Canonicalization / single-file seeding for Cora edgelists
ap.add_argument("--canonicalize_in", type=str, default="", help="Input edgelist to canonicalize (optional)")
ap.add_argument("--canonicalize_out", type=str, default="", help="Output path for canonicalized edgelist")
ap.add_argument("--one_indexed", action="store_true", help="Treat input as 1-indexed during canonicalization")
ap.add_argument("--java_single_in", type=str, default="", help="Canonical edgelist to feed into Java seeder (overrides directory mode)")
ap.add_argument("--java_single_out", type=str, default="", help="Output seeds JSON for single-file Java run")
ap.add_argument("--cora_content", type=str, default="", help="Path to cora.content (to export canonical edgelist)")
ap.add_argument("--cora_cites", type=str, default="", help="Path to cora.cites (to export canonical edgelist)")
ap.add_argument("--export_cora_out", type=str, default="", help="Where to write the canonical Cora edgelist")
args = ap.parse_args()
root = Path(args.data_root)
edges_dir = Path(args.edges_out)
seeds_dir = Path(args.seeds_out)
# Optional canonicalization path (e.g., for src/cora/graph_000000.txt)
# if args.canonicalize_in and args.canonicalize_out:
# n, m = canonicalize_edgelist_file(Path(args.canonicalize_in), Path(args.canonicalize_out), input_is_one_indexed=args.one_indexed)
# print(f"[canonicalize] wrote {args.canonicalize_out} (n≈{n}, m={m})")
#
# # Optional: export edgelist from Cora content/cites using Java's mapping
# if args.cora_content and args.cora_cites and args.export_cora_out:
# n, m = export_cora_edgelist_from_content_cites(Path(args.cora_content), Path(args.cora_cites), Path(args.export_cora_out))
# print(f"[cora-export] wrote {args.export_cora_out} (n={n}, m={m})")
#
# # Optional single-file Java run on a canonical edgelist
# if args.java_single_in and args.java_single_out:
# run_java_seeder_single(Path(args.java_single_in), Path(args.java_single_out), args.alpha, args.eps)
# print(f"[java] wrote seeds to {args.java_single_out}")
# Original directory-based export + Java for dataset
# if not args.canonicalize_in and not args.java_single_in:
export_edgelists(root, edges_dir)
if not args.no_java:
run_java_seeder(edges_dir, seeds_dir, args.alpha, args.eps)