| |
| |
| |
| |
|
|
| import argparse, subprocess |
| from pathlib import Path |
| from typing import Tuple, Iterable |
| from torch_geometric.datasets import TUDataset, Planetoid |
|
|
| from rich import print |
|
|
|
|
| def export_edgelists(root: Path, out_dir: Path): |
| ds = Planetoid(root=str(root), name='Cora') |
| out_dir.mkdir(parents=True, exist_ok=True) |
| for i, data in enumerate(ds): |
| with (out_dir / f"graph_{i:06d}.txt").open('w') as f: |
| for u, v in data.edge_index.t().tolist(): |
| if (v > u): |
| f.write(f"{u} {v}\n") |
| print(f"[export] wrote {len(ds)} edge lists to {out_dir}") |
|
|
|
|
| def run_java_seeder(edges_dir: Path, seeds_dir: Path, alpha="DIAM", eps="1e-6"): |
| seeds_dir.mkdir(parents=True, exist_ok=True) |
| |
| subprocess.run([ |
| "javac", "LRMCseedsProteins_streamsafe2.java", "clique2_ablations_parallel2.java" |
| ], check=True) |
| subprocess.run([ |
| "java", "LRMCseedsProteins_streamsafe2", str(edges_dir), str(seeds_dir), alpha, eps |
| ], check=True) |
|
|
|
|
| def canonicalize_edgelist_file(in_path: Path, out_path: Path, input_is_one_indexed: bool = False) -> Tuple[int, int]: |
| """ |
| Read an edgelist file with lines "u v" possibly 1-indexed, possibly containing |
| duplicates and/or both directions, and write a canonical 0-indexed undirected |
| edgelist with exactly one line per edge (u < v) and no self-loops. |
| Returns (n_nodes_approx, n_edges_out). |
| """ |
| seen = set() |
| max_id = -1 |
| with in_path.open('r') as f: |
| for line in f: |
| s = line.strip() |
| if not s or s.startswith('#'): |
| continue |
| parts = s.replace(',', ' ').split() |
| if len(parts) < 2: |
| continue |
| try: |
| u = int(parts[0]); v = int(parts[1]) |
| except ValueError: |
| continue |
| if input_is_one_indexed: |
| u -= 1; v -= 1 |
| if u == v: |
| continue |
| if u > v: |
| u, v = v, u |
| seen.add((u, v)) |
| if u > max_id: max_id = u |
| if v > max_id: max_id = v |
| edges = sorted(seen) |
| out_path.parent.mkdir(parents=True, exist_ok=True) |
| with out_path.open('w') as w: |
| for u, v in edges: |
| w.write(f"{u} {v}\n") |
| return (max_id + 1, len(edges)) |
|
|
|
|
| def run_java_seeder_single(edges_file: Path, seeds_out: Path, alpha: str = "DIAM", eps: str = "1e-6"): |
| """Compile and run the Reddit_streamsafe Java seeder on a single canonical edgelist.""" |
| subprocess.run([ |
| "javac", "LRMCseedsReddit_streamsafe.java", "clique2_ablations_parallel2.java" |
| ], check=True) |
| subprocess.run([ |
| "java", "LRMCseedsReddit_streamsafe", str(edges_file), str(seeds_out), alpha, eps |
| ], check=True) |
|
|
|
|
| def export_cora_edgelist_from_content_cites(content: Path, cites: Path, out_path: Path) -> Tuple[int, int]: |
| """ |
| Export a canonical 0-indexed undirected edgelist (u < v) from Cora's |
| content/cites files using the exact node-id mapping that the Java loader |
| uses (insertion order from cora.content). |
| Returns (n_nodes, m_edges). |
| """ |
| id2idx = {} |
| |
| with content.open('r', encoding='utf-8') as f: |
| for line in f: |
| s = line.strip() |
| if not s: |
| continue |
| parts = s.split() |
| paper_id = parts[0] |
| if paper_id not in id2idx: |
| id2idx[paper_id] = len(id2idx) |
|
|
| seen = set() |
| max_idx = -1 |
| |
| with cites.open('r', encoding='utf-8') as f: |
| for line in f: |
| s = line.strip() |
| if not s or s.startswith('#'): |
| continue |
| parts = s.replace(',', ' ').split() |
| if len(parts) < 2: |
| continue |
| a, b = parts[0], parts[1] |
| if a not in id2idx or b not in id2idx: |
| continue |
| u = id2idx[a]; v = id2idx[b] |
| if u == v: |
| continue |
| if u > v: |
| u, v = v, u |
| seen.add((u, v)) |
| if u > max_idx: max_idx = u |
| if v > max_idx: max_idx = v |
|
|
| edges = sorted(seen) |
| out_path.parent.mkdir(parents=True, exist_ok=True) |
| with out_path.open('w', encoding='utf-8') as w: |
| for u, v in edges: |
| w.write(f"{u} {v}\n") |
| return (len(id2idx), len(edges)) |
|
|
|
|
| if __name__ == "__main__": |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--data_root", type=str, default="./data") |
| ap.add_argument("--edges_out", type=str, default="./proteins_edgelists") |
| ap.add_argument("--seeds_out", type=str, default="./proteins_seeds") |
| ap.add_argument("--alpha", type=str, default="DIAM") |
| ap.add_argument("--eps", type=str, default="1e-6") |
| ap.add_argument("--no_java", action="store_true") |
| |
| ap.add_argument("--canonicalize_in", type=str, default="", help="Input edgelist to canonicalize (optional)") |
| ap.add_argument("--canonicalize_out", type=str, default="", help="Output path for canonicalized edgelist") |
| ap.add_argument("--one_indexed", action="store_true", help="Treat input as 1-indexed during canonicalization") |
| ap.add_argument("--java_single_in", type=str, default="", help="Canonical edgelist to feed into Java seeder (overrides directory mode)") |
| ap.add_argument("--java_single_out", type=str, default="", help="Output seeds JSON for single-file Java run") |
| ap.add_argument("--cora_content", type=str, default="", help="Path to cora.content (to export canonical edgelist)") |
| ap.add_argument("--cora_cites", type=str, default="", help="Path to cora.cites (to export canonical edgelist)") |
| ap.add_argument("--export_cora_out", type=str, default="", help="Where to write the canonical Cora edgelist") |
| args = ap.parse_args() |
|
|
| root = Path(args.data_root) |
| edges_dir = Path(args.edges_out) |
| seeds_dir = Path(args.seeds_out) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| export_edgelists(root, edges_dir) |
| if not args.no_java: |
| run_java_seeder(edges_dir, seeds_dir, args.alpha, args.eps) |
|
|