| |
| |
| import argparse |
| import glob |
| import json |
| import os |
| import re |
| import shutil |
| import subprocess |
| from collections import defaultdict |
| from typing import Dict, List, Tuple, Iterable, Set |
|
|
| def ensure_dir(p: str): |
| os.makedirs(p, exist_ok=True) |
|
|
| def read_edgelist(path: str) -> Iterable[Tuple[int, int]]: |
| with open(path, 'r') as f: |
| for line in f: |
| s = line.strip() |
| if not s or s.startswith('#'): |
| continue |
| parts = s.split() |
| if len(parts) < 2: |
| continue |
| try: |
| u = int(parts[0]); v = int(parts[1]) |
| except ValueError: |
| continue |
| if u == v: |
| continue |
| a, b = (u, v) if u < v else (v, u) |
| yield a, b |
|
|
| def write_edgelist(path: str, edges: Iterable[Tuple[int, int]]): |
| with open(path, 'w') as f: |
| for u, v in edges: |
| f.write(f"{u} {v}\n") |
|
|
| def parse_seeds(path: str) -> Tuple[Dict[int, int], List[int]]: |
| """ |
| Return (node_to_cluster_index, sorted_cluster_ids). |
| The cluster indices are 0..C-1, sorted by cluster_id. |
| - On overlapping membership, choose the cluster with higher 'score', then smaller cluster_id. |
| """ |
| with open(path, 'r') as f: |
| js = json.load(f) |
| clusters = js.get('clusters', []) |
| |
| clusters_sorted = sorted(clusters, key=lambda c: c.get('cluster_id', 0)) |
| cluster_id_list = [c.get('cluster_id', i) for i, c in enumerate(clusters_sorted)] |
| cluster_id_to_idx = {cid: i for i, cid in enumerate(cluster_id_list)} |
|
|
| |
| node_choice: Dict[int, Tuple[int, float, int]] = {} |
|
|
| for c in clusters_sorted: |
| cid = c.get('cluster_id', None) |
| if cid is None: |
| continue |
| idx = cluster_id_to_idx[cid] |
| members = c.get('members', []) |
| score = float(c.get('score', 0.0)) |
| for u in members: |
| prev = node_choice.get(u, None) |
| if prev is None or (score > prev[1]) or (score == prev[1] and cid < prev[2]): |
| node_choice[u] = (idx, score, cid) |
|
|
| node_to_cluster = {u: idx for u, (idx, score, cid) in node_choice.items()} |
| return node_to_cluster, cluster_id_list |
|
|
| def coarsen_edgelist(prev_edgelist: str, seeds_json: str, out_edgelist: str) -> int: |
| node_to_cluster, cluster_id_list = parse_seeds(seeds_json) |
| edges_set: Set[Tuple[int, int]] = set() |
| missing_nodes = 0 |
| for u, v in read_edgelist(prev_edgelist): |
| cu = node_to_cluster.get(u, None) |
| cv = node_to_cluster.get(v, None) |
| if cu is None or cv is None: |
| |
| missing_nodes += 1 |
| continue |
| if cu == cv: |
| continue |
| a, b = (cu, cv) if cu < cv else (cv, cu) |
| edges_set.add((a, b)) |
|
|
| write_edgelist(out_edgelist, sorted(edges_set)) |
| return missing_nodes |
|
|
| def run_java(java_exec: str, class_name: str, edgelist_path: str, out_json_path: str, |
| epsilon: str, java_opts: List[str]) -> None: |
| cmd = [java_exec] + java_opts + [class_name, edgelist_path, out_json_path, epsilon] |
| print("[run]", " ".join(cmd)) |
| subprocess.run(cmd, check=True) |
|
|
| def build_single_graph_levels(args): |
| ensure_dir(args.out_dir) |
| |
| stage0_dir = os.path.join(args.out_dir, "stage0") |
| ensure_dir(stage0_dir) |
| e0_copy = os.path.join(stage0_dir, "edgelist_0.txt") |
| if args.copy_inputs: |
| shutil.copyfile(args.input_edgelist, e0_copy) |
|
|
| prev_edgelist = args.input_edgelist |
| for lvl in range(args.levels): |
| stage_dir = os.path.join(args.out_dir, f"stage{lvl}") |
| ensure_dir(stage_dir) |
| seeds_out = os.path.join(stage_dir, "seeds.json") |
| |
| run_java(args.java, args.class_name, prev_edgelist, seeds_out, args.epsilon, args.java_opts) |
|
|
| |
| if lvl < args.levels - 1: |
| next_stage_dir = os.path.join(args.out_dir, f"stage{lvl+1}") |
| ensure_dir(next_stage_dir) |
| next_edgelist = os.path.join(next_stage_dir, f"edgelist_{lvl+1}.txt") |
| missing = coarsen_edgelist(prev_edgelist, seeds_out, next_edgelist) |
| if missing > 0: |
| print(f"[warn] stage{lvl}: {missing} edges had nodes missing from seeds; skipped.") |
| prev_edgelist = next_edgelist |
|
|
| def build_multigraph_levels(args): |
| ensure_dir(args.out_dir) |
| |
| graph_files = sorted(glob.glob(os.path.join(args.graphs_dir, args.glob))) |
| if not graph_files: |
| raise SystemExit(f"No graph files found in {args.graphs_dir} with pattern {args.glob}") |
|
|
| pattern = re.compile(r'(.*?)(\d+)(\.\w+)$') |
| def graph_id_from_path(p: str) -> str: |
| base = os.path.basename(p) |
| m = pattern.match(base) |
| if m: |
| return m.group(2).zfill(6) |
| |
| stem = os.path.splitext(base)[0] |
| m2 = re.search(r'(\d+)$', stem) |
| return (m2.group(1).zfill(6) if m2 else stem) |
|
|
| |
| prev_stage_edgelists: Dict[str, str] = {} |
| for lvl in range(args.levels): |
| stage_dir = os.path.join(args.out_dir, f"stage{lvl}") |
| ensure_dir(stage_dir) |
|
|
| if lvl == 0: |
| for gpath in graph_files: |
| gid = graph_id_from_path(gpath) |
| seeds_out = os.path.join(stage_dir, f"graph_{gid}.json") |
| run_java(args.java, args.class_name, gpath, seeds_out, args.epsilon, args.java_opts) |
| prev_stage_edgelists[gid] = gpath |
| else: |
| |
| for gpath in graph_files: |
| gid = graph_id_from_path(gpath) |
| prev_edgelist = prev_stage_edgelists[gid] |
| prev_seeds = os.path.join(args.out_dir, f"stage{lvl-1}", f"graph_{gid}.json") |
| next_edgelist = os.path.join(stage_dir, f"graph_{gid}.txt") |
| missing = coarsen_edgelist(prev_edgelist, prev_seeds, next_edgelist) |
| if missing > 0: |
| print(f"[warn] stage{lvl-1} graph_{gid}: {missing} edges had nodes missing from seeds; skipped.") |
|
|
| seeds_out = os.path.join(stage_dir, f"graph_{gid}.json") |
| run_java(args.java, args.class_name, next_edgelist, seeds_out, args.epsilon, args.java_opts) |
| prev_stage_edgelists[gid] = next_edgelist |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser(description="Build LRMC seeds across multiple levels by invoking the Java LRMC tool and coarsening between levels.") |
| mode = ap.add_mutually_exclusive_group(required=True) |
| mode.add_argument('--input_edgelist', type=str, help='Single-graph mode: path to original edgelist.txt') |
| mode.add_argument('--graphs_dir', type=str, help='Multi-graph mode: directory containing per-graph edgelist files (e.g., graph_000000.txt)') |
| ap.add_argument('--glob', type=str, default='graph_*.txt', help='Multi-graph mode: glob pattern for graph files (default: graph_*.txt)') |
| ap.add_argument('--out_dir', type=str, required=True, help='Output directory (stages will be created here)') |
| ap.add_argument('--levels', type=int, required=True, help='Number of levels to build (e.g., 3)') |
| |
| ap.add_argument('--java', type=str, default='java', help='Java executable (default: java)') |
| ap.add_argument('--class_name', type=str, default='LRMCGenerateSingleCluster', help='Fully qualified Java class name') |
| ap.add_argument('--epsilon', type=str, default='1e6', help='Epsilon argument for the Java tool (default: 1e6)') |
| ap.add_argument('--java_opts', type=str, default='', help='Extra options for java (e.g., "-Xmx16g -cp my.jar")') |
| ap.add_argument('--copy_inputs', action='store_true', help='Copy original edgelist under stage0 for record (single-graph mode)') |
| args = ap.parse_args() |
|
|
| |
| args.java_opts = args.java_opts.split() if args.java_opts else [] |
|
|
| if args.input_edgelist: |
| build_single_graph_levels(args) |
| else: |
| build_multigraph_levels(args) |
|
|
| if __name__ == '__main__': |
| main() |
|
|