| import argparse |
| from rich import print |
| import glob |
| import os |
| import re |
| import shutil |
| import subprocess |
| from typing import Dict, List, Tuple, Iterable, Set |
| import random |
| from iv2_utils.iv2 import json_read, json_write |
| import tempfile |
|
|
| def ensure_dir(p: str): |
| os.makedirs(p, exist_ok=True) |
|
|
| def read_edgelist(path: str) -> Iterable[Tuple[int, int]]: |
| with open(path, 'r') as f: |
| for line in f: |
| s = line.strip() |
| if not s or s.startswith('#'): |
| continue |
| parts = s.split() |
| if len(parts) < 2: |
| continue |
| try: |
| u = int(parts[0]); v = int(parts[1]) |
| except ValueError: |
| continue |
| if u == v: |
| continue |
| a, b = (u, v) if u < v else (v, u) |
| yield a, b |
|
|
| def write_edgelist(path: str, edges: Iterable[Tuple[int, int]]): |
| with open(path, 'w') as f: |
| for u, v in edges: |
| f.write(f"{u} {v}\n") |
|
|
| def parse_seeds(path: str) -> Tuple[Dict[int, int], List[int]]: |
| """ |
| Return (node_to_cluster_index, sorted_cluster_ids). |
| The cluster indices are 0..C-1, sorted by cluster_id. |
| - On overlapping membership, choose the cluster with higher 'score', then smaller cluster_id. |
| """ |
| js = json_read(path) |
| clusters = js.get('clusters', []) |
| |
| clusters_sorted = sorted(clusters, key=lambda c: c.get('cluster_id', 0)) |
| cluster_id_list = [c.get('cluster_id', i) for i, c in enumerate(clusters_sorted)] |
| cluster_id_to_idx = {cid: i for i, cid in enumerate(cluster_id_list)} |
|
|
| |
| node_choice: Dict[int, Tuple[int, float, int]] = {} |
|
|
| for c in clusters_sorted: |
| cid = c.get('cluster_id', None) |
| if cid is None: |
| continue |
| idx = cluster_id_to_idx[cid] |
| members = c.get('members') |
| if members is None: |
| members = c.get('seed_nodes', []) |
| score = float(c.get('score', 0.0)) |
| for u in members: |
| prev = node_choice.get(u, None) |
| if prev is None or (score > prev[1]) or (score == prev[1] and cid < prev[2]): |
| node_choice[u] = (idx, score, cid) |
|
|
| node_to_cluster = {u: idx for u, (idx, score, cid) in node_choice.items()} |
| return node_to_cluster, cluster_id_list |
|
|
| def coarsen_edgelist(prev_edgelist: str, seeds_json: str, out_edgelist: str) -> int: |
| node_to_cluster, cluster_id_list = parse_seeds(seeds_json) |
| edges_set: Set[Tuple[int, int]] = set() |
| missing_nodes = 0 |
| for u, v in read_edgelist(prev_edgelist): |
| cu = node_to_cluster.get(u, None) |
| cv = node_to_cluster.get(v, None) |
| if cu is None or cv is None: |
| |
| missing_nodes += 1 |
| continue |
| if cu == cv: |
| continue |
| a, b = (cu, cv) if cu < cv else (cv, cu) |
| edges_set.add((a, b)) |
|
|
| write_edgelist(out_edgelist, sorted(edges_set)) |
| return missing_nodes |
|
|
| def run_java( |
| java_exec: str, |
| class_name: str, |
| edgelist_path: str, |
| out_json_path: str, |
| epsilon: str, |
| java_opts: List[str], |
| quiet: bool = False, |
| ) -> None: |
| java_file = class_name + '.java' |
| compile_cmd = ['javac', '-cp', '.', java_file] |
| if not quiet: |
| print(f"[blue]Compiling:[/blue] {' '.join(compile_cmd)}") |
| try: |
| _ = subprocess.run( |
| compile_cmd, check=True, capture_output=True, text=True, encoding='utf-8' |
| ) |
| except FileNotFoundError: |
| if not quiet: |
| print("[red]Error: `javac` command not found. Is JDK installed and in your PATH?[/red]") |
| raise |
| except subprocess.CalledProcessError as e: |
| if not quiet: |
| print(f"[red]Java compilation failed. Return code: {e.returncode}[/red]") |
| print("[red]stdout:[/red]\n" + e.stdout) |
| print("[red]stderr:[/red]\n" + e.stderr) |
| raise |
|
|
| cmd = [java_exec] + java_opts + [class_name, edgelist_path, out_json_path, epsilon] |
| if not quiet: |
| print("[blue]Running:[/blue]", " ".join(cmd)) |
| |
| run_kwargs = {'check': True} |
| if quiet: |
| run_kwargs['stdout'] = subprocess.DEVNULL |
| run_kwargs['stderr'] = subprocess.DEVNULL |
| subprocess.run(cmd, **run_kwargs) |
|
|
| def generate_lrmc_cluster(edges: List[Tuple[int, int]], epsilon: float, java_exec: str = 'java', java_opts: List[str] = None, quiet: bool = True) -> Dict: |
| """ |
| Generates a single cluster from an edge list using the L-RMC algorithm. |
| |
| Args: |
| edges: A list of 0-indexed integer tuples representing the graph edges. |
| epsilon: The epsilon value for the L-RMC algorithm. |
| java_exec: Path to the java executable. |
| java_opts: A list of options for the java executable. |
| quiet: If True, suppress stdout from this function and subprocesses. |
| |
| Returns: |
| A dictionary containing the 'seed_nodes' and 'score' of the found cluster. |
| Node IDs in 'seed_nodes' are 0-indexed. |
| """ |
| class_name = 'LRMCGenerateSingleCluster' |
|
|
| |
| with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as edge_file, \ |
| tempfile.NamedTemporaryFile(mode='r', delete=False, suffix='.json', encoding='utf-8') as json_file: |
| edge_file_path = edge_file.name |
| json_file_path = json_file.name |
|
|
| try: |
| |
| with open(edge_file_path, 'w', encoding='utf-8') as f: |
| for u, v in edges: |
| f.write(f"{u} {v}\n") |
|
|
| |
| run_java(java_exec, class_name, edge_file_path, json_file_path, str(epsilon), java_opts or [], quiet=quiet) |
|
|
| |
| json_output = json_read(json_file_path) |
|
|
| finally: |
| |
| os.remove(edge_file_path) |
| os.remove(json_file_path) |
|
|
| |
| cluster = json_output['clusters'][0] |
|
|
| |
| seed_nodes_1_based = cluster['seed_nodes'] |
| seed_nodes_0_based = [node - 1 for node in seed_nodes_1_based] |
|
|
| return { |
| "seed_nodes": seed_nodes_0_based, |
| "score": cluster['score'] |
| } |
|
|
|
|
| def generate_random_seeds(edgelist_path: str, num_nodes: int, out_json_path: str): |
| """Generates a seeds file with a single cluster of randomly selected nodes.""" |
| nodes = set() |
| for u, v in read_edgelist(edgelist_path): |
| nodes.add(u) |
| nodes.add(v) |
|
|
| if num_nodes > len(nodes): |
| print(f"[yellow]Warning: requested {num_nodes} random nodes, but only {len(nodes)} unique nodes exist. Selecting all nodes.[/yellow]") |
| selected_nodes = list(nodes) |
| else: |
| selected_nodes = random.sample(list(nodes), num_nodes) |
|
|
| |
| selected_nodes_1_indexed = [n + 1 for n in selected_nodes] |
|
|
| seed_data = { |
| "clusters": [ |
| { |
| "cluster_id": 1, |
| "seed_nodes": selected_nodes_1_indexed, |
| "score": 1.0 |
| } |
| ] |
| } |
|
|
| json_write(seed_data, out_json_path) |
| print(f"[green]Generated random seeds with {len(selected_nodes)} nodes at {out_json_path}[/green]") |
|
|
| def build_single_graph_levels(args): |
| if args.baseline == 'random': |
| if not args.num_nodes: |
| raise SystemExit("--num_nodes is required for --baseline random") |
|
|
| stage_dir = os.path.join(args.out_dir, "stage0_rand") |
| ensure_dir(stage_dir) |
|
|
| if args.levels != 1: |
| print("[yellow]Warning: For random baseline, only one level is generated. Ignoring --levels setting.[/yellow]") |
|
|
| seeds_out = os.path.join(stage_dir, f"seeds_{args.num_nodes}.json") |
| generate_random_seeds(args.input_edgelist, args.num_nodes, seeds_out) |
| return |
|
|
| ensure_dir(args.out_dir) |
| |
| stage0_dir = os.path.join(args.out_dir, "stage0") |
| ensure_dir(stage0_dir) |
| e0_copy = os.path.join(stage0_dir, "edgelist_0.txt") |
| if args.copy_inputs: |
| shutil.copyfile(args.input_edgelist, e0_copy) |
|
|
| prev_edgelist = args.input_edgelist |
| for lvl in range(args.levels): |
| stage_dir = os.path.join(args.out_dir, f"stage{lvl}") |
| ensure_dir(stage_dir) |
| seeds_out = os.path.join(stage_dir, "seeds_"+str(args.epsilon)+".json") |
| |
| run_java(args.java, args.class_name, prev_edgelist, seeds_out, args.epsilon, args.java_opts) |
|
|
| |
| if lvl < args.levels - 1: |
| next_stage_dir = os.path.join(args.out_dir, f"stage{lvl+1}") |
| ensure_dir(next_stage_dir) |
| next_edgelist = os.path.join(next_stage_dir, f"edgelist_{lvl+1}.txt") |
| missing = coarsen_edgelist(prev_edgelist, seeds_out, next_edgelist) |
| if missing > 0: |
| print(f"[yellow]stage{lvl}: {missing} edges had nodes missing from seeds; skipped.[/yellow]") |
| prev_edgelist = next_edgelist |
|
|
| def build_multigraph_levels(args): |
| ensure_dir(args.out_dir) |
| |
| graph_files = sorted(glob.glob(os.path.join(args.graphs_dir, args.glob))) |
| if not graph_files: |
| raise SystemExit(f"No graph files found in {args.graphs_dir} with pattern {args.glob}") |
|
|
| pattern = re.compile(r'(.*?)(\d+)(\.\w+)$') |
| def graph_id_from_path(p: str) -> str: |
| base = os.path.basename(p) |
| m = pattern.match(base) |
| if m: |
| return m.group(2).zfill(6) |
| |
| stem = os.path.splitext(base)[0] |
| m2 = re.search(r'(\d+)$', stem) |
| return (m2.group(1).zfill(6) if m2 else stem) |
|
|
| |
| prev_stage_edgelists: Dict[str, str] = {} |
| for lvl in range(args.levels): |
| stage_dir = os.path.join(args.out_dir, f"stage{lvl}") |
| ensure_dir(stage_dir) |
|
|
| if lvl == 0: |
| for gpath in graph_files: |
| gid = graph_id_from_path(gpath) |
| seeds_out = os.path.join(stage_dir, f"graph_{gid}.json") |
| run_java(args.java, args.class_name, gpath, seeds_out, args.epsilon, args.java_opts) |
| prev_stage_edgelists[gid] = gpath |
| else: |
| |
| for gpath in graph_files: |
| gid = graph_id_from_path(gpath) |
| prev_edgelist = prev_stage_edgelists[gid] |
| prev_seeds = os.path.join(args.out_dir, f"stage{lvl-1}", f"graph_{gid}.json") |
| next_edgelist = os.path.join(stage_dir, f"graph_{gid}.txt") |
| missing = coarsen_edgelist(prev_edgelist, prev_seeds, next_edgelist) |
| if missing > 0: |
| print(f"[yellow]stage{lvl-1} graph_{gid}: {missing} edges had nodes missing from seeds; skipped.[/yellow]") |
|
|
| seeds_out = os.path.join(stage_dir, f"graph_{gid}.json") |
| run_java(args.java, args.class_name, next_edgelist, seeds_out, args.epsilon, args.java_opts) |
| prev_stage_edgelists[gid] = next_edgelist |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser(description="Build LRMC seeds across multiple levels by invoking the Java LRMC tool and coarsening between levels.") |
| mode = ap.add_mutually_exclusive_group(required=True) |
| mode.add_argument('--input_edgelist', type=str, help='Single-graph mode: path to original edgelist.txt') |
| mode.add_argument('--graphs_dir', type=str, help='Multi-graph mode: directory containing per-graph edgelist files (e.g., graph_000000.txt)') |
| ap.add_argument('--glob', type=str, default='graph_*.txt', help='Multi-graph mode: glob pattern for graph files (default: graph_*.txt)') |
| ap.add_argument('--out_dir', type=str, required=True, help='Output directory (stages will be created here)') |
| ap.add_argument('--levels', type=int, required=True, help='Number of levels to build (e.g., 3)') |
| |
| ap.add_argument('--baseline', type=str, choices=['random'], help='Use a baseline method for seed generation.') |
| ap.add_argument('--num_nodes', type=int, help='Number of random nodes to select for random baseline.') |
| |
| ap.add_argument('--java', type=str, default='java', help='Java executable (default: java)') |
| ap.add_argument('--class_name', type=str, default='LRMCGenerateSingleCluster', help='Fully qualified Java class name') |
| ap.add_argument('--epsilon', type=str, default='1e6', help='Epsilon argument for the Java tool (default: 1e6)') |
| ap.add_argument('--java_opts', type=str, default='', help='Extra options for java (e.g., "-Xmx16g -cp my.jar")') |
| ap.add_argument('--copy_inputs', action='store_true', help='Copy original edgelist under stage0 for record (single-graph mode)') |
| args = ap.parse_args() |
|
|
| |
| args.java_opts = args.java_opts.split() if args.java_opts else [] |
|
|
| if args.input_edgelist: |
| build_single_graph_levels(args) |
| else: |
| if args.baseline == 'random': |
| raise SystemExit("--baseline random is only supported in single-graph mode (--input_edgelist)") |
| build_multigraph_levels(args) |
|
|
| class Args: |
| def __init__(self): |
| self.input_edgelist = None |
| self.graphs_dir = None |
| self.glob = 'graph_*.txt' |
| self.out_dir = '' |
| self.levels = 0 |
| self.java = 'java' |
| self.class_name = 'LRMCGenerateSingleCluster' |
| self.epsilon = '1e6' |
| self.java_opts = [] |
| self.copy_inputs = False |
| self.baseline = None |
| self.num_nodes = None |
|
|
| def build_lrmc_single_graph(input_edgelist: str, out_dir: str, levels: int, epsilon: str = '1e6', |
| java_exec: str = 'java', class_name: str = 'LRMCGenerateSingleCluster', |
| java_opts: List[str] = None, copy_inputs: bool = False) -> str: |
| """ |
| Build LRMC levels for a single graph. |
| |
| Args: |
| input_edgelist: Path to input edgelist file |
| out_dir: Output directory |
| levels: Number of levels to build |
| epsilon: Epsilon value for Java tool |
| java_exec: Java executable path |
| class_name: Fully qualified Java class name |
| java_opts: Extra options for Java |
| copy_inputs: Whether to copy input edgelist to stage0 |
| |
| Returns: |
| Path to the generated seeds JSON file |
| """ |
| args = Args() |
| args.input_edgelist = input_edgelist |
| args.out_dir = out_dir |
| args.levels = levels |
| args.epsilon = epsilon |
| args.java = java_exec |
| args.class_name = class_name |
| args.java_opts = java_opts or [] |
| args.copy_inputs = copy_inputs |
|
|
| build_single_graph_levels(args) |
|
|
| |
| return os.path.join(out_dir, "stage0", f"seeds_{epsilon}.json") |
|
|
| if __name__ == '__main__': |
| main() |
|
|