| |
| """ |
| Generic particle analyzer for ROOT files |
| Usage: python analyze_particles.py <prefix> [filepath] |
| Examples: |
| python analyze_particles.py lep |
| python analyze_particles.py photon |
| python analyze_particles.py tau |
| python analyze_particles.py jet |
| """ |
|
|
| import sys |
| import os |
| import uproot |
| import numpy as np |
| import argparse |
|
|
| def get_available_prefixes(filepath): |
| """Get all available particle prefixes in the ROOT file""" |
| with uproot.open(filepath) as file: |
| tree = file['mini;1'] |
| branches = list(tree.keys()) |
| |
| prefixes = set() |
| for branch in branches: |
| if '_' in branch: |
| prefix = branch.split('_')[0] |
| prefixes.add(prefix) |
| |
| return sorted(list(prefixes)) |
|
|
| def analyze_particles(filepath, prefix, max_events=None): |
| """Analyze particle properties in detail""" |
|
|
| print(f"Analyzing {prefix} properties in: {filepath}") |
| print("=" * 60) |
|
|
| with uproot.open(filepath) as file: |
| tree = file['mini;1'] |
| branches = list(tree.keys()) |
| |
| |
| prefix_branches = [b for b in branches if b.startswith(prefix + '_')] |
| |
| if not prefix_branches: |
| print(f"No branches found with prefix '{prefix}_'") |
| print(f"Available prefixes: {get_available_prefixes(filepath)}") |
| return |
| |
| print(f"Found {len(prefix_branches)} branches with prefix '{prefix}_':") |
| for branch in sorted(prefix_branches): |
| print(f" - {branch}") |
| print() |
|
|
| |
| data = {} |
| for branch in prefix_branches: |
| try: |
| data[branch] = tree[branch].array() |
| if max_events: |
| data[branch] = data[branch][:max_events] |
| except Exception as e: |
| print(f"Warning: Could not load {branch}: {e}") |
| continue |
|
|
| if not data: |
| print("No data could be loaded!") |
| return |
|
|
| n_events = len(data[list(data.keys())[0]]) |
| print(f"Total events analyzed: {n_events}") |
| print() |
|
|
| |
| multiplicity_branch = f"{prefix}_n" |
| has_variable_multiplicity = multiplicity_branch in data |
| |
| |
| has_fixed_multiplicity = False |
| if prefix == 'photon': |
| |
| id_branches = [b for b in prefix_branches if b in [f'{prefix}_isTightID', f'{prefix}_truthMatched', f'{prefix}_trigMatched']] |
| if id_branches: |
| sample_id = data[id_branches[0]] |
| |
| try: |
| |
| test_access = sample_id[:,1] |
| has_fixed_multiplicity = True |
| except: |
| has_fixed_multiplicity = False |
| |
| if has_variable_multiplicity: |
| analyze_multiplicity(data[multiplicity_branch], prefix) |
| |
| |
| kinematic_vars = ['pt', 'eta', 'phi', 'E', 'm'] |
| for var in kinematic_vars: |
| branch_name = f"{prefix}_{var}" |
| if branch_name in data: |
| analyze_kinematic(data[branch_name], var.upper(), prefix, |
| has_variable_multiplicity, has_fixed_multiplicity) |
| |
| |
| id_vars = ['type', 'charge', 'isTightID', 'truthMatched', 'trigMatched'] |
| for var in id_vars: |
| branch_name = f"{prefix}_{var}" |
| if branch_name in data: |
| analyze_identification(data[branch_name], var, prefix, |
| has_variable_multiplicity, has_fixed_multiplicity, prefix) |
|
|
| def analyze_multiplicity(mult_data, prefix): |
| """Analyze particle multiplicity""" |
| print(f"{prefix.upper()} multiplicity distribution:") |
| unique, counts = np.unique(mult_data, return_counts=True) |
| for n, count in zip(unique, counts): |
| percentage = count / len(mult_data) * 100 |
| print(" {} {}(s): {:6d} events ({:.1f}%)".format(n, prefix, count, percentage)) |
| print() |
|
|
| def analyze_kinematic(var_data, var_name, prefix, has_variable_multiplicity=False, has_fixed_multiplicity=False): |
| """Analyze kinematic variables""" |
| print(f"{prefix.upper()} {var_name} analysis:") |
| |
| if has_variable_multiplicity: |
| |
| all_values = [] |
| leading_values = [] |
| subleading_values = [] |
| |
| for event_values in var_data: |
| if len(event_values) > 0: |
| |
| if var_name == 'PT': |
| sorted_values = sorted(event_values, reverse=True) |
| else: |
| sorted_values = event_values |
| |
| all_values.extend(sorted_values) |
| |
| |
| if len(sorted_values) >= 1: |
| leading_values.append(sorted_values[0]) |
| if len(sorted_values) >= 2: |
| subleading_values.append(sorted_values[1]) |
| |
| values = np.array(all_values) |
| leading = np.array(leading_values) if leading_values else None |
| subleading = np.array(subleading_values) if subleading_values else None |
| |
| print(f" Total number of {prefix}(s): {len(values)}") |
| print(f" Events with ≥1 {prefix}: {len(leading) if leading is not None else 0}") |
| print(f" Events with ≥2 {prefix}(s): {len(subleading) if subleading is not None else 0}") |
| elif has_fixed_multiplicity: |
| |
| values = np.array(var_data) |
| if var_name == 'PT': |
| |
| leading = values[:, 0] if values.shape[1] > 0 else None |
| subleading = values[:, 1] if values.shape[1] > 1 else None |
| else: |
| leading = values[:, 0] if values.shape[1] > 0 else None |
| subleading = values[:, 1] if values.shape[1] > 1 else None |
| |
| print(f" Total number of {prefix}(s): {len(values)}") |
| else: |
| |
| values = np.array(var_data) |
| leading = None |
| subleading = None |
| print(f" Total number of {prefix}(s): {len(values)}") |
| |
| if len(values) == 0: |
| print(" No data available") |
| return |
| |
| |
| if var_name in ['PT', 'E', 'M']: |
| values_gev = values / 1000 |
| leading_gev = leading / 1000 if leading is not None else None |
| subleading_gev = subleading / 1000 if subleading is not None else None |
| unit = "GeV" |
| else: |
| values_gev = values |
| leading_gev = leading |
| subleading_gev = subleading |
| unit = "" |
| |
| print(f" {var_name} statistics ({unit}) - All {prefix}(s):") |
| print(" Mean: {:.3f}".format(np.mean(values_gev))) |
| print(" Median: {:.3f}".format(np.median(values_gev))) |
| print(" Min: {:.3f}".format(np.min(values_gev))) |
| print(" Max: {:.3f}".format(np.max(values_gev))) |
| print(" Std: {:.3f}".format(np.std(values_gev))) |
| |
| |
| if leading_gev is not None and len(leading_gev) > 0: |
| print(f" {var_name} statistics ({unit}) - Leading {prefix}:") |
| print(" Mean: {:.3f}".format(np.mean(leading_gev))) |
| print(" Median: {:.3f}".format(np.median(leading_gev))) |
| print(" Min: {:.3f}".format(np.min(leading_gev))) |
| print(" Max: {:.3f}".format(np.max(leading_gev))) |
| print(" Std: {:.3f}".format(np.std(leading_gev))) |
| |
| |
| if subleading_gev is not None and len(subleading_gev) > 0: |
| print(f" {var_name} statistics ({unit}) - Subleading {prefix}:") |
| print(" Mean: {:.3f}".format(np.mean(subleading_gev))) |
| print(" Median: {:.3f}".format(np.median(subleading_gev))) |
| print(" Min: {:.3f}".format(np.min(subleading_gev))) |
| print(" Max: {:.3f}".format(np.max(subleading_gev))) |
| print(" Std: {:.3f}".format(np.std(subleading_gev))) |
| |
| |
| if leading_gev is not None and len(leading_gev) == len(subleading_gev): |
| ratio = subleading_gev / leading_gev |
| print(f" {var_name} ratio (Subleading/Leading):") |
| print(" Mean: {:.3f}".format(np.mean(ratio))) |
| print(" Median: {:.3f}".format(np.median(ratio))) |
| print(" Min: {:.3f}".format(np.min(ratio))) |
| print(" Max: {:.3f}".format(np.max(ratio))) |
| |
| print() |
|
|
| def analyze_identification(var_data, var_name, prefix, has_variable_multiplicity=False, has_fixed_multiplicity=False, particle_prefix=None): |
| """Analyze identification variables""" |
| print(f"{prefix.upper()} {var_name} analysis:") |
| |
| |
| use_fixed_multiplicity = has_fixed_multiplicity and (particle_prefix == 'photon' or not has_variable_multiplicity) |
| |
| if use_fixed_multiplicity: |
| |
| values = np.array(var_data) |
| |
| |
| if values.shape[1] >= 2: |
| leading_values = values[:, 0] |
| subleading_values = values[:, 1] |
| |
| print(f" Overall {var_name} distribution:") |
| analyze_id_distribution(values.flatten(), var_name, prefix) |
| |
| print(f" Leading {prefix} {var_name} distribution:") |
| analyze_id_distribution(leading_values, var_name, prefix) |
| |
| print(f" Subleading {prefix} {var_name} distribution:") |
| analyze_id_distribution(subleading_values, var_name, prefix) |
| |
| |
| if var_name in ['isTightID', 'truthMatched', 'trigMatched']: |
| analyze_correlation(leading_values, subleading_values, var_name, prefix) |
| |
| print() |
| return |
| elif has_variable_multiplicity: |
| |
| all_values = [] |
| for event_values in var_data: |
| if len(event_values) > 0: |
| all_values.extend(event_values) |
| values = np.array(all_values) |
| else: |
| |
| values = np.array(var_data) |
| |
| |
| analyze_id_distribution(values, var_name, prefix) |
| print() |
|
|
| def analyze_id_distribution(values, var_name, prefix): |
| """Analyze a single identification variable distribution""" |
| if var_name == 'type': |
| analyze_particle_types(values, prefix) |
| elif var_name in ['isTightID', 'truthMatched', 'trigMatched']: |
| analyze_boolean_flags(values, var_name, prefix) |
| elif var_name == 'charge': |
| analyze_charges(values, prefix) |
| else: |
| |
| unique, counts = np.unique(values, return_counts=True) |
| total = len(values) |
| print(f" Distribution:") |
| for val, count in zip(unique[:10], counts[:10]): |
| percentage = count / total * 100 |
| print(" {}: {:6d} ({:.1f}%)".format(val, count, percentage)) |
| if len(unique) > 10: |
| print(f" ... and {len(unique) - 10} more values") |
|
|
| def analyze_correlation(leading, subleading, var_name, prefix): |
| """Analyze correlation between leading and subleading particle properties""" |
| print(f" {prefix.upper()} {var_name} correlation (Leading × Subleading):") |
| |
| |
| both_true = np.sum((leading == True) & (subleading == True)) |
| leading_true_sub_false = np.sum((leading == True) & (subleading == False)) |
| leading_false_sub_true = np.sum((leading == False) & (subleading == True)) |
| both_false = np.sum((leading == False) & (subleading == False)) |
| |
| total = len(leading) |
| |
| print(" Both True: {:6d} ({:.1f}%)".format(both_true, both_true/total*100)) |
| print(" Leading True, Subleading False: {:6d} ({:.1f}%)".format( |
| leading_true_sub_false, leading_true_sub_false/total*100)) |
| print(" Leading False, Subleading True: {:6d} ({:.1f}%)".format( |
| leading_false_sub_true, leading_false_sub_true/total*100)) |
| print(" Both False: {:6d} ({:.1f}%)".format(both_false, both_false/total*100)) |
|
|
| def analyze_particle_types(types, prefix): |
| """Analyze particle types""" |
| type_dict = {11: 'electron', 13: 'muon', 15: 'tau', 22: 'photon'} |
| print(f" {prefix.upper()} type distribution:") |
| |
| unique_types, counts = np.unique(types, return_counts=True) |
| for ptype, count in zip(unique_types, counts): |
| type_name = type_dict.get(ptype, f'unknown({ptype})') |
| percentage = count / len(types) * 100 |
| print(" {}: {:6d} ({:.1f}%)".format(type_name, count, percentage)) |
| print() |
|
|
| def analyze_boolean_flags(flags, flag_name, prefix): |
| """Analyze boolean flags""" |
| true_count = np.sum(flags) |
| false_count = len(flags) - true_count |
| true_pct = true_count / len(flags) * 100 |
| false_pct = false_count / len(flags) * 100 |
| |
| print(f" {prefix.upper()} {flag_name} distribution:") |
| print(" True: {:6d} ({:.1f}%)".format(true_count, true_pct)) |
| print(" False: {:6d} ({:.1f}%)".format(false_count, false_pct)) |
| print() |
|
|
| def analyze_charges(charges, prefix): |
| """Analyze particle charges""" |
| unique_charges, counts = np.unique(charges, return_counts=True) |
| print(f" {prefix.upper()} charge distribution:") |
| for charge, count in zip(unique_charges, counts): |
| percentage = count / len(charges) * 100 |
| print(" {}: {:6d} ({:.1f}%)".format(charge, count, percentage)) |
| print() |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description='Generic particle analyzer for ROOT files') |
| parser.add_argument('--list-prefixes', action='store_true', |
| help='List all available prefixes in the file') |
| parser.add_argument('prefix', nargs='?', help='Particle prefix (e.g., lep, photon, tau, jet)') |
| parser.add_argument('filepath', nargs='?', |
| default="/global/cfs/projectdirs/atlas/eligd/llm_for_analysis_copy/data/mc_341081.ttH125_gamgam.GamGam.root", |
| help='Path to ROOT file') |
| parser.add_argument('--max-events', type=int, help='Limit analysis to first N events') |
| |
| args = parser.parse_args() |
| |
| if args.list_prefixes: |
| if not os.path.exists(args.filepath): |
| print(f"Error: File '{args.filepath}' does not exist!") |
| return |
| print("Available prefixes in the file:") |
| prefixes = get_available_prefixes(args.filepath) |
| for prefix in prefixes: |
| print(f" - {prefix}") |
| return |
| |
| if not args.prefix: |
| print("Error: Please specify a particle prefix (e.g., lep, photon, tau, jet)") |
| print("Use --list-prefixes to see available options") |
| return |
| |
| if not os.path.exists(args.filepath): |
| print(f"Error: File '{args.filepath}' does not exist!") |
| return |
| |
| analyze_particles(args.filepath, args.prefix, args.max_events) |
|
|
| if __name__ == "__main__": |
| main() |
|
|