LigUnity / active_learning_scripts /run_cycle_ensemble.py

提交LigUnity初始代码

94391f2 3 months ago

13.4 kB

	import pandas as pd
	import numpy as np
	import subprocess
	import os
	from pathlib import Path
	import random
	import argparse
	import json
	import subprocess
	from concurrent.futures import ThreadPoolExecutor, wait

	def parse_arguments():
	parser = argparse.ArgumentParser(description='Active Learning Cycle for Ligand Prediction')

	# Input/Output arguments
	parser.add_argument('--input_file', type=str, required=True,
	help='Input CSV file containing ligand data (e.g., tyk2_fep.csv)')
	parser.add_argument('--results_dir_1', type=str, required=True,
	help='Results directory for first model')
	parser.add_argument('--results_dir_2', type=str, required=True,
	help='Results directory for second model')
	parser.add_argument('--al_batch_size', type=int, required=True,
	help='Number of samples for each active learning batch')

	# Experiment configuration
	parser.add_argument('--num_repeats', type=int, default=5,
	help='Number of repeated experiments (default: 5)')
	parser.add_argument('--num_cycles', type=int, required=True,
	help='Number of active learning cycles')

	# Model configuration
	parser.add_argument('--arch_1', type=str, required=True,
	help='First model architecture')
	parser.add_argument('--arch_2', type=str, required=True,
	help='Second model architecture')
	parser.add_argument('--weight_path_1', type=str, required=True,
	help='Path to first model pretrained weights')
	parser.add_argument('--weight_path_2', type=str, required=True,
	help='Path to second model pretrained weights')
	parser.add_argument('--lr', type=float, default=0.001,
	help='Learning rate (default: 0.001)')
	parser.add_argument('--master_port', type=int, default=29500,
	help='Master port for distributed training (default: 29500)')
	parser.add_argument('--device', type=int, default=0,
	help='Base device to run the models on (default: 0)')
	parser.add_argument('--begin_greedy', type=int, default=0,
	help='iter of begin to be pure greedy, using half greedy before')

	# Random seed
	parser.add_argument('--base_seed', type=int, default=42,
	help='Base random seed (default: 42)')

	return parser.parse_args()


	def _run(cmd):
	import os
	project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	subprocess.run(cmd, check=True, cwd=project_root)


	def run_model(arch_1, arch_2, weight_path_1, weight_path_2, results_path_1, results_path_2, result_file, lr,
	master_port, train_ligf, test_ligf, device):
	cmd1 = [
	"bash", "./active_learning_scripts/run_model.sh",
	arch_1,
	weight_path_1,
	results_path_1,
	result_file,
	str(lr),
	str(master_port),
	train_ligf,
	test_ligf,
	str(device)
	]

	cmd2 = [
	"bash", "./active_learning_scripts/run_model.sh",
	arch_2,
	weight_path_2,
	results_path_2,
	result_file,
	str(lr),
	str(master_port + 1),
	train_ligf,
	test_ligf,
	str(device + 1)
	]

	with ThreadPoolExecutor(max_workers=2) as executor:
	task1 = executor.submit(_run, cmd=cmd1)
	task2 = executor.submit(_run, cmd=cmd2)
	wait([task1, task2])


	def read_predictions(results_path, result_file):
	"""
	Read predictions from a single model
	"""
	predictions = {}

	jsonl_path = os.path.join(results_path, result_file)
	with open(jsonl_path, 'r') as f:
	first_line = json.loads(f.readline().strip())
	smiles_list = first_line["tyk2"]["smiles"]
	all_predictions = []
	for line in f:
	pred_line = json.loads(line.strip())
	all_predictions.append(pred_line["tyk2"]["pred"])

	# Convert to numpy array and calculate mean predictions
	pred_array = np.array(all_predictions)
	mean_predictions = np.mean(pred_array, axis=0)

	# Create dictionary mapping SMILES to predictions
	for smile, pred in zip(smiles_list, mean_predictions):
	predictions[smile] = float(pred)

	return predictions

	def prepare_initial_split(input_file, results_dir_1, results_dir_2, al_batch_size, repeat_idx, cycle_idx, base_seed):
	# Read all ligands
	df = pd.read_csv(input_file)

	# Set random seed for reproducibility
	random.seed(base_seed + repeat_idx)

	# Randomly select ligands for training and testing
	all_indices = list(range(len(df)))
	train_indices = random.sample(all_indices, al_batch_size)
	test_indices = [i for i in all_indices if i not in train_indices]

	# Create train and test files
	train_df = df.iloc[train_indices]
	test_df = df.iloc[test_indices]

	# Create file names for both directories
	train_file_1 = os.path.join(results_dir_1, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
	test_file_1 = os.path.join(results_dir_1, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")

	train_file_2 = os.path.join(results_dir_2, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
	test_file_2 = os.path.join(results_dir_2, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")

	# Create directories if they don't exist
	os.makedirs(os.path.dirname(train_file_1), exist_ok=True)
	os.makedirs(os.path.dirname(train_file_2), exist_ok=True)

	# Save files to both directories
	train_df.to_csv(train_file_1, index=False)
	test_df.to_csv(test_file_1, index=False)
	train_df.to_csv(train_file_2, index=False)
	test_df.to_csv(test_file_2, index=False)

	return train_file_1, test_file_1, train_file_2, test_file_2


	def read_and_combine_predictions(results_path_1, results_path_2, result_file):
	"""
	Read predictions from both models and calculate average predictions
	"""
	predictions = {}

	# Read predictions from model 1
	jsonl_path_1 = os.path.join(results_path_1, result_file)
	with open(jsonl_path_1, 'r') as f:
	first_line = json.loads(f.readline().strip())
	smiles_list = first_line["tyk2"]["smiles"]
	all_predictions_1 = []
	for line in f:
	pred_line = json.loads(line.strip())
	all_predictions_1.append(pred_line["tyk2"]["pred"])

	# Read predictions from model 2
	jsonl_path_2 = os.path.join(results_path_2, result_file)
	with open(jsonl_path_2, 'r') as f:
	f.readline() # skip first line as we already have smiles_list
	all_predictions_2 = []
	for line in f:
	pred_line = json.loads(line.strip())
	all_predictions_2.append(pred_line["tyk2"]["pred"])

	# Convert to numpy arrays
	pred_array_1 = np.array(all_predictions_1)
	pred_array_2 = np.array(all_predictions_2)

	# Calculate mean predictions across both models
	mean_predictions = (np.mean(pred_array_1, axis=0) + np.mean(pred_array_2, axis=0)) / 2

	# Create dictionary mapping SMILES to average predictions
	for smile, pred in zip(smiles_list, mean_predictions):
	predictions[smile] = float(pred)

	return predictions


	def update_splits(results_dir_1, results_dir_2, predictions_1, predictions_2,
	prev_train_file_1, prev_test_file_1,
	prev_train_file_2, prev_test_file_2,
	repeat_idx, cycle_idx, al_batch_size, begin_greedy):
	# Read previous test files
	test_df_1 = pd.read_csv(prev_test_file_1)
	test_df_2 = pd.read_csv(prev_test_file_2)

	# Add predictions to test_df
	test_df_1['prediction_1'] = test_df_1['Smiles'].map(predictions_1)
	test_df_1['prediction_2'] = test_df_1['Smiles'].map(predictions_2)
	test_df_1['prediction'] = (test_df_1['prediction_1'] + test_df_1['prediction_2']) / 2

	# Sort by average predictions (high to low)
	test_df_sorted = test_df_1.sort_values('prediction', ascending=False)

	# Read previous train files
	train_df_1 = pd.read_csv(prev_train_file_1)
	train_df_2 = pd.read_csv(prev_train_file_2)

	# Create new file names for both directories
	new_train_file_1 = os.path.join(results_dir_1, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
	new_test_file_1 = os.path.join(results_dir_1, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")
	new_train_file_2 = os.path.join(results_dir_2, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
	new_test_file_2 = os.path.join(results_dir_2, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")

	# Create directories if they don't exist
	os.makedirs(os.path.dirname(new_train_file_1), exist_ok=True)
	os.makedirs(os.path.dirname(new_train_file_2), exist_ok=True)

	if cycle_idx >= begin_greedy:
	# Take top al_batch_size compounds for training
	new_train_compounds = test_df_sorted.head(al_batch_size)
	remaining_test_compounds = test_df_sorted.iloc[al_batch_size:]
	else:
	# use half greedy approach
	new_train_compounds_tmp_1 = test_df_sorted.head(al_batch_size//2)
	remaining_test_compounds_tmp = test_df_sorted.iloc[al_batch_size//2:]
	all_indices = list(range(len(remaining_test_compounds_tmp)))

	train_indices = random.sample(all_indices, al_batch_size - al_batch_size//2)
	test_indices = [i for i in all_indices if i not in train_indices]
	remaining_test_compounds = remaining_test_compounds_tmp.iloc[test_indices]
	new_train_compounds_tmp_2 = remaining_test_compounds_tmp.iloc[train_indices]
	new_train_compounds = pd.concat([new_train_compounds_tmp_1, new_train_compounds_tmp_2])

	# Combine with previous training data
	combined_train_df = pd.concat([train_df_1, new_train_compounds])

	for _ in range(3):
	print("########################################")
	print("Cycling: ", cycle_idx)
	print("top_1p: {}/100".format(combined_train_df['top_1p'].sum()))
	print("top_2p: {}/200".format(combined_train_df['top_2p'].sum()))
	print("top_5p: {}/500".format(combined_train_df['top_5p'].sum()))

	# Save files for both models (same content, different directories)
	combined_train_df.to_csv(new_train_file_1, index=False)
	remaining_test_compounds.to_csv(new_test_file_1, index=False)
	combined_train_df.to_csv(new_train_file_2, index=False)
	remaining_test_compounds.to_csv(new_test_file_2, index=False)

	return (new_train_file_1, new_test_file_1,
	new_train_file_2, new_test_file_2)


	def run_active_learning(args):
	# Create base results directories
	os.system(f"rm -rf {args.results_dir_1}")
	os.system(f"rm -rf {args.results_dir_2}")
	os.makedirs(args.results_dir_1, exist_ok=True)
	os.makedirs(args.results_dir_2, exist_ok=True)

	for repeat_idx in range(args.num_repeats):
	print(f"Starting repeat {repeat_idx}")

	# Initial split for this repeat
	train_file_1, test_file_1, train_file_2, test_file_2 = prepare_initial_split(
	args.input_file,
	args.results_dir_1,
	args.results_dir_2,
	args.al_batch_size,
	repeat_idx,
	0, # First cycle
	args.base_seed
	)

	for cycle_idx in range(args.num_cycles):
	print(f"Running cycle {cycle_idx} for repeat {repeat_idx}")

	# Result file name
	result_file = f"repeat_{repeat_idx}_cycle_{cycle_idx}_results.jsonl"
	if os.path.exists(f"{args.results_dir_1}/{result_file}"):
	os.remove(f"{args.results_dir_1}/{result_file}")
	if os.path.exists(f"{args.results_dir_2}/{result_file}"):
	os.remove(f"{args.results_dir_2}/{result_file}")

	# Run both models
	run_model(
	arch_1=args.arch_1,
	arch_2=args.arch_2,
	weight_path_1=args.weight_path_1,
	weight_path_2=args.weight_path_2,
	results_path_1=args.results_dir_1,
	results_path_2=args.results_dir_2,
	result_file=result_file,
	lr=args.lr,
	master_port=args.master_port,
	train_ligf=train_file_1,
	test_ligf=test_file_1,
	device=args.device
	)

	# Update splits for next cycle
	if cycle_idx < args.num_cycles - 1:
	# Read predictions from both models separately
	predictions_1 = read_predictions(args.results_dir_1, result_file)
	predictions_2 = read_predictions(args.results_dir_2, result_file)

	# Update splits for both models
	train_file_1, test_file_1, train_file_2, test_file_2 = update_splits(
	args.results_dir_1,
	args.results_dir_2,
	predictions_1,
	predictions_2,
	train_file_1,
	test_file_1,
	train_file_2,
	test_file_2,
	repeat_idx,
	cycle_idx + 1,
	args.al_batch_size,
	args.begin_greedy
	)


	if __name__ == "__main__":
	args = parse_arguments()
	run_active_learning(args)