LLM4HEP / run_smk_sequential.sh

initial commit

cfcbbc8 5 months ago

11.8 kB

	#!/bin/bash
	#
	# run_smk_sequential.sh - Run Snakemake workflows one at a time for debugging
	#
	# This script runs each Snakemake workflow sequentially to observe
	# the behavior of prompt scripts, supervisor, and coder in real time.
	#
	# Usage:
	# ./run_smk_sequential.sh # Run all steps
	# ./run_smk_sequential.sh --step1 # Run summarize_root (both rules)
	# ./run_smk_sequential.sh --step2 # Run create_numpy
	# ./run_smk_sequential.sh --step3 # Run preprocess
	# ./run_smk_sequential.sh --step4 # Run scores
	# ./run_smk_sequential.sh --step5 # Run categorization
	# ./run_smk_sequential.sh --step1 --step3 # Run summarize_root + preprocess
	#

	if [ -f ~/.apikeys.sh ]; then
	source ~/.apikeys.sh
	fi

	# Parse command line arguments
	RUN_STEP1=false
	RUN_STEP2=false
	RUN_STEP3=false
	RUN_STEP4=false
	RUN_STEP5=false
	VALIDATE_STEPS=false
	OUTPUT_DIR="results"
	CONFIG="config.yml"

	# Remember the project root where this script is invoked
	PROJECT_ROOT="$(pwd)"


	while [[ $# -gt 0 ]]; do
	case $1 in
	--step1)
	RUN_STEP1=true
	shift
	;;
	--step2)
	RUN_STEP2=true
	shift
	;;
	--step3)
	RUN_STEP3=true
	shift
	;;
	--step4)
	RUN_STEP4=true
	shift
	;;
	--step5)
	RUN_STEP5=true
	shift
	;;
	--validate)
	VALIDATE_STEPS=true
	shift
	;;
	--out-dir)
	OUTPUT_DIR="$2"
	shift
	shift
	;;
	--job-id)
	# Create unique directory based on job ID
	OUTPUT_DIR="results_job_$2"
	shift
	shift
	;;
	--auto-dir)
	# Create unique directory with timestamp
	TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
	OUTPUT_DIR="results_${TIMESTAMP}"
	shift
	;;
	--config)
	CONFIG="$2"
	shift
	shift
	;;
	--help\|-h)
	echo "Usage: $0 [OPTIONS]"
	echo ""
	echo "Run Snakemake workflows for ATLAS analysis"
	echo ""
	echo "Options:"
	echo " --step1 Run summarize_root workflow (both rules: data generation + prompt processing)"
	echo " --step2 Run create_numpy workflow"
	echo " --step3 Run preprocess workflow"
	echo " --step4 Run scores workflow"
	echo " --step5 Run categorization workflow"
	echo " --validate Run validation after each successful step"
	echo " --out-dir DIR Custom output directory (default: results)"
	echo " --job-id ID Create unique directory: results_job_ID"
	echo " --auto-dir Create unique directory with timestamp: results_YYYYMMDD_HHMMSS"
	echo " --help Show this help message"
	echo ""
	echo "Examples:"
	echo " $0 --step1 --auto-dir # results_20250916_143052/"
	echo " $0 --step1 --job-id 12345 # results_job_12345/"
	echo " $0 --step1 --out-dir my_run_1 # my_run_1/"
	echo ""
	echo "If no options are provided, all steps are run sequentially."
	exit 0
	;;
	*)
	echo "Unknown option: $1"
	echo "Use --help for usage information"
	exit 1
	;;
	esac
	done

	# If no specific steps requested, run all
	if [[ "$RUN_STEP1" == "false" && "$RUN_STEP2" == "false" && "$RUN_STEP3" == "false" && "$RUN_STEP4" == "false" && "$RUN_STEP5" == "false" ]]; then
	RUN_STEP1=true
	RUN_STEP2=true
	RUN_STEP3=true
	RUN_STEP4=true
	RUN_STEP5=true
	echo "=== Running All Snakemake Workflows Sequentially (Output: ${OUTPUT_DIR}) ==="
	else
	echo "=== Running Selected Snakemake Workflows (Output: ${OUTPUT_DIR}) ==="
	fi
	echo ""

	# Set up environment
	module load python
	conda activate llm_env

	# Resolve config file to an absolute path so Snakemake can always find it
	if [[ "${CONFIG}" = /* ]]; then
	CONFIG_PATH="${CONFIG}"
	else
	CONFIG_PATH="${PROJECT_ROOT}/${CONFIG}"
	fi

	if [[ ! -f "${CONFIG_PATH}" ]]; then
	echo "❌ Config file not found at ${CONFIG_PATH}"
	exit 1
	fi

	# Copy and prepare workflow files

	OUTPUT_DIR="${OUTPUT_DIR%/}"
	if [[ "${OUTPUT_DIR}" = /* ]]; then
	BASE_DIR="${OUTPUT_DIR}"
	else
	BASE_DIR="$PWD/${OUTPUT_DIR}"
	fi

	echo "Preparing workflow files..."
	mkdir -p ${OUTPUT_DIR}/prompts_temp
	cp -r prompts/* ${OUTPUT_DIR}/prompts_temp/
	sed -i "s#{BASE_DIR}#${BASE_DIR}#g" ${OUTPUT_DIR}/prompts_temp/*.txt

	cp workflow/summarize_root.smk ${OUTPUT_DIR}/summarize_root_temp.smk
	cp workflow/create_numpy.smk ${OUTPUT_DIR}/create_numpy_temp.smk
	cp workflow/preprocess.smk ${OUTPUT_DIR}/preprocess_temp.smk
	cp workflow/scores.smk ${OUTPUT_DIR}/scores_temp.smk
	cp workflow/categorization.smk ${OUTPUT_DIR}/categorization_temp.smk
	cp supervisor_coder.py ${OUTPUT_DIR}/supervisor_coder.py
	cp write_prompt.py ${OUTPUT_DIR}/write_prompt.py
	cp check_soln.py ${OUTPUT_DIR}/check_soln.py

	sed -i "s#{BASE_DIR}#${BASE_DIR}#g" ${OUTPUT_DIR}/*_temp.smk
	# Replace {CONFIG} in temp snakemake files with the absolute path to the project's config
	sed -i "s#{CONFIG}#${CONFIG_PATH}#g" ${OUTPUT_DIR}/*_temp.smk

	# Copy solutions for validation
	echo "Copying reference solution arrays for validation..."
	mkdir -p ${OUTPUT_DIR}/solution/arrays
	# Remove any existing files first to avoid permission issues
	rm -f ${OUTPUT_DIR}/solution/arrays/*
	cp solution/arrays/* ${OUTPUT_DIR}/solution/arrays/

	# Create output directory
	mkdir -p ${OUTPUT_DIR}/generated_code
	mkdir -p ${OUTPUT_DIR}/logs
	cp utils.py ${OUTPUT_DIR}/generated_code/utils.py

	# Clean up any existing numpy files (store metrics under logs)
	rm -f ${OUTPUT_DIR}/logs/success.npy ${OUTPUT_DIR}/logs/calls.npy ${OUTPUT_DIR}/logs/input_tokens.npy ${OUTPUT_DIR}/logs/output_tokens.npy

	echo "Starting sequential execution..."
	echo ""

	# Function to run a single workflow
	run_workflow() {
	local workflow_name=$1
	local smk_file=$2
	local target=$3
	local step_number=$4

	echo "========================================="
	echo "Running: $workflow_name"
	echo "Target: $target"
	echo "Time: $(date)"
	echo "========================================="

	# cd into OUTPUT_DIR and do all the work there
	if ! pushd "$OUTPUT_DIR" > /dev/null; then
	echo "❌ Failed to cd into $OUTPUT_DIR"
	return 1
	fi

	# Print the command that will be executed (run inside ${OUTPUT_DIR})
	# Commented out original with --stats, kept for reference
	# echo "Command: snakemake -s \"$smk_file\" -j 1 --forcerun \"$target\" --rerun-incomplete --configfile \"${CONFIG}\" --latency-wait 120 --verbose --stats logs/${workflow_name}.stats > logs/${workflow_name}.log 2>&1"
	echo "Command: snakemake -s \"$smk_file\" -j 1 --forcerun \"$target\" --rerun-incomplete --configfile \"${CONFIG}\" --latency-wait 120 --verbose > logs/${workflow_name}.log 2>&1"
	echo ""

	local start_time=$SECONDS

	# Run snakemake from inside the output directory. Use BASE_DIR for the config file
	# so Snakemake can find the main config.yml even when cwd is the job folder.
	# Original Snakemake run with --stats (commented out)
	# if snakemake -s "$smk_file" -j 1 --forcerun "$target" --rerun-incomplete --configfile "${CONFIG}" --latency-wait 120 --verbose --stats "logs/${workflow_name}.stats" > "logs/${workflow_name}.log" 2>&1; then
	if snakemake -s "$smk_file" -j 1 --forcerun "$target" --rerun-incomplete --configfile "${CONFIG_PATH}" --latency-wait 120 --verbose > "logs/${workflow_name}.log" 2>&1; then
	local duration=$((SECONDS - start_time))
	echo ""
	echo "✅ $workflow_name completed successfully in ${duration}s"
	echo ""

	# Run validation for this step if it completed successfully
	if [[ "$VALIDATE_STEPS" == "true" ]]; then
	echo "Running validation for Step $step_number..."
	if python check_soln.py --out_dir "${BASE_DIR}" --step $step_number >> "logs/${workflow_name}_validation.log" 2>&1; then
	echo "✅ Step $step_number validation completed"
	# Check if validation passed
	if [[ -f "${OUTPUT_DIR}/logs/success.npy" ]]; then
	validation_result=$(python -c "import numpy as np; print(np.load('${OUTPUT_DIR}/logs/success.npy')[$step_number-1])")
	if [[ "$validation_result" == "1" ]]; then
	echo "✅ Step $step_number validation: PASS"
	else
	echo "❌ Step $step_number validation: FAIL"
	fi
	fi
	else
	echo "❌ Step $step_number validation failed to run"
	fi
	echo ""
	fi
	popd > /dev/null
	return 0
	else
	local duration=$((SECONDS - start_time))
	echo ""
	echo "❌ $workflow_name failed after ${duration}s"
	echo ""
	popd > /dev/null
	return 1
	fi
	}

	# Run workflows sequentially based on flags
	step_counter=1

	if [[ "$RUN_STEP1" == "true" ]]; then
	echo "$step_counter. Running summarize_root workflow (both rules)..."
	# Run both rules: first summarize_root, then insert_root_summary
	run_workflow "summarize_root" "summarize_root_temp.smk" "summarize_root" 1
	run_workflow "insert_root_summary" "summarize_root_temp.smk" "insert_root_summary" 1
	((step_counter++))
	fi

	if [[ "$RUN_STEP2" == "true" ]]; then
	echo "$step_counter. Running create_numpy workflow..."
	run_workflow "create_numpy" "create_numpy_temp.smk" "create_numpy" 2
	((step_counter++))
	fi

	if [[ "$RUN_STEP3" == "true" ]]; then
	echo "$step_counter. Running preprocess workflow..."
	run_workflow "preprocess" "preprocess_temp.smk" "preprocess" 3
	((step_counter++))
	fi

	if [[ "$RUN_STEP4" == "true" ]]; then
	echo "$step_counter. Running scores workflow..."
	run_workflow "scores" "scores_temp.smk" "scores" 4
	((step_counter++))
	fi

	if [[ "$RUN_STEP5" == "true" ]]; then
	echo "$step_counter. Running categorization workflow..."
	run_workflow "categorization" "categorization_temp.smk" "categorization" 5
	((step_counter++))
	fi

	echo ""
	echo "=== Sequential Execution Complete ==="
	echo "Check ${OUTPUT_DIR}/ for output files"
	echo "Check ${OUTPUT_DIR}/logs/*.log files for detailed logs"
	if [[ "$VALIDATE_STEPS" == "true" ]]; then
	echo "Check ${OUTPUT_DIR}/logs/*_validation.log files for validation results"
	fi

	# Optional: Run final comprehensive validation (only if all steps were run)
	if [[ "$RUN_STEP1" == "true" && "$RUN_STEP2" == "true" && "$RUN_STEP3" == "true" && "$RUN_STEP4" == "true" && "$RUN_STEP5" == "true" ]]; then
	echo ""
	if [[ "$VALIDATE_STEPS" == "false" ]]; then
	read -p "Run final comprehensive validation? (y/n): " -n 1 -r
	echo ""
	if [[ $REPLY =~ ^[Yy]$ ]]; then
	echo "Running final comprehensive validation..."
	python check_soln.py --out_dir ${OUTPUT_DIR}
	fi
	else
	echo "Running final comprehensive validation..."
	python check_soln.py --out_dir ${OUTPUT_DIR}
	fi
	else
	echo ""
	echo "Note: Final comprehensive validation skipped (not all steps were run)"
	fi

	# Clean up
	echo ""
	# echo "Cleaning up temporary files..."
	# Comment out the next line to keep prompts_temp for inspection
	# rm -rf prompts_temp
	# rm -f *_temp.smk
	# rm -rf .snakemake # Clean up Snakemake's default log directory

	echo -e "Done!\n"