Joblib
PeptiVerse / training_classifiers /src_bash /ml_uncertainty.bash
ynuozhang
major update
04c2975
#!/bin/bash
#SBATCH --job-name=ml-walltime
#SBATCH --partition=b200-mig45
#SBATCH --gpus=1
#SBATCH --cpus-per-task=5
#SBATCH --mem=50G
#SBATCH --time=6:00:00
#SBATCH --output=%x_%j.out
# =============================================================================
# Unified Bootstrap CI + Uncertainty + Wall-time Refit
# wt, smiles, chemberta embeddings
# Runs sequentially: bootstrap/uncertainty first, then wall-time refit
# =============================================================================
HOME_LOC=~/
SCRIPT_LOC=$HOME_LOC/PeptiVerse/training_classifiers
ALT_EMB_LOC=$HOME_LOC/PeptiVerse/training_data_cleaned
LOG_LOC=$SCRIPT_LOC/src_bash/logs
mkdir -p $LOG_LOC
DATE=$(date +%m_%d)
cd $SCRIPT_LOC
# =============================================================================
# Helper functions
# =============================================================================
# Bootstrap CI + uncertainty
# $1=OBJECTIVE $2=WT $3=UNCERTAINTY_SCRIPT $4=MODEL_TYPE $5=UNC_MODE
run_bootstrap() {
local OBJECTIVE=$1
local WT=$2
local SCRIPT=$3
local MODEL_TYPE=$4
local UNC_MODE=$5
local VAL_PREDS="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}/val_predictions.csv"
local OUT_DIR="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}"
local LOG_FILE="${LOG_LOC}/${DATE}_ci_${MODEL_TYPE}_${OBJECTIVE}_${WT}.log"
if [ ! -f "$VAL_PREDS" ]; then
echo " [SKIP bootstrap] val_predictions.csv not found: $VAL_PREDS"
return
fi
echo " [bootstrap ci] ${MODEL_TYPE} / ${OBJECTIVE} / ${WT}"
python -u "$SCRIPT" \
--mode ci \
--val_preds "$VAL_PREDS" \
--out_dir "$OUT_DIR" \
--model_name "${MODEL_TYPE}_${WT}" \
>> "$LOG_FILE" 2>&1
echo " [bootstrap unc] ${MODEL_TYPE} / ${OBJECTIVE} / ${WT} (${UNC_MODE})"
python -u "$SCRIPT" \
--mode "$UNC_MODE" \
--val_preds "$VAL_PREDS" \
--out_dir "$OUT_DIR" \
--model_name "${MODEL_TYPE}_${WT}" \
>> "$LOG_FILE" 2>&1
echo " ${OUT_DIR}/"
}
# Wall-time refit
# $1=OBJECTIVE $2=WT $3=MODEL_TYPE $4=DATASET_PATH
run_walltime() {
local OBJECTIVE=$1
local WT=$2
local MODEL_TYPE=$3
local DATASET_PATH=$4
local MODEL_DIR="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}"
local LOG_FILE="${LOG_LOC}/${DATE}_walltime_${MODEL_TYPE}_${OBJECTIVE}_${WT}.log"
if [ ! -d "$MODEL_DIR" ]; then
echo " [SKIP walltime] model_dir not found: $MODEL_DIR"
return
fi
if [ ! -d "$DATASET_PATH" ]; then
echo " [SKIP walltime] dataset not found: $DATASET_PATH"
return
fi
echo " [walltime] ${MODEL_TYPE} / ${OBJECTIVE} / ${WT}"
python -u refit_ml_walltime.py \
--model_dir "$MODEL_DIR" \
--dataset_path "$DATASET_PATH" \
--logs_dir "$LOG_LOC" \
>> "$LOG_FILE" 2>&1
echo " logged to ${LOG_LOC}/${DATE}_wall_clock_ml.jsonl"
}
# =============================================================================
# Dataset path lookup
# $1=OBJECTIVE $2=WT
# =============================================================================
get_dataset_path() {
local OBJECTIVE=$1
local WT=$2
local DATA_LOC=$HOME_LOC/projects/Classifier_Weight/training_data_cleaned
case "${OBJECTIVE}|${WT}" in
# -- wt embeddings (ESM2 / original) ------------------------------
"hemolysis|wt") echo "${DATA_LOC}/hemolysis/hemo_wt_with_embeddings" ;;
"nf|wt") echo "${DATA_LOC}/nf/nf_wt_with_embeddings" ;;
"solubility|wt") echo "${DATA_LOC}/solubility/sol_wt_with_embeddings" ;;
"permeability_penetrance|wt") echo "${DATA_LOC}/permeability_penetrance/perm_wt_with_embeddings_pooled" ;;
# -- smiles embeddings (PeptideCLM) -------------------------------
"hemolysis|smiles") echo "${ALT_EMB_LOC}/hemolysis_peptideclm/hemo_smiles_with_embeddings" ;;
"nf|smiles") echo "${ALT_EMB_LOC}/nf_peptideclm/nf_smiles_with_embeddings" ;;
"permeability_pampa|smiles") echo "${ALT_EMB_LOC}/permeability_pampa_peptideclm/pampa_smiles_with_embeddings" ;;
"permeability_caco2|smiles") echo "${ALT_EMB_LOC}/permeability_caco2_peptideclm/caco2_smiles_with_embeddings" ;;
# -- chemberta embeddings -----------------------------------------
"hemolysis|chemberta") echo "${ALT_EMB_LOC}/hemolysis_chemberta/hemo_smiles_with_embeddings" ;;
"nf|chemberta") echo "${ALT_EMB_LOC}/nf_chemberta/nf_smiles_with_embeddings" ;;
"permeability_penetrance|chemberta") echo "${ALT_EMB_LOC}/permeability_chemberta/perm_smiles_with_embeddings" ;;
"permeability_penetrance|peptideclm") echo "${ALT_EMB_LOC}/permeability_peptideclm/perm_smiles_with_embeddings" ;;
"permeability_pampa|chemberta") echo "${ALT_EMB_LOC}/permeability_pampa_chemberta/pampa_smiles_with_embeddings" ;;
"permeability_caco2|chemberta") echo "${ALT_EMB_LOC}/permeability_caco2_chemberta/caco2_smiles_with_embeddings" ;;
*)
echo ""
;;
esac
}
# =============================================================================
# SECTION 1 - Classification tasks
# =============================================================================
echo ""
echo "============================================================"
echo " SECTION 1: Classification bootstrap + walltime"
echo "============================================================"
CLS_MODEL_TYPES=("svm_gpu" "enet_gpu" "xgb")
# hemolysis, nf - wt + smiles + chemberta
for OBJECTIVE in "hemolysis" "nf"; do
for WT in "wt" "smiles" "chemberta"; do
for MODEL_TYPE in "${CLS_MODEL_TYPES[@]}"; do
echo ""
echo "-- ${OBJECTIVE} / ${WT} / ${MODEL_TYPE} --"
run_bootstrap "$OBJECTIVE" "$WT" "ml_uncertainty.py" "$MODEL_TYPE" "uncertainty_prob"
DPATH=$(get_dataset_path "$OBJECTIVE" "$WT")
run_walltime "$OBJECTIVE" "$WT" "$MODEL_TYPE" "$DPATH"
done
done
done
# solubility, permeability_penetrance - wt + chemberta (no smiles embeddings)
for OBJECTIVE in "solubility" "permeability_penetrance"; do
for WT in "wt" "chemberta"; do
for MODEL_TYPE in "${CLS_MODEL_TYPES[@]}"; do
echo ""
echo "-- ${OBJECTIVE} / ${WT} / ${MODEL_TYPE} --"
run_bootstrap "$OBJECTIVE" "$WT" "ml_uncertainty.py" "$MODEL_TYPE" "uncertainty_prob"
DPATH=$(get_dataset_path "$OBJECTIVE" "$WT")
run_walltime "$OBJECTIVE" "$WT" "$MODEL_TYPE" "$DPATH"
done
done
done
# =============================================================================
# SECTION 2 - Regression tasks (PAMPA, Caco-2)
# =============================================================================
echo ""
echo "============================================================"
echo " SECTION 2: Regression bootstrap + walltime"
echo "============================================================"
REG_MODEL_TYPES=("svr" "enet_gpu" "xgb")
for OBJECTIVE in "permeability_pampa" "permeability_caco2"; do
for WT in "smiles" "chemberta"; do
for MODEL_TYPE in "${REG_MODEL_TYPES[@]}"; do
echo ""
echo "-- ${OBJECTIVE} / ${WT} / ${MODEL_TYPE} --"
run_bootstrap "$OBJECTIVE" "$WT" "ml_uncertainty_reg.py" "$MODEL_TYPE" "uncertainty_residual"
DPATH=$(get_dataset_path "$OBJECTIVE" "$WT")
run_walltime "$OBJECTIVE" "$WT" "$MODEL_TYPE" "$DPATH"
done
done
done
echo ""
echo "============================================================"
echo "All runs completed at $(date)"
echo "============================================================"
conda deactivate