File size: 11,220 Bytes
5551585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#!/usr/bin/env bash
# ═══════════════════════════════════════════════════════════════════════════
# setup.sh β€” Medical VQA Environment Setup
# Hα»— trợ: Vast.ai (CUDA), Google Colab, local macOS (CPU/MPS)
#
# CΓ‘ch dΓΉng:
#   chmod +x setup.sh && bash setup.sh
#   bash setup.sh --colab        # Google Colab mode (skip git config)
#   bash setup.sh --offline      # Offline mode (khΓ΄ng sync WandB)
#   bash setup.sh --skip-nltk    # Bỏ qua download NLTK data
# ═══════════════════════════════════════════════════════════════════════════

set -euo pipefail

# ── Parse flags ──────────────────────────────────────────────────────────────
COLAB_MODE=0
OFFLINE_MODE=0
SKIP_NLTK=0
for arg in "$@"; do
  case $arg in
    --colab)    COLAB_MODE=1  ;;
    --offline)  OFFLINE_MODE=1 ;;
    --skip-nltk) SKIP_NLTK=1 ;;
  esac
done

# ── Colors ───────────────────────────────────────────────────────────────────
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; NC='\033[0m'
info()  { echo -e "${GREEN}[INFO]${NC}  $*"; }
warn()  { echo -e "${YELLOW}[WARN]${NC}  $*"; }
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }

echo ""
echo "════════════════════════════════════════════════════════════"
echo "  πŸ₯  Medical VQA β€” Environment Setup"
echo "  Project: DL Final 523H0173 & 523H0178"
echo "════════════════════════════════════════════════════════════"
echo ""

# ── 1. Python version check ──────────────────────────────────────────────────
PYTHON=$(command -v python3 || command -v python)
PY_VER=$($PYTHON --version 2>&1 | grep -oP '\d+\.\d+')
PY_MAJOR=$(echo $PY_VER | cut -d. -f1)
PY_MINOR=$(echo $PY_VER | cut -d. -f2)

info "Python $PY_VER tαΊ‘i: $($PYTHON -c 'import sys; print(sys.executable)')"
if [ "$PY_MAJOR" -lt 3 ] || { [ "$PY_MAJOR" -eq 3 ] && [ "$PY_MINOR" -lt 10 ]; }; then
  error "CαΊ§n Python β‰₯ 3.10 (hiện tαΊ‘i: $PY_VER)"
fi

# ── 2. GPU detection ─────────────────────────────────────────────────────────
CUDA_AVAILABLE=$($PYTHON -c "import torch; print(torch.cuda.is_available())" 2>/dev/null || echo "False")
if [ "$CUDA_AVAILABLE" = "True" ]; then
  GPU_NAME=$($PYTHON -c "import torch; print(torch.cuda.get_device_name(0))" 2>/dev/null || echo "Unknown")
  VRAM=$($PYTHON -c "import torch; print(round(torch.cuda.get_device_properties(0).total_memory/1e9,1))" 2>/dev/null || echo "?")
  info "GPU: $GPU_NAME | VRAM: ${VRAM}GB"
else
  warn "KhΓ΄ng phΓ‘t hiện CUDA GPU β€” training sαΊ½ rαΊ₯t chαΊ­m trΓͺn CPU"
fi

# ── 3. Install pip packages ──────────────────────────────────────────────────
info "CΓ i Δ‘αΊ·t dependencies tα»« requirements.txt..."

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REQ_FILE="$SCRIPT_DIR/requirements.txt"

if [ ! -f "$REQ_FILE" ]; then
  error "KhΓ΄ng tΓ¬m thαΊ₯y $REQ_FILE"
fi

# NΓ’ng pip trΖ°α»›c
$PYTHON -m pip install --upgrade pip --quiet

# CΓ i main requirements (quiet để giαΊ£m noise)
$PYTHON -m pip install -r "$REQ_FILE" --quiet || {
  warn "CΓ i Δ‘αΊ·t silent thαΊ₯t bαΊ‘i, thα»­ vα»›i verbose..."
  $PYTHON -m pip install -r "$REQ_FILE"
}

# wandb (cαΊ§n version chΓ­nh xΓ‘c)
$PYTHON -m pip install "wandb>=0.16.0" --quiet
info "βœ… Dependencies Δ‘Γ£ cΓ i xong"

# ── 4. NLTK data download ─────────────────────────────────────────────────────
if [ "$SKIP_NLTK" -eq 0 ]; then
  info "TαΊ£i NLTK data (punkt, wordnet)..."
  $PYTHON -c "
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
for pkg in ['punkt', 'punkt_tab', 'wordnet', 'averaged_perceptron_tagger', 'stopwords']:
    try:
        nltk.download(pkg, quiet=True)
    except Exception as e:
        print(f'  [WARN] NLTK {pkg}: {e}')
print('  NLTK data OK')
"
fi

# ── 5. Python path configuration ─────────────────────────────────────────────
info "CαΊ₯u hΓ¬nh Python path..."

# TαΊ‘o .pth file để Python tα»± Δ‘α»™ng thΓͺm project root vΓ o sys.path
SITE_PACKAGES=$($PYTHON -c "import site; print(site.getsitepackages()[0])" 2>/dev/null || \
                $PYTHON -c "import site; print(site.getusersitepackages())")
PTH_FILE="$SITE_PACKAGES/medical_vqa.pth"

echo "$SCRIPT_DIR" > "$PTH_FILE" && \
  info "βœ… Path cαΊ₯u hΓ¬nh tαΊ‘i: $PTH_FILE" || \
  warn "KhΓ΄ng thể ghi vΓ o site-packages, thα»­ export PYTHONPATH thα»§ cΓ΄ng."

# CΕ©ng export PYTHONPATH trong session hiện tαΊ‘i
export PYTHONPATH="$SCRIPT_DIR:${PYTHONPATH:-}"
info "PYTHONPATH = $PYTHONPATH"

# ── 6. .env file ─────────────────────────────────────────────────────────────
ENV_FILE="$SCRIPT_DIR/.env"
ENV_EXAMPLE="$SCRIPT_DIR/.env.example"

if [ ! -f "$ENV_FILE" ] && [ -f "$ENV_EXAMPLE" ]; then
  cp "$ENV_EXAMPLE" "$ENV_FILE"
  warn "Đã tαΊ‘o .env tα»« .env.example β€” HΓ£y Δ‘iền WANDB_API_KEY!"
fi

if [ -f "$ENV_FILE" ]; then
  # Source .env (bỏ qua comment vΓ  dΓ²ng trα»‘ng)
  set -a
  source <(grep -v '^\s*#' "$ENV_FILE" | grep -v '^\s*$') 2>/dev/null || true
  set +a
  info ".env Δ‘Γ£ được load"
fi

# ── 7. WandB login ───────────────────────────────────────────────────────────
if [ "$OFFLINE_MODE" -eq 1 ]; then
  export WANDB_MODE=offline
  info "WandB: OFFLINE mode (sync sau bαΊ±ng: wandb sync)"
elif [ -n "${WANDB_API_KEY:-}" ]; then
  $PYTHON -m wandb login "$WANDB_API_KEY" --relogin --quiet 2>/dev/null && \
    info "βœ… WandB logged in (entity: SpringWang08)" || \
    warn "WandB login thαΊ₯t bαΊ‘i β€” kiểm tra WANDB_API_KEY"
else
  warn "WANDB_API_KEY chΖ°a được set β€” WandB sαΊ½ bα»‹ bỏ qua khi training"
  warn "  Set bαΊ±ng: export WANDB_API_KEY=your_key"
  warn "  HoαΊ·c Δ‘iền vΓ o file .env"
fi

# ── 8. HuggingFace login ─────────────────────────────────────────────────────
if [ -n "${HF_TOKEN:-}" ]; then
  $PYTHON -c "from huggingface_hub import login; login(token='${HF_TOKEN}', add_to_git_credential=False)" 2>/dev/null && \
    info "βœ… HuggingFace logged in" || \
    warn "HF login thαΊ₯t bαΊ‘i β€” dataset cΓ΄ng khai vαΊ«n tαΊ£i được"
else
  warn "HF_TOKEN chΖ°a được set (khΓ΄ng cαΊ§n nαΊΏu dataset lΓ  public)"
fi

# ── 9. TαΊ‘o thΖ° mα»₯c cαΊ§n thiαΊΏt ─────────────────────────────────────────────────
info "TαΊ‘o thΖ° mα»₯c dα»± Γ‘n..."
for dir in checkpoints logs/history results/charts data scripts; do
  mkdir -p "$SCRIPT_DIR/$dir"
done
info "βœ… ThΖ° mα»₯c sαΊ΅n sΓ ng"

# ── 10. Smoke test import ─────────────────────────────────────────────────────
info "Kiểm tra imports..."
$PYTHON - <<'PYEOF'
import sys, importlib
ok, fail = [], []
checks = [
    ("torch",             "PyTorch"),
    ("torchvision",       "TorchVision"),
    ("transformers",      "Transformers"),
    ("datasets",          "HF Datasets"),
    ("peft",              "PEFT (LoRA)"),
    ("trl",               "TRL (SFT/DPO)"),
    ("wandb",             "WandB"),
    ("nltk",              "NLTK"),
    ("bert_score",        "BERTScore"),
    ("rouge_score",       "ROUGE"),
    ("sklearn",           "Scikit-learn"),
    ("matplotlib",        "Matplotlib"),
    ("yaml",              "PyYAML"),
    ("dotenv",            "python-dotenv"),
    ("cv2",               "OpenCV"),
]
for mod, name in checks:
    try:
        importlib.import_module(mod)
        ok.append(name)
    except ImportError:
        fail.append(name)

print(f"  βœ… OK ({len(ok)}): {', '.join(ok)}")
if fail:
    print(f"  ❌ MISSING ({len(fail)}): {', '.join(fail)}")
    sys.exit(1)
PYEOF

# ── 11. Kiểm tra src modules ─────────────────────────────────────────────────
info "Kiểm tra src modules..."
$PYTHON - <<'PYEOF'
import sys
checks = [
    "src.models.medical_vqa_model",
    "src.models.transformer_decoder",
    "src.engine.trainer",
    "src.engine.medical_eval",
    "src.data.medical_dataset",
    "src.utils.text_utils",
    "src.utils.translator",
]
ok, fail = [], []
for mod in checks:
    try:
        __import__(mod)
        ok.append(mod.split(".")[-1])
    except Exception as e:
        fail.append(f"{mod.split('.')[-1]} ({e})")

print(f"  βœ… src OK ({len(ok)}): {', '.join(ok)}")
if fail:
    print(f"  ❌ src FAIL ({len(fail)}): {', '.join(fail)}")
PYEOF

# ── Done ─────────────────────────────────────────────────────────────────────
echo ""
echo "════════════════════════════════════════════════════════════"
echo "  βœ…  Setup hoΓ n tαΊ₯t!"
echo ""
echo "  TiαΊΏp theo:"
echo "    export WANDB_API_KEY=your_key    # nαΊΏu chΖ°a cΓ³"
echo "    python train_medical.py --variant A1"
echo "    python train_medical.py --variant A2"
echo "    python train_medical.py --variant B1"
echo "    python train_medical.py --variant B2"
echo "    python train_medical.py --variant DPO"
echo ""
echo "  So sΓ‘nh 5 model sau khi train xong:"
echo "    python scripts/compare_models.py"
echo "════════════════════════════════════════════════════════════"
echo ""