Spaces:

YashashMathur
/

aegis_training

Runtime error

YashashMathur commited on 14 days ago

Commit

e945ec1

verified ·

1 Parent(s): 854a3f5

Upload folder using huggingface_hub

Files changed (4) hide show

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+aegis_env/dist/aegis_env-0.1.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -1,22 +1,42 @@
-FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-WORKDIR /app
-# Install Unsloth and training dependencies
-RUN pip install --no-cache-dir \
-    "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" \
-    && pip install --no-cache-dir --no-deps xformers \
-    && pip install --no-cache-dir \
-    trl peft accelerate bitsandbytes huggingface_hub safetensors
-# Copy training script and dataset
-COPY train.py .
-COPY aegis_training_data_500.json .
-EXPOSE 7860
-# -u for unbuffered stdout so logs appear in real time in HF Space console
-CMD ["python", "-u", "train.py"]

+FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+# Install system essentials
+RUN apt-get update && apt-get install -y \
+    python3.10 \
+    python3-pip \
+    python3-dev \
+    git \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Set up Hugging Face user (u:1000)
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:${PATH}"
+WORKDIR /app
+# Sync PyTorch
+RUN pip install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
+# Install Unsloth
+RUN pip install --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+# Install requirements
+COPY --chown=user hf_training/requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Install Secondary (No-Deps)
+RUN pip install --no-cache-dir --no-deps xformers<0.28 trl<0.13.0 peft accelerate bitsandbytes
+# Copy training files
+COPY --chown=user hf_training/ ./hf_training/
+COPY --chown=user aegis_env/ ./aegis_env/
+COPY --chown=user scripts/ ./scripts/
+COPY --chown=user aegis_training_data_500.json .
+# CMD
+CMD ["python3", "hf_training/train.py"]

README.md CHANGED Viewed

@@ -1,16 +1,12 @@
----
-title: AEGIS Training
-emoji: 🛡️
-colorFrom: red
-colorTo: blue
-sdk: docker
-pinned: false
----
-# AEGIS Training Space
-This Space runs GRPO training for Qwen2.5-7B on the AEGIS fleet oversight task.
-**Status page is served on port 7860 — refresh to see current training step.**
-After training completes, downgrade hardware to CPU basic (free) in Space Settings.

+---
+title: Aegis Training
+emoji: 🛡️
+colorFrom: blue
+colorTo: gray
+sdk: docker
+pinned: false
+---
+# Aegis Training
+This space handles RL training for the AEGIS fleet oversight agents.

hf_training/requirements.txt ADDED Viewed

+transformers>=4.45.0
+huggingface_hub
+sentencepiece
+protobuf
+datasets
+scipy
+gradio
+safetensors