YashashMathur commited on
Commit
e945ec1
·
verified ·
1 Parent(s): 854a3f5

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +42 -22
  3. README.md +12 -16
  4. hf_training/requirements.txt +8 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ aegis_env/dist/aegis_env-0.1.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,22 +1,42 @@
1
- FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
2
-
3
- ENV DEBIAN_FRONTEND=noninteractive
4
- RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
5
-
6
- WORKDIR /app
7
-
8
- # Install Unsloth and training dependencies
9
- RUN pip install --no-cache-dir \
10
- "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" \
11
- && pip install --no-cache-dir --no-deps xformers \
12
- && pip install --no-cache-dir \
13
- trl peft accelerate bitsandbytes huggingface_hub safetensors
14
-
15
- # Copy training script and dataset
16
- COPY train.py .
17
- COPY aegis_training_data_500.json .
18
-
19
- EXPOSE 7860
20
-
21
- # -u for unbuffered stdout so logs appear in real time in HF Space console
22
- CMD ["python", "-u", "train.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+ ENV PYTHONUNBUFFERED=1
5
+
6
+ # Install system essentials
7
+ RUN apt-get update && apt-get install -y \
8
+ python3.10 \
9
+ python3-pip \
10
+ python3-dev \
11
+ git \
12
+ build-essential \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Set up Hugging Face user (u:1000)
16
+ RUN useradd -m -u 1000 user
17
+ USER user
18
+ ENV PATH="/home/user/.local/bin:${PATH}"
19
+
20
+ WORKDIR /app
21
+
22
+ # Sync PyTorch
23
+ RUN pip install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
24
+
25
+ # Install Unsloth
26
+ RUN pip install --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
27
+
28
+ # Install requirements
29
+ COPY --chown=user hf_training/requirements.txt .
30
+ RUN pip install --no-cache-dir -r requirements.txt
31
+
32
+ # Install Secondary (No-Deps)
33
+ RUN pip install --no-cache-dir --no-deps xformers<0.28 trl<0.13.0 peft accelerate bitsandbytes
34
+
35
+ # Copy training files
36
+ COPY --chown=user hf_training/ ./hf_training/
37
+ COPY --chown=user aegis_env/ ./aegis_env/
38
+ COPY --chown=user scripts/ ./scripts/
39
+ COPY --chown=user aegis_training_data_500.json .
40
+
41
+ # CMD
42
+ CMD ["python3", "hf_training/train.py"]
README.md CHANGED
@@ -1,16 +1,12 @@
1
- ---
2
- title: AEGIS Training
3
- emoji: 🛡️
4
- colorFrom: red
5
- colorTo: blue
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- # AEGIS Training Space
11
-
12
- This Space runs GRPO training for Qwen2.5-7B on the AEGIS fleet oversight task.
13
-
14
- **Status page is served on port 7860 — refresh to see current training step.**
15
-
16
- After training completes, downgrade hardware to CPU basic (free) in Space Settings.
 
1
+ ---
2
+ title: Aegis Training
3
+ emoji: 🛡️
4
+ colorFrom: blue
5
+ colorTo: gray
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ # Aegis Training
11
+
12
+ This space handles RL training for the AEGIS fleet oversight agents.
 
 
 
 
hf_training/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers>=4.45.0
2
+ huggingface_hub
3
+ sentencepiece
4
+ protobuf
5
+ datasets
6
+ scipy
7
+ gradio
8
+ safetensors