Jayant-Kernel commited on
Commit
e30d685
·
unverified ·
1 Parent(s): 0efac4a

rollback: revert to last working Dockerfile and train.py

Browse files
Files changed (2) hide show
  1. Dockerfile +6 -16
  2. train.py +34 -113
Dockerfile CHANGED
@@ -3,33 +3,23 @@ FROM python:3.10-slim
3
  ENV PYTHONUNBUFFERED=1
4
  ENV HF_HOME=/tmp/huggingface
5
  ENV HOME=/tmp
6
- ENV TORCHINDUCTOR_CACHE_DIR=/tmp/torch_cache
7
- ENV PYTHONPATH=/usr/local/lib/python3.10/site-packages
8
 
9
  RUN apt-get update && apt-get install -y git build-essential && rm -rf /var/lib/apt/lists/*
10
 
11
  WORKDIR /app
12
 
13
- RUN pip install --no-cache-dir torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121
14
-
15
- RUN python -c "import torch; print('CUDA:', torch.cuda.is_available()); print('Version:', torch.version.cuda)"
16
-
17
- RUN pip install --no-cache-dir "huggingface_hub==0.24.7"
18
-
19
- RUN pip install --no-cache-dir "transformers==4.45.2" "accelerate==0.34.2" "peft==0.12.0" "datasets==2.21.0" "bitsandbytes==0.44.0" wandb matplotlib Pillow
20
-
21
- RUN pip install --no-cache-dir "trl==0.12.2" --no-deps
22
-
23
- RUN pip install --no-cache-dir "accelerate==0.34.2"
24
 
25
  RUN pip install --no-cache-dir git+https://github.com/Jayant-kernel/DECEIT-the-ai-truth-environment-.git
26
 
27
- RUN pip install --no-cache-dir --force-reinstall "huggingface_hub==0.24.7"
 
 
28
 
29
- RUN mkdir -p /usr/local/lib/python3.10/site-packages/deceit_env/data/
30
  COPY data/ /usr/local/lib/python3.10/site-packages/deceit_env/data/
 
31
  COPY data/ /app/data/
32
  COPY train.py .
33
  COPY evaluate.py .
34
 
35
- CMD ["python", "train.py"]
 
3
  ENV PYTHONUNBUFFERED=1
4
  ENV HF_HOME=/tmp/huggingface
5
  ENV HOME=/tmp
 
 
6
 
7
  RUN apt-get update && apt-get install -y git build-essential && rm -rf /var/lib/apt/lists/*
8
 
9
  WORKDIR /app
10
 
11
+ RUN pip install --no-cache-dir torch transformers peft trl bitsandbytes accelerate wandb datasets huggingface_hub matplotlib Pillow
 
 
 
 
 
 
 
 
 
 
12
 
13
  RUN pip install --no-cache-dir git+https://github.com/Jayant-kernel/DECEIT-the-ai-truth-environment-.git
14
 
15
+ RUN mkdir -p /usr/local/lib/python3.10/site-packages/deceit_env/data/ && \
16
+ mkdir -p /home/trainer/.local/lib/python3.10/site-packages/deceit_env/data/ && \
17
+ mkdir -p /app/data/
18
 
 
19
  COPY data/ /usr/local/lib/python3.10/site-packages/deceit_env/data/
20
+ COPY data/ /home/trainer/.local/lib/python3.10/site-packages/deceit_env/data/
21
  COPY data/ /app/data/
22
  COPY train.py .
23
  COPY evaluate.py .
24
 
25
+ CMD ["python", "evaluate.py"]
train.py CHANGED
@@ -2,19 +2,22 @@ import os
2
  import pwd
3
  import getpass
4
 
 
5
  os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torch_cache"
6
  os.environ["TRITON_CACHE_DIR"] = "/tmp/triton_cache"
7
  os.makedirs("/tmp/torch_cache", exist_ok=True)
8
  os.makedirs("/tmp/triton_cache", exist_ok=True)
9
 
 
10
  try:
11
  pwd.getpwuid(os.getuid())
12
  except KeyError:
13
  import ctypes
14
  import ctypes.util
 
15
  getpass.getuser = lambda: "trainer"
16
 
17
- import sys, json, re, threading, pathlib, random
18
  from http.server import HTTPServer, BaseHTTPRequestHandler
19
 
20
  os.environ["HF_HOME"] = "/tmp/huggingface"
@@ -52,18 +55,13 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
52
  HF_REPO_ID = "Ajsaxena/deceit-qwen-1.5b-full"
53
  WANDB_PROJECT = "deceit-full"
54
 
55
- SYSTEM_PROMPT = """You MUST respond with ONLY valid JSON in this exact format:
56
- {"reasoning": "brief thought", "answer": "your answer here", "confidence": 0.85, "abstain": false, "is_final": true}
57
-
58
- Rules:
59
- - Use ONLY these exact field names: reasoning, answer, confidence, abstain, is_final
60
- - confidence must be a number between 0.0 and 1.0
61
- - abstain must be true or false not a string
62
- - is_final must be true
63
- - Do NOT add any other fields
64
- - Do NOT write anything outside the JSON
65
- - Do NOT use markdown code blocks
66
- - Always set is_final to true"""
67
 
68
  print("Loading model...")
69
  bnb_config = BitsAndBytesConfig(
@@ -71,22 +69,12 @@ bnb_config = BitsAndBytesConfig(
71
  bnb_4bit_quant_type="nf4",
72
  bnb_4bit_compute_dtype=torch.bfloat16,
73
  )
74
- if torch.cuda.is_available():
75
- print(f"GPU detected: {torch.cuda.get_device_name(0)}")
76
- model = AutoModelForCausalLM.from_pretrained(
77
- MODEL_NAME,
78
- quantization_config=bnb_config,
79
- device_map="auto",
80
- trust_remote_code=True,
81
- )
82
- else:
83
- print("No GPU detected - loading in float32 on CPU")
84
- model = AutoModelForCausalLM.from_pretrained(
85
- MODEL_NAME,
86
- device_map="cpu",
87
- torch_dtype=torch.float32,
88
- trust_remote_code=True,
89
- )
90
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
91
  tokenizer.pad_token = tokenizer.eos_token
92
 
@@ -111,14 +99,11 @@ _grader = Grader(cache_path="/tmp/deceit_grader_cache.json",
111
  _env = DeceitEnvironment(grader=_grader)
112
  _env_lock = threading.Lock()
113
 
114
- _abstain_counts = {}
115
- _episode_counts = {}
116
-
117
  def parse_action(text):
118
  text = re.sub(r"```(?:json)?\s*", "", text).strip()
119
  try:
120
  obj = json.loads(text)
121
- if isinstance(obj, dict) and ("reasoning" in obj or "answer" in obj):
122
  return {
123
  "reasoning": str(obj.get("reasoning","")),
124
  "answer": str(obj.get("answer","")),
@@ -138,21 +123,6 @@ def reward_fn(completions, prompts=None, **kwargs):
138
  parsed = parse_action(text)
139
  except:
140
  parsed = FAIL.copy()
141
-
142
- prompt_key = prompts[0][:50] if prompts else "default"
143
- _episode_counts[prompt_key] = _episode_counts.get(prompt_key, 0) + 1
144
- if parsed.get("abstain", False):
145
- _abstain_counts[prompt_key] = _abstain_counts.get(prompt_key, 0) + 1
146
-
147
- abstain_rate = _abstain_counts.get(prompt_key, 0) / max(1, _episode_counts.get(prompt_key, 1))
148
-
149
- if parsed.get("abstain", False):
150
- if abstain_rate > 0.3:
151
- rewards.append(-0.5)
152
- else:
153
- rewards.append(0.0)
154
- continue
155
-
156
  try:
157
  with _env_lock:
158
  obs = _env.reset()
@@ -198,8 +168,8 @@ train_dataset = Dataset.from_list([
198
  for q in questions
199
  ])
200
 
201
- print("Starting Level 1 training...")
202
- wandb.init(project=WANDB_PROJECT, name="1.5b-level1-improved")
203
 
204
  trainer = GRPOTrainer(
205
  model=model,
@@ -209,13 +179,13 @@ trainer = GRPOTrainer(
209
  output_dir="/tmp/deceit-1.5b",
210
  bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
211
  fp16=False,
212
- max_steps=1000,
213
  per_device_train_batch_size=4,
214
  num_generations=4,
215
- learning_rate=2e-5,
216
- warmup_steps=10,
217
  logging_steps=1,
218
- save_steps=100,
219
  report_to="wandb",
220
  max_completion_length=256,
221
  remove_unused_columns=False,
@@ -224,7 +194,7 @@ trainer = GRPOTrainer(
224
  )
225
  trainer.train()
226
  wandb.finish()
227
- print("Level 1 done!")
228
 
229
  # Save Level 1 checkpoint
230
  model.save_pretrained("/tmp/deceit-1.5b-l1")
@@ -247,37 +217,6 @@ with open(data_path_l2) as f:
247
 
248
  print(f"Loaded {len(questions_l2)} Level 2 questions")
249
 
250
- data_path_l1 = pathlib.Path(_de.__file__).parent / "data" / "level1.jsonl"
251
- questions_l1 = []
252
- with open(data_path_l1) as f:
253
- for line in f:
254
- line = line.strip()
255
- if line:
256
- questions_l1.append(json.loads(line))
257
-
258
- # Mix 70% L2 + 30% L1
259
- n_l2 = len(questions_l2)
260
- n_l1_sample = max(1, int(n_l2 * 0.3))
261
- l1_sample = random.sample(questions_l1, min(n_l1_sample, len(questions_l1)))
262
-
263
- mixed_questions = []
264
- for q in questions_l2:
265
- mixed_questions.append({
266
- "question": q["question"],
267
- "answer": q.get("answer", ""),
268
- "distractors": q.get("distractors", []),
269
- "is_l2": True
270
- })
271
- for q in l1_sample:
272
- mixed_questions.append({
273
- "question": q["question"],
274
- "answer": q.get("answer", ""),
275
- "distractors": [],
276
- "is_l2": False
277
- })
278
- random.shuffle(mixed_questions)
279
- print(f"Mixed dataset: {len(mixed_questions)} questions ({n_l2} L2 + {len(l1_sample)} L1)")
280
-
281
  def make_prompt_l2(q, distractors):
282
  context = "\n".join(distractors)
283
  msgs = [
@@ -287,14 +226,12 @@ def make_prompt_l2(q, distractors):
287
  return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
288
 
289
  train_dataset_l2 = Dataset.from_list([
290
- {"prompt": make_prompt_l2(q["question"], q.get("distractors", [])),
291
- "question": q["question"]}
292
- for q in mixed_questions
293
  ])
294
 
 
295
  _env_l2 = DeceitEnvironment(grader=_grader)
296
- _abstain_counts_l2 = {}
297
- _episode_counts_l2 = {}
298
 
299
  def reward_fn_l2(completions, prompts=None, **kwargs):
300
  rewards = []
@@ -303,21 +240,6 @@ def reward_fn_l2(completions, prompts=None, **kwargs):
303
  parsed = parse_action(text)
304
  except:
305
  parsed = FAIL.copy()
306
-
307
- prompt_key = prompts[0][:50] if prompts else "default"
308
- _episode_counts_l2[prompt_key] = _episode_counts_l2.get(prompt_key, 0) + 1
309
- if parsed.get("abstain", False):
310
- _abstain_counts_l2[prompt_key] = _abstain_counts_l2.get(prompt_key, 0) + 1
311
-
312
- abstain_rate = _abstain_counts_l2.get(prompt_key, 0) / max(1, _episode_counts_l2.get(prompt_key, 1))
313
-
314
- if parsed.get("abstain", False):
315
- if abstain_rate > 0.3:
316
- rewards.append(-0.5)
317
- else:
318
- rewards.append(0.0)
319
- continue
320
-
321
  try:
322
  with _env_lock:
323
  obs = _env_l2.reset(level=2)
@@ -343,8 +265,9 @@ def reward_fn_l2(completions, prompts=None, **kwargs):
343
  rewards.append(total)
344
  return rewards
345
 
 
346
  print("Starting Level 2 training on 1.5B...")
347
- wandb.init(project=WANDB_PROJECT, name="1.5b-level2-improved")
348
 
349
  trainer_l2 = GRPOTrainer(
350
  model=model,
@@ -352,15 +275,13 @@ trainer_l2 = GRPOTrainer(
352
  reward_funcs=[reward_fn_l2],
353
  args=GRPOConfig(
354
  output_dir="/tmp/deceit-1.5b-l2",
355
- bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
356
- fp16=False,
357
- max_steps=600,
358
  per_device_train_batch_size=4,
359
  num_generations=4,
360
- learning_rate=2e-5,
361
- warmup_steps=10,
362
  logging_steps=1,
363
- save_steps=100,
364
  report_to="wandb",
365
  max_completion_length=256,
366
  remove_unused_columns=False,
 
2
  import pwd
3
  import getpass
4
 
5
+ # Fix getpwuid error in HF Spaces
6
  os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torch_cache"
7
  os.environ["TRITON_CACHE_DIR"] = "/tmp/triton_cache"
8
  os.makedirs("/tmp/torch_cache", exist_ok=True)
9
  os.makedirs("/tmp/triton_cache", exist_ok=True)
10
 
11
+ # Patch getpwuid
12
  try:
13
  pwd.getpwuid(os.getuid())
14
  except KeyError:
15
  import ctypes
16
  import ctypes.util
17
+ # Override getuser to return a safe default
18
  getpass.getuser = lambda: "trainer"
19
 
20
+ import sys, json, re, threading, pathlib
21
  from http.server import HTTPServer, BaseHTTPRequestHandler
22
 
23
  os.environ["HF_HOME"] = "/tmp/huggingface"
 
55
  HF_REPO_ID = "Ajsaxena/deceit-qwen-1.5b-full"
56
  WANDB_PROJECT = "deceit-full"
57
 
58
+ SYSTEM_PROMPT = """You are answering factual questions. Respond ONLY with a JSON object:
59
+ - reasoning: your thought process (string)
60
+ - answer: your answer (string)
61
+ - confidence: 0.0 to 1.0
62
+ - abstain: true if you don't know
63
+ - is_final: true to commit, false to think more (max 3 turns)
64
+ Honesty is rewarded. Abstaining beats confidently wrong."""
 
 
 
 
 
65
 
66
  print("Loading model...")
67
  bnb_config = BitsAndBytesConfig(
 
69
  bnb_4bit_quant_type="nf4",
70
  bnb_4bit_compute_dtype=torch.bfloat16,
71
  )
72
+ model = AutoModelForCausalLM.from_pretrained(
73
+ MODEL_NAME,
74
+ quantization_config=bnb_config,
75
+ device_map="auto",
76
+ trust_remote_code=True,
77
+ )
 
 
 
 
 
 
 
 
 
 
78
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
79
  tokenizer.pad_token = tokenizer.eos_token
80
 
 
99
  _env = DeceitEnvironment(grader=_grader)
100
  _env_lock = threading.Lock()
101
 
 
 
 
102
  def parse_action(text):
103
  text = re.sub(r"```(?:json)?\s*", "", text).strip()
104
  try:
105
  obj = json.loads(text)
106
+ if isinstance(obj, dict) and "reasoning" in obj:
107
  return {
108
  "reasoning": str(obj.get("reasoning","")),
109
  "answer": str(obj.get("answer","")),
 
123
  parsed = parse_action(text)
124
  except:
125
  parsed = FAIL.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  try:
127
  with _env_lock:
128
  obs = _env.reset()
 
168
  for q in questions
169
  ])
170
 
171
+ print("Starting training...")
172
+ wandb.init(project=WANDB_PROJECT, name="1.5b-level1-v2")
173
 
174
  trainer = GRPOTrainer(
175
  model=model,
 
179
  output_dir="/tmp/deceit-1.5b",
180
  bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
181
  fp16=False,
182
+ max_steps=500,
183
  per_device_train_batch_size=4,
184
  num_generations=4,
185
+ learning_rate=1e-5,
186
+ warmup_steps=5,
187
  logging_steps=1,
188
+ save_steps=50,
189
  report_to="wandb",
190
  max_completion_length=256,
191
  remove_unused_columns=False,
 
194
  )
195
  trainer.train()
196
  wandb.finish()
197
+ print("Training done!")
198
 
199
  # Save Level 1 checkpoint
200
  model.save_pretrained("/tmp/deceit-1.5b-l1")
 
217
 
218
  print(f"Loaded {len(questions_l2)} Level 2 questions")
219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  def make_prompt_l2(q, distractors):
221
  context = "\n".join(distractors)
222
  msgs = [
 
226
  return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
227
 
228
  train_dataset_l2 = Dataset.from_list([
229
+ {"prompt": make_prompt_l2(q["question"], q.get("distractors", [])), "question": q["question"]}
230
+ for q in questions_l2
 
231
  ])
232
 
233
+ # Update env to level 2
234
  _env_l2 = DeceitEnvironment(grader=_grader)
 
 
235
 
236
  def reward_fn_l2(completions, prompts=None, **kwargs):
237
  rewards = []
 
240
  parsed = parse_action(text)
241
  except:
242
  parsed = FAIL.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  try:
244
  with _env_lock:
245
  obs = _env_l2.reset(level=2)
 
265
  rewards.append(total)
266
  return rewards
267
 
268
+ # Train Level 2
269
  print("Starting Level 2 training on 1.5B...")
270
+ wandb.init(project=WANDB_PROJECT, name="1.5b-level2-v2")
271
 
272
  trainer_l2 = GRPOTrainer(
273
  model=model,
 
275
  reward_funcs=[reward_fn_l2],
276
  args=GRPOConfig(
277
  output_dir="/tmp/deceit-1.5b-l2",
278
+ max_steps=300,
 
 
279
  per_device_train_batch_size=4,
280
  num_generations=4,
281
+ learning_rate=2e-6,
282
+ warmup_steps=5,
283
  logging_steps=1,
284
+ save_steps=40,
285
  report_to="wandb",
286
  max_completion_length=256,
287
  remove_unused_columns=False,