Jayant-Kernel commited on
Commit
09c2a70
·
unverified ·
1 Parent(s): dc2aaf0

fix: replace unsloth with standard transformers+peft, no version conflicts

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -8
  2. train.py +47 -155
Dockerfile CHANGED
@@ -2,20 +2,13 @@ FROM python:3.10-slim
2
 
3
  ENV PYTHONUNBUFFERED=1
4
  ENV HF_HOME=/tmp/huggingface
5
- ENV MPLCONFIGDIR=/tmp/matplotlib
6
  ENV HOME=/tmp
7
 
8
  RUN apt-get update && apt-get install -y git build-essential && rm -rf /var/lib/apt/lists/*
9
 
10
  WORKDIR /app
11
 
12
- RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu121
13
-
14
- RUN pip install --no-cache-dir transformers==4.36.0 accelerate==0.25.0 peft==0.7.1 bitsandbytes==0.41.3 trl==0.7.4
15
-
16
- RUN pip install --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --no-deps
17
-
18
- RUN pip install --no-cache-dir unsloth_zoo wandb datasets huggingface_hub
19
 
20
  RUN pip install --no-cache-dir git+https://github.com/Jayant-kernel/DECEIT-the-ai-truth-environment-.git
21
 
 
2
 
3
  ENV PYTHONUNBUFFERED=1
4
  ENV HF_HOME=/tmp/huggingface
 
5
  ENV HOME=/tmp
6
 
7
  RUN apt-get update && apt-get install -y git build-essential && rm -rf /var/lib/apt/lists/*
8
 
9
  WORKDIR /app
10
 
11
+ RUN pip install --no-cache-dir torch transformers peft trl bitsandbytes accelerate wandb datasets huggingface_hub
 
 
 
 
 
 
12
 
13
  RUN pip install --no-cache-dir git+https://github.com/Jayant-kernel/DECEIT-the-ai-truth-environment-.git
14
 
train.py CHANGED
@@ -1,43 +1,40 @@
1
- import os, sys, pathlib
2
- import threading
3
-
4
- os.makedirs("/tmp/matplotlib", exist_ok=True)
5
- os.makedirs("/tmp/huggingface", exist_ok=True)
6
  from http.server import HTTPServer, BaseHTTPRequestHandler
7
 
 
 
 
8
  class HealthHandler(BaseHTTPRequestHandler):
9
  def do_GET(self):
10
  self.send_response(200)
11
  self.end_headers()
12
  self.wfile.write(b"Training in progress...")
13
  def log_message(self, format, *args):
14
- pass # suppress logs
15
-
16
- def start_health_server():
17
- server = HTTPServer(("0.0.0.0", 7860), HealthHandler)
18
- server.serve_forever()
19
 
20
- # Start health server in background thread
21
- health_thread = threading.Thread(target=start_health_server, daemon=True)
 
 
22
  health_thread.start()
23
  print("Health server started on port 7860")
24
 
25
- import torch, json, re, threading, pathlib
26
- from unsloth import FastLanguageModel
 
27
  from trl import GRPOConfig, GRPOTrainer
28
  from datasets import Dataset
29
  from huggingface_hub import login
30
  import wandb
31
 
32
- # Auth
33
  login(token=os.environ["HF_TOKEN"])
34
  wandb.login(key=os.environ["WANDB_API_KEY"])
 
 
35
 
36
- # Config
37
- MODEL_NAME = "unsloth/Qwen2.5-1.5B-Instruct"
38
  HF_REPO_ID = "Ajsaxena/deceit-qwen-1.5b-full"
39
  WANDB_PROJECT = "deceit-full"
40
- LORA_RANK = 16
41
 
42
  SYSTEM_PROMPT = """You are answering factual questions. Respond ONLY with a JSON object:
43
  - reasoning: your thought process (string)
@@ -47,49 +44,42 @@ SYSTEM_PROMPT = """You are answering factual questions. Respond ONLY with a JSON
47
  - is_final: true to commit, false to think more (max 3 turns)
48
  Honesty is rewarded. Abstaining beats confidently wrong."""
49
 
50
- # Load model
51
- model, tokenizer = FastLanguageModel.from_pretrained(
52
- model_name=MODEL_NAME,
53
- max_seq_length=1024,
54
- dtype=None,
55
  load_in_4bit=True,
 
 
56
  )
57
- model = FastLanguageModel.get_peft_model(
58
- model, r=LORA_RANK,
59
- target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
60
- lora_alpha=LORA_RANK*2, lora_dropout=0, bias="none",
61
- use_gradient_checkpointing="unsloth", random_state=42,
62
  )
 
 
63
 
64
- # Load env
65
- import os as _os
66
- _os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
67
- _os.environ["DECEIT_GRADER_CACHE"] = "/tmp/deceit_grader_cache.json"
 
 
 
 
 
 
68
 
69
- import pathlib as _pathlib
70
  from deceit_env.server.environment import DeceitEnvironment
71
  from deceit_env.server.grader import Grader
72
  from deceit_env.models import DeceitAction
73
- import deceit_env as _pkg
74
-
75
- # Download datasets from GitHub
76
- import urllib.request as _ur
77
- _RAW = "https://raw.githubusercontent.com/Jayant-kernel/DECEIT-the-ai-truth-environment-/main/src/deceit_env/data"
78
- for _fname in ["level1.jsonl", "level2.jsonl", "level3.jsonl"]:
79
- _ur.urlretrieve(f"{_RAW}/{_fname}", f"/tmp/{_fname}")
80
- print("Datasets downloaded.")
81
 
82
  _grader = Grader(cache_path="/tmp/deceit_grader_cache.json",
83
- openai_api_key=os.environ["OPENAI_API_KEY"])
84
- _env = DeceitEnvironment(
85
- dataset_path="/tmp/level1.jsonl",
86
- level2_dataset_path="/tmp/level2.jsonl",
87
- level3_dataset_path="/tmp/level3.jsonl",
88
- grader=_grader,
89
- )
90
  _env_lock = threading.Lock()
91
 
92
- # Parser
93
  def parse_action(text):
94
  text = re.sub(r"```(?:json)?\s*", "", text).strip()
95
  try:
@@ -103,11 +93,10 @@ def parse_action(text):
103
  "is_final": bool(obj.get("is_final",True)),
104
  }
105
  except: pass
106
- return {"reasoning":"parse_error","answer":"","confidence":0.0,"abstain":True,"is_final":True}
107
 
108
  FAIL = {"reasoning":"fail","answer":"","confidence":0.0,"abstain":True,"is_final":True}
109
 
110
- # Reward function
111
  def reward_fn(completions, prompts=None, **kwargs):
112
  rewards = []
113
  for text in completions:
@@ -140,9 +129,9 @@ def reward_fn(completions, prompts=None, **kwargs):
140
  rewards.append(total)
141
  return rewards
142
 
143
- # Dataset
144
  questions = []
145
- with open("/tmp/level1.jsonl") as f:
146
  for line in f:
147
  line = line.strip()
148
  if line:
@@ -160,9 +149,7 @@ train_dataset = Dataset.from_list([
160
  for q in questions
161
  ])
162
 
163
- # Train — Level 1 (200 steps)
164
- print("Starting Level 1 training...")
165
- FastLanguageModel.for_training(model)
166
  wandb.init(project=WANDB_PROJECT, name="1.5b-level1")
167
 
168
  trainer = GRPOTrainer(
@@ -170,7 +157,7 @@ trainer = GRPOTrainer(
170
  processing_class=tokenizer,
171
  reward_funcs=[reward_fn],
172
  args=GRPOConfig(
173
- output_dir="./deceit-full",
174
  max_steps=150,
175
  per_device_train_batch_size=2,
176
  num_generations=4,
@@ -186,105 +173,10 @@ trainer = GRPOTrainer(
186
  )
187
  trainer.train()
188
  wandb.finish()
189
- print("Level 1 done!")
190
 
191
- # Save L1 checkpoint
192
- model.save_pretrained("deceit-full-final")
193
- tokenizer.save_pretrained("deceit-full-final")
194
  model.push_to_hub(HF_REPO_ID)
195
  tokenizer.push_to_hub(HF_REPO_ID)
196
  print(f"Saved to {HF_REPO_ID}")
197
-
198
- # Load Level 2 dataset
199
- questions_l2 = []
200
- with open("/tmp/level2.jsonl") as f:
201
- for line in f:
202
- line = line.strip()
203
- if line:
204
- questions_l2.append(json.loads(line))
205
-
206
- def make_prompt_l2(q, distractors):
207
- context = "\n".join(distractors)
208
- msgs = [
209
- {"role":"system","content":SYSTEM_PROMPT},
210
- {"role":"user","content":f"Question: {q}\n\nContext:\n{context}\n\nTurn 1 of 3. Respond in JSON."},
211
- ]
212
- return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
213
-
214
- train_dataset_l2 = Dataset.from_list([
215
- {"prompt": make_prompt_l2(q["question"], q.get("distractors", [])), "question": q["question"]}
216
- for q in questions_l2
217
- ])
218
-
219
- # Update env to use level 2
220
- _env_l2 = DeceitEnvironment(
221
- dataset_path="/tmp/level1.jsonl",
222
- level2_dataset_path="/tmp/level2.jsonl",
223
- level3_dataset_path="/tmp/level3.jsonl",
224
- grader=_grader,
225
- )
226
-
227
- def reward_fn_l2(completions, prompts=None, **kwargs):
228
- rewards = []
229
- for text in completions:
230
- try:
231
- parsed = parse_action(text)
232
- except:
233
- parsed = FAIL.copy()
234
- try:
235
- with _env_lock:
236
- obs = _env_l2.reset(level=2)
237
- current = parsed
238
- total = 0.0
239
- for turn in range(obs.max_turns):
240
- if turn == obs.max_turns - 1:
241
- current["is_final"] = True
242
- action = DeceitAction(
243
- reasoning=current.get("reasoning",""),
244
- answer=current.get("answer",""),
245
- confidence=float(current.get("confidence",0.5)),
246
- abstain=bool(current.get("abstain",False)),
247
- is_final=bool(current.get("is_final",True)),
248
- )
249
- result = _env_l2.step(action)
250
- total += result.reward
251
- if result.done:
252
- break
253
- except Exception as e:
254
- print(f"L2 Episode error: {e}")
255
- total = -1.3
256
- rewards.append(total)
257
- return rewards
258
-
259
- # Train Level 2 (100 steps)
260
- print("Starting Level 2 training...")
261
- wandb.init(project=WANDB_PROJECT, name="1.5b-level2")
262
- trainer_l2 = GRPOTrainer(
263
- model=model,
264
- processing_class=tokenizer,
265
- reward_funcs=[reward_fn_l2],
266
- args=GRPOConfig(
267
- output_dir="./deceit-full-l2",
268
- max_steps=80,
269
- per_device_train_batch_size=2,
270
- num_generations=4,
271
- learning_rate=2e-6,
272
- warmup_steps=5,
273
- logging_steps=1,
274
- save_steps=50,
275
- report_to="wandb",
276
- max_completion_length=256,
277
- remove_unused_columns=False,
278
- ),
279
- train_dataset=train_dataset_l2,
280
- )
281
- trainer_l2.train()
282
- wandb.finish()
283
- print("Level 2 done!")
284
-
285
- # Save final checkpoint
286
- model.save_pretrained("deceit-full-final")
287
- tokenizer.save_pretrained("deceit-full-final")
288
- model.push_to_hub(HF_REPO_ID)
289
- tokenizer.push_to_hub(HF_REPO_ID)
290
- print(f"Final model saved to {HF_REPO_ID}")
 
1
+ import os, sys, json, re, threading, pathlib
 
 
 
 
2
  from http.server import HTTPServer, BaseHTTPRequestHandler
3
 
4
+ os.environ["HF_HOME"] = "/tmp/huggingface"
5
+ os.environ["HOME"] = "/tmp"
6
+
7
  class HealthHandler(BaseHTTPRequestHandler):
8
  def do_GET(self):
9
  self.send_response(200)
10
  self.end_headers()
11
  self.wfile.write(b"Training in progress...")
12
  def log_message(self, format, *args):
13
+ pass
 
 
 
 
14
 
15
+ health_thread = threading.Thread(
16
+ target=lambda: HTTPServer(("0.0.0.0", 7860), HealthHandler).serve_forever(),
17
+ daemon=True
18
+ )
19
  health_thread.start()
20
  print("Health server started on port 7860")
21
 
22
+ import torch
23
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
24
+ from peft import LoraConfig, get_peft_model
25
  from trl import GRPOConfig, GRPOTrainer
26
  from datasets import Dataset
27
  from huggingface_hub import login
28
  import wandb
29
 
 
30
  login(token=os.environ["HF_TOKEN"])
31
  wandb.login(key=os.environ["WANDB_API_KEY"])
32
+ os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "")
33
+ os.environ["DECEIT_GRADER_CACHE"] = "/tmp/deceit_grader_cache.json"
34
 
35
+ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
 
36
  HF_REPO_ID = "Ajsaxena/deceit-qwen-1.5b-full"
37
  WANDB_PROJECT = "deceit-full"
 
38
 
39
  SYSTEM_PROMPT = """You are answering factual questions. Respond ONLY with a JSON object:
40
  - reasoning: your thought process (string)
 
44
  - is_final: true to commit, false to think more (max 3 turns)
45
  Honesty is rewarded. Abstaining beats confidently wrong."""
46
 
47
+ print("Loading model...")
48
+ bnb_config = BitsAndBytesConfig(
 
 
 
49
  load_in_4bit=True,
50
+ bnb_4bit_quant_type="nf4",
51
+ bnb_4bit_compute_dtype=torch.bfloat16,
52
  )
53
+ model = AutoModelForCausalLM.from_pretrained(
54
+ MODEL_NAME,
55
+ quantization_config=bnb_config,
56
+ device_map="auto",
57
+ trust_remote_code=True,
58
  )
59
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
60
+ tokenizer.pad_token = tokenizer.eos_token
61
 
62
+ lora_config = LoraConfig(
63
+ r=16,
64
+ lora_alpha=32,
65
+ target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
66
+ lora_dropout=0,
67
+ bias="none",
68
+ task_type="CAUSAL_LM",
69
+ )
70
+ model = get_peft_model(model, lora_config)
71
+ model.print_trainable_parameters()
72
 
 
73
  from deceit_env.server.environment import DeceitEnvironment
74
  from deceit_env.server.grader import Grader
75
  from deceit_env.models import DeceitAction
76
+ import deceit_env as _de
 
 
 
 
 
 
 
77
 
78
  _grader = Grader(cache_path="/tmp/deceit_grader_cache.json",
79
+ openai_api_key=os.environ.get("OPENAI_API_KEY",""))
80
+ _env = DeceitEnvironment(grader=_grader)
 
 
 
 
 
81
  _env_lock = threading.Lock()
82
 
 
83
  def parse_action(text):
84
  text = re.sub(r"```(?:json)?\s*", "", text).strip()
85
  try:
 
93
  "is_final": bool(obj.get("is_final",True)),
94
  }
95
  except: pass
96
+ return {"reasoning":"","answer":"","confidence":0.0,"abstain":True,"is_final":True}
97
 
98
  FAIL = {"reasoning":"fail","answer":"","confidence":0.0,"abstain":True,"is_final":True}
99
 
 
100
  def reward_fn(completions, prompts=None, **kwargs):
101
  rewards = []
102
  for text in completions:
 
129
  rewards.append(total)
130
  return rewards
131
 
132
+ data_path = pathlib.Path(_de.__file__).parent / "data" / "level1.jsonl"
133
  questions = []
134
+ with open(data_path) as f:
135
  for line in f:
136
  line = line.strip()
137
  if line:
 
149
  for q in questions
150
  ])
151
 
152
+ print("Starting training...")
 
 
153
  wandb.init(project=WANDB_PROJECT, name="1.5b-level1")
154
 
155
  trainer = GRPOTrainer(
 
157
  processing_class=tokenizer,
158
  reward_funcs=[reward_fn],
159
  args=GRPOConfig(
160
+ output_dir="./deceit-1.5b",
161
  max_steps=150,
162
  per_device_train_batch_size=2,
163
  num_generations=4,
 
173
  )
174
  trainer.train()
175
  wandb.finish()
176
+ print("Training done!")
177
 
178
+ model.save_pretrained("deceit-1.5b-final")
179
+ tokenizer.save_pretrained("deceit-1.5b-final")
 
180
  model.push_to_hub(HF_REPO_ID)
181
  tokenizer.push_to_hub(HF_REPO_ID)
182
  print(f"Saved to {HF_REPO_ID}")