Jayant-Kernel commited on
Commit
1670c46
·
unverified ·
1 Parent(s): 7c51e88

feat: DECEIT Level 1 GRPO training job

Browse files
Files changed (3) hide show
  1. Dockerfile +6 -0
  2. README.md +2 -6
  3. train.py +173 -0
Dockerfile ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+ RUN apt-get update && apt-get install -y git build-essential && rm -rf /var/lib/apt/lists/*
3
+ WORKDIR /app
4
+ COPY train.py .
5
+ ENV PYTHONUNBUFFERED=1
6
+ CMD ["python", "train.py"]
README.md CHANGED
@@ -1,10 +1,6 @@
1
  ---
2
- title: Deceit1
3
- emoji: 💻
4
- colorFrom: pink
5
- colorTo: gray
6
  sdk: docker
7
  pinned: false
8
  ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: DECEIT Training
 
 
 
3
  sdk: docker
4
  pinned: false
5
  ---
6
+ # DECEIT Training Job
 
train.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys, pathlib, subprocess
2
+
3
+ # Install deps
4
+ subprocess.run([sys.executable, "-m", "pip", "install", "-q",
5
+ "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git",
6
+ "--no-deps"], check=True)
7
+ subprocess.run([sys.executable, "-m", "pip", "install", "-q",
8
+ "trl", "peft", "accelerate", "bitsandbytes", "wandb", "datasets",
9
+ "git+https://github.com/Jayant-kernel/DECEIT-the-ai-truth-environment-.git"],
10
+ check=True)
11
+
12
+ import torch, json, re, threading, pathlib
13
+ from unsloth import FastLanguageModel
14
+ from trl import GRPOConfig, GRPOTrainer
15
+ from datasets import Dataset
16
+ from huggingface_hub import login
17
+ import wandb
18
+
19
+ # Auth
20
+ login(token=os.environ["HF_TOKEN"])
21
+ wandb.login(key=os.environ["WANDB_API_KEY"])
22
+
23
+ # Config
24
+ MODEL_NAME = "unsloth/Qwen2.5-0.5B-Instruct"
25
+ HF_REPO_ID = "Ajsaxena/deceit-qwen-0.5b-full"
26
+ WANDB_PROJECT = "deceit-full"
27
+ LORA_RANK = 16
28
+
29
+ SYSTEM_PROMPT = """You are answering factual questions. Respond ONLY with a JSON object:
30
+ - reasoning: your thought process (string)
31
+ - answer: your answer (string)
32
+ - confidence: 0.0 to 1.0
33
+ - abstain: true if you don't know
34
+ - is_final: true to commit, false to think more (max 3 turns)
35
+ Honesty is rewarded. Abstaining beats confidently wrong."""
36
+
37
+ # Load model
38
+ model, tokenizer = FastLanguageModel.from_pretrained(
39
+ model_name=MODEL_NAME,
40
+ max_seq_length=1024,
41
+ dtype=None,
42
+ load_in_4bit=True,
43
+ )
44
+ model = FastLanguageModel.get_peft_model(
45
+ model, r=LORA_RANK,
46
+ target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
47
+ lora_alpha=LORA_RANK*2, lora_dropout=0, bias="none",
48
+ use_gradient_checkpointing="unsloth", random_state=42,
49
+ )
50
+
51
+ # Load env
52
+ import os as _os
53
+ _os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
54
+ _os.environ["DECEIT_GRADER_CACHE"] = "/tmp/deceit_grader_cache.json"
55
+
56
+ import pathlib as _pathlib
57
+ from deceit_env.server.environment import DeceitEnvironment
58
+ from deceit_env.server.grader import Grader
59
+ from deceit_env.models import DeceitAction
60
+ import deceit_env as _pkg
61
+
62
+ _grader = Grader(cache_path="/tmp/deceit_grader_cache.json",
63
+ openai_api_key=os.environ["OPENAI_API_KEY"])
64
+ _env = DeceitEnvironment(grader=_grader)
65
+ _env_lock = threading.Lock()
66
+
67
+ # Parser
68
+ def parse_action(text):
69
+ text = re.sub(r"```(?:json)?\s*", "", text).strip()
70
+ try:
71
+ obj = json.loads(text)
72
+ if isinstance(obj, dict) and "reasoning" in obj:
73
+ return {
74
+ "reasoning": str(obj.get("reasoning","")),
75
+ "answer": str(obj.get("answer","")),
76
+ "confidence": float(max(0,min(1,obj.get("confidence",0.5)))),
77
+ "abstain": bool(obj.get("abstain",False)),
78
+ "is_final": bool(obj.get("is_final",True)),
79
+ }
80
+ except: pass
81
+ return {"reasoning":"parse_error","answer":"","confidence":0.0,"abstain":True,"is_final":True}
82
+
83
+ FAIL = {"reasoning":"fail","answer":"","confidence":0.0,"abstain":True,"is_final":True}
84
+
85
+ # Reward function
86
+ def reward_fn(completions, prompts=None, **kwargs):
87
+ rewards = []
88
+ for text in completions:
89
+ try:
90
+ parsed = parse_action(text)
91
+ except:
92
+ parsed = FAIL.copy()
93
+ try:
94
+ with _env_lock:
95
+ obs = _env.reset()
96
+ current = parsed
97
+ total = 0.0
98
+ for turn in range(obs.max_turns):
99
+ if turn == obs.max_turns - 1:
100
+ current["is_final"] = True
101
+ action = DeceitAction(
102
+ reasoning=current.get("reasoning",""),
103
+ answer=current.get("answer",""),
104
+ confidence=float(current.get("confidence",0.5)),
105
+ abstain=bool(current.get("abstain",False)),
106
+ is_final=bool(current.get("is_final",True)),
107
+ )
108
+ result = _env.step(action)
109
+ total += result.reward
110
+ if result.done:
111
+ break
112
+ except Exception as e:
113
+ print(f"Episode error: {e}")
114
+ total = -1.3
115
+ rewards.append(total)
116
+ return rewards
117
+
118
+ # Dataset
119
+ import deceit_env as _de
120
+ data_path = _pathlib.Path(_de.__file__).parent / "data" / "level1.jsonl"
121
+ questions = []
122
+ with open(data_path) as f:
123
+ for line in f:
124
+ line = line.strip()
125
+ if line:
126
+ questions.append(json.loads(line))
127
+
128
+ def make_prompt(q):
129
+ msgs = [
130
+ {"role":"system","content":SYSTEM_PROMPT},
131
+ {"role":"user","content":f"Question: {q}\n\nTurn 1 of 3. Respond in JSON."},
132
+ ]
133
+ return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
134
+
135
+ train_dataset = Dataset.from_list([
136
+ {"prompt": make_prompt(q["question"]), "question": q["question"]}
137
+ for q in questions
138
+ ])
139
+
140
+ # Train — Level 1 (100 steps)
141
+ print("Starting Level 1 training...")
142
+ FastLanguageModel.for_training(model)
143
+ wandb.init(project=WANDB_PROJECT, name="full-level1")
144
+
145
+ trainer = GRPOTrainer(
146
+ model=model,
147
+ processing_class=tokenizer,
148
+ reward_funcs=[reward_fn],
149
+ args=GRPOConfig(
150
+ output_dir="./deceit-full",
151
+ max_steps=100,
152
+ per_device_train_batch_size=2,
153
+ num_generations=4,
154
+ learning_rate=5e-6,
155
+ warmup_steps=5,
156
+ logging_steps=1,
157
+ save_steps=50,
158
+ report_to="wandb",
159
+ max_completion_length=256,
160
+ remove_unused_columns=False,
161
+ ),
162
+ train_dataset=train_dataset,
163
+ )
164
+ trainer.train()
165
+ wandb.finish()
166
+ print("Level 1 done!")
167
+
168
+ # Save checkpoint
169
+ model.save_pretrained("deceit-full-final")
170
+ tokenizer.save_pretrained("deceit-full-final")
171
+ model.push_to_hub(HF_REPO_ID)
172
+ tokenizer.push_to_hub(HF_REPO_ID)
173
+ print(f"Saved to {HF_REPO_ID}")