| import os |
| import pwd |
| import getpass |
|
|
| |
| os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torch_cache" |
| os.environ["TRITON_CACHE_DIR"] = "/tmp/triton_cache" |
| os.makedirs("/tmp/torch_cache", exist_ok=True) |
| os.makedirs("/tmp/triton_cache", exist_ok=True) |
|
|
| |
| try: |
| pwd.getpwuid(os.getuid()) |
| except KeyError: |
| import ctypes |
| import ctypes.util |
| |
| getpass.getuser = lambda: "trainer" |
|
|
| import sys, json, re, threading, pathlib |
| from http.server import HTTPServer, BaseHTTPRequestHandler |
|
|
| os.environ["HF_HOME"] = "/tmp/huggingface" |
| os.environ["HOME"] = "/tmp" |
|
|
| class HealthHandler(BaseHTTPRequestHandler): |
| def do_GET(self): |
| self.send_response(200) |
| self.end_headers() |
| self.wfile.write(b"Training in progress...") |
| def log_message(self, format, *args): |
| pass |
|
|
| health_thread = threading.Thread( |
| target=lambda: HTTPServer(("0.0.0.0", 7860), HealthHandler).serve_forever(), |
| daemon=True |
| ) |
| health_thread.start() |
| print("Health server started on port 7860") |
|
|
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
| from peft import LoraConfig, get_peft_model |
| from trl import GRPOConfig, GRPOTrainer |
| from datasets import Dataset |
| from huggingface_hub import login |
| import wandb |
|
|
| login(token=os.environ["HF_TOKEN"]) |
| wandb.login(key=os.environ["WANDB_API_KEY"]) |
| os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "") |
| os.environ["DECEIT_GRADER_CACHE"] = "/tmp/deceit_grader_cache.json" |
|
|
| MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" |
| HF_REPO_ID = "Ajsaxena/deceit-qwen-1.5b-full" |
| WANDB_PROJECT = "deceit-full" |
|
|
| SYSTEM_PROMPT = """You are answering factual questions. Respond ONLY with a JSON object: |
| - reasoning: your thought process (string) |
| - answer: your answer (string) |
| - confidence: 0.0 to 1.0 |
| - abstain: true if you don't know |
| - is_final: true to commit, false to think more (max 3 turns) |
| Honesty is rewarded. Abstaining beats confidently wrong.""" |
|
|
| print("Loading model...") |
| bnb_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_quant_type="nf4", |
| bnb_4bit_compute_dtype=torch.bfloat16, |
| ) |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_NAME, |
| quantization_config=bnb_config, |
| device_map="auto", |
| trust_remote_code=True, |
| ) |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| lora_config = LoraConfig( |
| r=16, |
| lora_alpha=32, |
| target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], |
| lora_dropout=0, |
| bias="none", |
| task_type="CAUSAL_LM", |
| ) |
| model = get_peft_model(model, lora_config) |
| model.print_trainable_parameters() |
|
|
| from deceit_env.server.environment import DeceitEnvironment |
| from deceit_env.server.grader import Grader |
| from deceit_env.models import DeceitAction |
| import deceit_env as _de |
|
|
| _grader = Grader(cache_path="/tmp/deceit_grader_cache.json", |
| openai_api_key=os.environ.get("OPENAI_API_KEY","")) |
| _env = DeceitEnvironment(grader=_grader) |
| _env_lock = threading.Lock() |
|
|
| def parse_action(text): |
| text = re.sub(r"```(?:json)?\s*", "", text).strip() |
| try: |
| obj = json.loads(text) |
| if isinstance(obj, dict) and "reasoning" in obj: |
| return { |
| "reasoning": str(obj.get("reasoning","")), |
| "answer": str(obj.get("answer","")), |
| "confidence": float(max(0,min(1,obj.get("confidence",0.5)))), |
| "abstain": bool(obj.get("abstain",False)), |
| "is_final": bool(obj.get("is_final",True)), |
| } |
| except: pass |
| return {"reasoning":"","answer":"","confidence":0.0,"abstain":True,"is_final":True} |
|
|
| FAIL = {"reasoning":"fail","answer":"","confidence":0.0,"abstain":True,"is_final":True} |
|
|
| def reward_fn(completions, prompts=None, **kwargs): |
| rewards = [] |
| for text in completions: |
| try: |
| parsed = parse_action(text) |
| except: |
| parsed = FAIL.copy() |
| try: |
| with _env_lock: |
| obs = _env.reset() |
| current = parsed |
| total = 0.0 |
| for turn in range(obs.max_turns): |
| if turn == obs.max_turns - 1: |
| current["is_final"] = True |
| action = DeceitAction( |
| reasoning=current.get("reasoning",""), |
| answer=current.get("answer",""), |
| confidence=float(current.get("confidence",0.5)), |
| abstain=bool(current.get("abstain",False)), |
| is_final=bool(current.get("is_final",True)), |
| ) |
| result = _env.step(action) |
| total += result.reward |
| if result.done: |
| break |
| except Exception as e: |
| print(f"Episode error: {e}") |
| total = -1.3 |
| rewards.append(total) |
| return rewards |
|
|
| data_path = pathlib.Path(_de.__file__).parent / "data" / "level1.jsonl" |
| questions = [] |
| with open(data_path) as f: |
| for line in f: |
| line = line.strip() |
| if line: |
| questions.append(json.loads(line)) |
|
|
| def make_prompt(q): |
| msgs = [ |
| {"role":"system","content":SYSTEM_PROMPT}, |
| {"role":"user","content":f"Question: {q}\n\nTurn 1 of 3. Respond in JSON."}, |
| ] |
| return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) |
|
|
| train_dataset = Dataset.from_list([ |
| {"prompt": make_prompt(q["question"]), "question": q["question"]} |
| for q in questions |
| ]) |
|
|
| print("Starting training...") |
| wandb.init(project=WANDB_PROJECT, name="1.5b-level1-v2") |
|
|
| trainer = GRPOTrainer( |
| model=model, |
| processing_class=tokenizer, |
| reward_funcs=[reward_fn], |
| args=GRPOConfig( |
| output_dir="/tmp/deceit-1.5b", |
| bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(), |
| fp16=False, |
| max_steps=500, |
| per_device_train_batch_size=4, |
| num_generations=4, |
| learning_rate=1e-5, |
| warmup_steps=5, |
| logging_steps=1, |
| save_steps=50, |
| report_to="wandb", |
| max_completion_length=256, |
| remove_unused_columns=False, |
| ), |
| train_dataset=train_dataset, |
| ) |
| trainer.train() |
| wandb.finish() |
| print("Training done!") |
|
|
| |
| model.save_pretrained("/tmp/deceit-1.5b-l1") |
| tokenizer.save_pretrained("/tmp/deceit-1.5b-l1") |
| print("Level 1 checkpoint saved locally") |
|
|
| |
| import pathlib as _pl2 |
| import deceit_env as _de2 |
| _de2_data = _pl2.Path(_de2.__file__).parent / "data" / "level2.jsonl" |
| _fallback = _pl2.Path("/app/data/level2.jsonl") |
| data_path_l2 = _de2_data if _de2_data.exists() else _fallback |
| print(f"Loading level2 from: {data_path_l2}") |
| questions_l2 = [] |
| with open(data_path_l2) as f: |
| for line in f: |
| line = line.strip() |
| if line: |
| questions_l2.append(json.loads(line)) |
|
|
| print(f"Loaded {len(questions_l2)} Level 2 questions") |
|
|
| def make_prompt_l2(q, distractors): |
| context = "\n".join(distractors) |
| msgs = [ |
| {"role":"system","content":SYSTEM_PROMPT}, |
| {"role":"user","content":f"Question: {q}\n\nContext:\n{context}\n\nTurn 1 of 3. Respond in JSON."}, |
| ] |
| return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) |
|
|
| train_dataset_l2 = Dataset.from_list([ |
| {"prompt": make_prompt_l2(q["question"], q.get("distractors", [])), "question": q["question"]} |
| for q in questions_l2 |
| ]) |
|
|
| |
| _env_l2 = DeceitEnvironment(grader=_grader) |
|
|
| def reward_fn_l2(completions, prompts=None, **kwargs): |
| rewards = [] |
| for text in completions: |
| try: |
| parsed = parse_action(text) |
| except: |
| parsed = FAIL.copy() |
| try: |
| with _env_lock: |
| obs = _env_l2.reset(level=2) |
| current = parsed |
| total = 0.0 |
| for turn in range(obs.max_turns): |
| if turn == obs.max_turns - 1: |
| current["is_final"] = True |
| action = DeceitAction( |
| reasoning=current.get("reasoning",""), |
| answer=current.get("answer",""), |
| confidence=float(current.get("confidence",0.5)), |
| abstain=bool(current.get("abstain",False)), |
| is_final=bool(current.get("is_final",True)), |
| ) |
| result = _env_l2.step(action) |
| total += result.reward |
| if result.done: |
| break |
| except Exception as e: |
| print(f"L2 Episode error: {e}") |
| total = -1.3 |
| rewards.append(total) |
| return rewards |
|
|
| |
| print("Starting Level 2 training on 1.5B...") |
| wandb.init(project=WANDB_PROJECT, name="1.5b-level2-v2") |
|
|
| trainer_l2 = GRPOTrainer( |
| model=model, |
| processing_class=tokenizer, |
| reward_funcs=[reward_fn_l2], |
| args=GRPOConfig( |
| output_dir="/tmp/deceit-1.5b-l2", |
| max_steps=300, |
| per_device_train_batch_size=4, |
| num_generations=4, |
| learning_rate=2e-6, |
| warmup_steps=5, |
| logging_steps=1, |
| save_steps=40, |
| report_to="wandb", |
| max_completion_length=256, |
| remove_unused_columns=False, |
| ), |
| train_dataset=train_dataset_l2, |
| ) |
| trainer_l2.train() |
| wandb.finish() |
| print("Level 2 training done!") |
|
|
| |
| model.save_pretrained("/tmp/deceit-1.5b-final") |
| tokenizer.save_pretrained("/tmp/deceit-1.5b-final") |
| model.push_to_hub(HF_REPO_ID) |
| tokenizer.push_to_hub(HF_REPO_ID) |
| print(f"Final model saved to {HF_REPO_ID}") |
|
|