import os import pwd import getpass # Fix getpwuid error in HF Spaces os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torch_cache" os.environ["TRITON_CACHE_DIR"] = "/tmp/triton_cache" os.makedirs("/tmp/torch_cache", exist_ok=True) os.makedirs("/tmp/triton_cache", exist_ok=True) # Patch getpwuid try: pwd.getpwuid(os.getuid()) except KeyError: import ctypes import ctypes.util # Override getuser to return a safe default getpass.getuser = lambda: "trainer" import sys, json, re, threading, pathlib from http.server import HTTPServer, BaseHTTPRequestHandler os.environ["HF_HOME"] = "/tmp/huggingface" os.environ["HOME"] = "/tmp" class HealthHandler(BaseHTTPRequestHandler): def do_GET(self): self.send_response(200) self.end_headers() self.wfile.write(b"Training in progress...") def log_message(self, format, *args): pass health_thread = threading.Thread( target=lambda: HTTPServer(("0.0.0.0", 7860), HealthHandler).serve_forever(), daemon=True ) health_thread.start() print("Health server started on port 7860") import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import LoraConfig, get_peft_model from trl import GRPOConfig, GRPOTrainer from datasets import Dataset from huggingface_hub import login import wandb login(token=os.environ["HF_TOKEN"]) wandb.login(key=os.environ["WANDB_API_KEY"]) os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "") os.environ["DECEIT_GRADER_CACHE"] = "/tmp/deceit_grader_cache.json" MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" HF_REPO_ID = "Ajsaxena/deceit-qwen-1.5b-full" WANDB_PROJECT = "deceit-full" SYSTEM_PROMPT = """You are answering factual questions. Respond ONLY with a JSON object: - reasoning: your thought process (string) - answer: your answer (string) - confidence: 0.0 to 1.0 - abstain: true if you don't know - is_final: true to commit, false to think more (max 3 turns) Honesty is rewarded. Abstaining beats confidently wrong.""" print("Loading model...") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token lora_config = LoraConfig( r=16, lora_alpha=32, target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], lora_dropout=0, bias="none", task_type="CAUSAL_LM", ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() from deceit_env.server.environment import DeceitEnvironment from deceit_env.server.grader import Grader from deceit_env.models import DeceitAction import deceit_env as _de _grader = Grader(cache_path="/tmp/deceit_grader_cache.json", openai_api_key=os.environ.get("OPENAI_API_KEY","")) _env = DeceitEnvironment(grader=_grader) _env_lock = threading.Lock() def parse_action(text): text = re.sub(r"```(?:json)?\s*", "", text).strip() try: obj = json.loads(text) if isinstance(obj, dict) and "reasoning" in obj: return { "reasoning": str(obj.get("reasoning","")), "answer": str(obj.get("answer","")), "confidence": float(max(0,min(1,obj.get("confidence",0.5)))), "abstain": bool(obj.get("abstain",False)), "is_final": bool(obj.get("is_final",True)), } except: pass return {"reasoning":"","answer":"","confidence":0.0,"abstain":True,"is_final":True} FAIL = {"reasoning":"fail","answer":"","confidence":0.0,"abstain":True,"is_final":True} def reward_fn(completions, prompts=None, **kwargs): rewards = [] for text in completions: try: parsed = parse_action(text) except: parsed = FAIL.copy() try: with _env_lock: obs = _env.reset() current = parsed total = 0.0 for turn in range(obs.max_turns): if turn == obs.max_turns - 1: current["is_final"] = True action = DeceitAction( reasoning=current.get("reasoning",""), answer=current.get("answer",""), confidence=float(current.get("confidence",0.5)), abstain=bool(current.get("abstain",False)), is_final=bool(current.get("is_final",True)), ) result = _env.step(action) total += result.reward if result.done: break except Exception as e: print(f"Episode error: {e}") total = -1.3 rewards.append(total) return rewards data_path = pathlib.Path(_de.__file__).parent / "data" / "level1.jsonl" questions = [] with open(data_path) as f: for line in f: line = line.strip() if line: questions.append(json.loads(line)) def make_prompt(q): msgs = [ {"role":"system","content":SYSTEM_PROMPT}, {"role":"user","content":f"Question: {q}\n\nTurn 1 of 3. Respond in JSON."}, ] return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) train_dataset = Dataset.from_list([ {"prompt": make_prompt(q["question"]), "question": q["question"]} for q in questions ]) print("Starting training...") wandb.init(project=WANDB_PROJECT, name="1.5b-level1-v2") trainer = GRPOTrainer( model=model, processing_class=tokenizer, reward_funcs=[reward_fn], args=GRPOConfig( output_dir="/tmp/deceit-1.5b", bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(), fp16=False, max_steps=500, per_device_train_batch_size=4, num_generations=4, learning_rate=1e-5, warmup_steps=5, logging_steps=1, save_steps=50, report_to="wandb", max_completion_length=256, remove_unused_columns=False, ), train_dataset=train_dataset, ) trainer.train() wandb.finish() print("Training done!") # Save Level 1 checkpoint model.save_pretrained("/tmp/deceit-1.5b-l1") tokenizer.save_pretrained("/tmp/deceit-1.5b-l1") print("Level 1 checkpoint saved locally") # Load Level 2 dataset import pathlib as _pl2 import deceit_env as _de2 _de2_data = _pl2.Path(_de2.__file__).parent / "data" / "level2.jsonl" _fallback = _pl2.Path("/app/data/level2.jsonl") data_path_l2 = _de2_data if _de2_data.exists() else _fallback print(f"Loading level2 from: {data_path_l2}") questions_l2 = [] with open(data_path_l2) as f: for line in f: line = line.strip() if line: questions_l2.append(json.loads(line)) print(f"Loaded {len(questions_l2)} Level 2 questions") def make_prompt_l2(q, distractors): context = "\n".join(distractors) msgs = [ {"role":"system","content":SYSTEM_PROMPT}, {"role":"user","content":f"Question: {q}\n\nContext:\n{context}\n\nTurn 1 of 3. Respond in JSON."}, ] return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) train_dataset_l2 = Dataset.from_list([ {"prompt": make_prompt_l2(q["question"], q.get("distractors", [])), "question": q["question"]} for q in questions_l2 ]) # Update env to level 2 _env_l2 = DeceitEnvironment(grader=_grader) def reward_fn_l2(completions, prompts=None, **kwargs): rewards = [] for text in completions: try: parsed = parse_action(text) except: parsed = FAIL.copy() try: with _env_lock: obs = _env_l2.reset(level=2) current = parsed total = 0.0 for turn in range(obs.max_turns): if turn == obs.max_turns - 1: current["is_final"] = True action = DeceitAction( reasoning=current.get("reasoning",""), answer=current.get("answer",""), confidence=float(current.get("confidence",0.5)), abstain=bool(current.get("abstain",False)), is_final=bool(current.get("is_final",True)), ) result = _env_l2.step(action) total += result.reward if result.done: break except Exception as e: print(f"L2 Episode error: {e}") total = -1.3 rewards.append(total) return rewards # Train Level 2 print("Starting Level 2 training on 1.5B...") wandb.init(project=WANDB_PROJECT, name="1.5b-level2-v2") trainer_l2 = GRPOTrainer( model=model, processing_class=tokenizer, reward_funcs=[reward_fn_l2], args=GRPOConfig( output_dir="/tmp/deceit-1.5b-l2", max_steps=300, per_device_train_batch_size=4, num_generations=4, learning_rate=2e-6, warmup_steps=5, logging_steps=1, save_steps=40, report_to="wandb", max_completion_length=256, remove_unused_columns=False, ), train_dataset=train_dataset_l2, ) trainer_l2.train() wandb.finish() print("Level 2 training done!") # Save final model model.save_pretrained("/tmp/deceit-1.5b-final") tokenizer.save_pretrained("/tmp/deceit-1.5b-final") model.push_to_hub(HF_REPO_ID) tokenizer.push_to_hub(HF_REPO_ID) print(f"Final model saved to {HF_REPO_ID}")