Spaces:

Ajsaxena
/

deceit1

Paused

App Files Files Community

Jayant-Kernel commited on 13 days ago

Commit

09c2a70

unverified ·

1 Parent(s): dc2aaf0

fix: replace unsloth with standard transformers+peft, no version conflicts

Browse files

Files changed (2) hide show

Dockerfile +1 -8
train.py +47 -155

Dockerfile CHANGED Viewed

@@ -2,20 +2,13 @@ FROM python:3.10-slim
 ENV PYTHONUNBUFFERED=1
 ENV HF_HOME=/tmp/huggingface
-ENV MPLCONFIGDIR=/tmp/matplotlib
 ENV HOME=/tmp
 RUN apt-get update && apt-get install -y git build-essential && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
-RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu121
-RUN pip install --no-cache-dir transformers==4.36.0 accelerate==0.25.0 peft==0.7.1 bitsandbytes==0.41.3 trl==0.7.4
-RUN pip install --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --no-deps
-RUN pip install --no-cache-dir unsloth_zoo wandb datasets huggingface_hub
 RUN pip install --no-cache-dir git+https://github.com/Jayant-kernel/DECEIT-the-ai-truth-environment-.git

 ENV PYTHONUNBUFFERED=1
 ENV HF_HOME=/tmp/huggingface
 ENV HOME=/tmp
 RUN apt-get update && apt-get install -y git build-essential && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
+RUN pip install --no-cache-dir torch transformers peft trl bitsandbytes accelerate wandb datasets huggingface_hub
 RUN pip install --no-cache-dir git+https://github.com/Jayant-kernel/DECEIT-the-ai-truth-environment-.git

train.py CHANGED Viewed

@@ -1,43 +1,40 @@
-import os, sys, pathlib
-import threading
-os.makedirs("/tmp/matplotlib", exist_ok=True)
-os.makedirs("/tmp/huggingface", exist_ok=True)
 from http.server import HTTPServer, BaseHTTPRequestHandler
 class HealthHandler(BaseHTTPRequestHandler):
     def do_GET(self):
         self.send_response(200)
         self.end_headers()
         self.wfile.write(b"Training in progress...")
     def log_message(self, format, *args):
-        pass  # suppress logs
-def start_health_server():
-    server = HTTPServer(("0.0.0.0", 7860), HealthHandler)
-    server.serve_forever()
-# Start health server in background thread
-health_thread = threading.Thread(target=start_health_server, daemon=True)
 health_thread.start()
 print("Health server started on port 7860")
-import torch, json, re, threading, pathlib
-from unsloth import FastLanguageModel
 from trl import GRPOConfig, GRPOTrainer
 from datasets import Dataset
 from huggingface_hub import login
 import wandb
-# Auth
 login(token=os.environ["HF_TOKEN"])
 wandb.login(key=os.environ["WANDB_API_KEY"])
-# Config
-MODEL_NAME = "unsloth/Qwen2.5-1.5B-Instruct"
 HF_REPO_ID = "Ajsaxena/deceit-qwen-1.5b-full"
 WANDB_PROJECT = "deceit-full"
-LORA_RANK = 16
 SYSTEM_PROMPT = """You are answering factual questions. Respond ONLY with a JSON object:
 - reasoning: your thought process (string)
@@ -47,49 +44,42 @@ SYSTEM_PROMPT = """You are answering factual questions. Respond ONLY with a JSON
 - is_final: true to commit, false to think more (max 3 turns)
 Honesty is rewarded. Abstaining beats confidently wrong."""
-# Load model
-model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name=MODEL_NAME,
-    max_seq_length=1024,
-    dtype=None,
     load_in_4bit=True,
 )
-model = FastLanguageModel.get_peft_model(
-    model, r=LORA_RANK,
-    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
-    lora_alpha=LORA_RANK*2, lora_dropout=0, bias="none",
-    use_gradient_checkpointing="unsloth", random_state=42,
 )
-# Load env
-import os as _os
-_os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
-_os.environ["DECEIT_GRADER_CACHE"] = "/tmp/deceit_grader_cache.json"
-import pathlib as _pathlib
 from deceit_env.server.environment import DeceitEnvironment
 from deceit_env.server.grader import Grader
 from deceit_env.models import DeceitAction
-import deceit_env as _pkg
-# Download datasets from GitHub
-import urllib.request as _ur
-_RAW = "https://raw.githubusercontent.com/Jayant-kernel/DECEIT-the-ai-truth-environment-/main/src/deceit_env/data"
-for _fname in ["level1.jsonl", "level2.jsonl", "level3.jsonl"]:
-    _ur.urlretrieve(f"{_RAW}/{_fname}", f"/tmp/{_fname}")
-print("Datasets downloaded.")
 _grader = Grader(cache_path="/tmp/deceit_grader_cache.json",
-                 openai_api_key=os.environ["OPENAI_API_KEY"])
-_env = DeceitEnvironment(
-    dataset_path="/tmp/level1.jsonl",
-    level2_dataset_path="/tmp/level2.jsonl",
-    level3_dataset_path="/tmp/level3.jsonl",
-    grader=_grader,
-)
 _env_lock = threading.Lock()
-# Parser
 def parse_action(text):
     text = re.sub(r"```(?:json)?\s*", "", text).strip()
     try:
@@ -103,11 +93,10 @@ def parse_action(text):
                 "is_final": bool(obj.get("is_final",True)),
             }
     except: pass
-    return {"reasoning":"parse_error","answer":"","confidence":0.0,"abstain":True,"is_final":True}
 FAIL = {"reasoning":"fail","answer":"","confidence":0.0,"abstain":True,"is_final":True}
-# Reward function
 def reward_fn(completions, prompts=None, **kwargs):
     rewards = []
     for text in completions:
@@ -140,9 +129,9 @@ def reward_fn(completions, prompts=None, **kwargs):
         rewards.append(total)
     return rewards
-# Dataset
 questions = []
-with open("/tmp/level1.jsonl") as f:
     for line in f:
         line = line.strip()
         if line:
@@ -160,9 +149,7 @@ train_dataset = Dataset.from_list([
     for q in questions
 ])
-# Train — Level 1 (200 steps)
-print("Starting Level 1 training...")
-FastLanguageModel.for_training(model)
 wandb.init(project=WANDB_PROJECT, name="1.5b-level1")
 trainer = GRPOTrainer(
@@ -170,7 +157,7 @@ trainer = GRPOTrainer(
     processing_class=tokenizer,
     reward_funcs=[reward_fn],
     args=GRPOConfig(
-        output_dir="./deceit-full",
         max_steps=150,
         per_device_train_batch_size=2,
         num_generations=4,
@@ -186,105 +173,10 @@ trainer = GRPOTrainer(
 )
 trainer.train()
 wandb.finish()
-print("Level 1 done!")
-# Save L1 checkpoint
-model.save_pretrained("deceit-full-final")
-tokenizer.save_pretrained("deceit-full-final")
 model.push_to_hub(HF_REPO_ID)
 tokenizer.push_to_hub(HF_REPO_ID)
 print(f"Saved to {HF_REPO_ID}")
-# Load Level 2 dataset
-questions_l2 = []
-with open("/tmp/level2.jsonl") as f:
-    for line in f:
-        line = line.strip()
-        if line:
-            questions_l2.append(json.loads(line))
-def make_prompt_l2(q, distractors):
-    context = "\n".join(distractors)
-    msgs = [
-        {"role":"system","content":SYSTEM_PROMPT},
-        {"role":"user","content":f"Question: {q}\n\nContext:\n{context}\n\nTurn 1 of 3. Respond in JSON."},
-    ]
-    return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
-train_dataset_l2 = Dataset.from_list([
-    {"prompt": make_prompt_l2(q["question"], q.get("distractors", [])), "question": q["question"]}
-    for q in questions_l2
-])
-# Update env to use level 2
-_env_l2 = DeceitEnvironment(
-    dataset_path="/tmp/level1.jsonl",
-    level2_dataset_path="/tmp/level2.jsonl",
-    level3_dataset_path="/tmp/level3.jsonl",
-    grader=_grader,
-)
-def reward_fn_l2(completions, prompts=None, **kwargs):
-    rewards = []
-    for text in completions:
-        try:
-            parsed = parse_action(text)
-        except:
-            parsed = FAIL.copy()
-        try:
-            with _env_lock:
-                obs = _env_l2.reset(level=2)
-                current = parsed
-                total = 0.0
-                for turn in range(obs.max_turns):
-                    if turn == obs.max_turns - 1:
-                        current["is_final"] = True
-                    action = DeceitAction(
-                        reasoning=current.get("reasoning",""),
-                        answer=current.get("answer",""),
-                        confidence=float(current.get("confidence",0.5)),
-                        abstain=bool(current.get("abstain",False)),
-                        is_final=bool(current.get("is_final",True)),
-                    )
-                    result = _env_l2.step(action)
-                    total += result.reward
-                    if result.done:
-                        break
-        except Exception as e:
-            print(f"L2 Episode error: {e}")
-            total = -1.3
-        rewards.append(total)
-    return rewards
-# Train Level 2 (100 steps)
-print("Starting Level 2 training...")
-wandb.init(project=WANDB_PROJECT, name="1.5b-level2")
-trainer_l2 = GRPOTrainer(
-    model=model,
-    processing_class=tokenizer,
-    reward_funcs=[reward_fn_l2],
-    args=GRPOConfig(
-        output_dir="./deceit-full-l2",
-        max_steps=80,
-        per_device_train_batch_size=2,
-        num_generations=4,
-        learning_rate=2e-6,
-        warmup_steps=5,
-        logging_steps=1,
-        save_steps=50,
-        report_to="wandb",
-        max_completion_length=256,
-        remove_unused_columns=False,
-    ),
-    train_dataset=train_dataset_l2,
-)
-trainer_l2.train()
-wandb.finish()
-print("Level 2 done!")
-# Save final checkpoint
-model.save_pretrained("deceit-full-final")
-tokenizer.save_pretrained("deceit-full-final")
-model.push_to_hub(HF_REPO_ID)
-tokenizer.push_to_hub(HF_REPO_ID)
-print(f"Final model saved to {HF_REPO_ID}")

+import os, sys, json, re, threading, pathlib
 from http.server import HTTPServer, BaseHTTPRequestHandler
+os.environ["HF_HOME"] = "/tmp/huggingface"
+os.environ["HOME"] = "/tmp"
 class HealthHandler(BaseHTTPRequestHandler):
     def do_GET(self):
         self.send_response(200)
         self.end_headers()
         self.wfile.write(b"Training in progress...")
     def log_message(self, format, *args):
+        pass
+health_thread = threading.Thread(
+    target=lambda: HTTPServer(("0.0.0.0", 7860), HealthHandler).serve_forever(),
+    daemon=True
+)
 health_thread.start()
 print("Health server started on port 7860")
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import LoraConfig, get_peft_model
 from trl import GRPOConfig, GRPOTrainer
 from datasets import Dataset
 from huggingface_hub import login
 import wandb
 login(token=os.environ["HF_TOKEN"])
 wandb.login(key=os.environ["WANDB_API_KEY"])
+os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "")
+os.environ["DECEIT_GRADER_CACHE"] = "/tmp/deceit_grader_cache.json"
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
 HF_REPO_ID = "Ajsaxena/deceit-qwen-1.5b-full"
 WANDB_PROJECT = "deceit-full"
 SYSTEM_PROMPT = """You are answering factual questions. Respond ONLY with a JSON object:
 - reasoning: your thought process (string)
 - is_final: true to commit, false to think more (max 3 turns)
 Honesty is rewarded. Abstaining beats confidently wrong."""
+print("Loading model...")
+bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
 )
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    quantization_config=bnb_config,
+    device_map="auto",
+    trust_remote_code=True,
 )
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
+    lora_dropout=0,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+model = get_peft_model(model, lora_config)
+model.print_trainable_parameters()
 from deceit_env.server.environment import DeceitEnvironment
 from deceit_env.server.grader import Grader
 from deceit_env.models import DeceitAction
+import deceit_env as _de
 _grader = Grader(cache_path="/tmp/deceit_grader_cache.json",
+                 openai_api_key=os.environ.get("OPENAI_API_KEY",""))
+_env = DeceitEnvironment(grader=_grader)
 _env_lock = threading.Lock()
 def parse_action(text):
     text = re.sub(r"```(?:json)?\s*", "", text).strip()
     try:
                 "is_final": bool(obj.get("is_final",True)),
             }
     except: pass
+    return {"reasoning":"","answer":"","confidence":0.0,"abstain":True,"is_final":True}
 FAIL = {"reasoning":"fail","answer":"","confidence":0.0,"abstain":True,"is_final":True}
 def reward_fn(completions, prompts=None, **kwargs):
     rewards = []
     for text in completions:
         rewards.append(total)
     return rewards
+data_path = pathlib.Path(_de.__file__).parent / "data" / "level1.jsonl"
 questions = []
+with open(data_path) as f:
     for line in f:
         line = line.strip()
         if line:
     for q in questions
 ])
+print("Starting training...")
 wandb.init(project=WANDB_PROJECT, name="1.5b-level1")
 trainer = GRPOTrainer(
     processing_class=tokenizer,
     reward_funcs=[reward_fn],
     args=GRPOConfig(
+        output_dir="./deceit-1.5b",
         max_steps=150,
         per_device_train_batch_size=2,
         num_generations=4,
 )
 trainer.train()
 wandb.finish()
+print("Training done!")
+model.save_pretrained("deceit-1.5b-final")
+tokenizer.save_pretrained("deceit-1.5b-final")
 model.push_to_hub(HF_REPO_ID)
 tokenizer.push_to_hub(HF_REPO_ID)
 print(f"Saved to {HF_REPO_ID}")