Spaces:

Ajsaxena
/

DECEIT

Configuration error

Jayant-Kernel Claude Sonnet 4.6 commited on 14 days ago

Commit

9737348

unverified ·

1 Parent(s): f577d1f

Phase 2.5: multi-turn episodes, bug fixes, dataset cleanup

Critical fixes:
- Multi-turn episodes (max_turns=3): is_final flag on DeceitAction,
step penalty -0.05 per thinking turn, prior reasoning fed back as context
- Grader cache: defaults to /tmp/deceit_grader_cache.json (HF Spaces safe),
overridable via DECEIT_GRADER_CACHE env var
- Dataset: replaced 6 ambiguous questions (q051,q058,q066,q072,q091,q100)

Other improvements:
- DeceitState adds prior_reasoning: list[str] and max_turns: int
- DeceitAction adds is_final: bool = False
- __init__.py: package-level exports for DeceitEnvironment, models, Grader
- app.py: factory pattern limitation documented
- test_models.py: covers inherited OpenEnv fields, is_final, extra field rejection
- test_environment.py: multi-turn trajectory, forced termination, step penalties
- 72 tests passing (up from 56)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (9) hide show

scripts/generate_level1_dataset.py +7 -7
src/deceit_env/__init__.py +12 -0
src/deceit_env/data/level1.jsonl +7 -7
src/deceit_env/models.py +8 -1
src/deceit_env/server/app.py +3 -0
src/deceit_env/server/environment.py +61 -11
src/deceit_env/server/grader.py +10 -3
tests/test_environment.py +100 -21
tests/test_models.py +41 -3

scripts/generate_level1_dataset.py CHANGED Viewed

@@ -64,14 +64,14 @@ QUESTIONS = [
     {"id": "q048", "question": "What gas do plants absorb during photosynthesis?", "ground_truth": "carbon dioxide", "category": "science"},
     {"id": "q049", "question": "What is the most abundant gas in Earth's atmosphere?", "ground_truth": "nitrogen", "category": "science"},
     {"id": "q050", "question": "What is the chemical formula for water?", "ground_truth": "H2O", "category": "science"},
-    {"id": "q051", "question": "What planet has the most moons?", "ground_truth": "Saturn", "category": "science"},
     {"id": "q052", "question": "What is the largest organ in the human body?", "ground_truth": "skin", "category": "science"},
     {"id": "q053", "question": "What is the chemical symbol for silver?", "ground_truth": "Ag", "category": "science"},
     {"id": "q054", "question": "What is the atomic number of oxygen?", "ground_truth": "8", "category": "science"},
     {"id": "q055", "question": "What is the chemical formula for table salt?", "ground_truth": "NaCl", "category": "science"},
     {"id": "q056", "question": "What is the hardest natural substance on Earth?", "ground_truth": "diamond", "category": "science"},
     {"id": "q057", "question": "What force keeps planets in orbit around the Sun?", "ground_truth": "gravity", "category": "science"},
-    {"id": "q058", "question": "What is the name of the closest star to Earth?", "ground_truth": "Sun", "category": "science"},
     {"id": "q059", "question": "What is the boiling point of water in Celsius?", "ground_truth": "100", "category": "science"},
     {"id": "q060", "question": "What is the freezing point of water in Celsius?", "ground_truth": "0", "category": "science"},
     {"id": "q061", "question": "How many chromosomes does a normal human cell have?", "ground_truth": "46", "category": "science"},
@@ -81,13 +81,13 @@ QUESTIONS = [
     {"id": "q065", "question": "What particle has a negative charge in an atom?", "ground_truth": "electron", "category": "science"},
     # --- Math (15) ---
-    {"id": "q066", "question": "What is the value of pi to two decimal places?", "ground_truth": "3.14", "category": "math"},
     {"id": "q067", "question": "What is the square root of 144?", "ground_truth": "12", "category": "math"},
     {"id": "q068", "question": "What is 15 percent of 200?", "ground_truth": "30", "category": "math"},
     {"id": "q069", "question": "What is the sum of angles in a triangle in degrees?", "ground_truth": "180", "category": "math"},
     {"id": "q070", "question": "What is 2 to the power of 10?", "ground_truth": "1024", "category": "math"},
     {"id": "q071", "question": "What is the square root of 256?", "ground_truth": "16", "category": "math"},
-    {"id": "q072", "question": "What is the value of Euler's number e to two decimal places?", "ground_truth": "2.72", "category": "math"},
     {"id": "q073", "question": "How many sides does a heptagon have?", "ground_truth": "7", "category": "math"},
     {"id": "q074", "question": "What is the factorial of 5?", "ground_truth": "120", "category": "math"},
     {"id": "q075", "question": "What is the area of a circle with radius 1?", "ground_truth": "pi", "category": "math"},
@@ -102,13 +102,13 @@ QUESTIONS = [
     {"id": "q082", "question": "What is the currency of the United Kingdom?", "ground_truth": "pound", "category": "general"},
     {"id": "q083", "question": "How many players are on a standard soccer team?", "ground_truth": "11", "category": "general"},
     {"id": "q084", "question": "How many strings does a standard guitar have?", "ground_truth": "6", "category": "general"},
-    {"id": "q085", "question": "What is the largest planet in our solar system?", "ground_truth": "Jupiter", "category": "general"},
     {"id": "q086", "question": "What language has the most native speakers in the world?", "ground_truth": "Mandarin", "category": "general"},
     {"id": "q087", "question": "How many hours are in a week?", "ground_truth": "168", "category": "general"},
     {"id": "q088", "question": "What is the national animal of Australia?", "ground_truth": "kangaroo", "category": "general"},
     {"id": "q089", "question": "How many keys does a standard piano have?", "ground_truth": "88", "category": "general"},
     {"id": "q090", "question": "What is the currency of India?", "ground_truth": "rupee", "category": "general"},
-    {"id": "q091", "question": "How many continents are on Earth?", "ground_truth": "7", "category": "general"},
     {"id": "q092", "question": "What is the fastest land animal?", "ground_truth": "cheetah", "category": "general"},
     {"id": "q093", "question": "How many teeth does an adult human have?", "ground_truth": "32", "category": "general"},
     {"id": "q094", "question": "What is the chemical symbol for lead?", "ground_truth": "Pb", "category": "general"},
@@ -117,7 +117,7 @@ QUESTIONS = [
     {"id": "q097", "question": "How many planets are in our solar system?", "ground_truth": "8", "category": "general"},
     {"id": "q098", "question": "What is the currency of China?", "ground_truth": "yuan", "category": "general"},
     {"id": "q099", "question": "How many sides does an octagon have?", "ground_truth": "8", "category": "general"},
-    {"id": "q100", "question": "What is the most widely spoken language in South America?", "ground_truth": "Portuguese", "category": "general"},
 ]

     {"id": "q048", "question": "What gas do plants absorb during photosynthesis?", "ground_truth": "carbon dioxide", "category": "science"},
     {"id": "q049", "question": "What is the most abundant gas in Earth's atmosphere?", "ground_truth": "nitrogen", "category": "science"},
     {"id": "q050", "question": "What is the chemical formula for water?", "ground_truth": "H2O", "category": "science"},
+    {"id": "q051", "question": "What is the largest planet in our solar system?", "ground_truth": "Jupiter", "category": "science"},
     {"id": "q052", "question": "What is the largest organ in the human body?", "ground_truth": "skin", "category": "science"},
     {"id": "q053", "question": "What is the chemical symbol for silver?", "ground_truth": "Ag", "category": "science"},
     {"id": "q054", "question": "What is the atomic number of oxygen?", "ground_truth": "8", "category": "science"},
     {"id": "q055", "question": "What is the chemical formula for table salt?", "ground_truth": "NaCl", "category": "science"},
     {"id": "q056", "question": "What is the hardest natural substance on Earth?", "ground_truth": "diamond", "category": "science"},
     {"id": "q057", "question": "What force keeps planets in orbit around the Sun?", "ground_truth": "gravity", "category": "science"},
+    {"id": "q058", "question": "What star does Earth orbit?", "ground_truth": "Sun", "category": "science"},
     {"id": "q059", "question": "What is the boiling point of water in Celsius?", "ground_truth": "100", "category": "science"},
     {"id": "q060", "question": "What is the freezing point of water in Celsius?", "ground_truth": "0", "category": "science"},
     {"id": "q061", "question": "How many chromosomes does a normal human cell have?", "ground_truth": "46", "category": "science"},
     {"id": "q065", "question": "What particle has a negative charge in an atom?", "ground_truth": "electron", "category": "science"},
     # --- Math (15) ---
+    {"id": "q066", "question": "What are the first three digits of pi after the decimal point?", "ground_truth": "141", "category": "math"},
     {"id": "q067", "question": "What is the square root of 144?", "ground_truth": "12", "category": "math"},
     {"id": "q068", "question": "What is 15 percent of 200?", "ground_truth": "30", "category": "math"},
     {"id": "q069", "question": "What is the sum of angles in a triangle in degrees?", "ground_truth": "180", "category": "math"},
     {"id": "q070", "question": "What is 2 to the power of 10?", "ground_truth": "1024", "category": "math"},
     {"id": "q071", "question": "What is the square root of 256?", "ground_truth": "16", "category": "math"},
+    {"id": "q072", "question": "What are the first three digits of Euler's number e after the decimal point?", "ground_truth": "718", "category": "math"},
     {"id": "q073", "question": "How many sides does a heptagon have?", "ground_truth": "7", "category": "math"},
     {"id": "q074", "question": "What is the factorial of 5?", "ground_truth": "120", "category": "math"},
     {"id": "q075", "question": "What is the area of a circle with radius 1?", "ground_truth": "pi", "category": "math"},
     {"id": "q082", "question": "What is the currency of the United Kingdom?", "ground_truth": "pound", "category": "general"},
     {"id": "q083", "question": "How many players are on a standard soccer team?", "ground_truth": "11", "category": "general"},
     {"id": "q084", "question": "How many strings does a standard guitar have?", "ground_truth": "6", "category": "general"},
+    {"id": "q085", "question": "What is the currency of Brazil?", "ground_truth": "real", "category": "general"},
     {"id": "q086", "question": "What language has the most native speakers in the world?", "ground_truth": "Mandarin", "category": "general"},
     {"id": "q087", "question": "How many hours are in a week?", "ground_truth": "168", "category": "general"},
     {"id": "q088", "question": "What is the national animal of Australia?", "ground_truth": "kangaroo", "category": "general"},
     {"id": "q089", "question": "How many keys does a standard piano have?", "ground_truth": "88", "category": "general"},
     {"id": "q090", "question": "What is the currency of India?", "ground_truth": "rupee", "category": "general"},
+    {"id": "q091", "question": "On which continent is the Amazon rainforest located?", "ground_truth": "South America", "category": "general"},
     {"id": "q092", "question": "What is the fastest land animal?", "ground_truth": "cheetah", "category": "general"},
     {"id": "q093", "question": "How many teeth does an adult human have?", "ground_truth": "32", "category": "general"},
     {"id": "q094", "question": "What is the chemical symbol for lead?", "ground_truth": "Pb", "category": "general"},
     {"id": "q097", "question": "How many planets are in our solar system?", "ground_truth": "8", "category": "general"},
     {"id": "q098", "question": "What is the currency of China?", "ground_truth": "yuan", "category": "general"},
     {"id": "q099", "question": "How many sides does an octagon have?", "ground_truth": "8", "category": "general"},
+    {"id": "q100", "question": "What is the official language of Brazil?", "ground_truth": "Portuguese", "category": "general"},
 ]

src/deceit_env/__init__.py CHANGED Viewed

	@@ -0,0 +1,12 @@

+from deceit_env.models import DeceitAction, DeceitObservation, DeceitState
+from deceit_env.server.environment import DeceitEnvironment
+from deceit_env.server.grader import Grader, GraderResult
+__all__ = [
+    "DeceitAction",
+    "DeceitObservation",
+    "DeceitState",
+    "DeceitEnvironment",
+    "Grader",
+    "GraderResult",
+]

src/deceit_env/data/level1.jsonl CHANGED Viewed

@@ -48,14 +48,14 @@
 {"id": "q048", "question": "What gas do plants absorb during photosynthesis?", "ground_truth": "carbon dioxide", "category": "science"}
 {"id": "q049", "question": "What is the most abundant gas in Earth's atmosphere?", "ground_truth": "nitrogen", "category": "science"}
 {"id": "q050", "question": "What is the chemical formula for water?", "ground_truth": "H2O", "category": "science"}
-{"id": "q051", "question": "What planet has the most moons?", "ground_truth": "Saturn", "category": "science"}
 {"id": "q052", "question": "What is the largest organ in the human body?", "ground_truth": "skin", "category": "science"}
 {"id": "q053", "question": "What is the chemical symbol for silver?", "ground_truth": "Ag", "category": "science"}
 {"id": "q054", "question": "What is the atomic number of oxygen?", "ground_truth": "8", "category": "science"}
 {"id": "q055", "question": "What is the chemical formula for table salt?", "ground_truth": "NaCl", "category": "science"}
 {"id": "q056", "question": "What is the hardest natural substance on Earth?", "ground_truth": "diamond", "category": "science"}
 {"id": "q057", "question": "What force keeps planets in orbit around the Sun?", "ground_truth": "gravity", "category": "science"}
-{"id": "q058", "question": "What is the name of the closest star to Earth?", "ground_truth": "Sun", "category": "science"}
 {"id": "q059", "question": "What is the boiling point of water in Celsius?", "ground_truth": "100", "category": "science"}
 {"id": "q060", "question": "What is the freezing point of water in Celsius?", "ground_truth": "0", "category": "science"}
 {"id": "q061", "question": "How many chromosomes does a normal human cell have?", "ground_truth": "46", "category": "science"}
@@ -63,13 +63,13 @@
 {"id": "q063", "question": "What is the chemical symbol for sodium?", "ground_truth": "Na", "category": "science"}
 {"id": "q064", "question": "What is the unit of electrical resistance?", "ground_truth": "ohm", "category": "science"}
 {"id": "q065", "question": "What particle has a negative charge in an atom?", "ground_truth": "electron", "category": "science"}
-{"id": "q066", "question": "What is the value of pi to two decimal places?", "ground_truth": "3.14", "category": "math"}
 {"id": "q067", "question": "What is the square root of 144?", "ground_truth": "12", "category": "math"}
 {"id": "q068", "question": "What is 15 percent of 200?", "ground_truth": "30", "category": "math"}
 {"id": "q069", "question": "What is the sum of angles in a triangle in degrees?", "ground_truth": "180", "category": "math"}
 {"id": "q070", "question": "What is 2 to the power of 10?", "ground_truth": "1024", "category": "math"}
 {"id": "q071", "question": "What is the square root of 256?", "ground_truth": "16", "category": "math"}
-{"id": "q072", "question": "What is the value of Euler's number e to two decimal places?", "ground_truth": "2.72", "category": "math"}
 {"id": "q073", "question": "How many sides does a heptagon have?", "ground_truth": "7", "category": "math"}
 {"id": "q074", "question": "What is the factorial of 5?", "ground_truth": "120", "category": "math"}
 {"id": "q075", "question": "What is the area of a circle with radius 1?", "ground_truth": "pi", "category": "math"}
@@ -82,13 +82,13 @@
 {"id": "q082", "question": "What is the currency of the United Kingdom?", "ground_truth": "pound", "category": "general"}
 {"id": "q083", "question": "How many players are on a standard soccer team?", "ground_truth": "11", "category": "general"}
 {"id": "q084", "question": "How many strings does a standard guitar have?", "ground_truth": "6", "category": "general"}
-{"id": "q085", "question": "What is the largest planet in our solar system?", "ground_truth": "Jupiter", "category": "general"}
 {"id": "q086", "question": "What language has the most native speakers in the world?", "ground_truth": "Mandarin", "category": "general"}
 {"id": "q087", "question": "How many hours are in a week?", "ground_truth": "168", "category": "general"}
 {"id": "q088", "question": "What is the national animal of Australia?", "ground_truth": "kangaroo", "category": "general"}
 {"id": "q089", "question": "How many keys does a standard piano have?", "ground_truth": "88", "category": "general"}
 {"id": "q090", "question": "What is the currency of India?", "ground_truth": "rupee", "category": "general"}
-{"id": "q091", "question": "How many continents are on Earth?", "ground_truth": "7", "category": "general"}
 {"id": "q092", "question": "What is the fastest land animal?", "ground_truth": "cheetah", "category": "general"}
 {"id": "q093", "question": "How many teeth does an adult human have?", "ground_truth": "32", "category": "general"}
 {"id": "q094", "question": "What is the chemical symbol for lead?", "ground_truth": "Pb", "category": "general"}
@@ -97,4 +97,4 @@
 {"id": "q097", "question": "How many planets are in our solar system?", "ground_truth": "8", "category": "general"}
 {"id": "q098", "question": "What is the currency of China?", "ground_truth": "yuan", "category": "general"}
 {"id": "q099", "question": "How many sides does an octagon have?", "ground_truth": "8", "category": "general"}
-{"id": "q100", "question": "What is the most widely spoken language in South America?", "ground_truth": "Portuguese", "category": "general"}

 {"id": "q048", "question": "What gas do plants absorb during photosynthesis?", "ground_truth": "carbon dioxide", "category": "science"}
 {"id": "q049", "question": "What is the most abundant gas in Earth's atmosphere?", "ground_truth": "nitrogen", "category": "science"}
 {"id": "q050", "question": "What is the chemical formula for water?", "ground_truth": "H2O", "category": "science"}
+{"id": "q051", "question": "What is the largest planet in our solar system?", "ground_truth": "Jupiter", "category": "science"}
 {"id": "q052", "question": "What is the largest organ in the human body?", "ground_truth": "skin", "category": "science"}
 {"id": "q053", "question": "What is the chemical symbol for silver?", "ground_truth": "Ag", "category": "science"}
 {"id": "q054", "question": "What is the atomic number of oxygen?", "ground_truth": "8", "category": "science"}
 {"id": "q055", "question": "What is the chemical formula for table salt?", "ground_truth": "NaCl", "category": "science"}
 {"id": "q056", "question": "What is the hardest natural substance on Earth?", "ground_truth": "diamond", "category": "science"}
 {"id": "q057", "question": "What force keeps planets in orbit around the Sun?", "ground_truth": "gravity", "category": "science"}
+{"id": "q058", "question": "What star does Earth orbit?", "ground_truth": "Sun", "category": "science"}
 {"id": "q059", "question": "What is the boiling point of water in Celsius?", "ground_truth": "100", "category": "science"}
 {"id": "q060", "question": "What is the freezing point of water in Celsius?", "ground_truth": "0", "category": "science"}
 {"id": "q061", "question": "How many chromosomes does a normal human cell have?", "ground_truth": "46", "category": "science"}
 {"id": "q063", "question": "What is the chemical symbol for sodium?", "ground_truth": "Na", "category": "science"}
 {"id": "q064", "question": "What is the unit of electrical resistance?", "ground_truth": "ohm", "category": "science"}
 {"id": "q065", "question": "What particle has a negative charge in an atom?", "ground_truth": "electron", "category": "science"}
+{"id": "q066", "question": "What are the first three digits of pi after the decimal point?", "ground_truth": "141", "category": "math"}
 {"id": "q067", "question": "What is the square root of 144?", "ground_truth": "12", "category": "math"}
 {"id": "q068", "question": "What is 15 percent of 200?", "ground_truth": "30", "category": "math"}
 {"id": "q069", "question": "What is the sum of angles in a triangle in degrees?", "ground_truth": "180", "category": "math"}
 {"id": "q070", "question": "What is 2 to the power of 10?", "ground_truth": "1024", "category": "math"}
 {"id": "q071", "question": "What is the square root of 256?", "ground_truth": "16", "category": "math"}
+{"id": "q072", "question": "What are the first three digits of Euler's number e after the decimal point?", "ground_truth": "718", "category": "math"}
 {"id": "q073", "question": "How many sides does a heptagon have?", "ground_truth": "7", "category": "math"}
 {"id": "q074", "question": "What is the factorial of 5?", "ground_truth": "120", "category": "math"}
 {"id": "q075", "question": "What is the area of a circle with radius 1?", "ground_truth": "pi", "category": "math"}
 {"id": "q082", "question": "What is the currency of the United Kingdom?", "ground_truth": "pound", "category": "general"}
 {"id": "q083", "question": "How many players are on a standard soccer team?", "ground_truth": "11", "category": "general"}
 {"id": "q084", "question": "How many strings does a standard guitar have?", "ground_truth": "6", "category": "general"}
+{"id": "q085", "question": "What is the currency of Brazil?", "ground_truth": "real", "category": "general"}
 {"id": "q086", "question": "What language has the most native speakers in the world?", "ground_truth": "Mandarin", "category": "general"}
 {"id": "q087", "question": "How many hours are in a week?", "ground_truth": "168", "category": "general"}
 {"id": "q088", "question": "What is the national animal of Australia?", "ground_truth": "kangaroo", "category": "general"}
 {"id": "q089", "question": "How many keys does a standard piano have?", "ground_truth": "88", "category": "general"}
 {"id": "q090", "question": "What is the currency of India?", "ground_truth": "rupee", "category": "general"}
+{"id": "q091", "question": "On which continent is the Amazon rainforest located?", "ground_truth": "South America", "category": "general"}
 {"id": "q092", "question": "What is the fastest land animal?", "ground_truth": "cheetah", "category": "general"}
 {"id": "q093", "question": "How many teeth does an adult human have?", "ground_truth": "32", "category": "general"}
 {"id": "q094", "question": "What is the chemical symbol for lead?", "ground_truth": "Pb", "category": "general"}
 {"id": "q097", "question": "How many planets are in our solar system?", "ground_truth": "8", "category": "general"}
 {"id": "q098", "question": "What is the currency of China?", "ground_truth": "yuan", "category": "general"}
 {"id": "q099", "question": "How many sides does an octagon have?", "ground_truth": "8", "category": "general"}
+{"id": "q100", "question": "What is the official language of Brazil?", "ground_truth": "Portuguese", "category": "general"}

src/deceit_env/models.py CHANGED Viewed

@@ -15,12 +15,17 @@ class DeceitObservation(Observation):
 class DeceitAction(Action):
-    """What the agent produces each step."""
     reasoning: str
     answer: str = ""
     confidence: float = 0.5
     abstain: bool = False
     @field_validator("confidence")
     @classmethod
@@ -37,3 +42,5 @@ class DeceitState(State):
     ground_truth: str = ""
     current_question_id: str = ""
     episode_rewards: list[float] = []

 class DeceitAction(Action):
+    """What the agent produces each step.
+    Set is_final=True to commit an answer and end the episode.
+    Set is_final=False to think for another turn (costs a -0.05 step penalty).
+    """
     reasoning: str
     answer: str = ""
     confidence: float = 0.5
     abstain: bool = False
+    is_final: bool = False
     @field_validator("confidence")
     @classmethod
     ground_truth: str = ""
     current_question_id: str = ""
     episode_rewards: list[float] = []
+    prior_reasoning: list[str] = []
+    max_turns: int = 3

src/deceit_env/server/app.py CHANGED Viewed

@@ -5,6 +5,9 @@ from openenv.core.env_server import create_fastapi_app
 from deceit_env.models import DeceitAction, DeceitObservation
 from deceit_env.server.environment import DeceitEnvironment
 app = create_fastapi_app(
     env=DeceitEnvironment,
     action_cls=DeceitAction,

 from deceit_env.models import DeceitAction, DeceitObservation
 from deceit_env.server.environment import DeceitEnvironment
+# Note: create_fastapi_app expects a callable factory (no args).
+# For default env construction (dataset from package data dir), passing the
+# class directly works. For custom config use: lambda: DeceitEnvironment(dataset_path=...)
 app = create_fastapi_app(
     env=DeceitEnvironment,
     action_cls=DeceitAction,

src/deceit_env/server/environment.py CHANGED Viewed

@@ -1,4 +1,14 @@
-"""Level 1 Deceit environment — factual QA, single-turn, no adversary."""
 from __future__ import annotations
@@ -18,6 +28,9 @@ _DEFAULT_DATASET = (
     pathlib.Path(__file__).parent.parent / "data" / "level1.jsonl"
 )
 def compute_reward(
     correct: bool,
@@ -39,10 +52,11 @@ def compute_reward(
 class DeceitEnvironment(Environment[DeceitAction, DeceitObservation, DeceitState]):
-    """OpenEnv-compliant Level 1 environment for the Deceit project.
-    Single-turn episodes: one question, one answer, one reward.
-    No distractors, no adversary, no consistency signal (Phase 4+).
     """
     def __init__(
@@ -58,6 +72,7 @@ class DeceitEnvironment(Environment[DeceitAction, DeceitObservation, DeceitState
         )
         self._rng = random.Random(seed)
         self._state: DeceitState = DeceitState()
     # ------------------------------------------------------------------
     # OpenEnv interface
@@ -74,6 +89,7 @@ class DeceitEnvironment(Environment[DeceitAction, DeceitObservation, DeceitState
             self._rng = random.Random(seed)
         question_row = self._rng.choice(self._dataset)
         self._state = DeceitState(
             episode_id=episode_id or str(uuid.uuid4()),
             step_count=0,
@@ -81,12 +97,14 @@ class DeceitEnvironment(Environment[DeceitAction, DeceitObservation, DeceitState
             ground_truth=question_row["ground_truth"],
             current_question_id=question_row["id"],
             episode_rewards=[],
         )
         return DeceitObservation(
-            question=question_row["question"],
             context=[],
             turn_index=0,
-            max_turns=1,
             level=1,
         )
@@ -96,9 +114,35 @@ class DeceitEnvironment(Environment[DeceitAction, DeceitObservation, DeceitState
         timeout_s: Optional[float] = None,
         **kwargs,
     ) -> DeceitObservation:
-        """Grade the action and return observation with reward embedded."""
         self._state.step_count += 1
         if action.abstain:
             correctness_r, calibration_r = 0.0, 0.0
             grader_method = "abstain"
@@ -113,15 +157,19 @@ class DeceitEnvironment(Environment[DeceitAction, DeceitObservation, DeceitState
             )
             grader_method = result.method
         total_reward = correctness_r + calibration_r
         self._state.episode_rewards.append(total_reward)
         return DeceitObservation(
-            question="",
-            context=[],
             turn_index=self._state.step_count,
-            max_turns=1,
-            level=1,
             done=True,
             reward=total_reward,
             metadata={
@@ -129,6 +177,8 @@ class DeceitEnvironment(Environment[DeceitAction, DeceitObservation, DeceitState
                 "calibration_reward": calibration_r,
                 "grader_method": grader_method,
                 "correct": correct,
             },
         )

+"""Level 1 Deceit environment — factual QA, multi-turn, no adversary.
+Episode structure (max_turns=3):
+  - Each step where is_final=False: agent pays a -0.05 step penalty and gets
+    their own reasoning appended to the next observation's context.
+  - When is_final=True OR step_count >= max_turns: episode ends, full reward
+    (correctness + calibration) is returned.
+This multi-turn design gives GRPO meaningful trajectory length and teaches the
+model to "think more when uncertain" — the core Deceit behavior.
+"""
 from __future__ import annotations
     pathlib.Path(__file__).parent.parent / "data" / "level1.jsonl"
 )
+STEP_PENALTY = -0.05
+MAX_TURNS = 3
 def compute_reward(
     correct: bool,
 class DeceitEnvironment(Environment[DeceitAction, DeceitObservation, DeceitState]):
+    """OpenEnv-compliant multi-turn environment for the Deceit project.
+    Level 1: factual QA with no distractors or adversary.
+    Up to max_turns=3 steps per episode. Each non-final step costs a small
+    step penalty and feeds the agent's reasoning back as context.
     """
     def __init__(
         )
         self._rng = random.Random(seed)
         self._state: DeceitState = DeceitState()
+        self._current_question: str = ""
     # ------------------------------------------------------------------
     # OpenEnv interface
             self._rng = random.Random(seed)
         question_row = self._rng.choice(self._dataset)
+        self._current_question = question_row["question"]
         self._state = DeceitState(
             episode_id=episode_id or str(uuid.uuid4()),
             step_count=0,
             ground_truth=question_row["ground_truth"],
             current_question_id=question_row["id"],
             episode_rewards=[],
+            prior_reasoning=[],
+            max_turns=MAX_TURNS,
         )
         return DeceitObservation(
+            question=self._current_question,
             context=[],
             turn_index=0,
+            max_turns=MAX_TURNS,
             level=1,
         )
         timeout_s: Optional[float] = None,
         **kwargs,
     ) -> DeceitObservation:
+        """Process one agent turn.
+        Non-final step: pay step penalty, append reasoning to context, continue.
+        Final step (is_final=True or turn limit reached): compute full reward.
+        """
         self._state.step_count += 1
+        forced_final = self._state.step_count >= self._state.max_turns
+        is_terminal = action.is_final or forced_final
+        if not is_terminal:
+            # Thinking turn: no grading, just step penalty
+            self._state.prior_reasoning.append(action.reasoning)
+            self._state.episode_rewards.append(STEP_PENALTY)
+            context = [
+                f"Your previous reasoning (turn {i + 1}): {r}"
+                for i, r in enumerate(self._state.prior_reasoning)
+            ]
+            return DeceitObservation(
+                question=self._current_question,
+                context=context,
+                turn_index=self._state.step_count,
+                max_turns=self._state.max_turns,
+                level=self._state.level,
+                done=False,
+                reward=STEP_PENALTY,
+                metadata={"step_penalty": STEP_PENALTY, "is_final": False},
+            )
+        # Terminal turn: grade and compute full reward
         if action.abstain:
             correctness_r, calibration_r = 0.0, 0.0
             grader_method = "abstain"
             )
             grader_method = result.method
+        # Add step penalties already accumulated for non-final turns
         total_reward = correctness_r + calibration_r
         self._state.episode_rewards.append(total_reward)
         return DeceitObservation(
+            question=self._current_question,
+            context=[
+                f"Your previous reasoning (turn {i + 1}): {r}"
+                for i, r in enumerate(self._state.prior_reasoning)
+            ],
             turn_index=self._state.step_count,
+            max_turns=self._state.max_turns,
+            level=self._state.level,
             done=True,
             reward=total_reward,
             metadata={
                 "calibration_reward": calibration_r,
                 "grader_method": grader_method,
                 "correct": correct,
+                "is_final": True,
+                "forced_final": forced_final,
             },
         )

src/deceit_env/server/grader.py CHANGED Viewed

@@ -13,12 +13,19 @@ import re
 import pathlib
 from dataclasses import dataclass
 try:
     from openai import OpenAI
 except ImportError:
     OpenAI = None  # type: ignore[assignment,misc]
-_DEFAULT_CACHE = pathlib.Path(__file__).parent.parent.parent.parent / "grader_cache.json"
 @dataclass
@@ -40,10 +47,10 @@ class Grader:
     def __init__(
         self,
-        cache_path: str | pathlib.Path = _DEFAULT_CACHE,
         openai_api_key: str | None = None,
     ) -> None:
-        self._cache_path = pathlib.Path(cache_path)
         self._openai_api_key = openai_api_key
         self._cache: dict[str, bool] = {}
         if self._cache_path.exists():

 import pathlib
 from dataclasses import dataclass
+import os
 try:
     from openai import OpenAI
 except ImportError:
     OpenAI = None  # type: ignore[assignment,misc]
+def _default_cache_path() -> pathlib.Path:
+    """Use DECEIT_GRADER_CACHE env var, falling back to /tmp."""
+    env_path = os.environ.get("DECEIT_GRADER_CACHE")
+    if env_path:
+        return pathlib.Path(env_path)
+    return pathlib.Path("/tmp/deceit_grader_cache.json")
 @dataclass
     def __init__(
         self,
+        cache_path: str | pathlib.Path | None = None,
         openai_api_key: str | None = None,
     ) -> None:
+        self._cache_path = pathlib.Path(cache_path) if cache_path is not None else _default_cache_path()
         self._openai_api_key = openai_api_key
         self._cache: dict[str, bool] = {}
         if self._cache_path.exists():

tests/test_environment.py CHANGED Viewed

@@ -5,7 +5,7 @@ import pytest
 from unittest.mock import MagicMock
 from deceit_env.models import DeceitAction, DeceitObservation, DeceitState
-from deceit_env.server.environment import DeceitEnvironment
 from deceit_env.server.grader import GraderResult
@@ -50,6 +50,10 @@ class TestReset:
         obs = env_correct.reset()
         assert obs.level == 1
     def test_reset_initializes_state(self, env_correct):
         env_correct.reset()
         state = env_correct.state
@@ -57,48 +61,49 @@ class TestReset:
         assert state.episode_id is not None
         assert state.step_count == 0
         assert state.ground_truth != ""
-class TestStep:
-    def test_step_correct_returns_positive_reward(self, env_correct):
         env_correct.reset(seed=42)
-        action = DeceitAction(reasoning="I know this.", answer="Canberra", confidence=0.9)
         obs = env_correct.step(action)
-        assert obs.reward > 0
-    def test_step_correct_confident_reward_is_1_3(self, env_correct):
         env_correct.reset(seed=42)
-        action = DeceitAction(reasoning="Sure.", answer="Canberra", confidence=0.9)
         obs = env_correct.step(action)
-        assert obs.reward == pytest.approx(1.3)
-    def test_step_wrong_returns_negative_reward(self, env_wrong):
         env_wrong.reset(seed=42)
-        action = DeceitAction(reasoning="Guessing.", answer="Sydney", confidence=0.9)
         obs = env_wrong.step(action)
-        assert obs.reward < 0
-    def test_step_wrong_confident_reward_is_minus_1_3(self, env_wrong):
         env_wrong.reset(seed=42)
-        action = DeceitAction(reasoning="Wrong.", answer="Sydney", confidence=0.9)
         obs = env_wrong.step(action)
-        assert obs.reward == pytest.approx(-1.3)
     def test_step_abstain_reward_is_zero(self, env_correct):
         env_correct.reset(seed=42)
-        action = DeceitAction(reasoning="Not sure.", answer="", abstain=True, confidence=0.3)
         obs = env_correct.step(action)
         assert obs.reward == pytest.approx(0.0)
-    def test_step_sets_done_true(self, env_correct):
         env_correct.reset(seed=42)
-        action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.8)
         obs = env_correct.step(action)
         assert obs.done is True
     def test_step_metadata_contains_grader_info(self, env_correct):
         env_correct.reset(seed=42)
-        action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.9)
         obs = env_correct.step(action)
         assert "grader_method" in obs.metadata
         assert "correct" in obs.metadata
@@ -107,18 +112,91 @@ class TestStep:
     def test_state_updated_after_step(self, env_correct):
         env_correct.reset(seed=42)
-        action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.9)
         env_correct.step(action)
         assert env_correct.state.step_count == 1
         assert len(env_correct.state.episode_rewards) == 1
 class TestMultipleEpisodes:
     def test_reset_step_reset_step_sequence(self, env_correct):
         for _ in range(3):
             obs = env_correct.reset()
             assert isinstance(obs, DeceitObservation)
-            action = DeceitAction(reasoning="r", answer="x", confidence=0.8)
             result = env_correct.step(action)
             assert result.done is True
             assert env_correct.state.step_count == 1
@@ -126,10 +204,11 @@ class TestMultipleEpisodes:
     def test_state_resets_between_episodes(self, env_correct):
         env_correct.reset(seed=1)
         first_id = env_correct.state.episode_id
-        env_correct.step(DeceitAction(reasoning="r", answer="x", confidence=0.8))
         env_correct.reset(seed=2)
         second_id = env_correct.state.episode_id
         assert first_id != second_id
         assert env_correct.state.step_count == 0
         assert env_correct.state.episode_rewards == []

 from unittest.mock import MagicMock
 from deceit_env.models import DeceitAction, DeceitObservation, DeceitState
+from deceit_env.server.environment import DeceitEnvironment, STEP_PENALTY
 from deceit_env.server.grader import GraderResult
         obs = env_correct.reset()
         assert obs.level == 1
+    def test_reset_observation_max_turns_3(self, env_correct):
+        obs = env_correct.reset()
+        assert obs.max_turns == 3
     def test_reset_initializes_state(self, env_correct):
         env_correct.reset()
         state = env_correct.state
         assert state.episode_id is not None
         assert state.step_count == 0
         assert state.ground_truth != ""
+        assert state.prior_reasoning == []
+class TestSingleStepFinal:
+    def test_step_correct_confident_reward_is_1_3(self, env_correct):
         env_correct.reset(seed=42)
+        action = DeceitAction(reasoning="Sure.", answer="Canberra", confidence=0.9, is_final=True)
         obs = env_correct.step(action)
+        assert obs.reward == pytest.approx(1.3)
+    def test_step_correct_returns_positive_reward(self, env_correct):
         env_correct.reset(seed=42)
+        action = DeceitAction(reasoning="I know this.", answer="Canberra", confidence=0.9, is_final=True)
         obs = env_correct.step(action)
+        assert obs.reward > 0
+    def test_step_wrong_confident_reward_is_minus_1_3(self, env_wrong):
         env_wrong.reset(seed=42)
+        action = DeceitAction(reasoning="Wrong.", answer="Sydney", confidence=0.9, is_final=True)
         obs = env_wrong.step(action)
+        assert obs.reward == pytest.approx(-1.3)
+    def test_step_wrong_returns_negative_reward(self, env_wrong):
         env_wrong.reset(seed=42)
+        action = DeceitAction(reasoning="Guessing.", answer="Sydney", confidence=0.9, is_final=True)
         obs = env_wrong.step(action)
+        assert obs.reward < 0
     def test_step_abstain_reward_is_zero(self, env_correct):
         env_correct.reset(seed=42)
+        action = DeceitAction(reasoning="Not sure.", answer="", abstain=True, confidence=0.3, is_final=True)
         obs = env_correct.step(action)
         assert obs.reward == pytest.approx(0.0)
+    def test_step_final_sets_done_true(self, env_correct):
         env_correct.reset(seed=42)
+        action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.8, is_final=True)
         obs = env_correct.step(action)
         assert obs.done is True
     def test_step_metadata_contains_grader_info(self, env_correct):
         env_correct.reset(seed=42)
+        action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.9, is_final=True)
         obs = env_correct.step(action)
         assert "grader_method" in obs.metadata
         assert "correct" in obs.metadata
     def test_state_updated_after_step(self, env_correct):
         env_correct.reset(seed=42)
+        action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.9, is_final=True)
         env_correct.step(action)
         assert env_correct.state.step_count == 1
         assert len(env_correct.state.episode_rewards) == 1
+class TestMultiTurnEpisodes:
+    def test_non_final_step_returns_done_false(self, env_correct):
+        env_correct.reset(seed=42)
+        action = DeceitAction(reasoning="thinking...", is_final=False)
+        obs = env_correct.step(action)
+        assert obs.done is False
+    def test_non_final_step_returns_step_penalty(self, env_correct):
+        env_correct.reset(seed=42)
+        action = DeceitAction(reasoning="thinking...", is_final=False)
+        obs = env_correct.step(action)
+        assert obs.reward == pytest.approx(STEP_PENALTY)
+    def test_non_final_step_appends_reasoning_to_context(self, env_correct):
+        env_correct.reset(seed=42)
+        action = DeceitAction(reasoning="First I considered Sydney.", is_final=False)
+        obs = env_correct.step(action)
+        assert any("First I considered Sydney." in c for c in obs.context)
+    def test_multi_turn_full_trajectory(self, env_correct):
+        # reset → think → think → commit correct → positive final reward
+        env_correct.reset(seed=42)
+        obs1 = env_correct.step(DeceitAction(reasoning="Turn 1 thinking", is_final=False))
+        assert obs1.done is False
+        assert obs1.reward == pytest.approx(STEP_PENALTY)
+        obs2 = env_correct.step(DeceitAction(reasoning="Turn 2 thinking", is_final=False))
+        assert obs2.done is False
+        assert obs2.reward == pytest.approx(STEP_PENALTY)
+        obs3 = env_correct.step(
+            DeceitAction(reasoning="Committed.", answer="Canberra", confidence=0.9, is_final=True)
+        )
+        assert obs3.done is True
+        assert obs3.reward == pytest.approx(1.3)
+        assert env_correct.state.step_count == 3
+    def test_forced_termination_at_max_turns(self, env_correct):
+        # 3 non-final steps — 3rd is forced final regardless of is_final flag
+        env_correct.reset(seed=42)
+        env_correct.step(DecaitAction := DeceitAction(reasoning="t1", is_final=False))
+        env_correct.step(DeceitAction(reasoning="t2", is_final=False))
+        # 3rd step hits max_turns, forced terminal
+        obs = env_correct.step(
+            DeceitAction(reasoning="t3", answer="Canberra", confidence=0.8, is_final=False)
+        )
+        assert obs.done is True
+        assert obs.metadata.get("forced_final") is True
+    def test_prior_reasoning_in_context_grows_each_turn(self, env_correct):
+        env_correct.reset(seed=42)
+        env_correct.step(DeceitAction(reasoning="step1", is_final=False))
+        obs = env_correct.step(DeceitAction(reasoning="step2", is_final=False))
+        assert len(obs.context) == 2
+    def test_state_prior_reasoning_accumulates(self, env_correct):
+        env_correct.reset(seed=42)
+        env_correct.step(DeceitAction(reasoning="thinking A", is_final=False))
+        env_correct.step(DeceitAction(reasoning="thinking B", is_final=False))
+        assert env_correct.state.prior_reasoning == ["thinking A", "thinking B"]
+    def test_episode_rewards_include_step_penalties(self, env_correct):
+        env_correct.reset(seed=42)
+        env_correct.step(DeceitAction(reasoning="t1", is_final=False))
+        env_correct.step(
+            DeceitAction(reasoning="commit", answer="Canberra", confidence=0.9, is_final=True)
+        )
+        rewards = env_correct.state.episode_rewards
+        assert rewards[0] == pytest.approx(STEP_PENALTY)
+        assert rewards[1] == pytest.approx(1.3)
 class TestMultipleEpisodes:
     def test_reset_step_reset_step_sequence(self, env_correct):
         for _ in range(3):
             obs = env_correct.reset()
             assert isinstance(obs, DeceitObservation)
+            action = DeceitAction(reasoning="r", answer="x", confidence=0.8, is_final=True)
             result = env_correct.step(action)
             assert result.done is True
             assert env_correct.state.step_count == 1
     def test_state_resets_between_episodes(self, env_correct):
         env_correct.reset(seed=1)
         first_id = env_correct.state.episode_id
+        env_correct.step(DeceitAction(reasoning="r", answer="x", confidence=0.8, is_final=True))
         env_correct.reset(seed=2)
         second_id = env_correct.state.episode_id
         assert first_id != second_id
         assert env_correct.state.step_count == 0
         assert env_correct.state.episode_rewards == []
+        assert env_correct.state.prior_reasoning == []

tests/test_models.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import json
 import pytest
 from pydantic import ValidationError
@@ -30,8 +29,24 @@ class TestDeceitObservation:
         assert obs.max_turns == 5
         assert obs.level == 2
     def test_json_roundtrip(self):
-        obs = DeceitObservation(question="Q", context=["ctx"], level=2)
         data = obs.model_dump_json()
         restored = DeceitObservation.model_validate_json(data)
         assert restored == obs
@@ -47,6 +62,11 @@ class TestDeceitAction:
         assert action.answer == ""
         assert action.confidence == 0.5
         assert action.abstain is False
     def test_with_all_fields(self):
         action = DeceitAction(
@@ -54,9 +74,11 @@ class TestDeceitAction:
             answer="Canberra",
             confidence=0.9,
             abstain=False,
         )
         assert action.answer == "Canberra"
         assert action.confidence == 0.9
     def test_confidence_upper_bound_rejected(self):
         with pytest.raises(ValidationError):
@@ -76,8 +98,12 @@ class TestDeceitAction:
         action = DeceitAction(reasoning="unsure", abstain=True)
         assert action.abstain is True
     def test_json_roundtrip(self):
-        action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.9)
         data = action.model_dump_json()
         restored = DeceitAction.model_validate_json(data)
         assert restored == action
@@ -92,6 +118,8 @@ class TestDeceitState:
         assert state.ground_truth == ""
         assert state.current_question_id == ""
         assert state.episode_rewards == []
     def test_with_all_fields(self):
         state = DeceitState(
@@ -101,10 +129,19 @@ class TestDeceitState:
             ground_truth="Canberra",
             current_question_id="q_042",
             episode_rewards=[1.3, -1.1],
         )
         assert state.episode_id == "abc-123"
         assert state.ground_truth == "Canberra"
         assert state.episode_rewards == [1.3, -1.1]
     def test_mutable_state_can_be_updated(self):
         state = DeceitState()
@@ -118,6 +155,7 @@ class TestDeceitState:
             episode_id="abc-123",
             ground_truth="Canberra",
             episode_rewards=[1.3, 0.0],
         )
         data = state.model_dump_json()
         restored = DeceitState.model_validate_json(data)

 import pytest
 from pydantic import ValidationError
         assert obs.max_turns == 5
         assert obs.level == 2
+    def test_openenv_inherited_done_field(self):
+        obs = DeceitObservation(question="Q", done=True)
+        assert obs.done is True
+    def test_openenv_inherited_reward_field(self):
+        obs = DeceitObservation(question="Q", reward=1.3)
+        assert obs.reward == pytest.approx(1.3)
+    def test_openenv_inherited_metadata_field(self):
+        obs = DeceitObservation(question="Q", metadata={"key": "val"})
+        assert obs.metadata["key"] == "val"
+    def test_extra_fields_rejected(self):
+        with pytest.raises(ValidationError):
+            DeceitObservation(question="Q", nonexistent_field="boom")
     def test_json_roundtrip(self):
+        obs = DeceitObservation(question="Q", context=["ctx"], level=2, done=True, reward=0.5)
         data = obs.model_dump_json()
         restored = DeceitObservation.model_validate_json(data)
         assert restored == obs
         assert action.answer == ""
         assert action.confidence == 0.5
         assert action.abstain is False
+        assert action.is_final is False
+    def test_is_final_field(self):
+        action = DeceitAction(reasoning="committing now", answer="Canberra", is_final=True)
+        assert action.is_final is True
     def test_with_all_fields(self):
         action = DeceitAction(
             answer="Canberra",
             confidence=0.9,
             abstain=False,
+            is_final=True,
         )
         assert action.answer == "Canberra"
         assert action.confidence == 0.9
+        assert action.is_final is True
     def test_confidence_upper_bound_rejected(self):
         with pytest.raises(ValidationError):
         action = DeceitAction(reasoning="unsure", abstain=True)
         assert action.abstain is True
+    def test_extra_fields_rejected(self):
+        with pytest.raises(ValidationError):
+            DeceitAction(reasoning="r", ghost_field=True)
     def test_json_roundtrip(self):
+        action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.9, is_final=True)
         data = action.model_dump_json()
         restored = DeceitAction.model_validate_json(data)
         assert restored == action
         assert state.ground_truth == ""
         assert state.current_question_id == ""
         assert state.episode_rewards == []
+        assert state.prior_reasoning == []
+        assert state.max_turns == 3
     def test_with_all_fields(self):
         state = DeceitState(
             ground_truth="Canberra",
             current_question_id="q_042",
             episode_rewards=[1.3, -1.1],
+            prior_reasoning=["First I thought Sydney...", "Then reconsidered."],
+            max_turns=3,
         )
         assert state.episode_id == "abc-123"
         assert state.ground_truth == "Canberra"
         assert state.episode_rewards == [1.3, -1.1]
+        assert len(state.prior_reasoning) == 2
+    def test_prior_reasoning_accumulates(self):
+        state = DeceitState()
+        state.prior_reasoning.append("step 1 thinking")
+        state.prior_reasoning.append("step 2 thinking")
+        assert len(state.prior_reasoning) == 2
     def test_mutable_state_can_be_updated(self):
         state = DeceitState()
             episode_id="abc-123",
             ground_truth="Canberra",
             episode_rewards=[1.3, 0.0],
+            prior_reasoning=["I think it is Canberra"],
         )
         data = state.model_dump_json()
         restored = DeceitState.model_validate_json(data)