Spaces:

Amogh-kal1
/

forge-arena

Sleeping

App Files Files Community

forge-arena / tests /test_forge.py

Amogh-kal1

Upload folder using huggingface_hub

db75f77 verified 16 days ago

raw

history blame contribute delete

13.8 kB

	"""Tests for the Forge module — pass@k formula, difficulty classification, scheduler."""
	from __future__ import annotations

	import pytest

	from forge_arena.forge.estimator import pass_at_k
	from forge_arena.forge.generator import TaskGenerator
	from forge_arena.models.tasks import CorruptionType, DifficultyTier, Task, TaskDomain


	# ─────────────────────────────────────────────────────────────────────────────
	# pass@k unbiased estimator
	# ─────────────────────────────────────────────────────────────────────────────

	class TestPassAtK:
	def test_zero_correct(self):
	"""0 correct answers → pass@k should be 0.0."""
	assert pass_at_k(n=10, c=0, k=8) == pytest.approx(0.0)

	def test_all_correct(self):
	"""All correct → pass@k should be 1.0."""
	assert pass_at_k(n=10, c=10, k=8) == pytest.approx(1.0)

	def test_half_correct_k1(self):
	"""pass@1 with half correct = 0.5."""
	result = pass_at_k(n=10, c=5, k=1)
	assert pytest.approx(0.5, abs=0.01) == result

	def test_k_greater_than_n_raises(self):
	with pytest.raises(ValueError):
	pass_at_k(n=4, c=2, k=8)

	def test_c_greater_than_n_raises(self):
	with pytest.raises(ValueError):
	pass_at_k(n=5, c=6, k=4)

	def test_monotone_in_c(self):
	"""Increasing correct answers should increase pass@k."""
	results = [pass_at_k(n=10, c=c, k=8) for c in range(0, 11)]
	for i in range(len(results) - 1):
	assert results[i] <= results[i + 1]

	def test_output_in_unit_interval(self):
	for n in [8, 10, 16]:
	for c in range(0, n + 1, 2):
	val = pass_at_k(n=n, c=c, k=min(8, n))
	assert 0.0 <= val <= 1.0, f"Out of range for n={n}, c={c}"


	# ─────────────────────────────────────────────────────────────────────────────
	# Difficulty classification
	# ─────────────────────────────────────────────────────────────────────────────

	class TestDifficultyClassification:
	def setup_method(self):
	from unittest.mock import MagicMock
	from forge_arena.forge.estimator import DifficultyEstimator

	self.config = MagicMock()
	self.config.difficulty_thresholds.too_easy = 0.85
	self.config.difficulty_thresholds.too_hard = 0.20
	self.config.estimation_k = 8
	self.config.estimation_n = 10
	# DifficultyEstimator requires a shared mutable episode_counter list
	self.estimator = DifficultyEstimator(self.config, [])

	def test_too_easy(self):
	from forge_arena.forge.estimator import classify_difficulty
	tier = classify_difficulty(0.90, self.config)
	assert tier == DifficultyTier.TOO_EASY

	def test_too_hard(self):
	from forge_arena.forge.estimator import classify_difficulty
	tier = classify_difficulty(0.10, self.config)
	assert tier == DifficultyTier.TOO_HARD

	def test_learnable_midpoint(self):
	from forge_arena.forge.estimator import classify_difficulty
	tier = classify_difficulty(0.50, self.config)
	assert tier == DifficultyTier.LEARNABLE

	def test_boundary_too_easy(self):
	"""Exactly at boundary → learnable (inclusive lower bound)."""
	from forge_arena.forge.estimator import classify_difficulty
	tier = classify_difficulty(0.85, self.config)
	assert tier == DifficultyTier.LEARNABLE

	def test_boundary_too_hard(self):
	from forge_arena.forge.estimator import classify_difficulty
	tier = classify_difficulty(0.20, self.config)
	assert tier == DifficultyTier.LEARNABLE


	# ─────────────────────────────────────────────────────────────────────────────
	# TaskScheduler queue management
	# ─────────────────────────────────────────────────────────────────────────────

	class TestTaskScheduler:
	def setup_method(self):
	from unittest.mock import MagicMock, AsyncMock
	from forge_arena.forge.scheduler import TaskScheduler

	config = MagicMock()
	config.queue_replenishment_threshold = 5
	config.batch_reestimation_interval = 50
	config.difficulty_thresholds.too_easy = 0.85
	config.difficulty_thresholds.too_hard = 0.20

	self.estimator = MagicMock()
	self.generator = MagicMock()
	self.scheduler = TaskScheduler(config, self.estimator, self.generator)

	def _make_task(self, task_id: str) -> object:
	from unittest.mock import MagicMock
	from forge_arena.models.tasks import TaskDomain, ObfuscationDepth
	task = MagicMock()
	task.id = task_id
	task.domain = TaskDomain.CUSTOMER_SUPPORT
	task.is_generated = False
	task.difficulty_tier = None
	return task

	def _make_snapshot(self, task_id: str, tier: DifficultyTier, pak: float):
	from unittest.mock import MagicMock
	snap = MagicMock()
	snap.task_id = task_id
	snap.difficulty_tier = tier
	snap.pass_at_k = pak
	return snap

	@pytest.mark.asyncio
	async def test_initialise_routes_learnable_to_active_queue(self):
	tasks = [self._make_task(f"t-{i}") for i in range(3)]
	snaps = [
	self._make_snapshot(f"t-{i}", DifficultyTier.LEARNABLE, 0.50)
	for i in range(3)
	]
	self.estimator.batch_estimate.return_value = snaps
	await self.scheduler.initialise(tasks, lambda t: False)
	state = self.scheduler.get_queue_state()
	assert state.learnable_count == 3

	@pytest.mark.asyncio
	async def test_initialise_places_all_seed_tasks_in_active_queue(self):
	"""Seed tasks bypass estimation and are placed directly in the learnable queue.

	Pre-estimating with a no-op policy (c=0) would classify every task
	as too-hard. Seed tasks are hand-authored for the learnable zone, so
	they skip estimation until real episodes are collected.
	"""
	tasks = [self._make_task("easy-task")]
	await self.scheduler.initialise(tasks, lambda t: False)
	state = self.scheduler.get_queue_state()
	# Task goes straight to active queue; estimator is never called.
	assert state.learnable_count == 1
	assert state.too_easy_count == 0
	self.estimator.batch_estimate.assert_not_called()

	def test_request_task_returns_task(self):
	task = self._make_task("t-1")
	self.scheduler._active_queue.append(task)
	result = self.scheduler.request_task()
	assert result.id == "t-1"

	def test_request_task_empty_raises(self):
	from forge_arena.forge.scheduler import QueueEmptyError
	# No initialise() called — both _active_queue and _seed_bank are empty.
	with pytest.raises(QueueEmptyError):
	self.scheduler.request_task()

	def test_request_task_cycles_when_queue_exhausted(self):
	"""After all seed tasks are consumed the queue refills from the seed bank."""
	import asyncio
	tasks = [self._make_task(f"seed-{i}") for i in range(3)]
	asyncio.run(self.scheduler.initialise(tasks, lambda t: False))

	# Consume all 3 tasks
	for _ in range(3):
	self.scheduler.request_task()

	# 4th call must not raise — it should cycle back through the seed bank
	result = self.scheduler.request_task()
	seed_ids = {t.id for t in tasks}
	assert result.id in seed_ids

	def test_batch_reestimate_does_not_wipe_queue_with_zero_accuracy_policy(self):
	"""_batch_reestimate with always-False policy must not leave the queue empty.

	This guards against the episode-50 bug where lambda t: False caused
	pass@k = 0.0 for every task, routing all tasks to the too-hard archive
	and wiping the active queue.
	"""
	import asyncio
	tasks = [self._make_task(f"t-{i}") for i in range(5)]
	snap_learnable = [
	self._make_snapshot(f"t-{i}", DifficultyTier.LEARNABLE, 0.50)
	for i in range(5)
	]
	# Simulate always-wrong policy: estimator returns too-hard for every task
	from forge_arena.models.tasks import DifficultyTier as DT
	snap_too_hard = [
	self._make_snapshot(f"t-{i}", DT.TOO_HARD, 0.0)
	for i in range(5)
	]
	self.estimator.batch_estimate.return_value = snap_too_hard
	asyncio.run(self.scheduler.initialise(tasks, lambda t: False))

	# Trigger batch re-estimation with the always-wrong policy
	asyncio.run(self.scheduler._batch_reestimate(lambda t: False))

	# Active queue must still be non-empty after the safety-net refill
	assert len(self.scheduler._active_queue) > 0

	def test_difficulty_history_empty_after_initialise(self):
	"""No snapshot history is recorded at init time.

	History starts accumulating only after real episodes are collected
	via update(). Seed tasks are placed in the queue without estimation.
	"""
	import asyncio
	tasks = [self._make_task("t-hist")]
	asyncio.run(self.scheduler.initialise(tasks, lambda t: False))
	curve = self.scheduler.get_difficulty_curve()
	# No estimation ran, so history is empty.
	assert "t-hist" not in curve


	# ─────────────────────────────────────────────────────────────────────────────
	# Generator quality validation
	# ─────────────────────────────────────────────────────────────────────────────

	def _make_task(
	corruption_type: CorruptionType \| None = None,
	corruption_explanation: str = "",
	corrupted_worker_output: str = "",
	) -> Task:
	"""Minimal Task fixture for validation tests."""
	return Task(
	id="test-gen-001",
	domain=TaskDomain.CUSTOMER_SUPPORT,
	corruption_type=corruption_type,
	task_description="Summarise the refund policy.",
	source_material="Policy text.",
	clean_worker_output="30-day returns accepted.",
	corrupted_worker_output=corrupted_worker_output or None,
	corruption_location=None,
	corruption_explanation=corruption_explanation or None,
	ground_truth_correction=None,
	is_generated=True,
	)


	class TestGeneratorValidation:
	"""Unit tests for TaskVariantGenerator._validate_generated_task()."""

	def test_clean_task_always_valid(self):
	task = _make_task(corruption_type=None)
	assert TaskGenerator._validate_generated_task(task) is True

	def test_corrupted_task_with_trigger_vocab_in_explanation_valid(self):
	# "omits" is in FACTUAL_OMISSION trigger_vocab
	task = _make_task(
	corruption_type=CorruptionType.FACTUAL_OMISSION,
	corruption_explanation="The Worker omits the 30-day deadline.",
	)
	assert TaskGenerator._validate_generated_task(task) is True

	def test_corrupted_task_with_trigger_vocab_in_output_valid(self):
	# "consistently recommends" (exact phrase) is in BIAS_INJECTION trigger_vocab
	task = _make_task(
	corruption_type=CorruptionType.BIAS_INJECTION,
	corrupted_worker_output="The agent consistently recommends Product A for all clients.",
	)
	assert TaskGenerator._validate_generated_task(task) is True

	def test_corrupted_task_with_no_trigger_vocab_invalid(self):
	# Neither explanation nor output contains any AUTHORITY_FABRICATION vocab
	task = _make_task(
	corruption_type=CorruptionType.AUTHORITY_FABRICATION,
	corruption_explanation="Something is wrong with the answer.",
	corrupted_worker_output="The policy says returns are accepted.",
	)
	assert TaskGenerator._validate_generated_task(task) is False

	def test_corrupted_task_with_empty_fields_invalid(self):
	# Corrupt task with no text fields at all
	task = _make_task(corruption_type=CorruptionType.TEMPORAL_SHIFT)
	assert TaskGenerator._validate_generated_task(task) is False

	@pytest.mark.parametrize("ctype", list(CorruptionType))
	def test_trigger_vocab_detected_for_each_type(self, ctype):
	"""Each corruption type's own trigger vocabulary should make validation pass."""
	from forge_arena.arena.corruptions.types import CORRUPTION_REGISTRY
	meta = CORRUPTION_REGISTRY[ctype]
	trigger_phrase = meta.trigger_vocab[0]
	task = _make_task(
	corruption_type=ctype,
	corruption_explanation=f"The Worker {trigger_phrase} the correct value.",
	)
	assert TaskGenerator._validate_generated_task(task) is True