| """Unit tests for dense reward components.""" |
|
|
| import math |
| import sqlite3 |
|
|
| import pytest |
|
|
| from server.reward import ( |
| _bin_progress, |
| _cardinality_score, |
| _layer1_operational, |
| _layer2_progress, |
| _numeric_range_score, |
| _value_overlap_score, |
| compute_step_reward, |
| ) |
| from sql_env.models import EpisodeContext, QuestionRecord |
|
|
|
|
| def _build_question_record() -> QuestionRecord: |
| return QuestionRecord( |
| question_id="q-episode-context", |
| question_text="How many students are there?", |
| database_name="student_assessment", |
| gold_sql="SELECT COUNT(*) FROM students", |
| gold_answer="0", |
| answer_type="integer", |
| difficulty="easy", |
| tables_involved=["students"], |
| ) |
|
|
|
|
| def _build_episode_context(**kwargs: object) -> EpisodeContext: |
| return EpisodeContext( |
| episode_id="ep-episode-context", |
| db_connection=sqlite3.connect(":memory:"), |
| question_record=_build_question_record(), |
| **kwargs, |
| ) |
|
|
|
|
| class TestLayer1Operational: |
| def test_layer1_successful_query(self) -> None: |
| context = _build_episode_context() |
| try: |
| reward = _layer1_operational( |
| context, |
| action_type="QUERY", |
| sql="SELECT 1", |
| rows=[(1,)], |
| error=None, |
| ) |
| assert reward == 0.025 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer1_successful_describe(self) -> None: |
| context = _build_episode_context() |
| try: |
| reward = _layer1_operational( |
| context, |
| action_type="DESCRIBE", |
| sql="DESCRIBE students", |
| rows=[("id", "INTEGER")], |
| error=None, |
| ) |
| assert reward == 0.015 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer1_successful_sample(self) -> None: |
| context = _build_episode_context() |
| try: |
| reward = _layer1_operational( |
| context, |
| action_type="SAMPLE", |
| sql="SELECT * FROM students LIMIT 5", |
| rows=[(1,)], |
| error=None, |
| ) |
| assert reward == 0.015 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer1_error_query(self) -> None: |
| context = _build_episode_context() |
| try: |
| reward = _layer1_operational( |
| context, |
| action_type="QUERY", |
| sql="SELECT missing FROM students", |
| rows=None, |
| error="no such column", |
| ) |
| assert reward == -0.005 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer1_new_info_no_cap(self) -> None: |
| """New info is awarded per unique query with no cumulative cap.""" |
| context = _build_episode_context() |
| try: |
| total = 0.0 |
| for idx in range(15): |
| r = _layer1_operational( |
| context, |
| action_type="QUERY", |
| sql=f"SELECT {idx}", |
| rows=[(idx,)], |
| error=None, |
| ) |
| total += r |
| |
| assert total == pytest.approx(15 * 0.025) |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer1_repeat_penalty(self) -> None: |
| context = _build_episode_context() |
| try: |
| _layer1_operational( |
| context, |
| action_type="QUERY", |
| sql="SELECT 1", |
| rows=[(1,)], |
| error=None, |
| ) |
| reward = _layer1_operational( |
| context, |
| action_type="QUERY", |
| sql="SELECT 1", |
| rows=[(1,)], |
| error=None, |
| ) |
| assert reward == -0.015 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer1_repeat_no_exec_ok(self) -> None: |
| context = _build_episode_context() |
| try: |
| _layer1_operational( |
| context, |
| action_type="QUERY", |
| sql="SELECT 2", |
| rows=[(2,)], |
| error=None, |
| ) |
| reward = _layer1_operational( |
| context, |
| action_type="QUERY", |
| sql="SELECT 2", |
| rows=[(2,)], |
| error=None, |
| ) |
| assert reward <= -0.005 |
| assert reward == -0.015 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer1_step_cost_always_applied(self) -> None: |
| context = _build_episode_context() |
| try: |
| reward_success = _layer1_operational( |
| context, |
| action_type="SAMPLE", |
| sql="SELECT * FROM students LIMIT 1", |
| rows=[(1,)], |
| error=None, |
| ) |
| reward_error = _layer1_operational( |
| context, |
| action_type="QUERY", |
| sql="SELECT bad", |
| rows=None, |
| error="bad query", |
| ) |
| assert reward_success < 0.02 |
| assert reward_error == -0.005 |
| finally: |
| context.db_connection.close() |
|
|
|
|
| class TestCardinalityScore: |
| def test_cardinality_exact_match(self) -> None: |
| assert _cardinality_score([(1,), (2,)], [(3,), (4,)]) == 1.0 |
|
|
| def test_cardinality_zero_pred(self) -> None: |
| assert _cardinality_score([], [(1,)]) == 0.0 |
|
|
| def test_cardinality_zero_gold(self) -> None: |
| assert _cardinality_score([(1,)], []) == 0.0 |
|
|
| def test_cardinality_both_empty(self) -> None: |
| assert _cardinality_score([], []) == 1.0 |
|
|
| def test_cardinality_pred_larger(self) -> None: |
| pred_rows = [(idx,) for idx in range(10)] |
| assert _cardinality_score(pred_rows, [(1,)]) == pytest.approx(0.1) |
|
|
| def test_cardinality_gold_larger(self) -> None: |
| gold_rows = [(idx,) for idx in range(4)] |
| assert _cardinality_score([(1,)], gold_rows) == 0.25 |
|
|
| def test_cardinality_returns_float_in_range(self) -> None: |
| score = _cardinality_score([(1,), (2,)], [(1,)]) |
| assert 0.0 <= score <= 1.0 |
|
|
|
|
| class TestValueOverlapScore: |
| def test_value_overlap_identical(self) -> None: |
| assert _value_overlap_score([(1, "a")], [(1, "a")]) == 1.0 |
|
|
| def test_value_overlap_disjoint(self) -> None: |
| assert _value_overlap_score([(1, "x")], [(2, "y")]) == 0.0 |
|
|
| def test_value_overlap_partial(self) -> None: |
| score = _value_overlap_score([(1, "a"), (2, "b")], [(1, "a"), (3, "c")]) |
| assert score == pytest.approx(2 / 6) |
|
|
| def test_value_overlap_empty_pred(self) -> None: |
| assert _value_overlap_score([], [(1,)]) == 0.0 |
|
|
| def test_value_overlap_empty_gold(self) -> None: |
| assert _value_overlap_score([(1,)], []) == 0.0 |
|
|
| def test_value_overlap_both_empty(self) -> None: |
| assert _value_overlap_score([], []) == 0.0 |
|
|
| def test_value_overlap_stringifies_values(self) -> None: |
| score = _value_overlap_score([(1, 2.5, None)], [(1, 2.5, None)]) |
| assert score == 1.0 |
|
|
| def test_value_overlap_returns_float_in_range(self) -> None: |
| score = _value_overlap_score([(1, "a")], [(1, "b")]) |
| assert 0.0 <= score <= 1.0 |
|
|
|
|
| class TestNumericRangeScore: |
| def test_numeric_range_identical(self) -> None: |
| assert _numeric_range_score([(10,)], [(10,)]) == 1.0 |
|
|
| def test_numeric_range_no_numerics_in_gold(self) -> None: |
| assert _numeric_range_score([("a",)], [("b",)]) == 1.0 |
|
|
| def test_numeric_range_close_values(self) -> None: |
| score = _numeric_range_score([(11,)], [(10,)]) |
| assert score > 0.5 |
| assert score < 1.0 |
|
|
| def test_numeric_range_far_values(self) -> None: |
| score = _numeric_range_score([(1000000,)], [(1,)]) |
| assert score < 0.1 |
|
|
| def test_numeric_range_zero_distance(self) -> None: |
| assert _numeric_range_score([(0,)], [(0,)]) == 1.0 |
|
|
| def test_numeric_range_negative_numbers(self) -> None: |
| expected = 1.0 / (1.0 + math.log1p(10.0)) |
| score = _numeric_range_score([(-5,)], [(5,)]) |
| assert score == expected |
|
|
| def test_numeric_range_mixed_types(self) -> None: |
| assert _numeric_range_score([(10, "a")], [(10, "b")]) == 1.0 |
|
|
| def test_numeric_range_empty_pred(self) -> None: |
| assert _numeric_range_score([], [(1,)]) == 0.0 |
|
|
| def test_numeric_range_returns_float_in_range(self) -> None: |
| score = _numeric_range_score([(5,), (10,)], [(7,)]) |
| assert 0.0 <= score <= 1.0 |
|
|
|
|
| class TestBinProgress: |
| def test_bin_progress_zero(self) -> None: |
| assert _bin_progress(0.0) == 0.0 |
|
|
| def test_bin_progress_low(self) -> None: |
| assert _bin_progress(0.124) == 0.0 |
|
|
| def test_bin_progress_boundary_0125(self) -> None: |
| assert _bin_progress(0.125) == 0.25 |
|
|
| def test_bin_progress_mid_low(self) -> None: |
| assert _bin_progress(0.3) == 0.25 |
|
|
| def test_bin_progress_boundary_0375(self) -> None: |
| assert _bin_progress(0.375) == 0.5 |
|
|
| def test_bin_progress_mid(self) -> None: |
| assert _bin_progress(0.5) == 0.5 |
|
|
| def test_bin_progress_boundary_0625(self) -> None: |
| assert _bin_progress(0.625) == 0.75 |
|
|
| def test_bin_progress_mid_high(self) -> None: |
| assert _bin_progress(0.7) == 0.75 |
|
|
| def test_bin_progress_boundary_0875(self) -> None: |
| assert _bin_progress(0.875) == 1.0 |
|
|
| def test_bin_progress_one(self) -> None: |
| assert _bin_progress(1.0) == 1.0 |
|
|
|
|
| class TestLayer2Progress: |
| def test_layer2_perfect_match(self) -> None: |
| context = _build_episode_context(gold_rows=[(1, "a", 10)]) |
| try: |
| reward = _layer2_progress(context, rows=[(1, "a", 10)]) |
| assert reward == pytest.approx(0.15) |
| assert context.previous_progress == 1.0 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer2_no_change(self) -> None: |
| context = _build_episode_context(gold_rows=[(1, "a", 10)]) |
| try: |
| _layer2_progress(context, rows=[(1, "a", 10)]) |
| reward = _layer2_progress(context, rows=[(1, "a", 10)]) |
| assert reward == 0.0 |
| assert context.previous_progress == 1.0 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer2_improvement(self) -> None: |
| context = _build_episode_context(gold_rows=[(1,), (2,), (3,), (4,)]) |
| try: |
| first_reward = _layer2_progress(context, rows=[(1,)]) |
| assert first_reward == pytest.approx(0.0375) |
| assert context.previous_progress == 0.25 |
|
|
| second_reward = _layer2_progress(context, rows=[(1,), (2,), (3,), (4,)]) |
| assert second_reward == pytest.approx(0.1125) |
| assert context.previous_progress == 1.0 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer2_regression_penalized(self) -> None: |
| """Delta-based: regression yields negative reward.""" |
| context = _build_episode_context(gold_rows=[(1, "a", 10)]) |
| try: |
| _layer2_progress(context, rows=[(1, "a", 10)]) |
| assert context.previous_progress == 1.0 |
|
|
| reward = _layer2_progress(context, rows=[]) |
| assert reward < 0.0 |
| assert context.previous_progress == 0.0 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer2_recovery_rewarded(self) -> None: |
| """Delta-based: recovery after regression IS rewarded.""" |
| context = _build_episode_context(gold_rows=[(1, "a", 10)]) |
| try: |
| _layer2_progress(context, rows=[(1, "a", 10)]) |
| _layer2_progress(context, rows=[]) |
| reward = _layer2_progress(context, rows=[(1, "a", 10)]) |
| assert reward == pytest.approx(0.15) |
| assert context.previous_progress == 1.0 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer2_empty_gold_rows(self) -> None: |
| context = _build_episode_context(gold_rows=[]) |
| try: |
| reward = _layer2_progress(context, rows=[(1,)]) |
| assert reward == 0.0 |
| assert context.previous_progress == 0.0 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer2_weighted_average(self) -> None: |
| context = _build_episode_context(gold_rows=[(10,), (20,)]) |
| try: |
| reward = _layer2_progress(context, rows=[(10,), (1000,)]) |
| assert reward == pytest.approx(0.075) |
| assert context.previous_progress == 0.5 |
| finally: |
| context.db_connection.close() |
|
|
| def test_layer2_updates_previous_progress(self) -> None: |
| context = _build_episode_context(gold_rows=[(1,), (2,), (3,), (4,)]) |
| try: |
| assert context.previous_progress == 0.0 |
| _layer2_progress(context, rows=[(1,), (2,), (3,), (4,)]) |
| assert context.previous_progress == 1.0 |
| finally: |
| context.db_connection.close() |
|
|
|
|
| class TestComputeStepReward: |
| def test_compute_reward_query_success(self) -> None: |
| context = _build_episode_context(gold_rows=[(10,), (20,)]) |
| try: |
| reward = compute_step_reward( |
| context, |
| action_type="QUERY", |
| sql="SELECT value FROM t", |
| rows=[(10,), (1000,)], |
| error=None, |
| ) |
| assert reward == pytest.approx(0.1) |
| finally: |
| context.db_connection.close() |
|
|
| def test_compute_reward_query_error(self) -> None: |
| context = _build_episode_context(gold_rows=[(1,)]) |
| try: |
| reward = compute_step_reward( |
| context, |
| action_type="QUERY", |
| sql="SELECT missing", |
| rows=None, |
| error="no such column", |
| ) |
| assert reward == -0.005 |
| finally: |
| context.db_connection.close() |
|
|
| def test_compute_reward_describe(self) -> None: |
| context = _build_episode_context(gold_rows=[(1,)]) |
| try: |
| reward = compute_step_reward( |
| context, |
| action_type="DESCRIBE", |
| sql="DESCRIBE students", |
| rows=[("id", "INTEGER")], |
| error=None, |
| ) |
| assert reward == 0.015 |
| assert context.previous_progress == 0.0 |
| finally: |
| context.db_connection.close() |
|
|
| def test_compute_reward_sample(self) -> None: |
| context = _build_episode_context(gold_rows=[(1,)]) |
| try: |
| reward = compute_step_reward( |
| context, |
| action_type="SAMPLE", |
| sql="SELECT * FROM students LIMIT 1", |
| rows=[(1,)], |
| error=None, |
| ) |
| assert reward == 0.015 |
| assert context.previous_progress == 0.0 |
| finally: |
| context.db_connection.close() |
|
|
| def test_compute_reward_per_step_cap(self) -> None: |
| """Per-step clipping caps at 0.15.""" |
| context = _build_episode_context(gold_rows=[(1, "a", 10)]) |
| try: |
| reward = compute_step_reward( |
| context, |
| action_type="QUERY", |
| sql="SELECT 1, 'a', 10", |
| rows=[(1, "a", 10)], |
| error=None, |
| ) |
| assert reward <= 0.15 |
| finally: |
| context.db_connection.close() |
|
|
| def test_compute_reward_per_step_floor(self) -> None: |
| """Per-step clipping floors at -0.05.""" |
| context = _build_episode_context(gold_rows=[(1, "a", 10)]) |
| try: |
| |
| compute_step_reward( |
| context, |
| action_type="QUERY", |
| sql="SELECT 1, 'a', 10", |
| rows=[(1, "a", 10)], |
| error=None, |
| ) |
| |
| reward = compute_step_reward( |
| context, |
| action_type="QUERY", |
| sql="SELECT 1, 'a', 10", |
| rows=[(1, "a", 10)], |
| error=None, |
| ) |
| assert reward >= -0.05 |
| finally: |
| context.db_connection.close() |
|
|
| def test_compute_reward_no_cumulative_tracking(self) -> None: |
| """Each step is independent — no cumulative state.""" |
| context = _build_episode_context(gold_rows=[(1,)]) |
| try: |
| assert not hasattr(context, "cumulative_step_reward") |
| finally: |
| context.db_connection.close() |
|
|
| def test_compute_reward_layer2_skipped_for_describe(self) -> None: |
| context = _build_episode_context(gold_rows=[(1,), (2,)]) |
| try: |
| compute_step_reward( |
| context, |
| action_type="DESCRIBE", |
| sql="DESCRIBE students", |
| rows=[("id", "INTEGER")], |
| error=None, |
| ) |
| assert context.previous_progress == 0.0 |
| finally: |
| context.db_connection.close() |
|
|
| def test_compute_reward_layer2_skipped_when_rows_none(self) -> None: |
| context = _build_episode_context(gold_rows=[(1,), (2,)]) |
| try: |
| compute_step_reward( |
| context, |
| action_type="QUERY", |
| sql="SELECT missing", |
| rows=None, |
| error="no such column", |
| ) |
| assert context.previous_progress == 0.0 |
| finally: |
| context.db_connection.close() |
|
|
| def test_compute_reward_layer2_skipped_empty_gold(self) -> None: |
| context = _build_episode_context(gold_rows=[]) |
| try: |
| reward = compute_step_reward( |
| context, |
| action_type="QUERY", |
| sql="SELECT 1", |
| rows=[(1,)], |
| error=None, |
| ) |
| assert reward == 0.025 |
| assert context.previous_progress == 0.0 |
| finally: |
| context.db_connection.close() |
|
|