Spaces:

TheLinconX
/

contextforge-demo

Sleeping

App Files Files Community

Pablo commited on 3 days ago

Commit

be03608

1 Parent(s): bd7899d

PBKVPredictor: 2nd-order Markov model, 19 tests (stub → production)

Browse files

Files changed (1) hide show

tests/test_pbkv_predictor.py +249 -9

tests/test_pbkv_predictor.py CHANGED Viewed

@@ -1,13 +1,20 @@
-"""Tests for PBKVPredictor — TASK-013."""
-import pytest
 import json
 import tempfile
 from pathlib import Path
-from contextforge.scheduling.pbkv_predictor import PBKVPredictor, WorkflowStepRecord, PredictionResult
 class TestPBKVPredictor:
-    """Tests for PBKV predictor stub."""
     @pytest.mark.asyncio
     async def test_log_workflow_step(self):
@@ -28,7 +35,7 @@ class TestPBKVPredictor:
     @pytest.mark.asyncio
     async def test_predict_next_agents_returns_prediction_result(self):
-        """predict_next_agents() returns PredictionResult."""
         with tempfile.TemporaryDirectory() as tmpdir:
             predictor = PBKVPredictor(log_dir=tmpdir, max_history_steps=10)
@@ -42,7 +49,9 @@ class TestPBKVPredictor:
                     cla_group=i % 2,
                 )
-            result = await predictor.predict_next_agents("agent_0", current_step=3, num_predictions=2)
             assert isinstance(result, PredictionResult)
             assert isinstance(result.predicted_agents, list)
@@ -54,7 +63,9 @@ class TestPBKVPredictor:
         with tempfile.TemporaryDirectory() as tmpdir:
             predictor = PBKVPredictor(log_dir=tmpdir, max_history_steps=10)
-            result = await predictor.predict_next_agents("agent_0", current_step=0, num_predictions=3)
             assert isinstance(result, PredictionResult)
             # Empty history → confidence 0, returns current agent as fallback
@@ -62,7 +73,7 @@ class TestPBKVPredictor:
     @pytest.mark.asyncio
     async def test_get_prefetch_candidates(self):
-        """get_prefetch_candidates() returns list of block IDs."""
         with tempfile.TemporaryDirectory() as tmpdir:
             predictor = PBKVPredictor(log_dir=tmpdir, max_history_steps=10)
@@ -110,4 +121,233 @@ class TestPBKVPredictor:
             stats = predictor.get_stats()
             assert stats["history_size"] == 0
             assert stats["max_history_steps"] == 50
-            assert "_pbkv_logs" in stats["log_file"]

+"""Tests for PBKVPredictor — Markov chain implementation."""
 import json
+import pytest
 import tempfile
 from pathlib import Path
+from contextforge.scheduling.pbkv_predictor import (
+    PBKVPredictor,
+    WorkflowStepRecord,
+    PredictionResult,
+)
 class TestPBKVPredictor:
+    """Tests for PBKV predictor Markov chain implementation."""
+    # ===== Existing stub tests (backward compatibility) =====
     @pytest.mark.asyncio
     async def test_log_workflow_step(self):
     @pytest.mark.asyncio
     async def test_predict_next_agents_returns_prediction_result(self):
+        """predict_next_agents() returns PredictionResult via async path."""
         with tempfile.TemporaryDirectory() as tmpdir:
             predictor = PBKVPredictor(log_dir=tmpdir, max_history_steps=10)
                     cla_group=i % 2,
                 )
+            result = await predictor._predict_next_agents_async(
+                "agent_0", current_step=3, num_predictions=2
+            )
             assert isinstance(result, PredictionResult)
             assert isinstance(result.predicted_agents, list)
         with tempfile.TemporaryDirectory() as tmpdir:
             predictor = PBKVPredictor(log_dir=tmpdir, max_history_steps=10)
+            result = await predictor._predict_next_agents_async(
+                "agent_0", current_step=0, num_predictions=3
+            )
             assert isinstance(result, PredictionResult)
             # Empty history → confidence 0, returns current agent as fallback
     @pytest.mark.asyncio
     async def test_get_prefetch_candidates(self):
+        """get_prefetch_candidates() returns list of agent IDs."""
         with tempfile.TemporaryDirectory() as tmpdir:
             predictor = PBKVPredictor(log_dir=tmpdir, max_history_steps=10)
             stats = predictor.get_stats()
             assert stats["history_size"] == 0
             assert stats["max_history_steps"] == 50
+            assert "workflow_steps.jsonl" in stats["log_file"]
+            assert stats["trained"] is False
+    # ===== Markov chain training tests =====
+    def test_train_from_jsonl(self):
+        """train_from_jsonl() builds transition table correctly."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            log_file = Path(tmpdir) / "workflow_steps.jsonl"
+            # Write JSONL with known sequence: A → B → C → A → B
+            records = [
+                {"step_idx": 0, "agent_id": "A", "anchor_hash": "h0", "token_length": 10, "cla_group": 1},
+                {"step_idx": 1, "agent_id": "B", "anchor_hash": "h1", "token_length": 10, "cla_group": 1},
+                {"step_idx": 2, "agent_id": "C", "anchor_hash": "h2", "token_length": 10, "cla_group": 1},
+                {"step_idx": 3, "agent_id": "A", "anchor_hash": "h3", "token_length": 10, "cla_group": 1},
+                {"step_idx": 4, "agent_id": "B", "anchor_hash": "h4", "token_length": 10, "cla_group": 1},
+            ]
+            with open(log_file, "w") as f:
+                for rec in records:
+                    f.write(json.dumps(rec) + "\n")
+            predictor = PBKVPredictor(log_dir=tmpdir)
+            predictor.train_from_jsonl(tmpdir)
+            assert predictor._trained is True
+            assert predictor._all_agents == {"A", "B", "C"}
+            # Check 2nd-order transitions exist
+            assert ("A", "B") in predictor._transition_table
+            assert ("B", "C") in predictor._transition_table
+            assert ("C", "A") in predictor._transition_table
+            assert ("A", "B") in predictor._transition_table
+    def test_train_from_jsonl_with_multiple_sequences(self):
+        """train_from_jsonl() handles multiple sequences (empty lines)."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            log_file = Path(tmpdir) / "workflow_steps.jsonl"
+            # Two sequences: A→B and C→D
+            records = [
+                {"step_idx": 0, "agent_id": "A", "anchor_hash": "h0", "token_length": 10, "cla_group": 1},
+                {"step_idx": 1, "agent_id": "B", "anchor_hash": "h1", "token_length": 10, "cla_group": 1},
+                {},
+                {"step_idx": 0, "agent_id": "C", "anchor_hash": "h2", "token_length": 10, "cla_group": 1},
+                {"step_idx": 1, "agent_id": "D", "anchor_hash": "h3", "token_length": 10, "cla_group": 1},
+            ]
+            with open(log_file, "w") as f:
+                for rec in records:
+                    f.write(json.dumps(rec) + "\n")
+            predictor = PBKVPredictor(log_dir=tmpdir)
+            predictor.train_from_jsonl(tmpdir)
+            assert predictor._trained is True
+            assert predictor._all_agents == {"A", "B", "C", "D"}
+    def test_train_from_jsonl_missing_file(self):
+        """train_from_jsonl() handles missing file gracefully."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            predictor = PBKVPredictor(log_dir=tmpdir)
+            predictor.train_from_jsonl(str(Path(tmpdir) / "nonexistent.jsonl"))
+            assert predictor._trained is False
+    # ===== Prediction correctness tests =====
+    def test_predict_next_agents_sync(self):
+        """Synchronous predict_next_agents() returns list of agent IDs."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            predictor = PBKVPredictor(log_dir=tmpdir)
+            # Train with known pattern: A → B → C
+            log_file = Path(tmpdir) / "workflow_steps.jsonl"
+            records = [
+                {"step_idx": 0, "agent_id": "A", "anchor_hash": "h0", "token_length": 10, "cla_group": 1},
+                {"step_idx": 1, "agent_id": "B", "anchor_hash": "h1", "token_length": 10, "cla_group": 1},
+                {"step_idx": 2, "agent_id": "C", "anchor_hash": "h2", "token_length": 10, "cla_group": 1},
+            ]
+            with open(log_file, "w") as f:
+                for rec in records:
+                    f.write(json.dumps(rec) + "\n")
+            predictor.train_from_jsonl(tmpdir)
+            predictions = predictor.predict_next_agents("B", top_k=2)
+            assert isinstance(predictions, list)
+            assert "C" in predictions  # B → C is the trained transition
+            assert len(predictions) <= 2
+    def test_predict_next_agents_fallback_on_empty_history(self):
+        """predict_next_agents() falls back when no training data."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            predictor = PBKVPredictor(log_dir=tmpdir)
+            # No training, no history
+            predictions = predictor.predict_next_agents("X", top_k=3)
+            assert predictions == ["X"]
+    def test_predict_next_agents_fallback_1st_order(self):
+        """predict_next_agents() uses 1st-order when 2nd-order state unseen."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            predictor = PBKVPredictor(log_dir=tmpdir)
+            # Train: A → B → C (only 2nd-order state (A,B)→C)
+            log_file = Path(tmpdir) / "workflow_steps.jsonl"
+            records = [
+                {"step_idx": 0, "agent_id": "A", "anchor_hash": "h0", "token_length": 10, "cla_group": 1},
+                {"step_idx": 1, "agent_id": "B", "anchor_hash": "h1", "token_length": 10, "cla_group": 1},
+                {"step_idx": 2, "agent_id": "C", "anchor_hash": "h2", "token_length": 10, "cla_group": 1},
+            ]
+            with open(log_file, "w") as f:
+                for rec in records:
+                    f.write(json.dumps(rec) + "\n")
+            predictor.train_from_jsonl(tmpdir)
+            # Query for unseen state: should fall back to 1st-order
+            predictions = predictor.predict_next_agents("B", top_k=1)
+            assert "C" in predictions
+    def test_predict_next_agents_top_k(self):
+        """predict_next_agents() respects top_k parameter."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            predictor = PBKVPredictor(log_dir=tmpdir)
+            log_file = Path(tmpdir) / "workflow_steps.jsonl"
+            records = [
+                {"step_idx": 0, "agent_id": "A", "anchor_hash": "h0", "token_length": 10, "cla_group": 1},
+                {"step_idx": 1, "agent_id": "B", "anchor_hash": "h1", "token_length": 10, "cla_group": 1},
+                {"step_idx": 2, "agent_id": "A", "anchor_hash": "h2", "token_length": 10, "cla_group": 1},
+            ]
+            with open(log_file, "w") as f:
+                for rec in records:
+                    f.write(json.dumps(rec) + "\n")
+            predictor.train_from_jsonl(tmpdir)
+            predictions = predictor.predict_next_agents("B", top_k=1)
+            assert len(predictions) == 1
+    # ===== blend_alpha tests =====
+    def test_blend_alpha_parameter(self):
+        """blend_alpha is stored correctly in __init__."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            predictor = PBKVPredictor(log_dir=tmpdir, blend_alpha=0.7)
+            assert predictor._blend_alpha == 0.7
+    def test_blend_alpha_default(self):
+        """blend_alpha defaults to 0.6."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            predictor = PBKVPredictor(log_dir=tmpdir)
+            assert predictor._blend_alpha == 0.6
+    @pytest.mark.asyncio
+    async def test_get_eviction_priority_without_step_graph(self):
+        """get_eviction_priority() works without AgentStepGraph."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            predictor = PBKVPredictor(log_dir=tmpdir)
+            log_file = Path(tmpdir) / "workflow_steps.jsonl"
+            records = [
+                {"step_idx": 0, "agent_id": "A", "anchor_hash": "h0", "token_length": 10, "cla_group": 1},
+                {"step_idx": 1, "agent_id": "B", "anchor_hash": "h1", "token_length": 10, "cla_group": 1},
+                {"step_idx": 2, "agent_id": "C", "anchor_hash": "h2", "token_length": 10, "cla_group": 1},
+            ]
+            with open(log_file, "w") as f:
+                for rec in records:
+                    f.write(json.dumps(rec) + "\n")
+            predictor.train_from_jsonl(tmpdir)
+            priority = await predictor.get_eviction_priority(["A", "B", "C"])
+            assert isinstance(priority, list)
+            assert len(priority) == 3
+    @pytest.mark.asyncio
+    async def test_get_eviction_priority_with_step_graph(self):
+        """get_eviction_priority() blends with AgentStepGraph."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            predictor = PBKVPredictor(log_dir=tmpdir, blend_alpha=0.6)
+            # Train with pattern
+            log_file = Path(tmpdir) / "workflow_steps.jsonl"
+            records = [
+                {"step_idx": 0, "agent_id": "retriever", "anchor_hash": "h0", "token_length": 10, "cla_group": 1},
+                {"step_idx": 1, "agent_id": "summarizer", "anchor_hash": "h1", "token_length": 10, "cla_group": 1},
+            ]
+            with open(log_file, "w") as f:
+                for rec in records:
+                    f.write(json.dumps(rec) + "\n")
+            predictor.train_from_jsonl(tmpdir)
+            # Create a simple step graph
+            from contextforge.scheduling.step_graph import AgentStepGraph, AgentStep
+            graph = AgentStepGraph()
+            graph.add_step(AgentStep(agent_id="retriever", depends_on=[], step_index=0))
+            graph.add_step(AgentStep(agent_id="summarizer", depends_on=["retriever"], step_index=1))
+            priority = await predictor.get_eviction_priority(
+                ["retriever", "summarizer"], step_graph=graph
+            )
+            assert isinstance(priority, list)
+            assert len(priority) == 2
+    # ===== Stats tests =====
+    def test_get_stats_after_training(self):
+        """get_stats() reflects training state."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            predictor = PBKVPredictor(log_dir=tmpdir)
+            log_file = Path(tmpdir) / "workflow_steps.jsonl"
+            records = [
+                {"step_idx": 0, "agent_id": "A", "anchor_hash": "h0", "token_length": 10, "cla_group": 1},
+                {"step_idx": 1, "agent_id": "B", "anchor_hash": "h1", "token_length": 10, "cla_group": 1},
+            ]
+            with open(log_file, "w") as f:
+                for rec in records:
+                    f.write(json.dumps(rec) + "\n")
+            predictor.train_from_jsonl(tmpdir)
+            stats = predictor.get_stats()
+            assert stats["trained"] is True
+            assert stats["transition_table_size"] > 0
+            assert stats["unique_agents"] == 2